Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a0db941

Browse files
committed
.
1 parent c3efa74 commit a0db941

4 files changed

+297
-44
lines changed
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import time
2+
from ctypes import *
3+
import mmap
4+
5+
#https://defuse.ca/online-x86-assembler.htm#disassembly2
6+
def translate(s):
7+
res=b''
8+
for l in s.split('\n'):
9+
if (not ':' in l) or (not ' ' in l): continue
10+
#l=l[l.find(':')+1:l.find(' ')].strip()
11+
l=l[l.find(':')+1:].strip()
12+
l=l[:l.find(' ')].strip()
13+
for b in l.split(' '):
14+
res+=int(b,16).to_bytes(1,byteorder='little')
15+
return res
16+
17+
def compile_asm(s,ftype):
18+
global buf
19+
buf=mmap.mmap(-1,mmap.PAGESIZE,prot=mmap.PROT_READ|mmap.PROT_WRITE|mmap.PROT_EXEC)
20+
fpointer=c_void_p.from_buffer(buf)
21+
buf.write(translate(s))
22+
return ftype(addressof(fpointer))
23+
24+
#order: edi,esi,edx,ecx,r8d
25+
#-Ofast
26+
asm_xor_in=compile_asm('''
27+
0: 85 f6 test esi,esi
28+
2: 0f 8e 07 01 00 00 jle 10f <L7>
29+
8: 8d 46 ff lea eax,[rsi-0x1]
30+
b: 49 89 f9 mov r9,rdi
31+
e: 89 d7 mov edi,edx
32+
10: 83 f8 02 cmp eax,0x2
33+
13: 0f 86 f9 00 00 00 jbe 112 <L8>
34+
19: 66 0f 6e fa movd xmm7,edx
35+
1d: 89 f2 mov edx,esi
36+
1f: 66 0f ef d2 pxor xmm2,xmm2
37+
23: 4c 89 c8 mov rax,r9
38+
26: 66 0f 70 f7 00 pshufd xmm6,xmm7,0x0
39+
2b: c1 ea 02 shr edx,0x2
40+
2e: 66 0f 6e f9 movd xmm7,ecx
41+
32: 66 0f 6f da movdqa xmm3,xmm2
42+
36: 66 0f 70 ef 00 pshufd xmm5,xmm7,0x0
43+
3b: 48 c1 e2 04 shl rdx,0x4
44+
3f: 66 41 0f 6e f8 movd xmm7,r8d
45+
44: 66 0f 70 e7 00 pshufd xmm4,xmm7,0x0
46+
49: 4c 01 ca add rdx,r9
47+
000000000000004c <L5>:
48+
4c: f3 0f 6f 00 movdqu xmm0,XMMWORD PTR [rax]
49+
50: 66 0f 6f cd movdqa xmm1,xmm5
50+
54: 48 83 c0 10 add rax,0x10
51+
58: 66 0f ef c6 pxor xmm0,xmm6
52+
5c: 66 0f 66 c8 pcmpgtd xmm1,xmm0
53+
60: 66 0f 66 c4 pcmpgtd xmm0,xmm4
54+
64: 66 0f 76 cb pcmpeqd xmm1,xmm3
55+
68: 66 0f 76 c3 pcmpeqd xmm0,xmm3
56+
6c: 66 0f db c1 pand xmm0,xmm1
57+
70: 66 0f fa d0 psubd xmm2,xmm0
58+
74: 48 39 d0 cmp rax,rdx
59+
77: 75 d3 jne 4c <L5>
60+
79: 66 0f 6f c2 movdqa xmm0,xmm2
61+
7d: 89 f2 mov edx,esi
62+
7f: 66 0f 73 d8 08 psrldq xmm0,0x8
63+
84: 83 e2 fc and edx,0xfffffffc
64+
87: 66 0f fe d0 paddd xmm2,xmm0
65+
8b: 66 0f 6f c2 movdqa xmm0,xmm2
66+
8f: 66 0f 73 d8 04 psrldq xmm0,0x4
67+
94: 66 0f fe d0 paddd xmm2,xmm0
68+
98: 66 0f 7e d0 movd eax,xmm2
69+
9c: 40 f6 c6 03 test sil,0x3
70+
a0: 74 6c je 10e <L13>
71+
00000000000000a2 <L3>:
72+
a2: 4c 63 d2 movsxd r10,edx
73+
a5: 53 push rbx
74+
a6: 47 8b 1c 91 mov r11d,DWORD PTR [r9+r10*4]
75+
aa: 41 31 fb xor r11d,edi
76+
ad: 41 39 cb cmp r11d,ecx
77+
b0: 0f 9d c3 setge bl
78+
b3: 45 31 d2 xor r10d,r10d
79+
b6: 45 39 c3 cmp r11d,r8d
80+
b9: 41 0f 9e c2 setle r10b
81+
bd: 41 21 da and r10d,ebx
82+
c0: 44 01 d0 add eax,r10d
83+
c3: 44 8d 52 01 lea r10d,[rdx+0x1]
84+
c7: 44 39 d6 cmp esi,r10d
85+
ca: 7e 40 jle 10c <L1>
86+
cc: 4d 63 d2 movsxd r10,r10d
87+
cf: 47 8b 1c 91 mov r11d,DWORD PTR [r9+r10*4]
88+
d3: 41 31 fb xor r11d,edi
89+
d6: 44 39 d9 cmp ecx,r11d
90+
d9: 0f 9e c3 setle bl
91+
dc: 45 31 d2 xor r10d,r10d
92+
df: 45 39 d8 cmp r8d,r11d
93+
e2: 41 0f 9d c2 setge r10b
94+
e6: 83 c2 02 add edx,0x2
95+
e9: 41 21 da and r10d,ebx
96+
ec: 44 01 d0 add eax,r10d
97+
ef: 39 d6 cmp esi,edx
98+
f1: 7e 19 jle 10c <L1>
99+
f3: 48 63 d2 movsxd rdx,edx
100+
f6: 41 33 3c 91 xor edi,DWORD PTR [r9+rdx*4]
101+
fa: 41 39 f8 cmp r8d,edi
102+
fd: 40 0f 9d c6 setge sil
103+
101: 31 d2 xor edx,edx
104+
103: 39 f9 cmp ecx,edi
105+
105: 0f 9e c2 setle dl
106+
108: 21 f2 and edx,esi
107+
10a: 01 d0 add eax,edx
108+
000000000000010c <L1>:
109+
10c: 5b pop rbx
110+
10d: c3 ret
111+
000000000000010e <L13>:
112+
10e: c3 ret
113+
000000000000010f <L7>:
114+
10f: 31 c0 xor eax,eax
115+
111: c3 ret
116+
0000000000000112 <L8>:
117+
112: 31 d2 xor edx,edx
118+
114: 31 c0 xor eax,eax
119+
116: eb 8a jmp a2 <L3>
120+
''',CFUNCTYPE(c_int,POINTER(c_int),c_int,c_int,c_int))
121+
122+
class Solution:
123+
def countPairs(self, nums: List[int], l: int, r: int) -> int:
124+
n,ans=len(nums),0
125+
a=(c_int*n)(*nums)
126+
for i in range(n):
127+
a1=cast(addressof(a)+4*(i+1),POINTER(c_int))
128+
ans+=asm_xor_in(a1,n-i-1,a[i],l,r)
129+
return ans
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import time
2+
from ctypes import *
3+
import mmap
4+
5+
#https://defuse.ca/online-x86-assembler.htm#disassembly2
6+
def translate(s):
7+
res=b''
8+
for l in s.split('\n'):
9+
if (not ':' in l) or (not ' ' in l): continue
10+
l=l[l.find(':')+1:l.find(' ')].strip()
11+
for b in l.split(' '):
12+
res+=int(b,16).to_bytes(1,byteorder='little')
13+
return res
14+
15+
def compile_asm(s,ftype):
16+
global buf
17+
buf=mmap.mmap(-1,mmap.PAGESIZE,prot=mmap.PROT_READ|mmap.PROT_WRITE|mmap.PROT_EXEC)
18+
fpointer=c_void_p.from_buffer(buf)
19+
buf.write(translate(s))
20+
return ftype(addressof(fpointer))
21+
22+
#order: edi,esi,edx,ecx,r8d
23+
#-O2
24+
asm_xor_in=compile_asm('''
25+
0: 89 f0 mov eax,esi
26+
2: 41 89 ca mov r10d,ecx
27+
5: 89 d6 mov esi,edx
28+
7: 85 c0 test eax,eax
29+
9: 7e 2f jle 3a <L4>
30+
b: 83 e8 01 sub eax,0x1
31+
e: 45 31 c9 xor r9d,r9d
32+
11: 4c 8d 5c 87 04 lea r11,[rdi+rax*4+0x4]
33+
0000000000000016 <L3>:
34+
16: 8b 17 mov edx,DWORD PTR [rdi]
35+
18: 31 f2 xor edx,esi
36+
1a: 44 39 d2 cmp edx,r10d
37+
1d: 0f 9d c1 setge cl
38+
20: 31 c0 xor eax,eax
39+
22: 44 39 c2 cmp edx,r8d
40+
25: 0f 9e c0 setle al
41+
28: 48 83 c7 04 add rdi,0x4
42+
2c: 21 c8 and eax,ecx
43+
2e: 41 01 c1 add r9d,eax
44+
31: 49 39 fb cmp r11,rdi
45+
34: 75 e0 jne 16 <L3>
46+
36: 44 89 c8 mov eax,r9d
47+
39: c3 ret
48+
000000000000003a <L4>:
49+
3a: 45 31 c9 xor r9d,r9d
50+
3d: 44 89 c8 mov eax,r9d
51+
40: c3 ret
52+
''',CFUNCTYPE(c_int,POINTER(c_int),c_int,c_int,c_int))
53+
54+
class Solution:
55+
def countPairs(self, nums: List[int], l: int, r: int) -> int:
56+
n,ans=len(nums),0
57+
a=(c_int*n)(*nums)
58+
for i in range(n):
59+
a1=cast(addressof(a)+4*(i+1),POINTER(c_int))
60+
ans+=asm_xor_in(a1,n-i-1,a[i],l,r)
61+
return ans

‎templates/run_asm.py

Lines changed: 107 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import time
2-
32
from ctypes import *
43
import mmap
54

@@ -11,25 +10,37 @@
1110
d=7
1211
d1=c_int(d)
1312

14-
#sum
15-
buf=mmap.mmap(-1,mmap.PAGESIZE,prot=mmap.PROT_READ|mmap.PROT_WRITE|mmap.PROT_EXEC)
16-
ftype=CFUNCTYPE(c_int,POINTER(c_int),c_int)
17-
fpointer=c_void_p.from_buffer(buf)
18-
asm_sum=ftype(addressof(fpointer))
19-
buf.write(
20-
b'\xb9\x00\x00\x00\x00' # mov ecx,0
21-
b'\x48\x8d\x14\xb7' # lea rdx,[rsi*4+rdi]
22-
# begin:
23-
b'\x8b\x1f' # mov ebx,DWORD PTR [rdi]
24-
b'\x01\xd9' # add ecx,ebx
25-
b'\x48\x8d\x7f\x04' # lea rdi,0x4[rdi]
26-
b'\x48\x39\xd7' # cmp rdi,rdx
27-
b'\x74\x02' # jz end
28-
b'\xeb\xf1' # jmp begin
29-
# end:
30-
b'\x89\xc8' # mov eax,ecx
31-
b'\xc3' # ret
32-
)
13+
#https://defuse.ca/online-x86-assembler.htm#disassembly2
14+
def translate(s):
15+
res=b''
16+
for l in s.split('\n'):
17+
if (not ':' in l) or (not ' ' in l): continue
18+
l=l[l.find(':')+1:l.find(' ')].strip()
19+
for b in l.split(' '):
20+
res+=int(b,16).to_bytes(1,byteorder='little')
21+
return res
22+
23+
def compile_asm(s,ftype):
24+
global buf
25+
buf=mmap.mmap(-1,mmap.PAGESIZE,prot=mmap.PROT_READ|mmap.PROT_WRITE|mmap.PROT_EXEC)
26+
fpointer=c_void_p.from_buffer(buf)
27+
buf.write(translate(s))
28+
return ftype(addressof(fpointer))
29+
30+
#order: edi,esi,edx,ecx,r8d
31+
#sum x in a
32+
asm_sum=compile_asm('''
33+
0: b9 00 00 00 00 mov ecx,0x0
34+
5: 48 8d 14 b7 lea rdx,[rdi+rsi*4]
35+
0000000000000009 <begin>:
36+
9: 8b 1f mov ebx,DWORD PTR [rdi]
37+
b: 01 d9 add ecx,ebx
38+
d: 48 8d 7f 04 lea rdi,[rdi+0x4]
39+
11: 48 39 d7 cmp rdi,rdx
40+
14: 75 f3 jne 9 <begin>
41+
16: 89 c8 mov eax,ecx
42+
18: c3 ret
43+
''',CFUNCTYPE(c_int,POINTER(c_int),c_int))
3344

3445
t1=time.time()
3546
for i in range(10000):
@@ -38,37 +49,89 @@
3849
#t=sum(a)
3950
t=asm_sum(a1,n1)
4051
#print(t)
41-
print('time sum=',time.time()-t1) #8129 vs 736
52+
print('time sum=',time.time()-t1) #4633 vs 446
53+
54+
55+
#sum parallel 4
56+
asm_sum_4=compile_asm('''
57+
0: 41 b8 00 00 00 00 mov r8d,0x0
58+
6: 41 b9 00 00 00 00 mov r9d,0x0
59+
c: 41 ba 00 00 00 00 mov r10d,0x0
60+
12: 41 bb 00 00 00 00 mov r11d,0x0
61+
18: 48 8d 14 b7 lea rdx,[rdi+rsi*4]
62+
000000000000001c <begin>:
63+
1c: 44 03 07 add r8d,DWORD PTR [rdi]
64+
1f: 44 03 4f 04 add r9d,DWORD PTR [rdi+0x4]
65+
23: 44 03 57 08 add r10d,DWORD PTR [rdi+0x8]
66+
27: 44 03 5f 0c add r11d,DWORD PTR [rdi+0xc]
67+
2b: 48 8d 7f 10 lea rdi,[rdi+0x10]
68+
2f: 48 39 d7 cmp rdi,rdx
69+
32: 75 e8 jne 1c <begin>
70+
34: 44 89 c0 mov eax,r8d
71+
37: 44 01 c8 add eax,r9d
72+
3a: 44 01 d0 add eax,r10d
73+
3d: 44 01 d8 add eax,r11d
74+
40: c3 ret
75+
''',CFUNCTYPE(c_int,POINTER(c_int),c_int))
4276

43-
#sum div
44-
buf=mmap.mmap(-1,mmap.PAGESIZE,prot=mmap.PROT_READ|mmap.PROT_WRITE|mmap.PROT_EXEC)
45-
ftype=CFUNCTYPE(c_int,POINTER(c_int),c_int,c_int)
46-
fpointer=c_void_p.from_buffer(buf)
47-
asm_sum_div=ftype(addressof(fpointer))
48-
buf.write(
49-
b'\x89\xd3' # mov ebx,edx
50-
b'\xb9\x00\x00\x00\x00' # mov ecx,0
51-
b'\x48\x8d\x34\xb7' # lea rsi,[rsi*4+rdi]
52-
# begin:
53-
b'\x8b\x07' # mov eax,DWORD PTR [rdi]
54-
b'\xba\x00\x00\x00\x00' # mov edx,0
55-
b'\xf7\xfb' # idiv ebx
56-
b'\x01\xc1' # add ecx,eax
57-
b'\x48\x8d\x7f\x04' # lea rdi,0x4[rdi]
58-
b'\x48\x39\xf7' # cmp rdi,rsi
59-
b'\x74\x02' # jz end
60-
b'\xeb\xea' # jmp begin
61-
# end:
62-
b'\x89\xc8' # mov eax,ecx
63-
b'\xc3' # ret
64-
)
77+
78+
#sum x//d where x in a
79+
asm_sum_div=compile_asm('''
80+
0: 89 d3 mov ebx,edx
81+
2: b9 00 00 00 00 mov ecx,0x0
82+
7: 48 8d 34 b7 lea rsi,[rdi+rsi*4]
83+
000000000000000b <begin>:
84+
b: 8b 07 mov eax,DWORD PTR [rdi]
85+
d: ba 00 00 00 00 mov edx,0x0
86+
12: f7 fb idiv ebx
87+
14: 01 c1 add ecx,eax
88+
16: 48 8d 7f 04 lea rdi,[rdi+0x4]
89+
1a: 48 39 f7 cmp rdi,rsi
90+
1d: 75 ec jne b <begin>
91+
1f: 89 c8 mov eax,ecx
92+
21: c3 ret
93+
''',CFUNCTYPE(c_int,POINTER(c_int),c_int,c_int))
6594

6695
t1=time.time()
6796
for i in range(1000):
6897
#t=sum(x//d for x in a)
6998
t=asm_sum_div(a1,n1,d1)
7099
#print(t)
71-
print('time sum div=',time.time()-t1) #5324 vs 184
100+
print('time sum div=',time.time()-t1) #3788 vs 173
101+
102+
103+
#count x in a s.t. x xor y in [l,r]
104+
asm_xor_in=compile_asm('''
105+
0: 89 f0 mov eax,esi
106+
2: 41 89 ca mov r10d,ecx
107+
5: 89 d6 mov esi,edx
108+
7: 85 c0 test eax,eax
109+
9: 7e 2f jle 3a <L4>
110+
b: 83 e8 01 sub eax,0x1
111+
e: 45 31 c9 xor r9d,r9d
112+
11: 4c 8d 5c 87 04 lea r11,[rdi+rax*4+0x4]
113+
0000000000000016 <L3>:
114+
16: 8b 17 mov edx,DWORD PTR [rdi]
115+
18: 31 f2 xor edx,esi
116+
1a: 44 39 d2 cmp edx,r10d
117+
1d: 0f 9d c1 setge cl
118+
20: 31 c0 xor eax,eax
119+
22: 44 39 c2 cmp edx,r8d
120+
25: 0f 9e c0 setle al
121+
28: 48 83 c7 04 add rdi,0x4
122+
2c: 21 c8 and eax,ecx
123+
2e: 41 01 c1 add r9d,eax
124+
31: 49 39 fb cmp r11,rdi
125+
34: 75 e0 jne 16 <L3>
126+
36: 44 89 c8 mov eax,r9d
127+
39: c3 ret
128+
000000000000003a <L4>:
129+
3a: 45 31 c9 xor r9d,r9d
130+
3d: 44 89 c8 mov eax,r9d
131+
40: c3 ret
132+
''',CFUNCTYPE(c_int,POINTER(c_int),c_int,c_int,c_int))
133+
72134

73135
#del fpointer
74136
#buf.close()
137+

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /