Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 04e4dfb

Browse files
committed
.
1 parent 3445d43 commit 04e4dfb

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

‎templates/run_asm.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,83 @@ def compile_asm(s,ftype):
144144
''',CFUNCTYPE(c_int,POINTER(c_int),c_int)) #129ms
145145

146146

147+
#sum -Ofast -mavx -mavx2
148+
asm_sum_Ofast_avx=compile_asm('''
149+
0: 49 89 f8 mov r8,rdi
150+
3: 89 f2 mov edx,esi
151+
5: 85 f6 test esi,esi
152+
7: 0f 8e b5 00 00 00 jle c2 <L7>
153+
d: 8d 46 ff lea eax,[rsi-0x1]
154+
10: 83 f8 06 cmp eax,0x6
155+
13: 0f 86 b0 00 00 00 jbe c9 <L8>
156+
19: 89 f1 mov ecx,esi
157+
1b: 48 89 f8 mov rax,rdi
158+
1e: c5 f1 ef c9 vpxor xmm1,xmm1,xmm1
159+
22: c1 e9 03 shr ecx,0x3
160+
25: 48 c1 e1 05 shl rcx,0x5
161+
29: 48 01 f9 add rcx,rdi
162+
000000000000002c <L4>:
163+
2c: c5 f5 fe 08 vpaddd ymm1,ymm1,YMMWORD PTR [rax]
164+
30: 48 83 c0 20 add rax,0x20
165+
34: 48 39 c8 cmp rax,rcx
166+
37: 75 f3 jne 2c <L4>
167+
39: c5 f9 6f c1 vmovdqa xmm0,xmm1
168+
3d: c4 e3 7d 39 c9 01 vextracti128 xmm1,ymm1,0x1
169+
43: 89 d1 mov ecx,edx
170+
45: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
171+
49: 83 e1 f8 and ecx,0xfffffff8
172+
4c: c5 f1 73 d8 08 vpsrldq xmm1,xmm0,0x8
173+
51: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
174+
55: c5 f1 73 d8 04 vpsrldq xmm1,xmm0,0x4
175+
5a: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
176+
5e: c5 f9 7e c0 vmovd eax,xmm0
177+
62: f6 c2 07 test dl,0x7
178+
65: 74 5e je c5 <L12>
179+
67: c5 f8 77 vzeroupper
180+
000000000000006a <L3>:
181+
6a: 48 63 f9 movsxd rdi,ecx
182+
6d: 48 8d 34 bd 00 00 00 lea rsi,[rdi*4+0x0]
183+
74: 00
184+
75: 41 03 04 b8 add eax,DWORD PTR [r8+rdi*4]
185+
79: 8d 79 01 lea edi,[rcx+0x1]
186+
7c: 39 fa cmp edx,edi
187+
7e: 7e 44 jle c4 <L1>
188+
80: 8d 79 02 lea edi,[rcx+0x2]
189+
83: 41 03 44 30 04 add eax,DWORD PTR [r8+rsi*1+0x4]
190+
88: 39 fa cmp edx,edi
191+
8a: 7e 38 jle c4 <L1>
192+
8c: 8d 79 03 lea edi,[rcx+0x3]
193+
8f: 41 03 44 30 08 add eax,DWORD PTR [r8+rsi*1+0x8]
194+
94: 39 fa cmp edx,edi
195+
96: 7e 2c jle c4 <L1>
196+
98: 8d 79 04 lea edi,[rcx+0x4]
197+
9b: 41 03 44 30 0c add eax,DWORD PTR [r8+rsi*1+0xc]
198+
a0: 39 fa cmp edx,edi
199+
a2: 7e 20 jle c4 <L1>
200+
a4: 8d 79 05 lea edi,[rcx+0x5]
201+
a7: 41 03 44 30 10 add eax,DWORD PTR [r8+rsi*1+0x10]
202+
ac: 39 fa cmp edx,edi
203+
ae: 7e 14 jle c4 <L1>
204+
b0: 83 c1 06 add ecx,0x6
205+
b3: 41 03 44 30 14 add eax,DWORD PTR [r8+rsi*1+0x14]
206+
b8: 39 ca cmp edx,ecx
207+
ba: 7e 08 jle c4 <L1>
208+
bc: 41 03 44 30 18 add eax,DWORD PTR [r8+rsi*1+0x18]
209+
c1: c3 ret
210+
00000000000000c2 <L7>:
211+
c2: 31 c0 xor eax,eax
212+
00000000000000c4 <L1>:
213+
c4: c3 ret
214+
00000000000000c5 <L12>:
215+
c5: c5 f8 77 vzeroupper
216+
c8: c3 ret
217+
00000000000000c9 <L8>:
218+
c9: 31 c9 xor ecx,ecx
219+
cb: 31 c0 xor eax,eax
220+
cd: eb 9b jmp 6a <L3>
221+
''',CFUNCTYPE(c_int,POINTER(c_int),c_int)) #59ms
222+
223+
147224
#sum x//d where x in a
148225
asm_sum_div=compile_asm('''
149226
0: 89 d3 mov ebx,edx

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /