I am running a math-oriented computation that spends a significant amount of its time doing memcpy
, always copying 80 bytes from one location to the next, an array of 20 32-bit int
s. The total computation takes around 4-5 days using both cores of my i7, so even a 1% speedup results in about an hour saved.
By using the memcpy
in this paper by Intel, I was able to speed up by about 25%, and also dropping the size argument and simply declaring inside seems to have some small effect. However, I feel I am not utilising the fact that my copying operations are always the same size. That said, I can't come up with a better way.
void *memcpyi80(void* __restrict b, const void* __restrict a){
size_t n = 80;
char *s1 = b;
const char *s2 = a;
for(; 0<n; --n)*s1++ = *s2++;
return b;
}
Some other things that may be useful for optimization:
I use an Intel Core i7-2620M, based on Sandy Bridge. I don't care about portability at all.
I only care about the 16 least significant bits of every int. The other 16 are useless to me and are permanently zeroed out.
Even though I copy 20 32-bit ints per memcpy invocation, I only care about the first 17. I have added 3 as it helps with alignment and therefore speed.
I use GCC 4.6 on Windows 7.
Any ideas?
UPDATE:
I think this is the assembly output (never done this before, there may be more than you need):
memcpyi80:
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
.seh_endprologue
movq %rdx, %r9
movq %rcx, %rax
negq %r9
andl 15,ドル %r9d
je .L165
movzbl (%rdx), %ecx
leaq -1(%r9), %r10
movl 79,ドル %esi
andl 7,ドル %r10d
cmpq 1,ドル %r9
movl 79,ドル %ebx
leaq 1(%rdx), %r8
movl 1,ドル %r11d
movb %cl, (%rax)
leaq 1(%rax), %rcx
jbe .L159
testq %r10, %r10
je .L160
cmpq 1,ドル %r10
je .L250
cmpq 2,ドル %r10
je .L251
cmpq 3,ドル %r10
je .L252
cmpq 4,ドル %r10
je .L253
cmpq 5,ドル %r10
je .L254
cmpq 6,ドル %r10
je .L255
movzbl (%r8), %r8d
movl 2,ドル %r11d
movb %r8b, (%rcx)
leaq 2(%rax), %rcx
leaq 2(%rdx), %r8
.L255:
movzbl (%r8), %ebx
addq 1,ドル %r11
addq 1,ドル %r8
movb %bl, (%rcx)
addq 1,ドル %rcx
.L254:
movzbl (%r8), %r10d
addq 1,ドル %r11
addq 1,ドル %r8
movb %r10b, (%rcx)
addq 1,ドル %rcx
.L253:
movzbl (%r8), %edi
addq 1,ドル %r11
addq 1,ドル %r8
movb %dil, (%rcx)
addq 1,ドル %rcx
.L252:
movzbl (%r8), %ebp
addq 1,ドル %r11
addq 1,ドル %r8
movb %bpl, (%rcx)
addq 1,ドル %rcx
.L251:
movzbl (%r8), %r12d
addq 1,ドル %r11
addq 1,ドル %r8
movb %r12b, (%rcx)
addq 1,ドル %rcx
.L250:
movzbl (%r8), %ebx
addq 1,ドル %r8
movb %bl, (%rcx)
movq %rsi, %rbx
addq 1,ドル %rcx
subq %r11, %rbx
addq 1,ドル %r11
cmpq %r11, %r9
jbe .L159
.p2align 4,,10
.L160:
movzbl (%r8), %r12d
movb %r12b, (%rcx)
movzbl 1(%r8), %ebp
movb %bpl, 1(%rcx)
movzbl 2(%r8), %edi
movb %dil, 2(%rcx)
movzbl 3(%r8), %ebx
movb %bl, 3(%rcx)
leaq 7(%r11), %rbx
addq 8,ドル %r11
movzbl 4(%r8), %r10d
movb %r10b, 4(%rcx)
movq %rsi, %r10
movzbl 5(%r8), %r12d
subq %rbx, %r10
movq %r10, %rbx
movb %r12b, 5(%rcx)
movzbl 6(%r8), %ebp
movb %bpl, 6(%rcx)
movzbl 7(%r8), %edi
addq 8,ドル %r8
movb %dil, 7(%rcx)
addq 8,ドル %rcx
cmpq %r11, %r9
ja .L160
.L159:
movl 80,ドル %r12d
subq %r9, %r12
movq %r12, %rsi
shrq 4,ドル %rsi
movq %rsi, %rbp
salq 4,ドル %rbp
testq %rbp, %rbp
je .L161
leaq (%rdx,%r9), %r10
addq %rax, %r9
movl 1,ドル %r11d
leaq -1(%rsi), %rdi
vmovdqa (%r10), %xmm0
movl 16,ドル %edx
andl 7,ドル %edi
cmpq 1,ドル %rsi
vmovdqu %xmm0, (%r9)
jbe .L256
testq %rdi, %rdi
je .L162
cmpq 1,ドル %rdi
je .L244
cmpq 2,ドル %rdi
je .L245
cmpq 3,ドル %rdi
je .L246
cmpq 4,ドル %rdi
je .L247
cmpq 5,ドル %rdi
je .L248
cmpq 6,ドル %rdi
je .L249
vmovdqa 16(%r10), %xmm3
movl 2,ドル %r11d
movl 32,ドル %edx
vmovdqu %xmm3, 16(%r9)
.L249:
vmovdqa (%r10,%rdx), %xmm4
addq 1,ドル %r11
vmovdqu %xmm4, (%r9,%rdx)
addq 16,ドル %rdx
.L248:
vmovdqa (%r10,%rdx), %xmm5
addq 1,ドル %r11
vmovdqu %xmm5, (%r9,%rdx)
addq 16,ドル %rdx
.L247:
vmovdqa (%r10,%rdx), %xmm0
addq 1,ドル %r11
vmovdqu %xmm0, (%r9,%rdx)
addq 16,ドル %rdx
.L246:
vmovdqa (%r10,%rdx), %xmm1
addq 1,ドル %r11
vmovdqu %xmm1, (%r9,%rdx)
addq 16,ドル %rdx
.L245:
vmovdqa (%r10,%rdx), %xmm2
addq 1,ドル %r11
vmovdqu %xmm2, (%r9,%rdx)
addq 16,ドル %rdx
.L244:
vmovdqa (%r10,%rdx), %xmm3
addq 1,ドル %r11
vmovdqu %xmm3, (%r9,%rdx)
addq 16,ドル %rdx
cmpq %r11, %rsi
jbe .L256
.p2align 4,,10
.L162:
vmovdqa (%r10,%rdx), %xmm2
addq 8,ドル %r11
vmovdqu %xmm2, (%r9,%rdx)
vmovdqa 16(%r10,%rdx), %xmm1
vmovdqu %xmm1, 16(%r9,%rdx)
vmovdqa 32(%r10,%rdx), %xmm0
vmovdqu %xmm0, 32(%r9,%rdx)
vmovdqa 48(%r10,%rdx), %xmm5
vmovdqu %xmm5, 48(%r9,%rdx)
vmovdqa 64(%r10,%rdx), %xmm4
vmovdqu %xmm4, 64(%r9,%rdx)
vmovdqa 80(%r10,%rdx), %xmm3
vmovdqu %xmm3, 80(%r9,%rdx)
vmovdqa 96(%r10,%rdx), %xmm2
vmovdqu %xmm2, 96(%r9,%rdx)
vmovdqa 112(%r10,%rdx), %xmm1
vmovdqu %xmm1, 112(%r9,%rdx)
subq $-128, %rdx
cmpq %r11, %rsi
ja .L162
.L256:
addq %rbp, %rcx
addq %rbp, %r8
subq %rbp, %rbx
cmpq %rbp, %r12
je .L163
.L161:
movzbl (%r8), %edx
leaq -1(%rbx), %r9
andl 7,ドル %r9d
movb %dl, (%rcx)
movl 1,ドル %edx
cmpq %rbx, %rdx
je .L163
testq %r9, %r9
je .L164
cmpq 1,ドル %r9
je .L238
cmpq 2,ドル %r9
je .L239
cmpq 3,ドル %r9
je .L240
cmpq 4,ドル %r9
je .L241
cmpq 5,ドル %r9
je .L242
cmpq 6,ドル %r9
je .L243
movzbl 1(%r8), %edx
movb %dl, 1(%rcx)
movl 2,ドル %edx
.L243:
movzbl (%r8,%rdx), %esi
movb %sil, (%rcx,%rdx)
addq 1,ドル %rdx
.L242:
movzbl (%r8,%rdx), %r11d
movb %r11b, (%rcx,%rdx)
addq 1,ドル %rdx
.L241:
movzbl (%r8,%rdx), %r10d
movb %r10b, (%rcx,%rdx)
addq 1,ドル %rdx
.L240:
movzbl (%r8,%rdx), %edi
movb %dil, (%rcx,%rdx)
addq 1,ドル %rdx
.L239:
movzbl (%r8,%rdx), %ebp
movb %bpl, (%rcx,%rdx)
addq 1,ドル %rdx
.L238:
movzbl (%r8,%rdx), %r12d
movb %r12b, (%rcx,%rdx)
addq 1,ドル %rdx
cmpq %rbx, %rdx
je .L163
.p2align 4,,10
.L164:
movzbl (%r8,%rdx), %r9d
movb %r9b, (%rcx,%rdx)
movzbl 1(%r8,%rdx), %r12d
movb %r12b, 1(%rcx,%rdx)
movzbl 2(%r8,%rdx), %ebp
movb %bpl, 2(%rcx,%rdx)
movzbl 3(%r8,%rdx), %edi
movb %dil, 3(%rcx,%rdx)
movzbl 4(%r8,%rdx), %r10d
movb %r10b, 4(%rcx,%rdx)
movzbl 5(%r8,%rdx), %r11d
movb %r11b, 5(%rcx,%rdx)
movzbl 6(%r8,%rdx), %esi
movb %sil, 6(%rcx,%rdx)
movzbl 7(%r8,%rdx), %r9d
movb %r9b, 7(%rcx,%rdx)
addq 8,ドル %rdx
cmpq %rbx, %rdx
jne .L164
.L163:
popq %rbx
popq %rsi
popq %rdi
popq %rbp
popq %r12
ret
.L165:
movq %rdx, %r8
movl 80,ドル %ebx
jmp .L159
.seh_endproc
.p2align 4,,15
.globl memcpyi
.def memcpyi; .scl 2; .type 32; .endef
.seh_proc memcpyi
UPDATE:
By building on Peter Alexander's solution and combining it with ideas from around the thread, I have produced this:
void memcpyi80(void* __restrict b, const void* __restrict a){
__m128 *s1 = b;
const __m128 *s2 = a;
*s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
}
The speedup is small but measurable (about 1%). Now I guess my next temptation is to find how to use __m256 AVX types so I can do it in 3 steps rather than 5.
UPDATE:
The __m256 type requires alignment on the 32-bit barrier, which makes things slower, so it seems __m128 is a sweet spot.
9 Answers 9
The fastest way to do this would be to align your data on 16-byte boundaries, then the entire copy just becomes 5 copies through XMM registers.
This is over twice as fast as your version on my machine.
Store your data like this:
#include <xmmintrin.h>
struct Data
{
union
{
int i[20];
__m128 v[5];
};
};
Then the copy function is just:
void memcpyv5(__m128* __restrict b, const __m128* __restrict a)
{
__m128 t0 = a[0];
__m128 t1 = a[1];
__m128 t2 = a[2];
__m128 t3 = a[3];
__m128 t4 = a[4];
b[0] = t0;
b[1] = t1;
b[2] = t2;
b[3] = t3;
b[4] = t4;
}
// Example
Data dst, src;
memcpyv5(dst.v, src.v);
Assembly output:
__Z8memcpyv5PU8__vectorfPKS_:
LFB493:
pushq %rbp
LCFI2:
movq %rsp, %rbp
LCFI3:
movaps 16(%rsi), %xmm3
movaps 32(%rsi), %xmm2
movaps 48(%rsi), %xmm1
movaps 64(%rsi), %xmm0
movaps (%rsi), %xmm4
movaps %xmm4, (%rdi)
movaps %xmm3, 16(%rdi)
movaps %xmm2, 32(%rdi)
movaps %xmm1, 48(%rdi)
movaps %xmm0, 64(%rdi)
leave
ret
-
3\$\begingroup\$ Great use of the hardware, especially since portability is not a a concern. \$\endgroup\$Jeff Mercado– Jeff Mercado2011年10月23日 00:49:24 +00:00Commented Oct 23, 2011 at 0:49
-
\$\begingroup\$ Not looking good. Simply converting my code to use structs takes it from <15sec to >40sec. Using the new memcpy function takes it to 37.5 sec. So the function is better but using structs kills the program. I will look for a way to use the xmmintrin commands without structs to see if anything changes and get back. \$\endgroup\$Alexandros Marinos– Alexandros Marinos2011年10月23日 16:29:40 +00:00Commented Oct 23, 2011 at 16:29
-
5\$\begingroup\$ You can avoid using structs by manually aligning your data (check your compiler docs) and just casting it to
(__m128*)
. \$\endgroup\$Peter Alexander– Peter Alexander2011年10月23日 16:50:13 +00:00Commented Oct 23, 2011 at 16:50 -
\$\begingroup\$ See update, I've made a version that gives a 1% speedup. Thank you very much for the answer. Any idea how to use __m256 vectors? cheers. \$\endgroup\$Alexandros Marinos– Alexandros Marinos2011年10月23日 19:43:00 +00:00Commented Oct 23, 2011 at 19:43
-
5\$\begingroup\$ @AlexandrosMarinos: Sandybridge does 256b loads / stores in 2 cycles anyway. They're still single-uop instructions, but it's only significantly faster with Haswell, which has 256b data paths to/from L1 cache. If your data is 32B-aligned, it could be a small gain on SnB, but not if you get a store-forwarding stall because it was written very recently with 16B writes. Also, 256b ops are slower than 128b until the CPU stops for thousands of cycles to power up the upper 128b lane in the execution units. It powers down if unused for ~1 ms. agner.org/optimize/blog/read.php?i=142#378 \$\endgroup\$Peter Cordes– Peter Cordes2015年09月18日 23:38:43 +00:00Commented Sep 18, 2015 at 23:38
Taking Benefits of The Out-of-Order Execution Engine
You can also read about The Out-of-Order Execution Engine in the "Intel® 64 and IA-32 Architectures Optimization Reference Manual", section 2.1.2, and take benefits of it.
For example, in Intel SkyLake processor series (launched in 2015), it has:
- 4 execution units for the Arithmetic logic unit (ALU) (add, and, cmp, or, test, xor, movzx, movsx, mov, (v)movdqu, (v)movdqa, (v)movap*, (v)movup),
- 3 execution units for Vector ALU ( (v)pand, (v)por, (v)pxor, (v)movq, (v)movq, (v)movap*, (v)movup*, (v)andp*, (v)orp*, (v)paddb/w/d/q, (v)blendv*, (v)blendp*, (v)pblendd)
So we can occupy the above units (3+4) in parallel if we use register-only operations. We cannot use 3+4 instructions in parallel for memory copy. We can use simultaneously a maximum of up to two 32-bytes instructions to load from memory and one 32-bytes instruction to store from memory, and even if we are working with Level-1 cache.
Please see the Intel manual again to understand on how to do the fastest memcpy implementation: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
Section 2.2.2 (The Out-of-Order Engine on the Haswell micro-architecture): "The Scheduler controls the dispatch of micro-ops onto the dispatch ports. There are eight dispatch ports to support the out-of-order execution core. Four of the eight ports provided execution resources for computational operations. The other 4 ports support memory operations of up to two 256-bit load and one 256-bit store operation in a cycle."
Section 2.2.4 (Cache and Memory Subsystem) has the following note: "First level data cache supports two load micro-ops each cycle; each micro-op can fetch up to 32-bytes of data."
Section 2.2.4.1 (Load and Store Operation Enhancements) has the following information: The L1 data cache can handle two 256-bit (32 bytes) load and one 256-bit (32 bytes) store operations each cycle. The unified L2 can service one cache line (64 bytes) each cycle. Additionally, there are 72 load buffers and 42 store buffers available to support micro-ops execution in-flight.
The other sections (2.3 and so on, dedicated to Sandy Bridge and other microarchitectures) basically reiterate the above information.
The section 2.3.4 (The Execution Core) gives additional details.
The scheduler can dispatch up to six micro-ops every cycle, one on each port. The following table summarizes which operations can be dispatched on which port.
- Port 0: ALU, Shift, Mul, STTNI, Int-Div, 128b-Mov, Blend, 256b-Mov
- Port 1: ALU, Fast LEA, Slow LEA, MUL, Shuf, Blend, 128bMov, Add, CVT
- Port 2 & Port 3: Load_Addr, Store_addr
- Port 4: Store_data
- Port 5: ALU, Shift, Branch, Fast LEA, Shuf, Blend, 128b-Mov, 256b-Mov
The section 2.3.5.1 (Load and Store Operation Overview) may also be useful to understand on how to make fast memory copy, as well as the section 2.4.4.1 (Loads and Stores).
For the other processor architectures, it is again - two load units and one store unit. Table 2-4 (Cache Parameters of the Skylake Microarchitecture) has the following information:
Peak Bandwidth (bytes/cycle):
- First Level Data Cache: 96 bytes (2x32B Load + 1*32B Store)
- Second Level Cache: 64 bytes
- Third Level Cache: 32 bytes.
I have also done speed tests on my Intel Core i5 6600 CPU (Skylake, 14nm, released in September 2015) with DDR4 memory, which confirmed the theory. For example, my tests have shown that using generic 64-bit registers for memory copy, even many registers in parallel, degrades performance, comparing to larger registers (XMM). Also, using just 2 XMM registers is enough - adding the 3rd doesn't add performance.
If your CPU has AVX CPUID bit, you may benefit from the large, 256-bit (32 byte) YMM registers to copy memory and occupy two whole load units. The AVX support was first introduced by Intel with the Sandy Bridge processors, shipping in Q1 2011 and later on by AMD with the Bulldozer processor shipping in Q3 2011.
// first cycle - use two load units
vmovdqa ymm0, ymmword ptr [esi+0] // load first part (32 bytes)
vmovdqa ymm1, ymmword ptr [esi+32] // load 2nd part (32 bytes)
// second cycle - use one load unit and one store unit
vmovdqa xmm2, xmmword ptr [esi+64] // load 3rd part (16 bytes)
vmovdqa ymmword ptr [edi+0], ymm0 // store first part
// third cycle - use one store unit
vmovdqa ymmword ptr [edi+32], ymm1 // store 2nd part
// fourth cycle - use one store unit
vmovdqa xmmword ptr [edi+64], xmm2 // store 3rd part
Just make sure your data is aligned by 16 bytes (for the XMM registers), or by 32 bytes (for the YMM registers). Otherwise, there will be an Access Violation error. If the data is not aligned, use unaligned commands: vmovdqu and movups, respectively.
If you are lucky to have an AVX-512 processor, you can copy 80 bytes in just four instructions:
vmovdqu64 zmm30, [esi]
vmovdqu xmm31, [esi+64]
vmovdqu64 [edi], zmm30
vmovdqu [edi+64], xmm31
We are using registers 30 and 31 here to avoid the upper 256 dirty state, which is a global state, which may incur SSE/AVX transitional penalties. Moreover, on some CPU models, vzeroupper
or vzeroall
is the only way to exit this state or even restore max-turbo after dirtying a ZMM register. The CPU, however, will not enter this state for writes to (x/y/z)mm16-31 - registers which do not exist on SSE/AVX1/AVX2.
Further reading - ERMSB (not needed to copy exactly 80 bytes but for much larger blocks)
If your CPU has CPUID ERMSB (Enhanced REP MOVSB) bit, then rep movsb
command is executed differently than on older processors, and it will be faster than rep movsd
(movsq
). However, the benefits of rep movsb
will only be only noticeable on large blocks.
rep movsb
is faster than plain simple "mov rax
in a loop" copy only starting from 256-byte blocks, and faster than AVX copy starting from 2048 bytes-blocks.
So, since your block size is 80 bytes only, ERMSB will not give you any benefit.
Get Microsoft Visual Studio, and look for memcpy.asm - it has different scenarios for different processors and different block sizes - so you will be able to figure out which method is best to use for your processor and your block size.
Meanwhile, I can consider Intel ERMSB "half-baked", because there is a high internal startup in ERMSB - about 35 cycles, and because of the other limitations.
See the Intel Manual on Optimization, section 3.7.6 Enhanced REP MOVSB and STOSB operation (ERMSB) http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
- startup cost is 35 cycles;
- both the source and destination addresses have to be aligned to a 16-Byte boundary;
- the source region should not overlap with the destination region;
- the length have to be a multiple of 64 to produce higher performance;
- the direction have to be forward (
CLD
).
I hope that in future Intel will eliminate such a high startup costs.
If you really need this part as fast as possible, one obvious route would be to write it in assembly language. The assembly language you've posted looks a bit on the insane side for this task (at least to me). Given a fixed size, the obvious route would be something like:
; warning: I haven't written a lot of assembly code recently -- I could have
; some of the syntax a bit wrong.
;
memcpyi80 proc dest:ptr byte src:ptr byte
mov esi, src
mov edi, dest
mov ecx, 20 ; 80/4
rep movsd
memcpyi80 endp
That is definitely open to improvement by using (for one example) moves through the SSE registers, but I'll leave that for others to play with. The improvement is pretty small though: recent processors have a special path specifically for memory copies, which this will use, so it's pretty competitive despite its simplicity.
@Mike Dunlavey's comment is good though: most of the time people think they need a faster memory copy, they really need to re-think their code to simply avoid needing it.
-
1\$\begingroup\$ really old answer, but it wasn't until IvyBridge that fast string ops were introduced. (smarter microcode for
rep movsb
with lower startup overhead, and I think better handling of misalignment). intel.com/content/dam/doc/manual/… section 3.7.7. Since vector-copy wins for general memcpy sizes under 128 bytes even on IvB, and in this case the size is an exact multiple of the vector width, using vectors is going to be better even on IvB and later with fastmovsb
. \$\endgroup\$Peter Cordes– Peter Cordes2015年09月18日 23:45:59 +00:00Commented Sep 18, 2015 at 23:45 -
\$\begingroup\$ Please also consider adding the "CLD" instruction to clear the "direction" flag, just in case it was not cleared after an eventual previous move with "STD" \$\endgroup\$Maxim Masiutin– Maxim Masiutin2020年10月29日 22:44:16 +00:00Commented Oct 29, 2020 at 22:44
What is the assembly generated?
I remember finding that using structs can speed things up:
typedef struct {
int x[17] __attribute__ ((packed));
int padding __attribute__ ((packed, unused));
} cbytes __attribute__ ((packed));
void *memcpyi80(cbytes* __restrict b, const cbytes* __restrict a){
size_t n = 80 / sizeof(cbytes);
cbytes *s1 = b;
const cbytes *s2 = a;
for(; 0<n; --n)*s1++ = *s2++;
return b;
}
-
\$\begingroup\$ Thank you. Added asm output. Will try the struct approach and let you know. (Am afk for the next 20h unfortunately) \$\endgroup\$Alexandros Marinos– Alexandros Marinos2011年10月22日 10:59:14 +00:00Commented Oct 22, 2011 at 10:59
Code below is optimized:
void *memcpyi72(void* __restrict b, const void * __restrict a)
{
return memcpy(b,a, 18*sizeof(int));
}
GCC with -O3 generates the same assembly for this function as for the Pubby8 code. There's no need to use structs.
-
3\$\begingroup\$ Using this slows down my computation by approx. 5%. \$\endgroup\$Alexandros Marinos– Alexandros Marinos2011年10月23日 16:20:09 +00:00Commented Oct 23, 2011 at 16:20
You know what the size is, and you know it's ints, so do a little insider-trading:
void myCopy(int* dest, int* src){
dest[ 0] = src[ 0];
dest[ 1] = src[ 1];
dest[ 2] = src[ 2];
...
dest[19] = src[19];
}
-
1\$\begingroup\$ This gives approx. 15% slowdown. \$\endgroup\$Alexandros Marinos– Alexandros Marinos2011年10月23日 18:44:31 +00:00Commented Oct 23, 2011 at 18:44
-
\$\begingroup\$ @Alex: Hmm... Then the next thing I would do is take maybe 20 stackshots so I would be confirming / deconfirming any guesses I might have about what's really going on. \$\endgroup\$Mike Dunlavey– Mike Dunlavey2011年10月23日 22:25:39 +00:00Commented Oct 23, 2011 at 22:25
The compiler cannot vectorize your version. If you simply change the for loop to be indexed instead of dereferenced, you will see a huge speed improvement. I get>10x speed up for this:
void *memcpyi80(void* __restrict b, const void* __restrict a) {
size_t n = 80;
char *s1 = b;
const char *s2 = a;
for(; 0 < n; --n) {
s1[n] = s2[n];
}
return b;
}
-
\$\begingroup\$ This does not appear to copy correctly. \$\endgroup\$Alexandros Marinos– Alexandros Marinos2011年10月23日 16:16:23 +00:00Commented Oct 23, 2011 at 16:16
-
1\$\begingroup\$ -1 : array indices are start at 0, your code assumes they start at 1 \$\endgroup\$Tom Knapen– Tom Knapen2011年10月23日 18:00:01 +00:00Commented Oct 23, 2011 at 18:00
-
1\$\begingroup\$ With Tom's fix it takes about 80% longer to run than my current implementation. \$\endgroup\$Alexandros Marinos– Alexandros Marinos2011年10月23日 18:39:35 +00:00Commented Oct 23, 2011 at 18:39
You are copying byte by byte, so it would be a lot faster copying int by int instead. Also unrolling the loop should help:
void *memcpyi80(void* __restrict b, const void* __restrict a){
int* s1 = b;
int* s2 = a;
*s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
*s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
*s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
*s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
*s1++ = *s2++;
// *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
return b;
}
In C# I have found that separating the access and incrementation is faster, so that's worth a try:
void *memcpyi80(void* __restrict b, const void* __restrict a){
int* s1 = b;
int* s2 = a;
*s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
*s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
*s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
*s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
*s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
*s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
// *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2;
return b;
}
-
3\$\begingroup\$ This slows it down by >13% on mine. \$\endgroup\$Alexandros Marinos– Alexandros Marinos2011年10月23日 16:18:21 +00:00Commented Oct 23, 2011 at 16:18
-
1\$\begingroup\$ Manual unrolling doesn't make sense. The compiler knows how to do this for you. \$\endgroup\$Reinderien– Reinderien2020年07月25日 16:44:27 +00:00Commented Jul 25, 2020 at 16:44
There's no way any solution in c or c++ could be better than assembly (unless of course, it was horribly written). The answer with the assembly language from Jerry Coffin above...
memcpyi80 proc dest:ptr byte src:ptr byte
mov esi, src ; load source address
mov edi, dest ; load destination address
mov ecx, 20 ; initialize count register (80/4)
rep movsd ; perform transfer
memcpyi80 endp
cannot be improved upon, in my opinion, unless it's possible to use a smaller number larger operands. Naturally the memory addresses need to be aligned properly. The rep movsd
instruction is the only part of the code that does any work, automatically incrementing the count register until the operation is complete.
What you might try is to pass the count as a separate parameter and then split the data into as many parts as you have cores, and call the function with a separate thread for each part.
-
\$\begingroup\$ oops - the formatting of the assembly got nuked there... \$\endgroup\$Fred– Fred2011年10月23日 09:04:00 +00:00Commented Oct 23, 2011 at 9:04
-
\$\begingroup\$ Another thing - if you run on a 64-bit o/s and use 64-bit assembly you should also be able to use 64-bit operands - i.e. 8 bytes at a time instead of just 4 using 32-bt. \$\endgroup\$Fred– Fred2011年10月23日 09:12:57 +00:00Commented Oct 23, 2011 at 9:12
-
5\$\begingroup\$ You want to multithread an 80-byte copy??? Even having a different thread do the whole copy would be a huge performance hit, because the cache line would end up in the "modified" state in the L1 cache of another core, and would have to be transferred back to the core running the main thread. Not to mention that there'd be no way to send a request to another thread in less time than it takes to just copy 80 bytes.
rep movsd
isn't terrible, but the microcode has high startup overhead. For short copies, fully-unrolled SSE / AVX is faster. \$\endgroup\$Peter Cordes– Peter Cordes2015年09月19日 00:00:27 +00:00Commented Sep 19, 2015 at 0:00
memcpy
) can be optimized? What I see a lot is people think the problem is here when it's not. It's there. \$\endgroup\$-O3 -march=sandybridge
. \$\endgroup\$