Copying 80 bytes as fast as possible

Question 1

I am running a math-oriented computation that spends a significant amount of its time doing memcpy, always copying 80 bytes from one location to the next, an array of 20 32-bit ints. The total computation takes around 4-5 days using both cores of my i7, so even a 1% speedup results in about an hour saved.

By using the memcpy in this paper by Intel, I was able to speed up by about 25%, and also dropping the size argument and simply declaring inside seems to have some small effect. However, I feel I am not utilising the fact that my copying operations are always the same size. That said, I can't come up with a better way.

void *memcpyi80(void* __restrict b, const void* __restrict a){
 size_t n = 80;
 char *s1 = b;
 const char *s2 = a;
 for(; 0<n; --n)*s1++ = *s2++;
 return b;
}

Some other things that may be useful for optimization:

I use an Intel Core i7-2620M, based on Sandy Bridge. I don't care about portability at all.
I only care about the 16 least significant bits of every int. The other 16 are useless to me and are permanently zeroed out.
Even though I copy 20 32-bit ints per memcpy invocation, I only care about the first 17. I have added 3 as it helps with alignment and therefore speed.
I use GCC 4.6 on Windows 7.

Any ideas?

UPDATE:

I think this is the assembly output (never done this before, there may be more than you need):

memcpyi80:
 pushq %r12
 .seh_pushreg %r12
 pushq %rbp
 .seh_pushreg %rbp
 pushq %rdi
 .seh_pushreg %rdi
 pushq %rsi
 .seh_pushreg %rsi
 pushq %rbx
 .seh_pushreg %rbx
 .seh_endprologue
 movq %rdx, %r9
 movq %rcx, %rax
 negq %r9
 andl 15,ドル %r9d
 je .L165
 movzbl (%rdx), %ecx
 leaq -1(%r9), %r10
 movl 79,ドル %esi
 andl 7,ドル %r10d
 cmpq 1,ドル %r9
 movl 79,ドル %ebx
 leaq 1(%rdx), %r8
 movl 1,ドル %r11d
 movb %cl, (%rax)
 leaq 1(%rax), %rcx
 jbe .L159
 testq %r10, %r10
 je .L160
 cmpq 1,ドル %r10
 je .L250
 cmpq 2,ドル %r10
 je .L251
 cmpq 3,ドル %r10
 je .L252
 cmpq 4,ドル %r10
 je .L253
 cmpq 5,ドル %r10
 je .L254
 cmpq 6,ドル %r10
 je .L255
 movzbl (%r8), %r8d
 movl 2,ドル %r11d
 movb %r8b, (%rcx)
 leaq 2(%rax), %rcx
 leaq 2(%rdx), %r8
.L255:
 movzbl (%r8), %ebx
 addq 1,ドル %r11
 addq 1,ドル %r8
 movb %bl, (%rcx)
 addq 1,ドル %rcx
.L254:
 movzbl (%r8), %r10d
 addq 1,ドル %r11
 addq 1,ドル %r8
 movb %r10b, (%rcx)
 addq 1,ドル %rcx
.L253:
 movzbl (%r8), %edi
 addq 1,ドル %r11
 addq 1,ドル %r8
 movb %dil, (%rcx)
 addq 1,ドル %rcx
.L252:
 movzbl (%r8), %ebp
 addq 1,ドル %r11
 addq 1,ドル %r8
 movb %bpl, (%rcx)
 addq 1,ドル %rcx
.L251:
 movzbl (%r8), %r12d
 addq 1,ドル %r11
 addq 1,ドル %r8
 movb %r12b, (%rcx)
 addq 1,ドル %rcx
.L250:
 movzbl (%r8), %ebx
 addq 1,ドル %r8
 movb %bl, (%rcx)
 movq %rsi, %rbx
 addq 1,ドル %rcx
 subq %r11, %rbx
 addq 1,ドル %r11
 cmpq %r11, %r9
 jbe .L159
 .p2align 4,,10
.L160:
 movzbl (%r8), %r12d
 movb %r12b, (%rcx)
 movzbl 1(%r8), %ebp
 movb %bpl, 1(%rcx)
 movzbl 2(%r8), %edi
 movb %dil, 2(%rcx)
 movzbl 3(%r8), %ebx
 movb %bl, 3(%rcx)
 leaq 7(%r11), %rbx
 addq 8,ドル %r11
 movzbl 4(%r8), %r10d
 movb %r10b, 4(%rcx)
 movq %rsi, %r10
 movzbl 5(%r8), %r12d
 subq %rbx, %r10
 movq %r10, %rbx
 movb %r12b, 5(%rcx)
 movzbl 6(%r8), %ebp
 movb %bpl, 6(%rcx)
 movzbl 7(%r8), %edi
 addq 8,ドル %r8
 movb %dil, 7(%rcx)
 addq 8,ドル %rcx
 cmpq %r11, %r9
 ja .L160
.L159:
 movl 80,ドル %r12d
 subq %r9, %r12
 movq %r12, %rsi
 shrq 4,ドル %rsi
 movq %rsi, %rbp
 salq 4,ドル %rbp
 testq %rbp, %rbp
 je .L161
 leaq (%rdx,%r9), %r10
 addq %rax, %r9
 movl 1,ドル %r11d
 leaq -1(%rsi), %rdi
 vmovdqa (%r10), %xmm0
 movl 16,ドル %edx
 andl 7,ドル %edi
 cmpq 1,ドル %rsi
 vmovdqu %xmm0, (%r9)
 jbe .L256
 testq %rdi, %rdi
 je .L162
 cmpq 1,ドル %rdi
 je .L244
 cmpq 2,ドル %rdi
 je .L245
 cmpq 3,ドル %rdi
 je .L246
 cmpq 4,ドル %rdi
 je .L247
 cmpq 5,ドル %rdi
 je .L248
 cmpq 6,ドル %rdi
 je .L249
 vmovdqa 16(%r10), %xmm3
 movl 2,ドル %r11d
 movl 32,ドル %edx
 vmovdqu %xmm3, 16(%r9)
.L249:
 vmovdqa (%r10,%rdx), %xmm4
 addq 1,ドル %r11
 vmovdqu %xmm4, (%r9,%rdx)
 addq 16,ドル %rdx
.L248:
 vmovdqa (%r10,%rdx), %xmm5
 addq 1,ドル %r11
 vmovdqu %xmm5, (%r9,%rdx)
 addq 16,ドル %rdx
.L247:
 vmovdqa (%r10,%rdx), %xmm0
 addq 1,ドル %r11
 vmovdqu %xmm0, (%r9,%rdx)
 addq 16,ドル %rdx
.L246:
 vmovdqa (%r10,%rdx), %xmm1
 addq 1,ドル %r11
 vmovdqu %xmm1, (%r9,%rdx)
 addq 16,ドル %rdx
.L245:
 vmovdqa (%r10,%rdx), %xmm2
 addq 1,ドル %r11
 vmovdqu %xmm2, (%r9,%rdx)
 addq 16,ドル %rdx
.L244:
 vmovdqa (%r10,%rdx), %xmm3
 addq 1,ドル %r11
 vmovdqu %xmm3, (%r9,%rdx)
 addq 16,ドル %rdx
 cmpq %r11, %rsi
 jbe .L256
 .p2align 4,,10
.L162:
 vmovdqa (%r10,%rdx), %xmm2
 addq 8,ドル %r11
 vmovdqu %xmm2, (%r9,%rdx)
 vmovdqa 16(%r10,%rdx), %xmm1
 vmovdqu %xmm1, 16(%r9,%rdx)
 vmovdqa 32(%r10,%rdx), %xmm0
 vmovdqu %xmm0, 32(%r9,%rdx)
 vmovdqa 48(%r10,%rdx), %xmm5
 vmovdqu %xmm5, 48(%r9,%rdx)
 vmovdqa 64(%r10,%rdx), %xmm4
 vmovdqu %xmm4, 64(%r9,%rdx)
 vmovdqa 80(%r10,%rdx), %xmm3
 vmovdqu %xmm3, 80(%r9,%rdx)
 vmovdqa 96(%r10,%rdx), %xmm2
 vmovdqu %xmm2, 96(%r9,%rdx)
 vmovdqa 112(%r10,%rdx), %xmm1
 vmovdqu %xmm1, 112(%r9,%rdx)
 subq $-128, %rdx
 cmpq %r11, %rsi
 ja .L162
.L256:
 addq %rbp, %rcx
 addq %rbp, %r8
 subq %rbp, %rbx
 cmpq %rbp, %r12
 je .L163
.L161:
 movzbl (%r8), %edx
 leaq -1(%rbx), %r9
 andl 7,ドル %r9d
 movb %dl, (%rcx)
 movl 1,ドル %edx
 cmpq %rbx, %rdx
 je .L163
 testq %r9, %r9
 je .L164
 cmpq 1,ドル %r9
 je .L238
 cmpq 2,ドル %r9
 je .L239
 cmpq 3,ドル %r9
 je .L240
 cmpq 4,ドル %r9
 je .L241
 cmpq 5,ドル %r9
 je .L242
 cmpq 6,ドル %r9
 je .L243
 movzbl 1(%r8), %edx
 movb %dl, 1(%rcx)
 movl 2,ドル %edx
.L243:
 movzbl (%r8,%rdx), %esi
 movb %sil, (%rcx,%rdx)
 addq 1,ドル %rdx
.L242:
 movzbl (%r8,%rdx), %r11d
 movb %r11b, (%rcx,%rdx)
 addq 1,ドル %rdx
.L241:
 movzbl (%r8,%rdx), %r10d
 movb %r10b, (%rcx,%rdx)
 addq 1,ドル %rdx
.L240:
 movzbl (%r8,%rdx), %edi
 movb %dil, (%rcx,%rdx)
 addq 1,ドル %rdx
.L239:
 movzbl (%r8,%rdx), %ebp
 movb %bpl, (%rcx,%rdx)
 addq 1,ドル %rdx
.L238:
 movzbl (%r8,%rdx), %r12d
 movb %r12b, (%rcx,%rdx)
 addq 1,ドル %rdx
 cmpq %rbx, %rdx
 je .L163
 .p2align 4,,10
.L164:
 movzbl (%r8,%rdx), %r9d
 movb %r9b, (%rcx,%rdx)
 movzbl 1(%r8,%rdx), %r12d
 movb %r12b, 1(%rcx,%rdx)
 movzbl 2(%r8,%rdx), %ebp
 movb %bpl, 2(%rcx,%rdx)
 movzbl 3(%r8,%rdx), %edi
 movb %dil, 3(%rcx,%rdx)
 movzbl 4(%r8,%rdx), %r10d
 movb %r10b, 4(%rcx,%rdx)
 movzbl 5(%r8,%rdx), %r11d
 movb %r11b, 5(%rcx,%rdx)
 movzbl 6(%r8,%rdx), %esi
 movb %sil, 6(%rcx,%rdx)
 movzbl 7(%r8,%rdx), %r9d
 movb %r9b, 7(%rcx,%rdx)
 addq 8,ドル %rdx
 cmpq %rbx, %rdx
 jne .L164
.L163:
 popq %rbx
 popq %rsi
 popq %rdi
 popq %rbp
 popq %r12
 ret
.L165:
 movq %rdx, %r8
 movl 80,ドル %ebx
 jmp .L159
 .seh_endproc
 .p2align 4,,15
 .globl memcpyi
 .def memcpyi; .scl 2; .type 32; .endef
 .seh_proc memcpyi

UPDATE:

By building on Peter Alexander's solution and combining it with ideas from around the thread, I have produced this:

void memcpyi80(void* __restrict b, const void* __restrict a){
 __m128 *s1 = b;
 const __m128 *s2 = a;
 *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; 
}

The speedup is small but measurable (about 1%). Now I guess my next temptation is to find how to use __m256 AVX types so I can do it in 3 steps rather than 5.

UPDATE:

The __m256 type requires alignment on the 32-bit barrier, which makes things slower, so it seems __m128 is a sweet spot.

Question 2

Is it possible something else (other than the memcpy) can be optimized? What I see a lot is people think the problem is here when it's not. It's there.

Question 3

Have you try unroll the loop? Do 17 instead of 20? You can also take care of condition 2 at the same time because the values are in the registers already. Int should be 4-byte aligned, copy int instead of char? Save you look at Intel SSE instructions?

Question 4

If you only care about the 16 least significant bits why aren't you using shorts?

Question 5

MikeDunlavey - I have been optimizing this algo for over a year. Could there be a better way? Maybe, but it's not something obvious. I will post other parts of it here for review once I get done with the suggestions in this thread. @qwert - Using shorts apparently is not making the compiler happy - it seems to slow things down. I will try again though, I had tried it a while ago.

Question 6

Can you confirm that you're correctly telling GCC to optimize for your target? Just posting your compilation flags is probably sufficient for that. I'm expecting something like -O3 -march=sandybridge.

Question 7

The fastest way to do this would be to align your data on 16-byte boundaries, then the entire copy just becomes 5 copies through XMM registers.

This is over twice as fast as your version on my machine.

Store your data like this:

#include <xmmintrin.h>
struct Data
{
 union
 {
 int i[20];
 __m128 v[5];
 };
};

Then the copy function is just:

void memcpyv5(__m128* __restrict b, const __m128* __restrict a)
{
 __m128 t0 = a[0];
 __m128 t1 = a[1];
 __m128 t2 = a[2];
 __m128 t3 = a[3];
 __m128 t4 = a[4];
 b[0] = t0;
 b[1] = t1;
 b[2] = t2;
 b[3] = t3;
 b[4] = t4;
}
// Example
Data dst, src;
memcpyv5(dst.v, src.v);

Assembly output:

__Z8memcpyv5PU8__vectorfPKS_:
LFB493:
 pushq %rbp
LCFI2:
 movq %rsp, %rbp
LCFI3:
 movaps 16(%rsi), %xmm3
 movaps 32(%rsi), %xmm2
 movaps 48(%rsi), %xmm1
 movaps 64(%rsi), %xmm0
 movaps (%rsi), %xmm4
 movaps %xmm4, (%rdi)
 movaps %xmm3, 16(%rdi)
 movaps %xmm2, 32(%rdi)
 movaps %xmm1, 48(%rdi)
 movaps %xmm0, 64(%rdi)
 leave
 ret

Question 8

Great use of the hardware, especially since portability is not a a concern.

Question 9

Not looking good. Simply converting my code to use structs takes it from <15sec to >40sec. Using the new memcpy function takes it to 37.5 sec. So the function is better but using structs kills the program. I will look for a way to use the xmmintrin commands without structs to see if anything changes and get back.

Question 10

You can avoid using structs by manually aligning your data (check your compiler docs) and just casting it to (__m128*).

Question 11

See update, I've made a version that gives a 1% speedup. Thank you very much for the answer. Any idea how to use __m256 vectors? cheers.

Question 12

@AlexandrosMarinos: Sandybridge does 256b loads / stores in 2 cycles anyway. They're still single-uop instructions, but it's only significantly faster with Haswell, which has 256b data paths to/from L1 cache. If your data is 32B-aligned, it could be a small gain on SnB, but not if you get a store-forwarding stall because it was written very recently with 16B writes. Also, 256b ops are slower than 128b until the CPU stops for thousands of cycles to power up the upper 128b lane in the execution units. It powers down if unused for ~1 ms. agner.org/optimize/blog/read.php?i=142#378

Question 13

Taking Benefits of The Out-of-Order Execution Engine

You can also read about The Out-of-Order Execution Engine in the "Intel® 64 and IA-32 Architectures Optimization Reference Manual", section 2.1.2, and take benefits of it.

For example, in Intel SkyLake processor series (launched in 2015), it has:

4 execution units for the Arithmetic logic unit (ALU) (add, and, cmp, or, test, xor, movzx, movsx, mov, (v)movdqu, (v)movdqa, (v)movap*, (v)movup),
3 execution units for Vector ALU ( (v)pand, (v)por, (v)pxor, (v)movq, (v)movq, (v)movap*, (v)movup*, (v)andp*, (v)orp*, (v)paddb/w/d/q, (v)blendv*, (v)blendp*, (v)pblendd)

So we can occupy the above units (3+4) in parallel if we use register-only operations. We cannot use 3+4 instructions in parallel for memory copy. We can use simultaneously a maximum of up to two 32-bytes instructions to load from memory and one 32-bytes instruction to store from memory, and even if we are working with Level-1 cache.

Please see the Intel manual again to understand on how to do the fastest memcpy implementation: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf

Section 2.2.2 (The Out-of-Order Engine on the Haswell micro-architecture): "The Scheduler controls the dispatch of micro-ops onto the dispatch ports. There are eight dispatch ports to support the out-of-order execution core. Four of the eight ports provided execution resources for computational operations. The other 4 ports support memory operations of up to two 256-bit load and one 256-bit store operation in a cycle."

Section 2.2.4 (Cache and Memory Subsystem) has the following note: "First level data cache supports two load micro-ops each cycle; each micro-op can fetch up to 32-bytes of data."

Section 2.2.4.1 (Load and Store Operation Enhancements) has the following information: The L1 data cache can handle two 256-bit (32 bytes) load and one 256-bit (32 bytes) store operations each cycle. The unified L2 can service one cache line (64 bytes) each cycle. Additionally, there are 72 load buffers and 42 store buffers available to support micro-ops execution in-flight.

The other sections (2.3 and so on, dedicated to Sandy Bridge and other microarchitectures) basically reiterate the above information.

The section 2.3.4 (The Execution Core) gives additional details.

The scheduler can dispatch up to six micro-ops every cycle, one on each port. The following table summarizes which operations can be dispatched on which port.

Port 0: ALU, Shift, Mul, STTNI, Int-Div, 128b-Mov, Blend, 256b-Mov
Port 1: ALU, Fast LEA, Slow LEA, MUL, Shuf, Blend, 128bMov, Add, CVT
Port 2 & Port 3: Load_Addr, Store_addr
Port 4: Store_data
Port 5: ALU, Shift, Branch, Fast LEA, Shuf, Blend, 128b-Mov, 256b-Mov

The section 2.3.5.1 (Load and Store Operation Overview) may also be useful to understand on how to make fast memory copy, as well as the section 2.4.4.1 (Loads and Stores).

For the other processor architectures, it is again - two load units and one store unit. Table 2-4 (Cache Parameters of the Skylake Microarchitecture) has the following information:

Peak Bandwidth (bytes/cycle):

First Level Data Cache: 96 bytes (2x32B Load + 1*32B Store)
Second Level Cache: 64 bytes
Third Level Cache: 32 bytes.

I have also done speed tests on my Intel Core i5 6600 CPU (Skylake, 14nm, released in September 2015) with DDR4 memory, which confirmed the theory. For example, my tests have shown that using generic 64-bit registers for memory copy, even many registers in parallel, degrades performance, comparing to larger registers (XMM). Also, using just 2 XMM registers is enough - adding the 3rd doesn't add performance.

If your CPU has AVX CPUID bit, you may benefit from the large, 256-bit (32 byte) YMM registers to copy memory and occupy two whole load units. The AVX support was first introduced by Intel with the Sandy Bridge processors, shipping in Q1 2011 and later on by AMD with the Bulldozer processor shipping in Q3 2011.

// first cycle - use two load units
vmovdqa ymm0, ymmword ptr [esi+0] // load first part (32 bytes)
vmovdqa ymm1, ymmword ptr [esi+32] // load 2nd part (32 bytes)
// second cycle - use one load unit and one store unit
vmovdqa xmm2, xmmword ptr [esi+64] // load 3rd part (16 bytes)
vmovdqa ymmword ptr [edi+0], ymm0 // store first part
// third cycle - use one store unit
vmovdqa ymmword ptr [edi+32], ymm1 // store 2nd part
// fourth cycle - use one store unit
vmovdqa xmmword ptr [edi+64], xmm2 // store 3rd part

Just make sure your data is aligned by 16 bytes (for the XMM registers), or by 32 bytes (for the YMM registers). Otherwise, there will be an Access Violation error. If the data is not aligned, use unaligned commands: vmovdqu and movups, respectively.

If you are lucky to have an AVX-512 processor, you can copy 80 bytes in just four instructions:

vmovdqu64 zmm30, [esi]
vmovdqu xmm31, [esi+64] 
vmovdqu64 [edi], zmm30
vmovdqu [edi+64], xmm31

We are using registers 30 and 31 here to avoid the upper 256 dirty state, which is a global state, which may incur SSE/AVX transitional penalties. Moreover, on some CPU models, vzeroupper or vzeroall is the only way to exit this state or even restore max-turbo after dirtying a ZMM register. The CPU, however, will not enter this state for writes to (x/y/z)mm16-31 - registers which do not exist on SSE/AVX1/AVX2.

Further reading - ERMSB (not needed to copy exactly 80 bytes but for much larger blocks)

If your CPU has CPUID ERMSB (Enhanced REP MOVSB) bit, then rep movsb command is executed differently than on older processors, and it will be faster than rep movsd (movsq). However, the benefits of rep movsb will only be only noticeable on large blocks.

rep movsb is faster than plain simple "mov rax in a loop" copy only starting from 256-byte blocks, and faster than AVX copy starting from 2048 bytes-blocks.

So, since your block size is 80 bytes only, ERMSB will not give you any benefit.

Get Microsoft Visual Studio, and look for memcpy.asm - it has different scenarios for different processors and different block sizes - so you will be able to figure out which method is best to use for your processor and your block size.

Meanwhile, I can consider Intel ERMSB "half-baked", because there is a high internal startup in ERMSB - about 35 cycles, and because of the other limitations.

See the Intel Manual on Optimization, section 3.7.6 Enhanced REP MOVSB and STOSB operation (ERMSB) http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf

startup cost is 35 cycles;
both the source and destination addresses have to be aligned to a 16-Byte boundary;
the source region should not overlap with the destination region;
the length have to be a multiple of 64 to produce higher performance;
the direction have to be forward (CLD).

I hope that in future Intel will eliminate such a high startup costs.

Question 14

If you really need this part as fast as possible, one obvious route would be to write it in assembly language. The assembly language you've posted looks a bit on the insane side for this task (at least to me). Given a fixed size, the obvious route would be something like:

; warning: I haven't written a lot of assembly code recently -- I could have 
; some of the syntax a bit wrong.
;
memcpyi80 proc dest:ptr byte src:ptr byte
 mov esi, src
 mov edi, dest
 mov ecx, 20 ; 80/4
 rep movsd
memcpyi80 endp

That is definitely open to improvement by using (for one example) moves through the SSE registers, but I'll leave that for others to play with. The improvement is pretty small though: recent processors have a special path specifically for memory copies, which this will use, so it's pretty competitive despite its simplicity.

@Mike Dunlavey's comment is good though: most of the time people think they need a faster memory copy, they really need to re-think their code to simply avoid needing it.

Question 15

really old answer, but it wasn't until IvyBridge that fast string ops were introduced. (smarter microcode for rep movsb with lower startup overhead, and I think better handling of misalignment). intel.com/content/dam/doc/manual/… section 3.7.7. Since vector-copy wins for general memcpy sizes under 128 bytes even on IvB, and in this case the size is an exact multiple of the vector width, using vectors is going to be better even on IvB and later with fast movsb.

Question 16

Please also consider adding the "CLD" instruction to clear the "direction" flag, just in case it was not cleared after an eventual previous move with "STD"

Question 17

What is the assembly generated?

I remember finding that using structs can speed things up:

typedef struct {
 int x[17] __attribute__ ((packed));
 int padding __attribute__ ((packed, unused));
} cbytes __attribute__ ((packed));
void *memcpyi80(cbytes* __restrict b, const cbytes* __restrict a){
 size_t n = 80 / sizeof(cbytes);
 cbytes *s1 = b;
 const cbytes *s2 = a;
 for(; 0<n; --n)*s1++ = *s2++;
 return b;
}

Question 18

Thank you. Added asm output. Will try the struct approach and let you know. (Am afk for the next 20h unfortunately)

Question 19

Code below is optimized:

void *memcpyi72(void* __restrict b, const void * __restrict a)
{
 return memcpy(b,a, 18*sizeof(int));
}

GCC with -O3 generates the same assembly for this function as for the Pubby8 code. There's no need to use structs.

Question 20

Using this slows down my computation by approx. 5%.

Question 21

You know what the size is, and you know it's ints, so do a little insider-trading:

void myCopy(int* dest, int* src){
 dest[ 0] = src[ 0];
 dest[ 1] = src[ 1];
 dest[ 2] = src[ 2];
 ...
 dest[19] = src[19];
}

Question 22

This gives approx. 15% slowdown.

Question 23

@Alex: Hmm... Then the next thing I would do is take maybe 20 stackshots so I would be confirming / deconfirming any guesses I might have about what's really going on.

Question 24

The compiler cannot vectorize your version. If you simply change the for loop to be indexed instead of dereferenced, you will see a huge speed improvement. I get>10x speed up for this:

void *memcpyi80(void* __restrict b, const void* __restrict a) {
 size_t n = 80;
 char *s1 = b;
 const char *s2 = a;
 for(; 0 < n; --n) {
 s1[n] = s2[n];
 }
 return b;
}

Question 25

This does not appear to copy correctly.

Question 26

-1 : array indices are start at 0, your code assumes they start at 1

Question 27

With Tom's fix it takes about 80% longer to run than my current implementation.

Question 28

You are copying byte by byte, so it would be a lot faster copying int by int instead. Also unrolling the loop should help:

void *memcpyi80(void* __restrict b, const void* __restrict a){
 int* s1 = b;
 int* s2 = a;
 *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
 *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
 *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
 *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
 *s1++ = *s2++;
 // *s1++ = *s2++; *s1++ = *s2++; *s1++ = *s2++;
 return b;
}

In C# I have found that separating the access and incrementation is faster, so that's worth a try:

void *memcpyi80(void* __restrict b, const void* __restrict a){
 int* s1 = b;
 int* s2 = a;
 *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
 *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
 *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
 *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
 *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
 *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++;
 // *s1 = *s2; s1++; s2++; *s1 = *s2; s1++; s2++; *s1 = *s2;
 return b;
}

Question 29

This slows it down by >13% on mine.

Question 30

Manual unrolling doesn't make sense. The compiler knows how to do this for you.

Question 31

There's no way any solution in c or c++ could be better than assembly (unless of course, it was horribly written). The answer with the assembly language from Jerry Coffin above...

memcpyi80 proc dest:ptr byte src:ptr byte
 mov esi, src ; load source address
 mov edi, dest ; load destination address
 mov ecx, 20 ; initialize count register (80/4)
 rep movsd ; perform transfer
memcpyi80 endp

cannot be improved upon, in my opinion, unless it's possible to use a smaller number larger operands. Naturally the memory addresses need to be aligned properly. The rep movsd instruction is the only part of the code that does any work, automatically incrementing the count register until the operation is complete.

What you might try is to pass the count as a separate parameter and then split the data into as many parts as you have cores, and call the function with a separate thread for each part.

Question 32

oops - the formatting of the assembly got nuked there...

Question 33

Another thing - if you run on a 64-bit o/s and use 64-bit assembly you should also be able to use 64-bit operands - i.e. 8 bytes at a time instead of just 4 using 32-bt.

Question 34

You want to multithread an 80-byte copy??? Even having a different thread do the whole copy would be a huge performance hit, because the cache line would end up in the "modified" state in the L1 cache of another core, and would have to be transferred back to the core running the main thread. Not to mention that there'd be no way to send a request to another thread in less time than it takes to just copy 80 bytes. rep movsd isn't terrible, but the microcode has high startup overhead. For short copies, fully-unrolled SSE / AVX is faster.

Peter Alexander Peter Alexander 4363 silver badges5 bronze badges · Accepted Answer · 2011-10-22 21:53:00Z

32

\$\begingroup\$

The fastest way to do this would be to align your data on 16-byte boundaries, then the entire copy just becomes 5 copies through XMM registers.

This is over twice as fast as your version on my machine.

Store your data like this:

#include <xmmintrin.h>
struct Data
{
 union
 {
 int i[20];
 __m128 v[5];
 };
};

Then the copy function is just:

void memcpyv5(__m128* __restrict b, const __m128* __restrict a)
{
 __m128 t0 = a[0];
 __m128 t1 = a[1];
 __m128 t2 = a[2];
 __m128 t3 = a[3];
 __m128 t4 = a[4];
 b[0] = t0;
 b[1] = t1;
 b[2] = t2;
 b[3] = t3;
 b[4] = t4;
}
// Example
Data dst, src;
memcpyv5(dst.v, src.v);

Assembly output:

__Z8memcpyv5PU8__vectorfPKS_:
LFB493:
 pushq %rbp
LCFI2:
 movq %rsp, %rbp
LCFI3:
 movaps 16(%rsi), %xmm3
 movaps 32(%rsi), %xmm2
 movaps 48(%rsi), %xmm1
 movaps 64(%rsi), %xmm0
 movaps (%rsi), %xmm4
 movaps %xmm4, (%rdi)
 movaps %xmm3, 16(%rdi)
 movaps %xmm2, 32(%rdi)
 movaps %xmm1, 48(%rdi)
 movaps %xmm0, 64(%rdi)
 leave
 ret

Share

edited Oct 22, 2011 at 22:34

answered Oct 22, 2011 at 21:53

Peter Alexander's user avatar

Peter Alexander Peter Alexander

4363 silver badges5 bronze badges

\$\endgroup\$

6

3

\$\begingroup\$ Great use of the hardware, especially since portability is not a a concern. \$\endgroup\$

Jeff Mercado
– Jeff Mercado

2011年10月23日 00:49:24 +00:00
Commented Oct 23, 2011 at 0:49
\$\begingroup\$ Not looking good. Simply converting my code to use structs takes it from <15sec to >40sec. Using the new memcpy function takes it to 37.5 sec. So the function is better but using structs kills the program. I will look for a way to use the xmmintrin commands without structs to see if anything changes and get back. \$\endgroup\$

Alexandros Marinos
– Alexandros Marinos

2011年10月23日 16:29:40 +00:00
Commented Oct 23, 2011 at 16:29
5

\$\begingroup\$ You can avoid using structs by manually aligning your data (check your compiler docs) and just casting it to (__m128*). \$\endgroup\$

Peter Alexander
– Peter Alexander

2011年10月23日 16:50:13 +00:00
Commented Oct 23, 2011 at 16:50
\$\begingroup\$ See update, I've made a version that gives a 1% speedup. Thank you very much for the answer. Any idea how to use __m256 vectors? cheers. \$\endgroup\$

Alexandros Marinos
– Alexandros Marinos

2011年10月23日 19:43:00 +00:00
Commented Oct 23, 2011 at 19:43
5

\$\begingroup\$ @AlexandrosMarinos: Sandybridge does 256b loads / stores in 2 cycles anyway. They're still single-uop instructions, but it's only significantly faster with Haswell, which has 256b data paths to/from L1 cache. If your data is 32B-aligned, it could be a small gain on SnB, but not if you get a store-forwarding stall because it was written very recently with 16B writes. Also, 256b ops are slower than 128b until the CPU stops for thousands of cycles to power up the upper 128b lane in the execution units. It powers down if unused for ~1 ms. agner.org/optimize/blog/read.php?i=142#378 \$\endgroup\$

Peter Cordes
– Peter Cordes

2015年09月18日 23:38:43 +00:00
Commented Sep 18, 2015 at 23:38

| Show 1 more comment

Stack Exchange Network

Copying 80 bytes as fast as possible

9 Answers 9

Taking Benefits of The Out-of-Order Execution Engine

Further reading - ERMSB (not needed to copy exactly 80 bytes but for much larger blocks)

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Copying 80 bytes as fast as possible

9 Answers 9

Taking Benefits of The Out-of-Order Execution Engine

Further reading - ERMSB (not needed to copy exactly 80 bytes but for much larger blocks)

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions