Edit:
By adding the restrict keyword I was able to get my memcpy up to speed with the library implementation (and in this particular test, exceeding the library implementations speed). New results:
| Test case | mem_cpy |
mem_cpy_naive |
memcpy |
|---|---|---|---|
| big string (1000 bytes) | 2.584988s | 3.936075s | 3.952187s |
| small string (8 bytes) | 0.025931s | 0.051899s | 0.025807s |
Note: I tested also it as a part of a bigger implementation I had been working on. Previously I gained about 20% performance by swapping the libc memcpy in place of my own, now there was no difference.
Updated code:
static void
copy_words(void *restrict dst, const void *restrict src, size_t words)
{
const uint64_t *restrict src64;
uint64_t *restrict dst64;
uint64_t pages;
uint64_t offset;
pages = words / 8;
offset = words - pages * 8;
src64 = (const uint64_t *restrict)src;
dst64 = (uint64_t *restrict)dst;
while (pages--)
{
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
}
while (offset--)
*dst64++ = *src64++;
}
static void
copy_small(void *restrict dst, const void *restrict src, size_t size)
{
const uint64_t *restrict src64;
uint64_t *restrict dst64;
src64 = (const uint64_t *restrict)src;
dst64 = (uint64_t *restrict)dst;
*dst64 = *src64;
}
void
*mem_cpy(void *restrict dst, const void *restrict src, const size_t size)
{
const uint8_t *restrict src8;
uint8_t *restrict dst8;
size_t offset;
size_t words;
size_t aligned_size;
if (!src || !dst)
return (NULL);
if (size <= 8)
{
copy_small(dst, src, size);
return (dst);
}
words = size / 8;
aligned_size = words * 8;
offset = size - aligned_size;
copy_words(dst, src, words);
if (offset)
{
src8 = (const uint8_t *restrict)src;
src8 = &src8[aligned_size];
dst8 = (uint8_t *restrict)dst;
dst8 = &dst8[aligned_size];
while (offset--)
*dst8++ = *src8++;
}
return (dst);
}
As a practice in optimization I'm trying to get my memcpy re-creation as close in speed to the libc one as I can. I have used the following techniques to optimize my memcpy:
- Casting the data to as big a datatype as possible for copying.
- Unrolling the main loop 8 times.
- For data <= 8 bytes I bypass the main loop.
My results (I have added a naive 1 byte at a time memcpy for reference):
| Test case | mem_cpy |
mem_cpy_naive |
memcpy |
|---|---|---|---|
| big string (1000 bytes) | 12.452919s | 212.728906s | 0.935605s |
| small string (8 bytes) | 0.367271s | 1.413559s | 0.149886s |
I feel I have exhausted the "low hanging fruit" in terms of optimization. I understand that the libc function could be optimized on a level not accessible to me writing only C, but I wonder if there's still something to be done here or is the next step to write it in assembly. To give a bit more clarification as to my motive for this. I study programming in a school that has performance constrains on our projects, but as of now we are only able to use standard C, so I can't go optimizing on assembly level yet. We are also not allowed to use libc and have to create our own versions of the standard functions we want to use so making my memcpy as fast as possible helps me hitting the performance goals in my projects. And it's great for learning obviously. I welcome any ideas!
Here is the code including the tests, can be compiled as is:
#include <time.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
const size_t iters = 100000000;
//-----------------------------------------------------------------------------
// Optimized memcpy
//
static void copy_words(void *dst, const void *src, size_t words)
{
const uint64_t *src64;
uint64_t *dst64;
uint64_t pages;
uint64_t offset;
pages = words / 8;
offset = words - pages * 8;
src64 = (const uint64_t *)src;
dst64 = (uint64_t *)dst;
while (pages--)
{
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
}
while (offset--)
*dst64++ = *src64++;
}
static void copy_small(void *dst, const void *src, size_t size)
{
const uint64_t *src64;
uint64_t *dst64;
src64 = (const uint64_t *)src;
dst64 = (uint64_t *)dst;
*dst64 = *src64;
}
void *mem_cpy(void *dst, const void *src, const size_t size)
{
const uint8_t *src8;
uint8_t *dst8;
size_t offset;
size_t words;
size_t aligned_size;
if (!src || !dst)
return (NULL);
if (size <= 8)
{
copy_small(dst, src, size);
return (dst);
}
words = size / 8;
aligned_size = words * 8;
offset = size - aligned_size;
copy_words(dst, src, words);
if (offset)
{
src8 = (const uint8_t *)src;
src8 = &src8[aligned_size];
dst8 = (uint8_t *)dst;
dst8 = &dst8[aligned_size];
while (offset--)
*dst8++ = *src8++;
}
return (dst);
}
//-----------------------------------------------------------------------------
// Naive memcpy
//
void *mem_cpy_naive(void *dst, const void *src, size_t n)
{
const uint8_t *src8;
uint8_t *dst8;
if (src == NULL)
return (NULL);
src8 = (const uint8_t *)src;
dst8 = (uint8_t *)dst;
while (n--)
*dst8++ = *src8++;
return (dst);
}
//-----------------------------------------------------------------------------
// Tests
//
int test(int (*f)(), char *test_name)
{
clock_t begin = clock();
f();
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%s: %f\n", test_name, time_spent);
return (1);
}
char *big_data()
{
char *out;
size_t i;
out = (char *)malloc(sizeof(char) * 1000);
i = 0;
while (i < 1000)
{
out[i] = 'a';
i++;
}
return (out);
}
int test1()
{
char *src;
char *dst;
size_t i;
src = big_data();
dst = (char *)malloc(sizeof(char) * 1000);
i = 0;
while (i < iters)
{
mem_cpy(dst, src, 1000);
i++;
}
return (1);
}
int test2()
{
char *src;
char *dst;
size_t i;
src = big_data();
dst = (char *)malloc(sizeof(char) * 1000);
i = 0;
while (i < iters)
{
mem_cpy_naive(dst, src, 1000);
i++;
}
return (1);
}
int test3()
{
char *src;
char *dst;
size_t i;
src = big_data();
dst = (char *)malloc(sizeof(char) * 1000);
i = 0;
while (i < iters)
{
memcpy(dst, src, 1000);
i++;
}
return (1);
}
int test4()
{
char *src;
char *dst;
size_t i;
src = "12345678";
dst = (char *)malloc(sizeof(char) * 8);
i = 0;
while (i < iters)
{
mem_cpy(dst, src, 8);
i++;
}
return (1);
}
int test5()
{
char *src;
char *dst;
size_t i;
src = "12345678";
dst = (char *)malloc(sizeof(char) * 8);
i = 0;
while (i < iters)
{
mem_cpy_naive(dst, src, 8);
i++;
}
return (1);
}
int test6()
{
char *src;
char *dst;
size_t i;
src = "12345678";
dst = (char *)malloc(sizeof(char) * 8);
i = 0;
while (i < iters)
{
memcpy(dst, src, 8);
i++;
}
return (1);
}
int main(void)
{
test(test1, "User memcpy (big string)");
test(test2, "User memcpy naive (big string)");
test(test3, "Libc memcpy (big string)");
test(test4, "User memcpy");
test(test5, "USer memcpy naive");
test(test6, "Libc memcpy");
}
I won't paste the assembly, since I think it's more convenient to just put a link to compiler explorer:
4 Answers 4
Fix buffer overrun in copy_small
As you've currently written it, and as Toby previously pointed out, copy_small always writes 8 bytes to dest, even when size < 8. This is a major memory safety bug as it writes past the end of the dest buffer.
void
copy_small(void *restrict dst, const void *restrict src, size_t size)
{
const uint64_t *restrict src64;
uint64_t *restrict dst64;
src64 = (const uint64_t *restrict)src;
dst64 = (uint64_t *restrict)dst;
*dst64 = *src64; // !DANGER WILL ROBINSON!
}
Here is how I would write copy_small. This is safe and, I believe, optimal.
void copy_small(uint8_t *restrict dst, const uint8_t *restrict src, size_t size)
{
if (size & 8) {
*(uint64_t *restrict)dst = *(const uint64_t *restrict)src;
return;
}
if (size & 4) {
*(uint32_t *restrict)dst = *(const uint32_t *restrict)src;
dst += 4;
src += 4;
}
if (size & 2) {
*(uint16_t *restrict)dst = *(const uint16_t *restrict)src;
dst += 2;
src += 2;
}
if (size & 1)
*dst = *src;
}
Don't time malloc in your tests
As Toby pointed out, you need to rewrite your tests so that any memory allocations are completed before the timer starts, otherwise you're contaminating your data by measuring malloc in addition to the copy routines.
Qualify pointer arguments with restrict
As I said in the comments, your original code was missing the restrict qualifier (as with libc memcpy) on pointer arguments to mem_cpy and friends. This was the most significant missed optimization opportunity in your code, and as you say this change led to a significant speedup.
For "fairness," if you haven't done so already, add restrict to the pointer arguments of mem_cpy_naive. Note that you will need to compile with the option -fno-tree-loop-distribute-patterns to prevent GCC from optimizing mem_cpy_naive to a call to libc memcpy.
Micro-optimizations
- You declared
copy_wordsandcopy_smallwithstaticlinkage, presumably because you want them to be inlined, but in that case you should also declare theminline(i.e.static inline). Contrary to popular myth, theinlinespecifier does make the compiler significantly more likely to inline the function. - Passing a null pointer to
memcpyis undefined behavior, meaning you don't have to handle it gracefully. So, this /if (!src || !dst) return (NULL)/ is unnecessary. I'd replace it with an assertion /assert(src && dst)/ and compile with-DNDEBUGfor the benchmarks. You can also declare your functions with__attribute__((nonnull)), which tells GCC to 1) raise a warning if it detects a null pointer being passed to the function and 2) optimize under the assumption that the pointer arguments are never null. - Divisions and multiplications by powers of 2 are equivalent to right or left bit shifts by that power (which are faster). So all the instances of
x / 8in your code can be replaced withx >> 3and all thex * 8can be replaced byx << 3. The compiler is probably smart enough to do this itself, but you might as well make it explicit. - The
if (offset)branch is unnecessary, and the loop it contains can be replaced with a call tocopy_small. And you can make this change while still only callingcopy_smallonce inmem_cpy. Do you see how?
Style suggestions
copy_wordsandcopy_smalldon't have to follow thememcpyAPI exactly. It's a lot less verbose if theirdestandsrcarguments are respectivelyuint64_t*anduint8_t*instead ofvoid*.- Declaring a bunch of uninitialized variables at the top of the function was necessary in ANSI C, but in modern C it's bad style and potentially dangerous if you forget to initialize one of them. As much as possible, variables should both be declared (as
constwhen they aren't modified) and initialized directly before they're used. - As Toby said, the unnecessary parentheses around return values are confusing to the reader.
- As Toby also said, the use of the term
pagesfor something other than a page is confusing. I would replace that withchunksor similar.
Disagreements with Toby
- I wouldn't worry about exotic/nonexistent platforms where
CHAR_BIT != 8oruint64_tisn't supported. If you were actually implementingmemcpyfor GCC, you might have to worry about this. But otherwise, no. - When I write C code, I try to make it compatible with C++ if possible. So, since C++ requires it, I'm in favor of casting calls to
mallocto the type of pointer they're being assigned to.
Your code, as I would write it:
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#if !defined(__GNUC__) && !defined(__attribute__) // no GCC attribute syntax
#define __attribute__(X)
#endif
#ifdef __cplusplus // C++
extern "C" {
#if defined(__GNUC__) || defined(_MSC_VER) || defined(__restrict)
#define restrict __restrict
#elif !defined(restrict) // restrict or __restrict not supported in C++
#define restrict
#endif
#endif
static inline __attribute__((nonnull))
void copy_small(uint8_t *restrict dst, const uint8_t *restrict src, size_t size)
{
if (size >= 8) {
*(uint64_t *restrict)dst = *(const uint64_t *restrict)src;
return;
}
if (size >= 4) {
*(uint32_t *restrict)dst = *(const uint32_t *restrict)src;
dst += 4;
src += 4;
}
if (size & 2) {
*(uint16_t *restrict)dst = *(const uint16_t *restrict)src;
dst += 2;
src += 2;
}
if (size & 1)
*dst = *src;
}
static inline __attribute__((nonnull))
void copy64(uint64_t *restrict dst, const uint64_t *restrict src, size_t n) {
size_t chunks = n >> 3;
size_t offset = n - (chunks << 3);
while (chunks--) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
while (offset--)
*dst++ = *src++;
}
__attribute__((nonnull))
void *mem_cpy(void *restrict dst, const void *restrict src, size_t size) {
assert(dst && src);
uint8_t *dst8 = (uint8_t*)dst;
const uint8_t *src8 = (const uint8_t*)src;
if (size > 8) {
const size_t qwords = size >> 3;
copy64((uint64_t*)dst, (const uint64_t*)src, qwords);
const size_t aligned_size = qwords << 3;
size -= aligned_size;
dst8 += aligned_size;
src8 += aligned_size;
}
copy_small(dst8, src8, size);
return dst;
}
/* GCC optimizes this to a call to libc memcpy unless compiled with
* -fno-tree-loop-distribute-patterns
*/
__attribute__((nonnull))
void *mem_cpy_naive(void *restrict dst, const void *restrict src, size_t size) {
assert(dst && src);
uint8_t *restrict dst8 = (uint8_t*)dst;
const uint8_t *restrict src8 = (const uint8_t*)src;
while (size--)
*dst8++ = *src8++;
return dst;
}
#ifdef __cplusplus
}
#endif
-
2\$\begingroup\$ You shouldn't have to cast pointers in C++ either, since you should use
newthere, or perhaps astd::vector, instead ofmalloc(). If it's about compiling with a C++ compiler, then that shouldn't work here becauserestrictis not a valid C++ keyword. \$\endgroup\$G. Sliepen– G. Sliepen2021年04月21日 18:39:56 +00:00Commented Apr 21, 2021 at 18:39 -
1\$\begingroup\$ Did you look at the code I wrote at the bottom? I addressed the
restrictissue (not difficult). \$\endgroup\$Ray Hamel– Ray Hamel2021年04月21日 19:43:22 +00:00Commented Apr 21, 2021 at 19:43 -
1\$\begingroup\$ The X32 ABI isn't really maintained, is likely to be deprecated in the near future and I don't think any distros support it. But, maybe it would be better to go with OP's idea and just use
uint64_tto copy with, since it should still be close to optimal even on most platforms with a native word size less than 64 bits. \$\endgroup\$Ray Hamel– Ray Hamel2021年04月21日 20:06:08 +00:00Commented Apr 21, 2021 at 20:06 -
1\$\begingroup\$ FWIW I dropped the
size_tidea, if nothing else it makes my code easier to read. \$\endgroup\$Ray Hamel– Ray Hamel2021年04月21日 21:36:26 +00:00Commented Apr 21, 2021 at 21:36 -
\$\begingroup\$ Would this be a cheapo lazy solution or could it work if I just bitmasked the < 8 byte chunk. Something like
mask = (1 << bytes) - 1; *dst = *src & mask?? \$\endgroup\$Julius– Julius2021年04月22日 15:48:16 +00:00Commented Apr 22, 2021 at 15:48
const uint64_t *restrict src64;
Portability problem - uint64_t is only defined if the platform has a type of exactly 64 bits. Consider using uint_fast64_t (or perhaps uintmax_t) instead.
pages = words / 8; offset = words - pages * 8;
Another portability problem - assumes CHAR_BIT is 64/8 = 8. Use sizeof *src64 instead.
The terminology is strange and confusing. Why are we calling bytes "words" and words "pages"?
src64 = (const uint64_t *restrict)src; dst64 = (uint64_t *restrict)dst;
I don't see where this code guarantees that src64 and dst64 are suitably aligned for read and write as 64-bit quantities. This means that the code will work on some architectures (possibly with performance problems) and fail completely on others (perhaps raising SIGBUS if you're lucky).
The explicit casts are unnecessary given that src and dst are both void*, which converts implicitly to any object pointer. And there's no need to declare separately from initialising.
I don't see how copy_small honours its size argument. It always seems to copy a full uint64_t, potentially clobbering more than requested (and possibly writing outside its legitimate bounds).
The tests have problems, too. Let's look at test1() as a representative example:
int test1() { char *src; char *dst; size_t i;
Again, declare where we initialise.
src = big_data(); dst = (char *)malloc(sizeof(char) * 1000);
There's no need to write an explicit cast to convert from void*. The multiplication by 1 has no effect.
Why do we continue if allocation gives us back a null pointer? That's very slapdash.
More importantly, why are we including malloc() call in our timings? That's going to dominate the result.
i = 0; while (i < iters) { mem_cpy(dst, src, 1000); i++; }
That would be more readable as an ordinary for loop.
return (1); }
Why the pointless parens around the return value? return 1; is more readable.
-
\$\begingroup\$ Can you elabroate on how
uint_fast64_twould help? As for the compatibility, you are very correct. Just didn't get around to it yet. \$\endgroup\$Julius– Julius2021年04月21日 09:18:18 +00:00Commented Apr 21, 2021 at 9:18 -
\$\begingroup\$
uint_fast64_tis defined even if there's not a type of exactly 64 bits. So it is more portable. Perhapsuintmax_twould be an even better choice? \$\endgroup\$Toby Speight– Toby Speight2021年04月21日 10:19:25 +00:00Commented Apr 21, 2021 at 10:19 -
\$\begingroup\$ As for the copy small, I was thinking about that. Having all different types and checking it size is 6-8, 4-6 etc. seemed a bit overkill. As far as I understand malloc guarantees 64bit offset anyways. I played with the idea of still copying the data with a bit-mask though, to make extra sure only relevant data is copied, although I don't see a practical case in which that could become an issue. \$\endgroup\$Julius– Julius2021年04月21日 10:52:56 +00:00Commented Apr 21, 2021 at 10:52
-
\$\begingroup\$ What's wrong with a simple char-by-char approach for small copies? That's what most
memcpy()implementations do. Yes,malloc()should return memory suitably aligned for any type, but that's no help when you need to copy (e.g.) a substring out of somewhere in the middle of a string, or from/to automatic ("stack") storage. \$\endgroup\$Toby Speight– Toby Speight2021年04月21日 12:09:36 +00:00Commented Apr 21, 2021 at 12:09 -
\$\begingroup\$ You are probably right that I should do a byte by byte. I was thinking I could squeeze a cycle or two there with using the wider type, but I guess compiler would optimize it away anyways. Need to do a couple of tests for that. \$\endgroup\$Julius– Julius2021年04月21日 14:00:26 +00:00Commented Apr 21, 2021 at 14:00
Updated results, tests and code
As per suggestions in this thread I have updated my code following several suggestions from the answers. If I have incporporated your code in the result, I want to note that I need to follow certain strict formatting rules so sorry if I've changed the formatting in places.
Test scenario
I have created the following test scenarion:
- Copy a constant amount of data, but changing the amount of bytes fed to memcpy each iteration and thus dividing the amount of iterations.
- Measure maximum throughput in MB/s to find a sweet spot. This could be considered the maximum theoretcial speed for the copy under optimal situations.
Test hardware:
AMD Ryzen 3 3100 4-Core Processor
cpu MHz : 2199.410
cache size : 512 KB
bogomips : 7199.89
TLB size : 3072 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 43 bits physical, 48 bits virtual
Test (32 samples):
total bytes per iteration = 4294MB
| bytes / string | lib (s) | usr (s) | lib (MB/s) | usr (MB/s) | lib / usr |
|---|---|---|---|---|---|
| 1 | 19.419099 | 18.627787 | 221.17 | 230.57 | 1.042480 |
| 2 | 10.235309 | 9.164130 | 419.62 | 468.67 | 1.116888 |
| 4 | 5.488895 | 6.074278 | 782.48 | 707.07 | 0.903629 |
| 8 | 2.734126 | 2.450595 | 1570.87 | 1752.62 | 1.115699 |
| 16 | 1.358260 | 2.307924 | 3162.11 | 1860.97 | 0.588520 |
| 32 | 0.533698 | 1.219608 | 8047.56 | 3521.60 | 0.437598 |
| 64 | 0.269884 | 0.629097 | 15914.12 | 6827.19 | 0.429002 |
| 128 | 0.231149 | 0.376591 | 18580.95 | 11404.86 | 0.613793 |
| 256 | 0.101361 | 0.209152 | 42372.98 | 20535.15 | 0.484628 |
| 512 | 0.098064 | 0.146679 | 43797.59 | 29281.41 | 0.668562 |
| 1024 | 0.091359 | 0.122013 | 47011.98 | 35200.90 | 0.748764 |
| 2048 | 0.088001 | 0.124751 | 48805.89 | 34428.32 | 0.705413 |
| 4096 | 0.092047 | 0.117869 | 46660.59 | 36438.48 | 0.780926 |
| 8192 | 0.083935 | 0.121658 | 51170.16 | 35303.62 | 0.689926 |
| 16384 | 0.089955 | 0.132727 | 47745.73 | 32359.41 | 0.677745 |
| 32768 | 0.122094 | 0.177560 | 35177.55 | 24188.82 | 0.687621 |
| 65536 | 0.132070 | 0.175046 | 32520.39 | 24536.22 | 0.754487 |
| 131072 | 0.132693 | 0.155997 | 32367.70 | 27532.37 | 0.850613 |
| 262144 | 0.158665 | 0.172354 | 27069.41 | 24919.45 | 0.920576 |
| 524288 | 0.192556 | 0.197228 | 22305.03 | 21776.66 | 0.976312 |
| 1048576 | 0.195232 | 0.200701 | 21999.30 | 21399.83 | 0.972751 |
| 2097152 | 0.199710 | 0.206325 | 21506.02 | 20816.51 | 0.967939 |
| 4194304 | 0.916251 | 1.116313 | 4687.54 | 3847.46 | 0.820783 |
| 8388608 | 1.867847 | 1.933270 | 2299.42 | 2221.61 | 0.966159 |
| 16777216 | 1.920515 | 2.298106 | 2236.36 | 1868.92 | 0.835695 |
| 33554432 | 1.905867 | 2.115908 | 2253.55 | 2029.85 | 0.900732 |
| 67108864 | 1.533392 | 1.837277 | 2800.96 | 2337.68 | 0.834600 |
| 134217728 | 1.530375 | 1.882125 | 2806.48 | 2281.98 | 0.813110 |
| 268435456 | 1.405044 | 1.805629 | 3056.82 | 2378.65 | 0.778147 |
| 536870912 | 1.485964 | 1.877592 | 2890.36 | 2287.49 | 0.791420 |
| 1073741824 | 1.434760 | 1.869482 | 2993.51 | 2297.41 | 0.767464 |
| 2147483648 | 1.442237 | 1.838574 | 2977.99 | 2336.03 | 0.784432 |
Updated code with tests
Improvements on the code since original:
- Usage of restrict keyword as suggested by @Ray Hamel
- Correction of the buffer overrun when copying a small string suggested by @Ray Hamel
- Corrections to tests such as not allocating memory inside the timed portion of the tests as suggested by @Toby Spleight and several others.
- For my use case I can expwect that the results ar not NULL so I have
incorporated
__attribute__((nonull))as suggested by @Ray Hamel.
#include <time.h>
#include <math.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
size_t iters = 0; // 2^36
//-----------------------------------------------------------------------------
// Optimized memcpy
//
static inline void __attribute__((nonnull))
copy_small(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
{
if (n >= 8)
{
*(uint64_t *restrict)dst = *(const uint64_t *restrict)src;
return;
}
if (n >= 4)
{
*(uint32_t *restrict)dst = *(const uint32_t *restrict)src;
dst += 4;
src += 4;
}
if (n & 2)
{
*(uint16_t *restrict)dst = *(const uint16_t *restrict)src;
dst += 2;
src += 2;
}
if (n & 1)
*dst = *src;
}
static inline void __attribute__((nonnull))
copy512(uint64_t *restrict dst, const uint64_t *restrict src, size_t n)
{
size_t chunks;
size_t offset;
chunks = n >> 3;
offset = n - (chunks << 3);
while (chunks--)
{
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
while (offset--)
*dst++ = *src++;
}
void __attribute__((nonnull))
*mem_cpy(void *restrict dst, const void *restrict src, size_t n)
{
uint8_t *dst8;
const uint8_t *src8;
size_t qwords;
size_t aligned_size;
dst8 = (uint8_t*)dst;
src8 = (const uint8_t*)src;
qwords = n >> 3;
if (n > 8)
{
copy512((uint64_t*)dst, (const uint64_t*)src, qwords);
return (dst);
}
aligned_size = qwords << 3;
n -= aligned_size;
dst8 += aligned_size;
src8 += aligned_size;
copy_small(dst8, src8, n);
return (dst);
}
//-----------------------------------------------------------------------------
// Tests
//
double test(int (*f)(char *, char *, size_t),
char *test_data,
char *test_dst,
char *test_name,
size_t i)
{
clock_t begin = clock();
f(test_dst, test_data, i);
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
return (time_spent);
}
char *make_string(size_t size)
{
char *out;
size_t i;
out = (char *)malloc(sizeof(char) * size);
i = 0;
while (i < size)
{
out[i] = i % 128;
i++;
}
return (out);
}
int test_usr(char *dst, char *src, size_t bytes)
{
size_t i;
i = 0;
while (i < iters)
{
mem_cpy(dst, src, bytes);
assert(memcmp(dst, src, bytes) == 0);
i++;
}
return (1);
}
int test_lib(char *dst, char *src, size_t bytes)
{
size_t i;
i = 0;
while (i < iters)
{
memcpy(dst, src, bytes);
assert(memcmp(dst, src, bytes) == 0);
i++;
}
return (1);
}
int test_different_sizes()
{
size_t power;
size_t i;
size_t size;
double lib;
double usr;
double lmbs;
double umbs;
char *src;
char *dst;
i = 0;
power = 32;
iters = pow(2, power);
printf("total bytes per iteration = %zuMB\n", iters / (size_t)pow(10, 6));
printf("| bytes / string | lib (s) | usr (s) | lib (MB/s) | usr (MB/s) | lib / usr |\n");
printf("|---|---|---|---|---|\n");
while (i < power)
{
size = pow(2, i);
src = make_string(size);
dst = make_string(size);
lib = test(test_lib, src, dst, "LIB", size);
usr = test(test_usr, src, dst, "USR", size);
lmbs = ((size * iters) / pow(10, 6)) / lib;
umbs = ((size * iters) / pow(10, 6)) / usr;
printf("|%-10zu|%-10lf|%-10lf|%-10.2lf|%-10.2lf|%-10lf|\n",
size, lib, usr, lmbs, umbs, lib / usr);
iters /= 2;
free(src);
free(dst);
i++;
}
return (1);
}
int main(void)
{
test_different_sizes();
}
If you try this in MacOS, you’ll have an extreme fight on your hands. MacOS will at boot time install code optimised for your particular processor in a fixed place, this is done for memcpy, memmove , memset plus memset for two, four or eight byte values, and for some atomic operations.
The memcpy on my current computer uses vector instructions, uses caching instructions not available in C, and all the tricks in the book. You basically have no chance beating it. And if you beat it on one computer, it won’t work on another.
As far as your code is concerned: You should try to align the pointers first.
If count >= 1 and dst is not two-byte aligned -> copy 1 byte.
If count >= 2 and dst is not four-byte aligned -> copy 2 byte.
If count >= 4 and dst is not eight-byte aligned -> copy 4 byte.
Then you copy eight bytes at a time, then another 4 if needed, another 2, and another byte.
-
\$\begingroup\$ I tried it on a MacBook Pro i9 2019. My code was still faster although not by as big of a margin as in the OP. Mac OS uses clang though even if you type GCC so I had to compile on clang. Surely my compiler can vectorize and use all tricks in the book as well since I'm using -O3, restrict etc. \$\endgroup\$Julius– Julius2021年04月22日 15:40:56 +00:00Commented Apr 22, 2021 at 15:40
You must log in to answer this question.
Explore related questions
See similar questions with these tags.
destandsrcwith the qualifierrestrict( en.cppreference.com/w/c/language/restrict ), as in libc. Like so:void *memcpy(void *restrict dest, const void *restrict src, size_t size)\$\endgroup\$mem_cpywith__attribute__((noinline))(GCC/clang) /__declspec(noinline)(MSVC)? \$\endgroup\$