#include #ifdef __clang__ #define __m128i_u __m128i #define __m256i_u __m256i #define __m512i_u __m512i #endif #ifdef __AVX512F__ #define BYTE_ALIGNMENT 0x3F // For 64-byte alignment #elif __AVX__ #define BYTE_ALIGNMENT 0x1F // For 32-byte alignment #else #define BYTE_ALIGNMENT 0x0F // For 16-byte alignment #endif // // USAGE INFORMATION: // // The "len" argument is "# of x bytes to move," e.g. memmove_512bit_u/a needs // to know "how many multiples of 512 bit (64 bytes) to move." All functions // with len follow the same pattern, e.g. memmove_512bit_512B_u/a needs to know // how many multiples of 512 bytes to move, so a len of 4 tells it to move 2kB. // // The "numbytes" argument for functions that use it is just the total // number of bytes to move. // // Some microarchitectural information: // // Sources: // https://www.agner.org/optimize/ // https://software.intel.com/en-us/articles/intel-sdm // http://blog.stuffedcow.net/2014/01/x86-memory-disambiguation/ // // It looks as though Haswell and up can do 2 simultaneous aligned loads or 1 // unaligned load in 1 cycle. Alignment means the data is at an address that is // a multiple of the cache line size, and the CPU most easily loads one cache // line at a time. All AVX-supporting CPUs have a 64-byte cacheline as of Q4 2018. // The bottleneck here is stores: only 1 store per cycle can be done (there is // only 1 store port despite 2 load ports). Unaligned loads/stores that cross // cache line boundaries typically incur relatively significant cycle penalties, // though Haswell and up fixed that specifically for unaligned loads. // // Unaligned loads on Haswell require both load ports, but, since there is only // one store port, the store port has to do double-duty for stores that cross // cache line boundaries. So stores should be contained within cache line sizes // for best performance. For memmove, this also means there's no point in doing // 2 separate aligned loads simultaneously if only one can be written at a time. // // BUT it turns out that's not the whole story. We can do 2 aligned loads to // ensure that no cycle is wasted. i.e. instead of this (comma = simultaneously): // load 1 -> store 1, load 2-> store 2, load 3 -> store 3, load 4 -> store 4 etc. // we can do this with aligned AVX2 loads: // load 1, load 2 -> store 1, load 3, load 4 -> store 2, load 5, load 6 -> store 3, etc. // And this is just per core. // // For pure memmove, this provides no real improvement, but loops with many // iterations that require loading two values, doing math on them, and storing a // single result can see significant throughput gains. Sandy Bridge could perform // similarly, but in 2 cycles instead of Haswell's 1 and only for the fewer // 256-bit AVX calculations it had (Haswell can do any size, AVX2 or otherwise). // // Skylake-X, with AVX512, extends Haswell's behavior to include 512-bit values. // // If an architecture ever adds 2 store ports, the AVX/(VEX-encoded) SSE // functions in this file will need to be modified to do 2 loads and 2 stores. // //----------------------------------------------------------------------------- // Individual Functions: //----------------------------------------------------------------------------- // 16-bit (2 bytes at a time) // Len is (# of total bytes/2), so it's "# of 16-bits" void * memmove_16bit(void *dest, const void *src, size_t len) { const uint16_t* s = (uint16_t*)src; uint16_t* d = (uint16_t*)dest; const uint16_t *nexts = s + len; uint16_t *nextd = d + len; if (d < s) { while (d != nextd) { *d++ = *s++; } } else { while (nextd != d) { *--nextd = *--nexts; } } return dest; } // 32-bit (4 bytes at a time - 1 pixel in a 32-bit linear frame buffer) // Len is (# of total bytes/4), so it's "# of 32-bits" void * memmove_32bit(void *dest, const void *src, size_t len) { const uint32_t* s = (uint32_t*)src; uint32_t* d = (uint32_t*)dest; const uint32_t *nexts = s + len; uint32_t *nextd = d + len; if (d < s) { while (d != nextd) { *d++ = *s++; } } else { while (nextd != d) { *--nextd = *--nexts; } } return dest; } // 64-bit (8 bytes at a time - 2 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/8), so it's "# of 64-bits" void * memmove_64bit(void *dest, const void *src, size_t len) { const uint64_t* s = (uint64_t*)src; uint64_t* d = (uint64_t*)dest; const uint64_t *nexts = s + len; uint64_t *nextd = d + len; if (d < s) { while (d != nextd) { *d++ = *s++; } } else { while (nextd != d) { *--nextd = *--nexts; } } return dest; } //----------------------------------------------------------------------------- // SSE2 Unaligned: //----------------------------------------------------------------------------- // SSE2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/16), so it's "# of 128-bits" void * memmove_128bit_u(void *dest, const void *src, size_t len) { const __m128i_u* s = (__m128i_u*)src; __m128i_u* d = (__m128i_u*)dest; const __m128i_u *nexts = s + len; __m128i_u *nextd = d + len; if (d < s) { while (d != nextd) { _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); } } else { while (nextd != d) { _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); } } return dest; } // 32 bytes at a time void * memmove_128bit_32B_u(void *dest, const void *src, size_t len) { const __m128i_u* s = (__m128i_u*)src; __m128i_u* d = (__m128i_u*)dest; const __m128i_u *nexts = s + (len << 1); __m128i_u *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 } } else { while (nextd != d) { _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 1 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 2 } } return dest; } // 64 bytes at a time void * memmove_128bit_64B_u(void *dest, const void *src, size_t len) { const __m128i_u* s = (__m128i_u*)src; __m128i_u* d = (__m128i_u*)dest; const __m128i_u *nexts = s + (len << 2); __m128i_u *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 } } else { while (nextd != d) { _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 1 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 2 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 3 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 4 } } return dest; } // 128 bytes at a time void * memmove_128bit_128B_u(void *dest, const void *src, size_t len) { const __m128i_u* s = (__m128i_u*)src; __m128i_u* d = (__m128i_u*)dest; const __m128i_u *nexts = s + (len << 3); __m128i_u *nextd = d + (len << 3); if (d < s) { while (d != nextd) { _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 5 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 6 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 7 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 8 } } else { while (nextd != d) { _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 1 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 2 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 3 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 4 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 5 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 6 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 7 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 8 } } return dest; } // For fun: 1 load->store for every xmm register // 256 bytes void * memmove_128bit_256B_u(void *dest, const void *src, size_t len) { const __m128i_u* s = (__m128i_u*)src; __m128i_u* d = (__m128i_u*)dest; const __m128i_u *nexts = s + (len << 4); __m128i_u *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 5 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 6 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 7 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 8 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 9 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 10 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 11 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 12 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 13 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 14 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 15 _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 16 } } else { while (nextd != d) { _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 1 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 2 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 3 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 4 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 5 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 6 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 7 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 8 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 9 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 10 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 11 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 12 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 13 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 14 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 15 _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 16 } } return dest; } //----------------------------------------------------------------------------- // AVX+ Unaligned: //----------------------------------------------------------------------------- // AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/32), so it's "# of 256-bits" // Sandybridge and Ryzen and up, Haswell and up for better performance #ifdef __AVX__ void * memmove_256bit_u(void *dest, const void *src, size_t len) { const __m256i_u* s = (__m256i_u*)src; __m256i_u* d = (__m256i_u*)dest; const __m256i_u *nexts = s + len; __m256i_u *nextd = d + len; if (d < s) { while (d != nextd) { _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); } } else { while (nextd != d) { _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); } } return dest; } // 64 bytes at a time void * memmove_256bit_64B_u(void *dest, const void *src, size_t len) { const __m256i_u* s = (__m256i_u*)src; __m256i_u* d = (__m256i_u*)dest; const __m256i_u *nexts = s + (len << 1); __m256i_u *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 } } else { while (nextd != d) { _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 1 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 2 } } return dest; } // 128 bytes at a time void * memmove_256bit_128B_u(void *dest, const void *src, size_t len) { const __m256i_u* s = (__m256i_u*)src; __m256i_u* d = (__m256i_u*)dest; const __m256i_u *nexts = s + (len << 2); __m256i_u *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 } } else { while (nextd != d) { _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 1 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 2 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 3 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 4 } } return dest; } // 256 bytes at a time void * memmove_256bit_256B_u(void *dest, const void *src, size_t len) { const __m256i_u* s = (__m256i_u*)src; __m256i_u* d = (__m256i_u*)dest; const __m256i_u *nexts = s + (len << 3); __m256i_u *nextd = d + (len << 3); if (d < s) { while (d != nextd) { _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 5 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 6 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 7 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 8 } } else { while (nextd != d) { _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 1 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 2 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 3 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 4 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 5 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 6 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 7 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 8 } } return dest; } // For fun: // 512 bytes at a time, one load->store for every ymm register (there are 16) void * memmove_256bit_512B_u(void *dest, const void *src, size_t len) { const __m256i_u* s = (__m256i_u*)src; __m256i_u* d = (__m256i_u*)dest; const __m256i_u *nexts = s + (len << 4); __m256i_u *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 5 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 6 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 7 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 8 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 9 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 10 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 11 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 12 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 13 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 14 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 15 _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 16 } } else { while (nextd != d) { _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 1 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 2 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 3 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 4 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 5 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 6 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 7 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 8 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 9 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 10 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 11 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 12 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 13 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 14 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 15 _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 16 } } return dest; } #endif // AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/64), so it's "# of 512-bits" // Requires AVX512F #ifdef __AVX512F__ void * memmove_512bit_u(void *dest, const void *src, size_t len) { const __m512i_u* s = (__m512i_u*)src; __m512i_u* d = (__m512i_u*)dest; const __m512i_u *nexts = s + len; __m512i_u *nextd = d + len; if (d < s) { while (d != nextd) { _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); } } else { while (nextd != d) { _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); } } return dest; } // 128 bytes at a time void * memmove_512bit_128B_u(void *dest, const void *src, size_t len) { const __m512i_u* s = (__m512i_u*)src; __m512i_u* d = (__m512i_u*)dest; const __m512i_u *nexts = s + (len << 1); __m512i_u *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 } } else { while (nextd != d) { _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 } } return dest; } // 256 bytes at a time void * memmove_512bit_256B_u(void *dest, const void *src, size_t len) { const __m512i_u* s = (__m512i_u*)src; __m512i_u* d = (__m512i_u*)dest; const __m512i_u *nexts = s + (len << 2); __m512i_u *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 } } else { while (nextd != d) { _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 } } return dest; } // 512 bytes (half a KB!!) at a time void * memmove_512bit_512B_u(void *dest, const void *src, size_t len) { const __m512i_u* s = (__m512i_u*)src; __m512i_u* d = (__m512i_u*)dest; const __m512i_u *nexts = s + (len << 3); __m512i_u *nextd = d + (len << 3); if (d < s) { while (d != nextd) { _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 } } else { while (nextd != d) { _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 } } return dest; } // Alright I'll admit I got a little carried away... // 1024 bytes, or 1 kB void * memmove_512bit_1kB_u(void *dest, const void *src, size_t len) { const __m512i_u* s = (__m512i_u*)src; __m512i_u* d = (__m512i_u*)dest; const __m512i_u *nexts = s + (len << 4); __m512i_u *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 } } else { while (nextd != d) { _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 9 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 10 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 11 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 12 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 13 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 14 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 15 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 16 } } return dest; } // 2048 bytes, or 2 kB void * memmove_512bit_2kB_u(void *dest, const void *src, size_t len) { const __m512i_u* s = (__m512i_u*)src; __m512i_u* d = (__m512i_u*)dest; const __m512i_u *nexts = s + (len << 5); __m512i_u *nextd = d + (len << 5); if (d < s) { while (d != nextd) { _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 } } else { while (nextd != d) { _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 9 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 10 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 11 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 12 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 13 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 14 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 15 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 16 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 17 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 18 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 19 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 20 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 21 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 22 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 23 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 24 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 25 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 26 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 27 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 28 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 29 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 30 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 31 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 32 } } return dest; } // Y'know what? Here's a whole page. // 4096 bytes, or 4 kB void * memmove_512bit_4kB_u(void *dest, const void *src, size_t len) { const __m512i_u* s = (__m512i_u*)src; __m512i_u* d = (__m512i_u*)dest; const __m512i_u *nexts = s + (len << 6); __m512i_u *nextd = d + (len << 6); if (d < s) { while (d != nextd) { _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 } } else { while (nextd != d) { _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 9 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 10 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 11 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 12 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 13 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 14 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 15 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 16 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 17 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 18 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 19 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 20 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 21 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 22 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 23 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 24 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 25 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 26 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 27 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 28 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 29 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 30 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 31 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 32 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 9 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 10 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 11 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 12 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 13 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 14 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 15 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 16 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 17 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 18 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 19 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 20 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 21 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 22 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 23 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 24 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 25 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 26 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 27 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 28 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 29 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 30 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 31 _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 32 } } return dest; } #endif // AVX-1024 support pending existence of the standard. It would be able to fit // an entire 4 kB page in its registers at one time. Imagine that! // (AVX-512 maxes at 2 kB, which is why I only used numbers 1-32 above.) //----------------------------------------------------------------------------- // SSE2 Aligned: //----------------------------------------------------------------------------- // SSE2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/16), so it's "# of 128-bits" void * memmove_128bit_a(void *dest, const void *src, size_t len) { const __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; const __m128i *nexts = s + len; __m128i *nextd = d + len; if (d < s) { while (d != nextd) { _mm_store_si128(d++, _mm_load_si128(s++)); } } else { while (nextd != d) { _mm_store_si128(--nextd, _mm_load_si128(--nexts)); } } return dest; } // 32 bytes at a time void * memmove_128bit_32B_a(void *dest, const void *src, size_t len) { const __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; const __m128i *nexts = s + (len << 1); __m128i *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm_store_si128(d++, _mm_load_si128(s++)); // 1 _mm_store_si128(d++, _mm_load_si128(s++)); // 2 } } else { while (nextd != d) { _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 1 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 2 } } return dest; } // 64 bytes at a time void * memmove_128bit_64B_a(void *dest, const void *src, size_t len) { const __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; const __m128i *nexts = s + (len << 2); __m128i *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm_store_si128(d++, _mm_load_si128(s++)); // 1 _mm_store_si128(d++, _mm_load_si128(s++)); // 2 _mm_store_si128(d++, _mm_load_si128(s++)); // 3 _mm_store_si128(d++, _mm_load_si128(s++)); // 4 } } else { while (nextd != d) { _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 1 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 2 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 3 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 4 } } return dest; } // 128 bytes at a time void * memmove_128bit_128B_a(void *dest, const void *src, size_t len) { const __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; const __m128i *nexts = s + (len << 3); __m128i *nextd = d + (len << 3); if (d < s) { while (d != nextd) { _mm_store_si128(d++, _mm_load_si128(s++)); // 1 _mm_store_si128(d++, _mm_load_si128(s++)); // 2 _mm_store_si128(d++, _mm_load_si128(s++)); // 3 _mm_store_si128(d++, _mm_load_si128(s++)); // 4 _mm_store_si128(d++, _mm_load_si128(s++)); // 5 _mm_store_si128(d++, _mm_load_si128(s++)); // 6 _mm_store_si128(d++, _mm_load_si128(s++)); // 7 _mm_store_si128(d++, _mm_load_si128(s++)); // 8 } } else { while (nextd != d) { _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 1 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 2 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 3 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 4 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 5 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 6 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 7 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 8 } } return dest; } // For fun: 1 load->store for every xmm register (there are 16) // 256 bytes void * memmove_128bit_256B_a(void *dest, const void *src, size_t len) { const __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; const __m128i *nexts = s + (len << 4); __m128i *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm_store_si128(d++, _mm_load_si128(s++)); // 1 _mm_store_si128(d++, _mm_load_si128(s++)); // 2 _mm_store_si128(d++, _mm_load_si128(s++)); // 3 _mm_store_si128(d++, _mm_load_si128(s++)); // 4 _mm_store_si128(d++, _mm_load_si128(s++)); // 5 _mm_store_si128(d++, _mm_load_si128(s++)); // 6 _mm_store_si128(d++, _mm_load_si128(s++)); // 7 _mm_store_si128(d++, _mm_load_si128(s++)); // 8 _mm_store_si128(d++, _mm_load_si128(s++)); // 9 _mm_store_si128(d++, _mm_load_si128(s++)); // 10 _mm_store_si128(d++, _mm_load_si128(s++)); // 11 _mm_store_si128(d++, _mm_load_si128(s++)); // 12 _mm_store_si128(d++, _mm_load_si128(s++)); // 13 _mm_store_si128(d++, _mm_load_si128(s++)); // 14 _mm_store_si128(d++, _mm_load_si128(s++)); // 15 _mm_store_si128(d++, _mm_load_si128(s++)); // 16 } } else { while (nextd != d) { _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 1 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 2 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 3 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 4 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 5 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 6 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 7 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 8 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 9 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 10 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 11 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 12 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 13 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 14 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 15 _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 16 } } return dest; } //----------------------------------------------------------------------------- // AVX+ Aligned: //----------------------------------------------------------------------------- // AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/32), so it's "# of 256-bits" // Sandybridge and Ryzen and up #ifdef __AVX__ void * memmove_256bit_a(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + len; __m256i *nextd = d + len; if (d < s) { while (d != nextd) { _mm256_store_si256(d++, _mm256_load_si256(s++)); } } else { while (nextd != d) { _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); } } return dest; } // 64 bytes at a time void * memmove_256bit_64B_a(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + (len << 1); __m256i *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 } } else { while (nextd != d) { _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 1 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 2 } } return dest; } // 128 bytes at a time void * memmove_256bit_128B_a(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + (len << 2); __m256i *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 } } else { while (nextd != d) { _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 1 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 2 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 3 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 4 } } return dest; } // 256 bytes at a time void * memmove_256bit_256B_a(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + (len << 3); __m256i *nextd = d + (len << 3); if (d < s) { while (d != nextd) { _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 5 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 6 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 7 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 8 } } else { while (nextd != d) { _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 1 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 2 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 3 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 4 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 5 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 6 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 7 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 8 } } return dest; } // I just wanted to see what doing one move for every ymm register looks like. // There are 16 256-bit (ymm) registers. void * memmove_256bit_512B_a(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + (len << 4); __m256i *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 5 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 6 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 7 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 8 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 9 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 10 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 11 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 12 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 13 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 14 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 15 _mm256_store_si256(d++, _mm256_load_si256(s++)); // 16 } } else { while (nextd != d) { _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 1 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 2 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 3 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 4 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 5 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 6 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 7 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 8 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 9 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 10 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 11 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 12 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 13 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 14 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 15 _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 16 } } return dest; } #endif // AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/64), so it's "# of 512-bits" // Requires AVX512F #ifdef __AVX512F__ void * memmove_512bit_a(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + len; __m512i *nextd = d + len; if (d < s) { while (d != nextd) { _mm512_store_si512(d++, _mm512_load_si512(s++)); } } else { while (nextd != d) { _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); } } return dest; } // 128 bytes at a time void * memmove_512bit_128B_a(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 1); __m512i *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 } } else { while (nextd != d) { _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 } } return dest; } // 256 bytes at a time void * memmove_512bit_256B_a(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 2); __m512i *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 } } else { while (nextd != d) { _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 } } return dest; } // 512 bytes (half a KB!!) at a time void * memmove_512bit_512B_a(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 3); __m512i *nextd = d + (len << 3); if (d < s) { while (d != nextd) // Post-increment: use d then increment { _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 } } else { while (nextd != d) // Pre-increment: increment nextd then use { _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 } } return dest; } // The functions below I made just for fun to see what doing one move for every // zmm register looks like. I think the insanity speaks for itself. :) // 1024 bytes, or 1 kB void * memmove_512bit_1kB_a(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 4); __m512i *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 } } else { while (nextd != d) { _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 9 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 10 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 11 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 12 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 13 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 14 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 15 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 16 } } return dest; } // 2048 bytes, or 2 kB // AVX512 has 32x 512-bit registers, so...... void * memmove_512bit_2kB_a(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 5); __m512i *nextd = d + (len << 5); if (d < s) { while (d != nextd) { _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 } } else { while (nextd != d) { _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 9 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 10 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 11 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 12 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 13 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 14 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 15 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 16 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 17 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 18 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 19 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 20 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 21 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 22 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 23 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 24 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 25 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 26 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 27 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 28 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 29 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 30 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 31 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 32 } } return dest; } // Y'know what? Here's a whole page. // 4096 bytes, or 4 kB void * memmove_512bit_4kB_a(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 6); __m512i *nextd = d + (len << 6); if (d < s) { while (d != nextd) { _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 } } else { while (nextd != d) { _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 9 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 10 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 11 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 12 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 13 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 14 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 15 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 16 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 17 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 18 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 19 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 20 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 21 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 22 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 23 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 24 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 25 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 26 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 27 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 28 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 29 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 30 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 31 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 32 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 9 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 10 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 11 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 12 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 13 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 14 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 15 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 16 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 17 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 18 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 19 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 20 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 21 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 22 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 23 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 24 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 25 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 26 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 27 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 28 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 29 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 30 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 31 _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 32 } } return dest; } #endif //----------------------------------------------------------------------------- // SSE4.1 Streaming: //----------------------------------------------------------------------------- // SSE4.1 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/16), so it's "# of 128-bits" void * memmove_128bit_as(void *dest, const void *src, size_t len) { __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; __m128i *nexts = s + len; __m128i *nextd = d + len; if (d < s) { while (d != nextd) { _mm_stream_si128(d++, _mm_stream_load_si128(s++)); } } else { while (nextd != d) { _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); } } _mm_sfence(); return dest; } // 32 bytes at a time void * memmove_128bit_32B_as(void *dest, const void *src, size_t len) { __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; __m128i *nexts = s + (len << 1); __m128i *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 } } else { while (nextd != d) { _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 1 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 2 } } _mm_sfence(); return dest; } // 64 bytes at a time void * memmove_128bit_64B_as(void *dest, const void *src, size_t len) { __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; __m128i *nexts = s + (len << 2); __m128i *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 } } else { while (nextd != d) { _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 1 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 2 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 3 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 4 } } _mm_sfence(); return dest; } // 128 bytes at a time void * memmove_128bit_128B_as(void *dest, const void *src, size_t len) { __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; __m128i *nexts = s + (len << 3); __m128i *nextd = d + (len << 3); if (d < s) { while (d != nextd) { _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 5 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 6 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 7 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 8 } } else { while (nextd != d) { _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 1 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 2 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 3 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 4 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 5 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 6 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 7 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 8 } } _mm_sfence(); return dest; } // For fun: 1 load->store for every xmm register (there are 16) // 256 bytes void * memmove_128bit_256B_as(void *dest, const void *src, size_t len) { __m128i* s = (__m128i*)src; __m128i* d = (__m128i*)dest; __m128i *nexts = s + (len << 4); __m128i *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 5 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 6 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 7 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 8 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 9 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 10 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 11 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 12 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 13 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 14 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 15 _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 16 } } else { while (nextd != d) { _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 1 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 2 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 3 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 4 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 5 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 6 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 7 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 8 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 9 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 10 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 11 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 12 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 13 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 14 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 15 _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 16 } } _mm_sfence(); return dest; } //----------------------------------------------------------------------------- // AVX2+ Streaming: //----------------------------------------------------------------------------- // AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/32), so it's "# of 256-bits" // Haswell and Ryzen and up #ifdef __AVX2__ void * memmove_256bit_as(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + len; __m256i *nextd = d + len; if (d < s) { while (d != nextd) { _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); } } else { while (nextd != d) { _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); } } _mm_sfence(); return dest; } // 64 bytes at a time void * memmove_256bit_64B_as(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + (len << 1); __m256i *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 } } else { while (nextd != d) { _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 1 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 2 } } _mm_sfence(); return dest; } // 128 bytes at a time void * memmove_256bit_128B_as(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + (len << 2); __m256i *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 } } else { while (nextd != d) { _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 1 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 2 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 3 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 4 } } _mm_sfence(); return dest; } // 256 bytes at a time void * memmove_256bit_256B_as(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + (len << 3); __m256i *nextd = d + (len << 3); if (d < s) { while (d != nextd) { _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 5 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 6 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 7 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 8 } } else { while (nextd != d) { _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 1 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 2 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 3 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 4 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 5 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 6 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 7 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 8 } } _mm_sfence(); return dest; } // I just wanted to see what doing one move for every ymm register looks like. // There are 16 256-bit (ymm) registers. void * memmove_256bit_512B_as(void *dest, const void *src, size_t len) { const __m256i* s = (__m256i*)src; __m256i* d = (__m256i*)dest; const __m256i *nexts = s + (len << 4); __m256i *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 5 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 6 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 7 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 8 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 9 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 10 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 11 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 12 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 13 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 14 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 15 _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 16 } } else { while (nextd != d) { _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 1 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 2 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 3 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 4 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 5 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 6 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 7 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 8 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 9 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 10 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 11 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 12 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 13 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 14 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 15 _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 16 } } _mm_sfence(); return dest; } #endif // AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) // Len is (# of total bytes/64), so it's "# of 512-bits" // Requires AVX512F #ifdef __AVX512F__ void * memmove_512bit_as(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + len; __m512i *nextd = d + len; if (d < s) { while (d != nextd) { _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); } } else { while (nextd != d) { _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); } } _mm_sfence(); return dest; } // 128 bytes at a time void * memmove_512bit_128B_as(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 1); __m512i *nextd = d + (len << 1); if (d < s) { while (d != nextd) { _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 } } else { while (nextd != d) { _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 } } _mm_sfence(); return dest; } // 256 bytes at a time void * memmove_512bit_256B_as(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 2); __m512i *nextd = d + (len << 2); if (d < s) { while (d != nextd) { _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 } } else { while (nextd != d) { _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 } } _mm_sfence(); return dest; } // 512 bytes (half a KB!!) at a time void * memmove_512bit_512B_as(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 3); __m512i *nextd = d + (len << 3); if (d < s) { while (d != nextd) // Post-increment: use d then increment { _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 } } else { while (nextd != d) // Pre-increment: increment nextd then use { _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 } } _mm_sfence(); return dest; } // The functions below I made just for fun to see what doing one move for every // zmm register looks like. I think the insanity speaks for itself. :) // 1024 bytes, or 1 kB void * memmove_512bit_1kB_as(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 4); __m512i *nextd = d + (len << 4); if (d < s) { while (d != nextd) { _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 } } else { while (nextd != d) { _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 9 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 10 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 11 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 12 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 13 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 14 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 15 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 16 } } _mm_sfence(); return dest; } // 2048 bytes, or 2 kB // AVX512 has 32x 512-bit registers, so...... void * memmove_512bit_2kB_as(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 5); __m512i *nextd = d + (len << 5); if (d < s) { while (d != nextd) { _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 } } else { while (nextd != d) { _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 9 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 10 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 11 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 12 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 13 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 14 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 15 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 16 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 17 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 18 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 19 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 20 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 21 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 22 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 23 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 24 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 25 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 26 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 27 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 28 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 29 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 30 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 31 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 32 } } _mm_sfence(); return dest; } // Y'know what? Here's a whole page. // 4096 bytes, or 4 kB void * memmove_512bit_4kB_as(void *dest, const void *src, size_t len) { const __m512i* s = (__m512i*)src; __m512i* d = (__m512i*)dest; const __m512i *nexts = s + (len << 6); __m512i *nextd = d + (len << 6); if (d < s) { while (d != nextd) { _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 } } else { while (nextd != d) { _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 9 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 10 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 11 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 12 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 13 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 14 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 15 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 16 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 17 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 18 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 19 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 20 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 21 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 22 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 23 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 24 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 25 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 26 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 27 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 28 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 29 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 30 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 31 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 32 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 9 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 10 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 11 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 12 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 13 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 14 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 15 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 16 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 17 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 18 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 19 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 20 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 21 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 22 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 23 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 24 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 25 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 26 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 27 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 28 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 29 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 30 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 31 _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 32 } } _mm_sfence(); return dest; } #endif //----------------------------------------------------------------------------- // Dispatch Functions: //----------------------------------------------------------------------------- // Move arbitrarily large amounts of data (dest addr < src addr) void * memmove_large(void *dest, void *src, size_t numbytes) { void * returnval = dest; // memmove is supposed to return the destination size_t offset = 0; // Offset size needs to match the size of a pointer while(numbytes) // The biggest sizes will go first for alignment. There's no benefit to using // aligned loads over unaligned loads here, so all are unaligned. // NOTE: Each memmove has its own loop so that any one can be used individually. { if(numbytes < 2) // 1 byte { memmove(dest, src, numbytes); offset = numbytes & -1; dest = (char *)dest + offset; src = (char *)src + offset; numbytes = 0; } else if(numbytes < 4) // 2 bytes { memmove_16bit(dest, src, numbytes >> 1); offset = numbytes & -2; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 1; } else if(numbytes < 8) // 4 bytes { memmove_32bit(dest, src, numbytes >> 2); offset = numbytes & -4; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 3; } else if(numbytes < 16) // 8 bytes { memmove_64bit(dest, src, numbytes >> 3); offset = numbytes & -8; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 7; } #ifdef __AVX512F__ else if(numbytes < 32) // 16 bytes { memmove_128bit_u(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_256bit_u(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_512bit_u(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_512bit_128B_u(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else if(numbytes < 512) // 256 bytes { memmove_512bit_256B_u(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } else if(numbytes < 1024) // 512 bytes { memmove_512bit_512B_u(dest, src, numbytes >> 9); offset = numbytes & -512; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 511; } else if(numbytes < 2048) // 1024 bytes (1 kB) { memmove_512bit_1kB_u(dest, src, numbytes >> 10); offset = numbytes & -1024; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 1023; } else if(numbytes < 4096) // 2048 bytes (2 kB) { memmove_512bit_2kB_u(dest, src, numbytes >> 11); offset = numbytes & -2048; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 2047; } else // 4096 bytes (4 kB) { memmove_512bit_4kB_u(dest, src, numbytes >> 12); offset = numbytes & -4096; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 4095; } #elif __AVX__ else if(numbytes < 32) // 16 bytes { memmove_128bit_u(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_256bit_u(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_256bit_64B_u(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_256bit_128B_u(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else if(numbytes < 512) // 256 bytes { memmove_256bit_256B_u(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } else // 512 bytes { memmove_256bit_512B_u(dest, src, numbytes >> 9); offset = numbytes & -512; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 511; } #else // SSE2 only else if(numbytes < 32) // 16 bytes { memmove_128bit_u(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_128bit_32B_u(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_128bit_64B_u(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_128bit_128B_u(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else // 256 bytes { memmove_128bit_256B_u(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } #endif } return returnval; } // END MEMMOVE LARGE, UNALIGNED // Move arbitrarily large amounts of data (dest addr < src addr) // Aligned version void * memmove_large_a(void *dest, void *src, size_t numbytes) { void * returnval = dest; // memmove is supposed to return the destination size_t offset = 0; // Offset size needs to match the size of a pointer while(numbytes) // The biggest sizes will go first for alignment. There's no benefit to using // aligned loads over unaligned loads here, so all are unaligned. // NOTE: Each memmove has its own loop so that any one can be used individually. { if(numbytes < 2) // 1 byte { memmove(dest, src, numbytes); offset = numbytes & -1; dest = (char *)dest + offset; src = (char *)src + offset; numbytes = 0; } else if(numbytes < 4) // 2 bytes { memmove_16bit(dest, src, numbytes >> 1); offset = numbytes & -2; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 1; } else if(numbytes < 8) // 4 bytes { memmove_32bit(dest, src, numbytes >> 2); offset = numbytes & -4; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 3; } else if(numbytes < 16) // 8 bytes { memmove_64bit(dest, src, numbytes >> 3); offset = numbytes & -8; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 7; } #ifdef __AVX512F__ else if(numbytes < 32) // 16 bytes { memmove_128bit_a(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_256bit_a(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_512bit_a(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_512bit_128B_a(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else if(numbytes < 512) // 256 bytes { memmove_512bit_256B_a(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } else if(numbytes < 1024) // 512 bytes { memmove_512bit_512B_a(dest, src, numbytes >> 9); offset = numbytes & -512; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 511; } else if(numbytes < 2048) // 1024 bytes (1 kB) { memmove_512bit_1kB_a(dest, src, numbytes >> 10); offset = numbytes & -1024; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 1023; } else if(numbytes < 4096) // 2048 bytes (2 kB) { memmove_512bit_2kB_a(dest, src, numbytes >> 11); offset = numbytes & -2048; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 2047; } else // 4096 bytes (4 kB) { memmove_512bit_4kB_a(dest, src, numbytes >> 12); offset = numbytes & -4096; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 4095; } #elif __AVX__ else if(numbytes < 32) // 16 bytes { memmove_128bit_a(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_256bit_a(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_256bit_64B_a(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_256bit_128B_a(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else if(numbytes < 512) // 256 bytes { memmove_256bit_256B_a(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } else // 512 bytes { memmove_256bit_512B_a(dest, src, numbytes >> 9); offset = numbytes & -512; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 511; } #else // SSE2 only else if(numbytes < 32) // 16 bytes { memmove_128bit_a(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_128bit_32B_a(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_128bit_64B_a(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_128bit_128B_a(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else // 256 bytes { memmove_128bit_256B_a(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } #endif } return returnval; } // END MEMMOVE LARGE, ALIGNED // Move arbitrarily large amounts of data (dest addr < src addr) // Aligned, streaming version void * memmove_large_as(void *dest, void *src, size_t numbytes) { void * returnval = dest; // memmove is supposed to return the destination size_t offset = 0; // Offset size needs to match the size of a pointer while(numbytes) // The biggest sizes will go first for alignment. There's no benefit to using // aligned loads over unaligned loads here, so all are unaligned. // NOTE: Each memmove has its own loop so that any one can be used individually. { if(numbytes < 2) // 1 byte { memmove(dest, src, numbytes); offset = numbytes & -1; dest = (char *)dest + offset; src = (char *)src + offset; numbytes = 0; } else if(numbytes < 4) // 2 bytes { memmove_16bit(dest, src, numbytes >> 1); offset = numbytes & -2; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 1; } else if(numbytes < 8) // 4 bytes { memmove_32bit(dest, src, numbytes >> 2); offset = numbytes & -4; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 3; } else if(numbytes < 16) // 8 bytes { memmove_64bit(dest, src, numbytes >> 3); offset = numbytes & -8; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 7; } #ifdef __AVX512F__ else if(numbytes < 32) // 16 bytes { memmove_128bit_as(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_256bit_as(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_512bit_as(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_512bit_128B_as(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else if(numbytes < 512) // 256 bytes { memmove_512bit_256B_as(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } else if(numbytes < 1024) // 512 bytes { memmove_512bit_512B_as(dest, src, numbytes >> 9); offset = numbytes & -512; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 511; } else if(numbytes < 2048) // 1024 bytes (1 kB) { memmove_512bit_1kB_as(dest, src, numbytes >> 10); offset = numbytes & -1024; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 1023; } else if(numbytes < 4096) // 2048 bytes (2 kB) { memmove_512bit_2kB_as(dest, src, numbytes >> 11); offset = numbytes & -2048; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 2047; } else // 4096 bytes (4 kB) { memmove_512bit_4kB_as(dest, src, numbytes >> 12); offset = numbytes & -4096; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 4095; } #elif __AVX2__ else if(numbytes < 32) // 16 bytes { memmove_128bit_as(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_256bit_as(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_256bit_64B_as(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_256bit_128B_as(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else if(numbytes < 512) // 256 bytes { memmove_256bit_256B_as(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } else // 512 bytes { memmove_256bit_512B_as(dest, src, numbytes >> 9); offset = numbytes & -512; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 511; } #else // SSE4.1 only else if(numbytes < 32) // 16 bytes { memmove_128bit_as(dest, src, numbytes >> 4); offset = numbytes & -16; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { memmove_128bit_32B_as(dest, src, numbytes >> 5); offset = numbytes & -32; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 31; } else if(numbytes < 128) // 64 bytes { memmove_128bit_64B_as(dest, src, numbytes >> 6); offset = numbytes & -64; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 63; } else if(numbytes < 256) // 128 bytes { memmove_128bit_128B_as(dest, src, numbytes >> 7); offset = numbytes & -128; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 127; } else // 256 bytes { memmove_128bit_256B_as(dest, src, numbytes >> 8); offset = numbytes & -256; dest = (char *)dest + offset; src = (char *)src + offset; numbytes &= 255; } #endif } return returnval; } // END MEMMOVE LARGE, ALIGNED, STREAMING // Move arbitrarily large amounts of data in reverse order (ends first) // src addr < dest addr void * memmove_large_reverse(void *dest, void *src, size_t numbytes) { void * returnval = dest; // memmove is supposed to return the destination size_t offset = 0; // Offset size needs to match the size of a pointer void * nextdest = (char *)dest + numbytes; void * nextsrc = (char *)src + numbytes; while(numbytes) // Want smallest sizes to go first, at the tail end, so that the biggest sizes // are aligned later in this operation (AVX_memmove sets the alignment up for // this to work). // NOTE: Each memmove has its own loop so that any one can be used individually. { if(numbytes & 1) // 1 byte { offset = numbytes & 1; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove(nextdest, nextsrc, 1); numbytes &= -2; } else if(numbytes & 2) // 2 bytes { offset = numbytes & 3; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_16bit(nextdest, nextsrc, 1); numbytes &= -4; } else if(numbytes & 4) // 4 bytes { offset = numbytes & 7; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_32bit(nextdest, nextsrc, 1); numbytes &= -8; } else if(numbytes & 8) // 8 bytes { offset = numbytes & 15; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_64bit(nextdest, nextsrc, 1); numbytes &= -16; } #ifdef __AVX512F__ else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_u(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_u(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_u(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128) // 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_128B_u(nextdest, nextsrc, 1); numbytes &= -256; } else if(numbytes & 256) // 256 bytes { offset = numbytes & 511; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_256B_u(nextdest, nextsrc, 1); numbytes &= -512; } else if(numbytes & 512) // 512 bytes { offset = numbytes & 1023; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_512B_u(nextdest, nextsrc, 1); numbytes &= -1024; } else if(numbytes & 1024) // 1024 bytes (1 kB) { offset = numbytes & 2047; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_1kB_u(nextdest, nextsrc, 1); numbytes &= -2048; } else if(numbytes & 2048) // 2048 bytes (2 kB) { offset = numbytes & 4095; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_2kB_u(nextdest, nextsrc, 1); numbytes &= -4096; } else // 4096 bytes (4 kB) { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_512bit_4kB_u(nextdest, nextsrc, numbytes >> 12); numbytes = 0; } #elif __AVX__ else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_u(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_u(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_64B_u(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128) // 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_128B_u(nextdest, nextsrc, 1); numbytes &= -256; } else if(numbytes & 256) // 256 bytes { offset = numbytes & 511; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_256B_u(nextdest, nextsrc, 1); numbytes &= -512; } else // 512 bytes { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_256bit_512B_u(nextdest, nextsrc, numbytes >> 9); numbytes = 0; } #else // SSE2 only else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_u(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_32B_u(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_64B_u(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128)// 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_128B_u(nextdest, nextsrc, 1); numbytes &= -256; } else // 256 bytes { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_128bit_256B_u(nextdest, nextsrc, numbytes >> 8); numbytes = 0; } #endif } return returnval; } // END MEMMOVE LARGE REVERSE, UNALIGNED // Move arbitrarily large amounts of data in reverse order (ends first) // src addr < dest addr // Aligned version void * memmove_large_reverse_a(void *dest, void *src, size_t numbytes) { void * returnval = dest; // memmove is supposed to return the destination size_t offset = 0; // Offset size needs to match the size of a pointer void * nextdest = (char *)dest + numbytes; void * nextsrc = (char *)src + numbytes; while(numbytes) // Want smallest sizes to go first, at the tail end, so that the biggest sizes // are aligned later in this operation (AVX_memmove sets the alignment up for // this to work). // NOTE: Each memmove has its own loop so that any one can be used individually. { if(numbytes & 1) // 1 byte { offset = numbytes & 1; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove(nextdest, nextsrc, 1); numbytes &= -2; } else if(numbytes & 2) // 2 bytes { offset = numbytes & 3; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_16bit(nextdest, nextsrc, 1); numbytes &= -4; } else if(numbytes & 4) // 4 bytes { offset = numbytes & 7; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_32bit(nextdest, nextsrc, 1); numbytes &= -8; } else if(numbytes & 8) // 8 bytes { offset = numbytes & 15; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_64bit(nextdest, nextsrc, 1); numbytes &= -16; } #ifdef __AVX512F__ else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_a(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_a(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_a(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128) // 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_128B_a(nextdest, nextsrc, 1); numbytes &= -256; } else if(numbytes & 256) // 256 bytes { offset = numbytes & 511; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_256B_a(nextdest, nextsrc, 1); numbytes &= -512; } else if(numbytes & 512) // 512 bytes { offset = numbytes & 1023; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_512B_a(nextdest, nextsrc, 1); numbytes &= -1024; } else if(numbytes & 1024) // 1024 bytes (1 kB) { offset = numbytes & 2047; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_1kB_a(nextdest, nextsrc, 1); numbytes &= -2048; } else if(numbytes & 2048) // 2048 bytes (2 kB) { offset = numbytes & 4095; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_2kB_a(nextdest, nextsrc, 1); numbytes &= -4096; } else // 4096 bytes (4 kB) { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_512bit_4kB_a(nextdest, nextsrc, numbytes >> 12); numbytes = 0; } #elif __AVX__ else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_a(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_a(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_64B_a(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128) // 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_128B_a(nextdest, nextsrc, 1); numbytes &= -256; } else if(numbytes & 256) // 256 bytes { offset = numbytes & 511; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_256B_a(nextdest, nextsrc, 1); numbytes &= -512; } else // 512 bytes { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_256bit_512B_a(nextdest, nextsrc, numbytes >> 9); numbytes = 0; } #else // SSE2 only else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_a(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_32B_a(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_64B_a(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128)// 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_128B_a(nextdest, nextsrc, 1); numbytes &= -256; } else // 256 bytes { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_128bit_256B_a(nextdest, nextsrc, numbytes >> 8); numbytes = 0; } #endif } return returnval; } // END MEMMOVE LARGE REVERSE, ALIGNED // Move arbitrarily large amounts of data in reverse order (ends first) // src addr < dest addr // Aligned, streaming version void * memmove_large_reverse_as(void *dest, void *src, size_t numbytes) { void * returnval = dest; // memmove is supposed to return the destination size_t offset = 0; // Offset size needs to match the size of a pointer void * nextdest = (char *)dest + numbytes; void * nextsrc = (char *)src + numbytes; while(numbytes) // Want smallest sizes to go first, at the tail end, so that the biggest sizes // are aligned later in this operation (AVX_memmove sets the alignment up for // this to work). // NOTE: Each memmove has its own loop so that any one can be used individually. { if(numbytes & 1) // 1 byte { offset = numbytes & 1; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove(nextdest, nextsrc, 1); numbytes &= -2; } else if(numbytes & 2) // 2 bytes { offset = numbytes & 3; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_16bit(nextdest, nextsrc, 1); numbytes &= -4; } else if(numbytes & 4) // 4 bytes { offset = numbytes & 7; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_32bit(nextdest, nextsrc, 1); numbytes &= -8; } else if(numbytes & 8) // 8 bytes { offset = numbytes & 15; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_64bit(nextdest, nextsrc, 1); numbytes &= -16; } #ifdef __AVX512F__ else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_as(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_as(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_as(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128) // 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_128B_as(nextdest, nextsrc, 1); numbytes &= -256; } else if(numbytes & 256) // 256 bytes { offset = numbytes & 511; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_256B_as(nextdest, nextsrc, 1); numbytes &= -512; } else if(numbytes & 512) // 512 bytes { offset = numbytes & 1023; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_512B_as(nextdest, nextsrc, 1); numbytes &= -1024; } else if(numbytes & 1024) // 1024 bytes (1 kB) { offset = numbytes & 2047; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_1kB_as(nextdest, nextsrc, 1); numbytes &= -2048; } else if(numbytes & 2048) // 2048 bytes (2 kB) { offset = numbytes & 4095; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_512bit_2kB_as(nextdest, nextsrc, 1); numbytes &= -4096; } else // 4096 bytes (4 kB) { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_512bit_4kB_as(nextdest, nextsrc, numbytes >> 12); numbytes = 0; } #elif __AVX2__ else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_as(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_as(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_64B_as(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128) // 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_128B_as(nextdest, nextsrc, 1); numbytes &= -256; } else if(numbytes & 256) // 256 bytes { offset = numbytes & 511; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_256bit_256B_as(nextdest, nextsrc, 1); numbytes &= -512; } else // 512 bytes { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_256bit_512B_as(nextdest, nextsrc, numbytes >> 9); numbytes = 0; } #else // SSE4.1 only else if(numbytes & 16) // 16 bytes { offset = numbytes & 31; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_as(nextdest, nextsrc, 1); numbytes &= -32; } else if(numbytes & 32) // 32 bytes { offset = numbytes & 63; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_32B_as(nextdest, nextsrc, 1); numbytes &= -64; } else if(numbytes & 64) // 64 bytes { offset = numbytes & 127; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_64B_as(nextdest, nextsrc, 1); numbytes &= -128; } else if(numbytes & 128)// 128 bytes { offset = numbytes & 255; nextdest = (char *)nextdest - offset; nextsrc = (char *)nextsrc - offset; memmove_128bit_128B_as(nextdest, nextsrc, 1); numbytes &= -256; } else // 256 bytes { offset = numbytes; nextdest = (char *)nextdest - offset; // These should match initial src/dest nextsrc = (char *)nextsrc - offset; memmove_128bit_256B_as(nextdest, nextsrc, numbytes >> 8); numbytes = 0; } #endif } return returnval; } // END MEMMOVE LARGE REVERSE, ALIGNED, STREAMING //----------------------------------------------------------------------------- // Main Function: //----------------------------------------------------------------------------- // General-purpose function to call void * memmoveAVX(void *dest, void *src, size_t numbytes) { void * returnval = dest; if((char*)src == (char*)dest) { return returnval; } if( ( ((uintptr_t)src & BYTE_ALIGNMENT) == 0 ) && ( ((uintptr_t)dest & BYTE_ALIGNMENT) == 0 ) ) // Check alignment { if((char *)dest < (char *)src) { // This is the fastest case: src and dest are both cache line aligned. if(numbytes > CACHESIZE) { memmove_large_as(dest, src, numbytes); } else { memmove_large_a(dest, src, numbytes); // Even if numbytes is small this'll work } } else // src < dest { // Need to move ends first if(numbytes > CACHESIZE) { memmove_large_reverse_as(dest, src, numbytes); } else { memmove_large_reverse_a(dest, src, numbytes); } } } else // Unaligned { size_t numbytes_to_align = (BYTE_ALIGNMENT + 1) - ((uintptr_t)dest & BYTE_ALIGNMENT); void * destoffset = (char*)dest + numbytes_to_align; void * srcoffset = (char*)src + numbytes_to_align; if((char *)dest < (char *)src) { if(numbytes > numbytes_to_align) { // Get to an aligned position. // This may be a little slower, but since it'll be mostly scalar operations // alignment doesn't matter. Worst case it uses two vector functions, and // this process only needs to be done once per call if dest is unaligned. memmove_large(dest, src, numbytes_to_align); // Now this should be faster since stores are aligned. memmove_large(destoffset, srcoffset, numbytes - numbytes_to_align); // NOTE: Can't use streaming due to potential src misalignment // On Haswell and up, cross cache line loads have a negligible penalty. // Thus this will be slower on Sandy & Ivy Bridge, though Ivy Bridge will // fare a little better (~2x, maybe?). Ryzen should generally fall somewhere // inbetween Sandy Bridge and Haswell/Skylake on that front. // NOTE: These are just rough theoretical estimates. } else // Small size { memmove_large(dest, src, numbytes); } } else // src < dest { if(numbytes > numbytes_to_align) { // Move bulk, up to lowest alignment line memmove_large_reverse(destoffset, srcoffset, numbytes - numbytes_to_align); // Move remainder memmove_large_reverse(dest, src, numbytes_to_align); } else // Small size { memmove_large_reverse(dest, src, numbytes); } } } return returnval; }