diff --git a/kernel/memory/memcmp.c b/kernel/memory/memcmp.c new file mode 100644 index 0000000..a2c4c06 --- /dev/null +++ b/kernel/memory/memcmp.c @@ -0,0 +1,1303 @@ + +#include + +// Equality-only version +int memcmp_eq (const void *str1, const void *str2, size_t count) +{ + const unsigned char *s1 = (unsigned char *)str1; + const unsigned char *s2 = (unsigned char *)str2; + + while (count-- > 0) + { + if (*s1++ != *s2++) + { + return -1; // Makes more sense to me if -1 means unequal. + } + } + return 0; // Return 0 if equal to match normal memcmp +} + +///============================================================================= +/// LICENSING INFORMATION +///============================================================================= +// +// The code above this comment is in the public domain. +// The code below this comment is subject to the custom attribution license found +// here: https://github.com/KNNSpeed/Simple-Kernel/blob/master/LICENSE_KERNEL +// +//============================================================================== +// AVX Memory Functions: AVX Memcmp +//============================================================================== +// +// Version 1.2 +// +// Author: +// KNNSpeed +// +// Source Code: +// https://github.com/KNNSpeed/Simple-Kernel +// +// Minimum requirement: +// x86_64 CPU with SSE4.2, but AVX2 or later is recommended +// +// This file provides a highly optimized version of memcmp. +// It allows for selection of modes, too: "check for equality" or perform the full +// greater-than/less-than comparison. For equality-only, pass 0 to the equality +// argument. Pass 1 for full comparison (or really any nonzero int). +// +// In equality mode, a return value of 0 means equal, -1 means unequal. +// In full comparison mode, -1 -> str1 is less, 0 -> equal, 1 -> str1 is greater. +// + +#ifdef __clang__ +#define __m128i_u __m128i +#define __m256i_u __m256i +#define __m512i_u __m512i +#define _mm_cvtsi128_si64x _mm_cvtsi128_si64 +#define _mm_cvtsi64x_si128 _mm_cvtsi64_si128 +#endif + +#ifdef __AVX512F__ +#define BYTE_ALIGNMENT 0x3F // For 64-byte alignment +#elif __AVX2__ +#define BYTE_ALIGNMENT 0x1F // For 32-byte alignment +#else +#define BYTE_ALIGNMENT 0x0F // For 16-byte alignment +#endif + +//----------------------------------------------------------------------------- +// Individual Functions: +//----------------------------------------------------------------------------- +// +// The following memcmps return -1 or 1 depending on the sign of the first unit +// of their respective sizes, as opposed to the first byte (it seems memcmp(3) +// is only defined for byte-by-byte comparisons, not, e.g., 16-byte-by-16-byte). +// +// The way these functions are made allows them to work properly even if they +// run off the edge of the desired memory area (e.g. numbytes was larger than the +// desired area for whatever reason). The returned value won't necessarily be +// indicative of the memory area in this case. +// + +// 16-bit (2 bytes at a time) +// Count is (# of total bytes/2), so it's "# of 16-bits" + +int memcmp_16bit(const void *str1, const void *str2, size_t count) +{ + const uint16_t *s1 = (uint16_t*)str1; + const uint16_t *s2 = (uint16_t*)str2; + + while (count-- > 0) + { + if (*s1++ != *s2++) + { + return s1[-1] < s2[-1] ? -1 : 1; + } + } + return 0; +} + +// Equality-only version +int memcmp_16bit_eq(const void *str1, const void *str2, size_t count) +{ + const uint16_t *s1 = (uint16_t*)str1; + const uint16_t *s2 = (uint16_t*)str2; + + while (count--) + { + if (*s1++ != *s2++) + { + return -1; + } + } + return 0; +} + +// 32-bit (4 bytes at a time - 1 pixel in a 32-bit linear frame buffer) +// Count is (# of total bytes/4), so it's "# of 32-bits" + +int memcmp_32bit(const void *str1, const void *str2, size_t count) +{ + const uint32_t *s1 = (uint32_t*)str1; + const uint32_t *s2 = (uint32_t*)str2; + + while (count--) + { + if (*s1++ != *s2++) + { + return s1[-1] < s2[-1] ? -1 : 1; + } + } + return 0; +} + +// Equality-only version +int memcmp_32bit_eq(const void *str1, const void *str2, size_t count) +{ + const uint32_t *s1 = (uint32_t*)str1; + const uint32_t *s2 = (uint32_t*)str2; + + while (count--) + { + if (*s1++ != *s2++) + { + return -1; + } + } + return 0; +} + +// 64-bit (8 bytes at a time - 2 pixels in a 32-bit linear frame buffer) +// Count is (# of total bytes/8), so it's "# of 64-bits" + +int memcmp_64bit(const void *str1, const void *str2, size_t count) +{ + const uint64_t *s1 = (uint64_t*)str1; + const uint64_t *s2 = (uint64_t*)str2; + + while (count--) + { + if (*s1++ != *s2++) + { + return s1[-1] < s2[-1] ? -1 : 1; + } + } + return 0; +} + +// Equality-only version +int memcmp_64bit_eq(const void *str1, const void *str2, size_t count) +{ + const uint64_t *s1 = (uint64_t*)str1; + const uint64_t *s2 = (uint64_t*)str2; + + while (count--) + { + if (*s1++ != *s2++) + { + return -1; + } + } + return 0; +} + +//----------------------------------------------------------------------------- +// SSE4.2 Unaligned: +//----------------------------------------------------------------------------- + +// SSE4.2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Count is (# of total bytes/16), so it's "# of 128-bits" + +int memcmp_128bit_u(const void *str1, const void *str2, size_t count) +{ + const __m128i_u *s1 = (__m128i_u*)str1; + const __m128i_u *s2 = (__m128i_u*)str2; + + while (count--) + { + __m128i item1 = _mm_lddqu_si128(s1++); + __m128i item2 = _mm_lddqu_si128(s2++); + __m128i result = _mm_cmpeq_epi64(item1, item2); + // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is + // true, and 0 per 64-bit portion where false + + // If result is not all ones, then there is a difference here + if(!(unsigned int)_mm_test_all_ones(result)) + {// Ok, now we know they're not equal somewhere + + // In the case where both halves of the 128-bit result integer are + // 0x0000000000000000, that's the same as + // 0x0000000000000000FFFFFFFFFFFFFFFF. Only the MSB matters here as the + // comparison is a greater-than check. + + // Do the greater than comparison here to have it done before the conditional + // Also make it an unsigned compare: + // https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work + const __m128i rangeshift = _mm_set1_epi64x(0x8000000000000000); + __m128i resultgt = _mm_cmpgt_epi64(_mm_xor_si128(item1, rangeshift), _mm_xor_si128(item2, rangeshift)); + // cmpgt returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true + + // _mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF) makes 0x0000000000000000FFFFFFFFFFFFFFFF, + // which is the desired mask inverted. + // AND the mask with result such that it returns 1 if all zeroes + if((unsigned int)_mm_test_all_zeros(result, ~_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF))) + { + // Returned a 1, therefore equality comparison gave 0x0000000000000000 + // for both 64-bits or 0x0000000000000000FFFFFFFFFFFFFFFF - this + // particular case highlights why an unsigned compare is very important. + // CMPGT will have given 0xFFFFFFFFFFFFFFFFYYYYYYYYYYYYYYYY or + // 0x0000000000000000YYYYYYYYYYYYYYYY + + // Right shift to put the desired bits into the lower part of the + // register (overwrite the Ys) + resultgt = _mm_bsrli_si128(resultgt, 8); + // Will either be all ones or all zeros. If all ones, item1 > item2, if + // all zeros, item1 < item2 + if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half + { + return 1; // 0x[0000000000000000]0000000000000000 + } + else + { + return -1; // 0x[0000000000000000]FFFFFFFFFFFFFFFF + } + } + else // AND mask produced a nonzero value, so the test returned 0. + { + // Therefore equality comparison gave 0xFFFFFFFFFFFFFFFF0000000000000000 + // (which is the same as the mask) and CMPGT will have given + // 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF or 0xFFFFFFFFFFFFFFFF0000000000000000 + // Lower register bits will either be all ones or all zeros. If all ones, + // item1 > item2, if all zeros, item1 < item2 + if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half + { + return 1; // 0x[FFFFFFFFFFFFFFFF]FFFFFFFFFFFFFFFF + } + else + { + return -1; // 0x[FFFFFFFFFFFFFFFF]0000000000000000 + } + } + } + } + return 0; +} + +// Equality-only version +int memcmp_128bit_eq_u(const void *str1, const void *str2, size_t count) +{ + const __m128i_u *s1 = (__m128i_u*)str1; + const __m128i_u *s2 = (__m128i_u*)str2; + + while (count--) + { + __m128i item1 = _mm_lddqu_si128(s1++); + __m128i item2 = _mm_lddqu_si128(s2++); + __m128i result = _mm_cmpeq_epi64(item1, item2); + // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is + // true, and 0 per 64-bit portion where false + + // If result is not all ones, then there is a difference here + if(!(unsigned int)_mm_test_all_ones(result)) + { + return -1; + } + } + return 0; +} + +//----------------------------------------------------------------------------- +// AVX2+ Unaligned: +//----------------------------------------------------------------------------- + +// AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Count is (# of total bytes/32), so it's "# of 256-bits" +// Haswell and Ryzen and up + +#ifdef __AVX2__ +int memcmp_256bit_u(const void *str1, const void *str2, size_t count) +{ + const __m256i_u *s1 = (__m256i_u*)str1; + const __m256i_u *s2 = (__m256i_u*)str2; + + while (count--) + { + __m256i item1 = _mm256_lddqu_si256(s1++); + __m256i item2 = _mm256_lddqu_si256(s2++); + __m256i result = _mm256_cmpeq_epi64(item1, item2); + // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is + // true, and 0 per 64-bit portion where false + + // If result is not all ones, then there is a difference here. + // This is the same thing as _mm_test_all_ones, but 256-bit + if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF))) + { // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity. + // It really makes no difference on two's complement machines. + + // Ok, now we know they're not equal somewhere. Man, doing a pure != is + // sooo much simpler than > or <.... + + // Unsigned greater-than compare using signed operations, see: + // https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work + const __m256i rangeshift = _mm256_set1_epi64x(0x8000000000000000); + __m256i resultgt = _mm256_cmpgt_epi64(_mm256_xor_si256(item1, rangeshift), _mm256_xor_si256(item2, rangeshift)); + // Returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true + + // 32-bit value, 4 outcomes we care about from cmpeq -> movemask: + // 00YYYYYY FF00YYYY FFFF00YY FFFFFF00, where Y is "don't care." The most + // significant zeroed byte is the inequality we care about. + + // This is the fastest we can do on AVX2. + unsigned int result_to_scan = (unsigned int)_mm256_movemask_epi8(result); + unsigned int resultgt_to_scan = (unsigned int)_mm256_movemask_epi8(resultgt); + // Outcomes from cmpgt are ZZYYYYYY 00ZZYYYY 0000ZZYY 000000ZZ, where + // Z is F if item1 > item2, 0 if item1 < item2, and Y is "don't care." + // The ZZ position of cmpgt will match the corresponding 00 of cmpeq. + + // result_to_scan: 00YYYYYY FF00YYYY FFFF00YY FFFFFF00 --inverted--> + // FFYYYYYY 00FFYYYY 0000FFYY 000000FF. This will either be + // > resultgt_to_scan (ZZ = 00) or it won't (ZZ = FF). + if(~result_to_scan > resultgt_to_scan) + { + return -1; // If ZZ = 00, item1 < item2 + } + else + { + return 1; // If ZZ = FF, item1 > item2 + } + } + } + return 0; +} + +// Equality-only version +int memcmp_256bit_eq_u(const void *str1, const void *str2, size_t count) +{ + const __m256i_u *s1 = (__m256i_u*)str1; + const __m256i_u *s2 = (__m256i_u*)str2; + + while (count--) + { + __m256i item1 = _mm256_lddqu_si256(s1++); + __m256i item2 = _mm256_lddqu_si256(s2++); + __m256i result = _mm256_cmpeq_epi64(item1, item2); + // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is + // true, and 0 per 64-bit portion where false + + // If result is not all ones, then there is a difference here. + // This is the same thing as _mm_test_all_ones, but 256-bit + if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF))) + { // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity. + // It really makes no difference on two's complement machines. + return -1; + } + } + return 0; +} +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Count is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ +int memcmp_512bit_u(const void *str1, const void *str2, size_t count) +{ + const __m512i_u *s1 = (__m512i_u*)str1; + const __m512i_u *s2 = (__m512i_u*)str2; + + while (count--) + { + __m512i item1 = _mm512_loadu_si512(s1++); + __m512i item2 = _mm512_loadu_si512(s2++); + unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2); + // All bits == 0 means equal + + if(result) // I don't believe this. I really need a CPU with AVX-512, lol. +// if(_mm512_mask_cmp_epu64_mask(0xFF, item1, item2, 4)) // 0 is CMPEQ, 4 is CMP_NE, this is the same thing + { + unsigned char resultgt = _mm512_cmpgt_epu64_mask(item1, item2); + // For every set of 64-bits where item1 > item2, the mask will have a 1 bit + // there, else 0 + + if(result > resultgt) // Similar deal as AVX2 + { + return -1; + } + else + { + return 1; + } + } + } + return 0; +} + +// Equality-only version +int memcmp_512bit_eq_u(const void *str1, const void *str2, size_t count) +{ + const __m512i_u *s1 = (__m512i_u*)str1; + const __m512i_u *s2 = (__m512i_u*)str2; + + while (count--) + { + __m512i item1 = _mm512_loadu_si512(s1++); + __m512i item2 = _mm512_loadu_si512(s2++); + unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2); + // All bits == 0 means equal + + if(result) // This is barely bigger than 1-byte memcmp_eq + { + return -1; + } + } + return 0; +} +#endif + +//----------------------------------------------------------------------------- +// SSE4.2 Aligned: +//----------------------------------------------------------------------------- + +// SSE4.2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Count is (# of total bytes/16), so it's "# of 128-bits" + +int memcmp_128bit_a(const void *str1, const void *str2, size_t count) +{ + const __m128i *s1 = (__m128i*)str1; + const __m128i *s2 = (__m128i*)str2; + + while (count--) + { + __m128i item1 = _mm_load_si128(s1++); + __m128i item2 = _mm_load_si128(s2++); + __m128i result = _mm_cmpeq_epi64(item1, item2); + // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is + // true, and 0 per 64-bit portion where false + + // If result is not all ones, then there is a difference here + if(!(unsigned int)_mm_test_all_ones(result)) + {// Ok, now we know they're not equal somewhere + + // In the case where both halves of the 128-bit result integer are + // 0x0000000000000000, that's the same as + // 0x0000000000000000FFFFFFFFFFFFFFFF. Only the MSB matters here as the + // comparison is a greater-than check. + + // Do the greater than comparison here to have it done before the conditional + // Also make it an unsigned compare: + // https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work + const __m128i rangeshift = _mm_set1_epi64x(0x8000000000000000); + __m128i resultgt = _mm_cmpgt_epi64(_mm_xor_si128(item1, rangeshift), _mm_xor_si128(item2, rangeshift)); + // cmpgt returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true + + // _mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF) makes 0x0000000000000000FFFFFFFFFFFFFFFF, + // which is the desired mask inverted. + // AND the mask with result such that it returns 1 if all zeroes + if((unsigned int)_mm_test_all_zeros(result, ~_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF))) + { + // Returned a 1, therefore equality comparison gave 0x0000000000000000 + // for both 64-bits or 0x0000000000000000FFFFFFFFFFFFFFFF - this + // particular case highlights why an unsigned compare is very important. + // CMPGT will have given 0xFFFFFFFFFFFFFFFFYYYYYYYYYYYYYYYY or + // 0x0000000000000000YYYYYYYYYYYYYYYY + + // Right shift to put the desired bits into the lower part of the + // register (overwrite the Ys) + resultgt = _mm_bsrli_si128(resultgt, 8); + // Will either be all ones or all zeros. If all ones, item1 > item2, if + // all zeros, item1 < item2 + if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half + { + return 1; // 0x[0000000000000000]0000000000000000 + } + else + { + return -1; // 0x[0000000000000000]FFFFFFFFFFFFFFFF + } + } + else // AND mask produced a nonzero value, so the test returned 0. + { + // Therefore equality comparison gave 0xFFFFFFFFFFFFFFFF0000000000000000 + // (which is the same as the mask) and CMPGT will have given + // 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF or 0xFFFFFFFFFFFFFFFF0000000000000000 + // Lower register bits will either be all ones or all zeros. If all ones, + // item1 > item2, if all zeros, item1 < item2 + if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half + { + return 1; // 0x[FFFFFFFFFFFFFFFF]FFFFFFFFFFFFFFFF + } + else + { + return -1; // 0x[FFFFFFFFFFFFFFFF]0000000000000000 + } + } + } + } + return 0; +} + +// Equality-only version +int memcmp_128bit_eq_a(const void *str1, const void *str2, size_t count) +{ + const __m128i *s1 = (__m128i*)str1; + const __m128i *s2 = (__m128i*)str2; + + while (count--) + { + __m128i item1 = _mm_load_si128(s1++); + __m128i item2 = _mm_load_si128(s2++); + __m128i result = _mm_cmpeq_epi64(item1, item2); + // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is + // true, and 0 per 64-bit portion where false + + // If result is not all ones, then there is a difference here + if(!(unsigned int)_mm_test_all_ones(result)) + { + return -1; + } + } + return 0; +} + +//----------------------------------------------------------------------------- +// AVX2+ Aligned: +//----------------------------------------------------------------------------- + +// AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Count is (# of total bytes/32), so it's "# of 256-bits" +// Haswell and Ryzen and up + +#ifdef __AVX2__ +int memcmp_256bit_a(const void *str1, const void *str2, size_t count) +{ + const __m256i *s1 = (__m256i*)str1; + const __m256i *s2 = (__m256i*)str2; + + while (count--) + { + __m256i item1 = _mm256_load_si256(s1++); + __m256i item2 = _mm256_load_si256(s2++); + __m256i result = _mm256_cmpeq_epi64(item1, item2); + // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is + // true, and 0 per 64-bit portion where false + + // If result is not all ones, then there is a difference here. + // This is the same thing as _mm_test_all_ones, but 256-bit + if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF))) + { // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity. + // It really makes no difference on two's complement machines. + + // Ok, now we know they're not equal somewhere. Man, doing a pure != is + // sooo much simpler than > or <.... + + // Unsigned greater-than compare using signed operations, see: + // https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work + const __m256i rangeshift = _mm256_set1_epi64x(0x8000000000000000); + __m256i resultgt = _mm256_cmpgt_epi64(_mm256_xor_si256(item1, rangeshift), _mm256_xor_si256(item2, rangeshift)); + // Returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true + + // 32-bit value, 4 outcomes we care about from cmpeq -> movemask: + // 00YYYYYY FF00YYYY FFFF00YY FFFFFF00, where Y is "don't care." The most + // significant zeroed byte is the inequality we care about. + + // This is the fastest we can do on AVX2. + unsigned int result_to_scan = (unsigned int)_mm256_movemask_epi8(result); + unsigned int resultgt_to_scan = (unsigned int)_mm256_movemask_epi8(resultgt); + // Outcomes from cmpgt are ZZYYYYYY 00ZZYYYY 0000ZZYY 000000ZZ, where + // Z is F if item1 > item2, 0 if item1 < item2, and Y is "don't care." + // The ZZ position of cmpgt will match the corresponding 00 of cmpeq. + + // result_to_scan: 00YYYYYY FF00YYYY FFFF00YY FFFFFF00 --inverted--> + // FFYYYYYY 00FFYYYY 0000FFYY 000000FF. This will either be + // > resultgt_to_scan (ZZ = 00) or it won't (ZZ = FF). + if(~result_to_scan > resultgt_to_scan) + { + return -1; // If ZZ = 00, item1 < item2 + } + else + { + return 1; // If ZZ = FF, item1 > item2 + } + } + } + return 0; +} + +// Equality-only version +int memcmp_256bit_eq_a(const void *str1, const void *str2, size_t count) +{ + const __m256i *s1 = (__m256i*)str1; + const __m256i *s2 = (__m256i*)str2; + + while (count--) + { + __m256i item1 = _mm256_load_si256(s1++); + __m256i item2 = _mm256_load_si256(s2++); + __m256i result = _mm256_cmpeq_epi64(item1, item2); + // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is + // true, and 0 per 64-bit portion where false + + // If result is not all ones, then there is a difference here. + // This is the same thing as _mm_test_all_ones, but 256-bit + if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF))) + { // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity. + // It really makes no difference on two's complement machines. + return -1; + } + } + return 0; +} +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Count is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ +int memcmp_512bit_a(const void *str1, const void *str2, size_t count) +{ + const __m512i *s1 = (__m512i*)str1; + const __m512i *s2 = (__m512i*)str2; + + while (count--) + { + __m512i item1 = _mm512_load_si512(s1++); + __m512i item2 = _mm512_load_si512(s2++); + unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2); + // All bits == 0 means equal + + if(result) // I don't believe this. I really need a CPU with AVX-512, lol. +// if(_mm512_mask_cmp_epu64_mask(0xFF, item1, item2, 4)) // 0 is CMPEQ, 4 is CMP_NE, this is the same thing + { + unsigned char resultgt = _mm512_cmpgt_epu64_mask(item1, item2); + // For every set of 64-bits where item1 > item2, the mask will have a 1 bit + // there, else 0 + + if(result > resultgt) // Similar deal as AVX2 + { + return -1; + } + else + { + return 1; + } + } + } + return 0; +} + +// GCC -O3 makes memcmp_512bit_a(...) take 25 lines of assembly. This version +// (~10 cycles) is around 5 or so cycles slower per set of memory regions than +// memcmp (~5 cycles). It's the mask operations that take ~3 cycles each... +// +// When the latency of jumps are taken into account, that means this function can +// compare 64 BYTES of data at around the same speed that memcmp does only 1 byte. +// The AVX2 version is 1 cycle slower than the AVX512 version in its main loop +// (i.e. it takes ~11 cycles). When an inequality is found, memcmp takes 3 cycles, +// AVX2 takes 16 cycles, and AVX512 takes 10 cycles to determine which input is +// greater. +// +// NOTE: These are estimates based solely on instruction latencies per Agner +// Fog's optimization tables: https://www.agner.org/optimize/. + +// Equality-only version +int memcmp_512bit_eq_a(const void *str1, const void *str2, size_t count) +{ + const __m512i *s1 = (__m512i*)str1; + const __m512i *s2 = (__m512i*)str2; + + while (count--) + { + __m512i item1 = _mm512_load_si512(s1++); + __m512i item2 = _mm512_load_si512(s2++); + unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2); + // All bits == 0 means equal + + if(result) // This is barely bigger than byte-by-byte memcmp_eq + { + return -1; + } + } + return 0; +} +#endif + +//----------------------------------------------------------------------------- +// Dispatch Functions (Unaligned): +//----------------------------------------------------------------------------- + +// memcmp for large chunks of memory with arbitrary sizes +int memcmp_large(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes. +{ + int returnval = 0; // Return value if equal... or numbytes is 0 + size_t offset = 0; + + while(numbytes) + // This loop will, at most, get evaluated 7 times, ending sooner each time. + // At minimum non-trivial case, once. Each memcmp has its own loop. + { + if(numbytes < 2) // 1 byte + { + returnval = memcmp(str1, str2, numbytes); + if(returnval) + { + return returnval; + } + offset = numbytes & -1; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + returnval = memcmp_16bit(str1, str2, numbytes >> 1); + if(returnval) + { + return returnval; + } + offset = numbytes & -2; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + returnval = memcmp_32bit(str1, str2, numbytes >> 2); + if(returnval) + { + return returnval; + } + offset = numbytes & -4; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + returnval = memcmp_64bit(str1, str2, numbytes >> 3); + if(returnval) + { + return returnval; + } + offset = numbytes & -8; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + returnval = memcmp_128bit_u(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + returnval = memcmp_256bit_u(str1, str2, numbytes >> 5); + if(returnval) + { + return returnval; + } + offset = numbytes & -32; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 31; + } + else // 64 bytes + { + returnval = memcmp_512bit_u(str1, str2, numbytes >> 6); + if(returnval) + { + return returnval; + } + offset = numbytes & -64; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 63; + } +#elif __AVX2__ + else if(numbytes < 32) // 16 bytes + { + returnval = memcmp_128bit_u(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } + else // 32 bytes + { + returnval = memcmp_256bit_u(str1, str2, numbytes >> 5); + if(returnval) + { + return returnval; + } + offset = numbytes & -32; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 31; + } +#else // SSE4.2 only + else // 16 bytes + { + returnval = memcmp_128bit_u(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } +#endif + } + return returnval; +} + +// Equality-only version +int memcmp_large_eq(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes. +{ + int returnval = 0; // Return value if equal... or numbytes is 0 + size_t offset = 0; + + while(numbytes) + { + if(numbytes < 2) // 1 byte + { + returnval = memcmp_eq(str1, str2, numbytes); + if(returnval) + { + return returnval; + } + offset = numbytes & -1; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + returnval = memcmp_16bit_eq(str1, str2, numbytes >> 1); + if(returnval) + { + return returnval; + } + offset = numbytes & -2; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + returnval = memcmp_32bit_eq(str1, str2, numbytes >> 2); + if(returnval) + { + return returnval; + } + offset = numbytes & -4; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + returnval = memcmp_64bit_eq(str1, str2, numbytes >> 3); + if(returnval) + { + return returnval; + } + offset = numbytes & -8; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + returnval = memcmp_256bit_eq_u(str1, str2, numbytes >> 5); + if(returnval) + { + return returnval; + } + offset = numbytes & -32; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 31; + } + else // 64 bytes + { + returnval = memcmp_512bit_eq_u(str1, str2, numbytes >> 6); + if(returnval) + { + return returnval; + } + offset = numbytes & -64; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 63; + } +#elif __AVX2__ + else if(numbytes < 32) // 16 bytes + { + returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } + else // 32 bytes + { + returnval = memcmp_256bit_eq_u(str1, str2, numbytes >> 5); + if(returnval) + { + return returnval; + } + offset = numbytes & -32; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 31; + } +#else // SSE4.2 only + else // 16 bytes + { + returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } +#endif + } + return returnval; +} + +//----------------------------------------------------------------------------- +// Dispatch Functions (Aligned): +//----------------------------------------------------------------------------- + +// memcmp for large chunks of memory with arbitrary sizes (aligned) +int memcmp_large_a(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes. +{ + int returnval = 0; // Return value if equal... or numbytes is 0 + size_t offset = 0; + + while(numbytes) + // This loop will, at most, get evaulated 7 times, ending sooner each time. + // At minimum non-trivial case, once. Each memcmp has its own loop. + { + if(numbytes < 2) // 1 byte + { + returnval = memcmp(str1, str2, numbytes); + if(returnval) + { + return returnval; + } + offset = numbytes & -1; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + returnval = memcmp_16bit(str1, str2, numbytes >> 1); + if(returnval) + { + return returnval; + } + offset = numbytes & -2; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + returnval = memcmp_32bit(str1, str2, numbytes >> 2); + if(returnval) + { + return returnval; + } + offset = numbytes & -4; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + returnval = memcmp_64bit(str1, str2, numbytes >> 3); + if(returnval) + { + return returnval; + } + offset = numbytes & -8; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + returnval = memcmp_128bit_a(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + returnval = memcmp_256bit_a(str1, str2, numbytes >> 5); + if(returnval) + { + return returnval; + } + offset = numbytes & -32; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 31; + } + else // 64 bytes + { + returnval = memcmp_512bit_a(str1, str2, numbytes >> 6); + if(returnval) + { + return returnval; + } + offset = numbytes & -64; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 63; + } +#elif __AVX2__ + else if(numbytes < 32) // 16 bytes + { + returnval = memcmp_128bit_a(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } + else // 32 bytes + { + returnval = memcmp_256bit_a(str1, str2, numbytes >> 5); + if(returnval) + { + return returnval; + } + offset = numbytes & -32; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 31; + } +#else // SSE4.2 only + else // 16 bytes + { + returnval = memcmp_128bit_a(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } +#endif + } + return returnval; +} + +// Equality-only version (aligned) +int memcmp_large_eq_a(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes. +{ + int returnval = 0; // Return value if equal... or numbytes is 0 + size_t offset = 0; + + while(numbytes) + { + if(numbytes < 2) // 1 byte + { + returnval = memcmp_eq(str1, str2, numbytes); + if(returnval) + { + return returnval; + } + offset = numbytes & -1; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + returnval = memcmp_16bit_eq(str1, str2, numbytes >> 1); + if(returnval) + { + return returnval; + } + offset = numbytes & -2; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + returnval = memcmp_32bit_eq(str1, str2, numbytes >> 2); + if(returnval) + { + return returnval; + } + offset = numbytes & -4; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + returnval = memcmp_64bit_eq(str1, str2, numbytes >> 3); + if(returnval) + { + return returnval; + } + offset = numbytes & -8; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + returnval = memcmp_256bit_eq_a(str1, str2, numbytes >> 5); + if(returnval) + { + return returnval; + } + offset = numbytes & -32; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 31; + } + else // 64 bytes + { + returnval = memcmp_512bit_eq_a(str1, str2, numbytes >> 6); + if(returnval) + { + return returnval; + } + offset = numbytes & -64; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 63; + } +#elif __AVX2__ + else if(numbytes < 32) // 16 bytes + { + returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } + else // 32 bytes + { + returnval = memcmp_256bit_eq_a(str1, str2, numbytes >> 5); + if(returnval) + { + return returnval; + } + offset = numbytes & -32; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 31; + } +#else // SSE4.2 only + else // 16 bytes + { + returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4); + if(returnval) + { + return returnval; + } + offset = numbytes & -16; + str1 = (char *)str1 + offset; + str2 = (char *)str2 + offset; + numbytes &= 15; + } +#endif + } + return returnval; +} + +//----------------------------------------------------------------------------- +// Main Function: +//----------------------------------------------------------------------------- + +// Main memcmp function +int AVX_memcmp(const void *str1, const void *str2, size_t numbytes, int equality) +{ + int returnval = 0; + + if( + ( ((uintptr_t)str1 & BYTE_ALIGNMENT) == 0 ) + && + ( ((uintptr_t)str2 & BYTE_ALIGNMENT) == 0 ) + ) // Check alignment + { + // See memmove.c for why it's worth doing special aligned versions of memcmp, which + // is a function that involves 2 loads. + if(equality == 0) + { + returnval = memcmp_large_eq_a(str1, str2, numbytes); + } + else + { + returnval = memcmp_large_a(str1, str2, numbytes); + } + } + else + { + if(equality == 0) + { + returnval = memcmp_large_eq(str1, str2, numbytes); + } + else + { + returnval = memcmp_large(str1, str2, numbytes); + } + } + + return returnval; +} + +// AVX-1024+ support pending existence of the standard. \ No newline at end of file diff --git a/kernel/memory/memcpy.c b/kernel/memory/memcpy.c new file mode 100644 index 0000000..7082c54 --- /dev/null +++ b/kernel/memory/memcpy.c @@ -0,0 +1,2143 @@ + +#include +// +// This file provides a highly optimized version of memcpy. +// Overlapping memory regions are not supported by default: use memmove instead. +// +// NOTE: The discussion about microarchitecture in the memmove file applies to +// this memcpy, as well. +// +// ...If for some reason you absolutely, desperately need to use AVX_memcpy +// instead of AVX_memmove, and you need to use it on overlapping areas, enable +// the below definition. It will check for overlap and automatically redirect to +// AVX_memmove if overlap is found. +// #define OVERLAP_CHECK +// + +#ifdef __clang__ +#define __m128i_u __m128i +#define __m256i_u __m256i +#define __m512i_u __m512i +#endif + +#ifdef __AVX512F__ +#define BYTE_ALIGNMENT 0x3F // For 64-byte alignment +#elif __AVX__ +#define BYTE_ALIGNMENT 0x1F // For 32-byte alignment +#else +#define BYTE_ALIGNMENT 0x0F // For 16-byte alignment +#endif + +// +// USAGE INFORMATION: +// +// The "len" argument is "# of x bytes to copy," e.g. memcpy_512bit_u/a needs +// to know "how many multiples of 512 bit (64 bytes) to copy." The functions +// with byte sizes larger than their bit/8 sizes follow the same pattern: +// memcpy_512bit_512B_u/a needs to know how many multiples of 512 bytes to copy. +// +// The "numbytes" argument in AVX_memcpy and memcpy_large is just the total +// number of bytes to copy. +// + +//----------------------------------------------------------------------------- +// Individual Functions: +//----------------------------------------------------------------------------- + +// 16-bit (2 bytes at a time) +// Len is (# of total bytes/2), so it's "# of 16-bits" + +void * memcpy_16bit(void *dest, const void *src, size_t len) +{ + const uint16_t* s = (uint16_t*)src; + uint16_t* d = (uint16_t*)dest; + + while (len--) + { + *d++ = *s++; + } + + return dest; +} + +// 32-bit (4 bytes at a time - 1 pixel in a 32-bit linear frame buffer) +// Len is (# of total bytes/4), so it's "# of 32-bits" + +void * memcpy_32bit(void *dest, const void *src, size_t len) +{ + const uint32_t* s = (uint32_t*)src; + uint32_t* d = (uint32_t*)dest; + + while (len--) + { + *d++ = *s++; + } + + return dest; +} + +// 64-bit (8 bytes at a time - 2 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/8), so it's "# of 64-bits" + +void * memcpy_64bit(void *dest, const void *src, size_t len) +{ + const uint64_t* s = (uint64_t*)src; + uint64_t* d = (uint64_t*)dest; + + while (len--) + { + *d++ = *s++; + } + + return dest; +} + +//----------------------------------------------------------------------------- +// SSE2 Unaligned: +//----------------------------------------------------------------------------- + +// SSE2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/16), so it's "# of 128-bits" + +void * memcpy_128bit_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); + } + + return dest; +} + +// 32 bytes at a time +void * memcpy_128bit_32B_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 + } + + return dest; +} + +// 64 bytes at a time +void * memcpy_128bit_64B_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 + } + + return dest; +} + +// 128 bytes at a time +void * memcpy_128bit_128B_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 5 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 6 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 7 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 8 + } + + return dest; +} + +// 256 bytes +void * memcpy_128bit_256B_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 5 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 6 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 7 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 8 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 9 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 10 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 11 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 12 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 13 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 14 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 15 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 16 + } + + return dest; +} + +//----------------------------------------------------------------------------- +// AVX+ Unaligned: +//----------------------------------------------------------------------------- + +// AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Sandybridge and Ryzen and up, Haswell and up for better performance + +#ifdef __AVX__ +void * memcpy_256bit_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); + } + + return dest; +} + +// 64 bytes at a time +void * memcpy_256bit_64B_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 + } + + return dest; +} + +// 128 bytes at a time +void * memcpy_256bit_128B_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 + } + + return dest; +} + +// 256 bytes at a time +void * memcpy_256bit_256B_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 5 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 6 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 7 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 8 + } + + return dest; +} + +// 512 bytes at a time, one load->store for every ymm register (there are 16) +void * memcpy_256bit_512B_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 5 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 6 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 7 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 8 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 9 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 10 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 11 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 12 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 13 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 14 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 15 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 16 + } + + return dest; +} +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ +void * memcpy_512bit_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); + } + + return dest; +} + +// 128 bytes at a time +void * memcpy_512bit_128B_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + } + + return dest; +} + +// 256 bytes at a time +void * memcpy_512bit_256B_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + } + + return dest; +} + +// 512 bytes (half a KB!!) at a time +void * memcpy_512bit_512B_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + } + + return dest; +} + +// 1024 bytes, or 1 kB +void * memcpy_512bit_1kB_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 + } + + return dest; +} + +// 2048 bytes, or 2 kB +void * memcpy_512bit_2kB_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 + } + + return dest; +} + +// 4096 bytes, or 4 kB +void * memcpy_512bit_4kB_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 + } + + return dest; +} + +#endif + +// AVX-1024 support pending existence of the standard. It would be able to fit +// an entire 4 kB page in its registers at one time. Imagine that! + +//----------------------------------------------------------------------------- +// SSE2 Aligned: +//----------------------------------------------------------------------------- + +// SSE2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/16), so it's "# of 128-bits" + +void * memcpy_128bit_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(d++, _mm_load_si128(s++)); + } + + return dest; +} + +// 32 bytes at a time +void * memcpy_128bit_32B_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(d++, _mm_load_si128(s++)); // 1 + _mm_store_si128(d++, _mm_load_si128(s++)); // 2 + } + + return dest; +} + +// 64 bytes at a time +void * memcpy_128bit_64B_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(d++, _mm_load_si128(s++)); // 1 + _mm_store_si128(d++, _mm_load_si128(s++)); // 2 + _mm_store_si128(d++, _mm_load_si128(s++)); // 3 + _mm_store_si128(d++, _mm_load_si128(s++)); // 4 + } + + return dest; +} + +// 128 bytes at a time +void * memcpy_128bit_128B_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(d++, _mm_load_si128(s++)); // 1 + _mm_store_si128(d++, _mm_load_si128(s++)); // 2 + _mm_store_si128(d++, _mm_load_si128(s++)); // 3 + _mm_store_si128(d++, _mm_load_si128(s++)); // 4 + _mm_store_si128(d++, _mm_load_si128(s++)); // 5 + _mm_store_si128(d++, _mm_load_si128(s++)); // 6 + _mm_store_si128(d++, _mm_load_si128(s++)); // 7 + _mm_store_si128(d++, _mm_load_si128(s++)); // 8 + } + + return dest; +} + +// 256 bytes +void * memcpy_128bit_256B_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(d++, _mm_load_si128(s++)); // 1 + _mm_store_si128(d++, _mm_load_si128(s++)); // 2 + _mm_store_si128(d++, _mm_load_si128(s++)); // 3 + _mm_store_si128(d++, _mm_load_si128(s++)); // 4 + _mm_store_si128(d++, _mm_load_si128(s++)); // 5 + _mm_store_si128(d++, _mm_load_si128(s++)); // 6 + _mm_store_si128(d++, _mm_load_si128(s++)); // 7 + _mm_store_si128(d++, _mm_load_si128(s++)); // 8 + _mm_store_si128(d++, _mm_load_si128(s++)); // 9 + _mm_store_si128(d++, _mm_load_si128(s++)); // 10 + _mm_store_si128(d++, _mm_load_si128(s++)); // 11 + _mm_store_si128(d++, _mm_load_si128(s++)); // 12 + _mm_store_si128(d++, _mm_load_si128(s++)); // 13 + _mm_store_si128(d++, _mm_load_si128(s++)); // 14 + _mm_store_si128(d++, _mm_load_si128(s++)); // 15 + _mm_store_si128(d++, _mm_load_si128(s++)); // 16 + } + + return dest; +} + +//----------------------------------------------------------------------------- +// AVX+ Aligned: +//----------------------------------------------------------------------------- + +// AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Sandybridge and Ryzen and up + +#ifdef __AVX__ +void * memcpy_256bit_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); + } + + return dest; +} + +// 64 bytes at a time +void * memcpy_256bit_64B_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 + } + + return dest; +} + +// 128 bytes at a time +void * memcpy_256bit_128B_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 + } + + return dest; +} + +// 256 bytes at a time +void * memcpy_256bit_256B_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 5 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 6 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 7 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 8 + } + + return dest; +} + +// 512 bytes +void * memcpy_256bit_512B_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 5 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 6 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 7 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 8 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 9 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 10 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 11 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 12 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 13 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 14 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 15 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 16 + } + + return dest; +} + +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ +void * memcpy_512bit_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); + } + + return dest; +} + +// 128 bytes at a time +void * memcpy_512bit_128B_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + } + + return dest; +} + +// 256 bytes at a time +void * memcpy_512bit_256B_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + } + + return dest; +} + +// 512 bytes (half a KB!!) at a time +void * memcpy_512bit_512B_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + } + + return dest; +} + +// 1024 bytes, or 1 kB +void * memcpy_512bit_1kB_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 + } + + return dest; +} + +// 2048 bytes, or 2 kB +void * memcpy_512bit_2kB_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 + } + + return dest; +} + +// 4096 bytes, or 4 kB +void * memcpy_512bit_4kB_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 + } + + return dest; +} + +#endif + +//----------------------------------------------------------------------------- +// SSE4.1 Streaming: +//----------------------------------------------------------------------------- + +// SSE4.1 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/16), so it's "# of 128-bits" + +void * memcpy_128bit_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); + } + _mm_sfence(); + + return dest; +} + +// 32 bytes at a time +void * memcpy_128bit_32B_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 + } + _mm_sfence(); + + return dest; +} + +// 64 bytes at a time +void * memcpy_128bit_64B_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 + } + _mm_sfence(); + + return dest; +} + +// 128 bytes at a time +void * memcpy_128bit_128B_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 5 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 6 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 7 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 8 + } + _mm_sfence(); + + return dest; +} + +// 256 bytes +void * memcpy_128bit_256B_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 5 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 6 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 7 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 8 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 9 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 10 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 11 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 12 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 13 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 14 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 15 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 16 + } + _mm_sfence(); + + return dest; +} + +//----------------------------------------------------------------------------- +// AVX2+ Streaming: +//----------------------------------------------------------------------------- + +// AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Haswell and Ryzen and up + +#ifdef __AVX2__ +void * memcpy_256bit_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); + } + _mm_sfence(); + + return dest; +} + +// 64 bytes at a time +void * memcpy_256bit_64B_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 + } + _mm_sfence(); + + return dest; +} + +// 128 bytes at a time +void * memcpy_256bit_128B_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 + } + _mm_sfence(); + + return dest; +} + +// 256 bytes at a time +void * memcpy_256bit_256B_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 5 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 6 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 7 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 8 + } + _mm_sfence(); + + return dest; +} + +// 512 bytes +void * memcpy_256bit_512B_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 5 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 6 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 7 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 8 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 9 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 10 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 11 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 12 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 13 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 14 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 15 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 16 + } + _mm_sfence(); + + return dest; +} + +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ +void * memcpy_512bit_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); + } + _mm_sfence(); + + return dest; +} + +// 128 bytes at a time +void * memcpy_512bit_128B_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + } + _mm_sfence(); + + return dest; +} + +// 256 bytes at a time +void * memcpy_512bit_256B_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + } + _mm_sfence(); + + return dest; +} + +// 512 bytes (half a KB!!) at a time +void * memcpy_512bit_512B_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + } + _mm_sfence(); + + return dest; +} + +// 1024 bytes, or 1 kB +void * memcpy_512bit_1kB_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 + } + _mm_sfence(); + + return dest; +} + +// 2048 bytes, or 2 kB +void * memcpy_512bit_2kB_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 + } + _mm_sfence(); + + return dest; +} + +// 4096 bytes, or 4 kB +void * memcpy_512bit_4kB_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 + } + _mm_sfence(); + + return dest; +} + +#endif + +//----------------------------------------------------------------------------- +// Dispatch Functions: +//----------------------------------------------------------------------------- + +// Copy arbitrarily large amounts of data between 2 non-overlapping regions +void * memcpy_large(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memcpy is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // The biggest sizes will go first for alignment. There's no benefit to using + // aligned loads over unaligned loads here, so all are unaligned. + // NOTE: Each memcpy has its own loop so that any one can be used individually. + { + if(numbytes < 2) // 1 byte + { + memcpy(dest, src, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memcpy_16bit(dest, src, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memcpy_32bit(dest, src, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memcpy_64bit(dest, src, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_u(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_256bit_u(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_512bit_u(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_512bit_128B_u(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memcpy_512bit_256B_u(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memcpy_512bit_512B_u(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memcpy_512bit_1kB_u(dest, src, numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memcpy_512bit_2kB_u(dest, src, numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memcpy_512bit_4kB_u(dest, src, numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_u(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_256bit_u(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_256bit_64B_u(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_256bit_128B_u(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memcpy_256bit_256B_u(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else // 512 bytes + { + memcpy_256bit_512B_u(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_u(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_128bit_32B_u(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_128bit_64B_u(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_128bit_128B_u(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else // 256 bytes + { + memcpy_128bit_256B_u(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } +#endif + } + + return returnval; +} // END MEMCPY LARGE, UNALIGNED + +// Copy arbitrarily large amounts of data between 2 non-overlapping regions +// Aligned version +void * memcpy_large_a(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memcpy is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // The biggest sizes will go first for alignment. There's no benefit to using + // aligned loads over unaligned loads here, so all are unaligned. + // NOTE: Each memcpy has its own loop so that any one can be used individually. + { + if(numbytes < 2) // 1 byte + { + memcpy(dest, src, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memcpy_16bit(dest, src, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memcpy_32bit(dest, src, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memcpy_64bit(dest, src, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_a(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_256bit_a(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_512bit_a(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_512bit_128B_a(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memcpy_512bit_256B_a(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memcpy_512bit_512B_a(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memcpy_512bit_1kB_a(dest, src, numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memcpy_512bit_2kB_a(dest, src, numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memcpy_512bit_4kB_a(dest, src, numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_a(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_256bit_a(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_256bit_64B_a(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_256bit_128B_a(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memcpy_256bit_256B_a(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else // 512 bytes + { + memcpy_256bit_512B_a(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_a(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_128bit_32B_a(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_128bit_64B_a(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_128bit_128B_a(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else // 256 bytes + { + memcpy_128bit_256B_a(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } +#endif + } + + return returnval; +} // END MEMCPY LARGE, ALIGNED + +// Copy arbitrarily large amounts of data between 2 non-overlapping regions +// Aligned, streaming version +void * memcpy_large_as(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memcpy is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // The biggest sizes will go first for alignment. There's no benefit to using + // aligned loads over unaligned loads here, so all are unaligned. + // NOTE: Each memcpy has its own loop so that any one can be used individually. + { + if(numbytes < 2) // 1 byte + { + memcpy(dest, src, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memcpy_16bit(dest, src, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memcpy_32bit(dest, src, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memcpy_64bit(dest, src, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_as(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_256bit_as(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_512bit_as(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_512bit_128B_as(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memcpy_512bit_256B_as(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memcpy_512bit_512B_as(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memcpy_512bit_1kB_as(dest, src, numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memcpy_512bit_2kB_as(dest, src, numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memcpy_512bit_4kB_as(dest, src, numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 4095; + } +#elif __AVX2__ + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_as(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_256bit_as(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_256bit_64B_as(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_256bit_128B_as(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memcpy_256bit_256B_as(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else // 512 bytes + { + memcpy_256bit_512B_as(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } +#else // SSE4.1 only + else if(numbytes < 32) // 16 bytes + { + memcpy_128bit_as(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memcpy_128bit_32B_as(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memcpy_128bit_64B_as(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memcpy_128bit_128B_as(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else // 256 bytes + { + memcpy_128bit_256B_as(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } +#endif + } + + return returnval; +} // END MEMCPY LARGE, ALIGNED, STREAMING + +//----------------------------------------------------------------------------- +// Main Function: +//----------------------------------------------------------------------------- + +// General-purpose function to call +void* memcpyAVX(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; + + if((char*)src == (char*)dest) + { + // Lol. + return returnval; + } + +#ifdef OVERLAP_CHECK + // Overlap check + if( + ( + ( + (char*)dest > (char*)src + ) + && + ( + (char*)dest < ((char*)src + numbytes) + ) + ) + || + ( + ( + (char*)src > (char*)dest + ) + && + ( + (char*)src < ((char*)dest + numbytes) + ) + ) + ) // Why didn't you just use memmove directly??? + { + returnval = AVX_memmove(dest, src, numbytes); + return returnval; + } +#endif + + if( + ( ((uintptr_t)src & BYTE_ALIGNMENT) == 0 ) + && + ( ((uintptr_t)dest & BYTE_ALIGNMENT) == 0 ) + ) // Check alignment + { + // This is the fastest case: src and dest are both cache line aligned. + if(numbytes > CACHESIZE) + { + memcpy_large_as(dest, src, numbytes); + } + else + { + memcpy_large_a(dest, src, numbytes); // Even if numbytes is small this'll work + } + } + else // Unaligned + { + size_t numbytes_to_align = (BYTE_ALIGNMENT + 1) - ((uintptr_t)dest & BYTE_ALIGNMENT); + + if(numbytes > numbytes_to_align) + { + void * destoffset = (char*)dest + numbytes_to_align; + void * srcoffset = (char*)src + numbytes_to_align; + + // Get to an aligned position. + // This may be a little slower, but since it'll be mostly scalar operations + // alignment doesn't matter. Worst case it uses two vector functions, and + // this process only needs to be done once per call if dest is unaligned. + memcpy_large(dest, src, numbytes_to_align); + // Now this should be faster since stores are aligned. + memcpy_large(destoffset, srcoffset, numbytes - numbytes_to_align); // Can't use streaming due to potential src misalignment + // On Haswell and up, cross cache line loads have a negligible penalty. + // Thus this will be slower on Sandy & Ivy Bridge, though Ivy Bridge will + // fare a little better (~2x, maybe?). Ryzen should generally fall somewhere + // inbetween Sandy Bridge and Haswell/Skylake on that front. + // NOTE: These are just rough theoretical estimates. + } + else // Small size + { + memcpy_large(dest, src, numbytes); + } + } + + return returnval; +} + +// AVX-1024+ support pending existence of the standard. \ No newline at end of file diff --git a/kernel/memory/memmove.c b/kernel/memory/memmove.c new file mode 100644 index 0000000..100e8ae --- /dev/null +++ b/kernel/memory/memmove.c @@ -0,0 +1,4021 @@ + +#include + + +#ifdef __clang__ +#define __m128i_u __m128i +#define __m256i_u __m256i +#define __m512i_u __m512i +#endif + +#ifdef __AVX512F__ +#define BYTE_ALIGNMENT 0x3F // For 64-byte alignment +#elif __AVX__ +#define BYTE_ALIGNMENT 0x1F // For 32-byte alignment +#else +#define BYTE_ALIGNMENT 0x0F // For 16-byte alignment +#endif + +// +// USAGE INFORMATION: +// +// The "len" argument is "# of x bytes to move," e.g. memmove_512bit_u/a needs +// to know "how many multiples of 512 bit (64 bytes) to move." All functions +// with len follow the same pattern, e.g. memmove_512bit_512B_u/a needs to know +// how many multiples of 512 bytes to move, so a len of 4 tells it to move 2kB. +// +// The "numbytes" argument for functions that use it is just the total +// number of bytes to move. +// + +// Some microarchitectural information: +// +// Sources: +// https://www.agner.org/optimize/ +// https://software.intel.com/en-us/articles/intel-sdm +// http://blog.stuffedcow.net/2014/01/x86-memory-disambiguation/ +// +// It looks as though Haswell and up can do 2 simultaneous aligned loads or 1 +// unaligned load in 1 cycle. Alignment means the data is at an address that is +// a multiple of the cache line size, and the CPU most easily loads one cache +// line at a time. All AVX-supporting CPUs have a 64-byte cacheline as of Q4 2018. +// The bottleneck here is stores: only 1 store per cycle can be done (there is +// only 1 store port despite 2 load ports). Unaligned loads/stores that cross +// cache line boundaries typically incur relatively significant cycle penalties, +// though Haswell and up fixed that specifically for unaligned loads. +// +// Unaligned loads on Haswell require both load ports, but, since there is only +// one store port, the store port has to do double-duty for stores that cross +// cache line boundaries. So stores should be contained within cache line sizes +// for best performance. For memmove, this also means there's no point in doing +// 2 separate aligned loads simultaneously if only one can be written at a time. +// +// BUT it turns out that's not the whole story. We can do 2 aligned loads to +// ensure that no cycle is wasted. i.e. instead of this (comma = simultaneously): +// load 1 -> store 1, load 2-> store 2, load 3 -> store 3, load 4 -> store 4 etc. +// we can do this with aligned AVX2 loads: +// load 1, load 2 -> store 1, load 3, load 4 -> store 2, load 5, load 6 -> store 3, etc. +// And this is just per core. +// +// For pure memmove, this provides no real improvement, but loops with many +// iterations that require loading two values, doing math on them, and storing a +// single result can see significant throughput gains. Sandy Bridge could perform +// similarly, but in 2 cycles instead of Haswell's 1 and only for the fewer +// 256-bit AVX calculations it had (Haswell can do any size, AVX2 or otherwise). +// +// Skylake-X, with AVX512, extends Haswell's behavior to include 512-bit values. +// +// If an architecture ever adds 2 store ports, the AVX/(VEX-encoded) SSE +// functions in this file will need to be modified to do 2 loads and 2 stores. +// + +//----------------------------------------------------------------------------- +// Individual Functions: +//----------------------------------------------------------------------------- + +// 16-bit (2 bytes at a time) +// Len is (# of total bytes/2), so it's "# of 16-bits" + +void * memmove_16bit(void *dest, const void *src, size_t len) +{ + const uint16_t* s = (uint16_t*)src; + uint16_t* d = (uint16_t*)dest; + + const uint16_t *nexts = s + len; + uint16_t *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + *d++ = *s++; + } + } + else + { + while (nextd != d) + { + *--nextd = *--nexts; + } + } + return dest; +} + +// 32-bit (4 bytes at a time - 1 pixel in a 32-bit linear frame buffer) +// Len is (# of total bytes/4), so it's "# of 32-bits" + +void * memmove_32bit(void *dest, const void *src, size_t len) +{ + const uint32_t* s = (uint32_t*)src; + uint32_t* d = (uint32_t*)dest; + + const uint32_t *nexts = s + len; + uint32_t *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + *d++ = *s++; + } + } + else + { + while (nextd != d) + { + *--nextd = *--nexts; + } + } + return dest; +} + +// 64-bit (8 bytes at a time - 2 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/8), so it's "# of 64-bits" + +void * memmove_64bit(void *dest, const void *src, size_t len) +{ + const uint64_t* s = (uint64_t*)src; + uint64_t* d = (uint64_t*)dest; + + const uint64_t *nexts = s + len; + uint64_t *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + *d++ = *s++; + } + } + else + { + while (nextd != d) + { + *--nextd = *--nexts; + } + } + return dest; +} + +//----------------------------------------------------------------------------- +// SSE2 Unaligned: +//----------------------------------------------------------------------------- + +// SSE2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/16), so it's "# of 128-bits" + +void * memmove_128bit_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + const __m128i_u *nexts = s + len; + __m128i_u *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); + } + } + else + { + while (nextd != d) + { + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); + } + } + return dest; +} + +// 32 bytes at a time +void * memmove_128bit_32B_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + const __m128i_u *nexts = s + (len << 1); +__m128i_u *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 1 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 2 + } + } + return dest; +} + +// 64 bytes at a time +void * memmove_128bit_64B_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + const __m128i_u *nexts = s + (len << 2); + __m128i_u *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 1 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 2 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 3 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 4 + } + } + return dest; +} + +// 128 bytes at a time +void * memmove_128bit_128B_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + const __m128i_u *nexts = s + (len << 3); + __m128i_u *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 5 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 6 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 7 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 8 + } + } + else + { + while (nextd != d) + { + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 1 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 2 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 3 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 4 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 5 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 6 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 7 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 8 + } + } + return dest; +} + +// For fun: 1 load->store for every xmm register +// 256 bytes +void * memmove_128bit_256B_u(void *dest, const void *src, size_t len) +{ + const __m128i_u* s = (__m128i_u*)src; + __m128i_u* d = (__m128i_u*)dest; + + const __m128i_u *nexts = s + (len << 4); + __m128i_u *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 1 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 2 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 3 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 4 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 5 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 6 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 7 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 8 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 9 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 10 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 11 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 12 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 13 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 14 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 15 + _mm_storeu_si128(d++, _mm_lddqu_si128(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 1 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 2 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 3 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 4 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 5 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 6 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 7 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 8 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 9 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 10 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 11 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 12 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 13 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 14 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 15 + _mm_storeu_si128(--nextd, _mm_lddqu_si128(--nexts)); // 16 + } + } + return dest; +} + +//----------------------------------------------------------------------------- +// AVX+ Unaligned: +//----------------------------------------------------------------------------- + +// AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Sandybridge and Ryzen and up, Haswell and up for better performance + +#ifdef __AVX__ + +void * memmove_256bit_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + const __m256i_u *nexts = s + len; + __m256i_u *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); + } + } + else + { + while (nextd != d) + { + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); + } + } + return dest; +} + +// 64 bytes at a time +void * memmove_256bit_64B_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + const __m256i_u *nexts = s + (len << 1); + __m256i_u *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 1 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 2 + } + } + return dest; +} + +// 128 bytes at a time +void * memmove_256bit_128B_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + const __m256i_u *nexts = s + (len << 2); + __m256i_u *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 1 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 2 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 3 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 4 + } + } + return dest; +} + +// 256 bytes at a time +void * memmove_256bit_256B_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + const __m256i_u *nexts = s + (len << 3); + __m256i_u *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 5 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 6 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 7 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 8 + } + } + else + { + while (nextd != d) + { + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 1 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 2 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 3 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 4 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 5 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 6 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 7 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 8 + } + } + return dest; +} + +// For fun: +// 512 bytes at a time, one load->store for every ymm register (there are 16) +void * memmove_256bit_512B_u(void *dest, const void *src, size_t len) +{ + const __m256i_u* s = (__m256i_u*)src; + __m256i_u* d = (__m256i_u*)dest; + + const __m256i_u *nexts = s + (len << 4); + __m256i_u *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 1 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 2 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 3 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 4 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 5 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 6 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 7 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 8 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 9 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 10 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 11 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 12 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 13 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 14 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 15 + _mm256_storeu_si256(d++, _mm256_lddqu_si256(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 1 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 2 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 3 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 4 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 5 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 6 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 7 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 8 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 9 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 10 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 11 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 12 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 13 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 14 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 15 + _mm256_storeu_si256(--nextd, _mm256_lddqu_si256(--nexts)); // 16 + } + } + return dest; +} +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ +void * memmove_512bit_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + const __m512i_u *nexts = s + len; + __m512i_u *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); + } + } + else + { + while (nextd != d) + { + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); + } + } + + return dest; +} + +// 128 bytes at a time +void * memmove_512bit_128B_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + const __m512i_u *nexts = s + (len << 1); + __m512i_u *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 + } + } + return dest; +} + +// 256 bytes at a time +void * memmove_512bit_256B_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + const __m512i_u *nexts = s + (len << 2); + __m512i_u *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 + } + } + return dest; +} + +// 512 bytes (half a KB!!) at a time +void * memmove_512bit_512B_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + const __m512i_u *nexts = s + (len << 3); + __m512i_u *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + } + } + else + { + while (nextd != d) + { + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 + } + } + return dest; +} + +// Alright I'll admit I got a little carried away... + +// 1024 bytes, or 1 kB +void * memmove_512bit_1kB_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + const __m512i_u *nexts = s + (len << 4); + __m512i_u *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 9 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 10 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 11 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 12 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 13 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 14 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 15 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 16 + } + } + return dest; +} + +// 2048 bytes, or 2 kB +void * memmove_512bit_2kB_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + const __m512i_u *nexts = s + (len << 5); + __m512i_u *nextd = d + (len << 5); + + if (d < s) + { + while (d != nextd) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 + } + } + else + { + while (nextd != d) + { + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 9 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 10 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 11 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 12 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 13 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 14 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 15 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 16 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 17 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 18 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 19 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 20 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 21 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 22 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 23 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 24 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 25 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 26 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 27 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 28 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 29 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 30 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 31 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 32 + } + } + return dest; +} + +// Y'know what? Here's a whole page. +// 4096 bytes, or 4 kB +void * memmove_512bit_4kB_u(void *dest, const void *src, size_t len) +{ + const __m512i_u* s = (__m512i_u*)src; + __m512i_u* d = (__m512i_u*)dest; + + const __m512i_u *nexts = s + (len << 6); + __m512i_u *nextd = d + (len << 6); + + if (d < s) + { + while (d != nextd) + { + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 1 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 2 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 3 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 4 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 5 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 6 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 7 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 8 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 9 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 10 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 11 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 12 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 13 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 14 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 15 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 16 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 17 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 18 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 19 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 20 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 21 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 22 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 23 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 24 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 25 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 26 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 27 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 28 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 29 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 30 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 31 + _mm512_storeu_si512(d++, _mm512_loadu_si512(s++)); // 32 + } + } + else + { + while (nextd != d) + { + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 9 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 10 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 11 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 12 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 13 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 14 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 15 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 16 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 17 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 18 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 19 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 20 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 21 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 22 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 23 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 24 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 25 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 26 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 27 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 28 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 29 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 30 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 31 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 32 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 1 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 2 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 3 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 4 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 5 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 6 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 7 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 8 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 9 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 10 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 11 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 12 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 13 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 14 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 15 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 16 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 17 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 18 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 19 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 20 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 21 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 22 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 23 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 24 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 25 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 26 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 27 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 28 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 29 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 30 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 31 + _mm512_storeu_si512(--nextd, _mm512_loadu_si512(--nexts)); // 32 + } + } + return dest; +} + +#endif + +// AVX-1024 support pending existence of the standard. It would be able to fit +// an entire 4 kB page in its registers at one time. Imagine that! +// (AVX-512 maxes at 2 kB, which is why I only used numbers 1-32 above.) + +//----------------------------------------------------------------------------- +// SSE2 Aligned: +//----------------------------------------------------------------------------- + +// SSE2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/16), so it's "# of 128-bits" + +void * memmove_128bit_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + const __m128i *nexts = s + len; + __m128i *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm_store_si128(d++, _mm_load_si128(s++)); + } + } + else + { + while (nextd != d) + { + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); + } + } + return dest; +} + +// 32 bytes at a time +void * memmove_128bit_32B_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + const __m128i *nexts = s + (len << 1); + __m128i *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm_store_si128(d++, _mm_load_si128(s++)); // 1 + _mm_store_si128(d++, _mm_load_si128(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 1 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 2 + } + } + return dest; +} + +// 64 bytes at a time +void * memmove_128bit_64B_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + const __m128i *nexts = s + (len << 2); + __m128i *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm_store_si128(d++, _mm_load_si128(s++)); // 1 + _mm_store_si128(d++, _mm_load_si128(s++)); // 2 + _mm_store_si128(d++, _mm_load_si128(s++)); // 3 + _mm_store_si128(d++, _mm_load_si128(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 1 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 2 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 3 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 4 + } + } + return dest; +} + +// 128 bytes at a time +void * memmove_128bit_128B_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + const __m128i *nexts = s + (len << 3); + __m128i *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) + { + _mm_store_si128(d++, _mm_load_si128(s++)); // 1 + _mm_store_si128(d++, _mm_load_si128(s++)); // 2 + _mm_store_si128(d++, _mm_load_si128(s++)); // 3 + _mm_store_si128(d++, _mm_load_si128(s++)); // 4 + _mm_store_si128(d++, _mm_load_si128(s++)); // 5 + _mm_store_si128(d++, _mm_load_si128(s++)); // 6 + _mm_store_si128(d++, _mm_load_si128(s++)); // 7 + _mm_store_si128(d++, _mm_load_si128(s++)); // 8 + } + } + else + { + while (nextd != d) + { + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 1 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 2 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 3 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 4 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 5 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 6 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 7 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 8 + } + } + return dest; +} + +// For fun: 1 load->store for every xmm register (there are 16) +// 256 bytes +void * memmove_128bit_256B_a(void *dest, const void *src, size_t len) +{ + const __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + const __m128i *nexts = s + (len << 4); + __m128i *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm_store_si128(d++, _mm_load_si128(s++)); // 1 + _mm_store_si128(d++, _mm_load_si128(s++)); // 2 + _mm_store_si128(d++, _mm_load_si128(s++)); // 3 + _mm_store_si128(d++, _mm_load_si128(s++)); // 4 + _mm_store_si128(d++, _mm_load_si128(s++)); // 5 + _mm_store_si128(d++, _mm_load_si128(s++)); // 6 + _mm_store_si128(d++, _mm_load_si128(s++)); // 7 + _mm_store_si128(d++, _mm_load_si128(s++)); // 8 + _mm_store_si128(d++, _mm_load_si128(s++)); // 9 + _mm_store_si128(d++, _mm_load_si128(s++)); // 10 + _mm_store_si128(d++, _mm_load_si128(s++)); // 11 + _mm_store_si128(d++, _mm_load_si128(s++)); // 12 + _mm_store_si128(d++, _mm_load_si128(s++)); // 13 + _mm_store_si128(d++, _mm_load_si128(s++)); // 14 + _mm_store_si128(d++, _mm_load_si128(s++)); // 15 + _mm_store_si128(d++, _mm_load_si128(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 1 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 2 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 3 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 4 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 5 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 6 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 7 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 8 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 9 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 10 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 11 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 12 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 13 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 14 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 15 + _mm_store_si128(--nextd, _mm_load_si128(--nexts)); // 16 + } + } + return dest; +} + +//----------------------------------------------------------------------------- +// AVX+ Aligned: +//----------------------------------------------------------------------------- + +// AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Sandybridge and Ryzen and up + +#ifdef __AVX__ +void * memmove_256bit_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + len; + __m256i *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); + } + } + else + { + while (nextd != d) + { + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); + } + } + return dest; +} + +// 64 bytes at a time +void * memmove_256bit_64B_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + (len << 1); + __m256i *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 1 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 2 + } + } + return dest; +} + +// 128 bytes at a time +void * memmove_256bit_128B_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + (len << 2); + __m256i *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 1 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 2 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 3 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 4 + } + } + return dest; +} + +// 256 bytes at a time +void * memmove_256bit_256B_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + (len << 3); + __m256i *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 5 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 6 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 7 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 8 + } + } + else + { + while (nextd != d) + { + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 1 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 2 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 3 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 4 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 5 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 6 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 7 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 8 + } + } + return dest; +} + +// I just wanted to see what doing one move for every ymm register looks like. +// There are 16 256-bit (ymm) registers. +void * memmove_256bit_512B_a(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + (len << 4); + __m256i *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 1 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 2 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 3 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 4 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 5 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 6 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 7 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 8 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 9 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 10 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 11 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 12 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 13 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 14 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 15 + _mm256_store_si256(d++, _mm256_load_si256(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 1 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 2 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 3 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 4 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 5 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 6 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 7 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 8 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 9 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 10 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 11 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 12 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 13 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 14 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 15 + _mm256_store_si256(--nextd, _mm256_load_si256(--nexts)); // 16 + } + } + return dest; +} + +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ +void * memmove_512bit_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + len; + __m512i *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); + } + } + else + { + while (nextd != d) + { + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); + } + } + return dest; +} + +// 128 bytes at a time +void * memmove_512bit_128B_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 1); + __m512i *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 + } + } + return dest; +} + +// 256 bytes at a time +void * memmove_512bit_256B_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 2); + __m512i *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 + } + } + return dest; +} + +// 512 bytes (half a KB!!) at a time +void * memmove_512bit_512B_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 3); + __m512i *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) // Post-increment: use d then increment + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + } + } + else + { + while (nextd != d) // Pre-increment: increment nextd then use + { + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 + } + } + return dest; +} + +// The functions below I made just for fun to see what doing one move for every +// zmm register looks like. I think the insanity speaks for itself. :) + +// 1024 bytes, or 1 kB +void * memmove_512bit_1kB_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 4); + __m512i *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 9 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 10 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 11 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 12 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 13 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 14 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 15 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 16 + } + } + return dest; +} + +// 2048 bytes, or 2 kB +// AVX512 has 32x 512-bit registers, so...... +void * memmove_512bit_2kB_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 5); + __m512i *nextd = d + (len << 5); + + if (d < s) + { + while (d != nextd) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 + } + } + else + { + while (nextd != d) + { + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 9 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 10 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 11 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 12 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 13 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 14 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 15 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 16 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 17 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 18 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 19 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 20 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 21 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 22 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 23 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 24 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 25 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 26 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 27 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 28 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 29 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 30 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 31 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 32 + } + } + return dest; +} + +// Y'know what? Here's a whole page. +// 4096 bytes, or 4 kB +void * memmove_512bit_4kB_a(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 6); + __m512i *nextd = d + (len << 6); + + if (d < s) + { + while (d != nextd) + { + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 1 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 2 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 3 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 4 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 5 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 6 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 7 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 8 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 9 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 10 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 11 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 12 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 13 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 14 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 15 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 16 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 17 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 18 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 19 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 20 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 21 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 22 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 23 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 24 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 25 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 26 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 27 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 28 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 29 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 30 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 31 + _mm512_store_si512(d++, _mm512_load_si512(s++)); // 32 + } + } + else + { + while (nextd != d) + { + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 9 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 10 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 11 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 12 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 13 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 14 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 15 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 16 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 17 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 18 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 19 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 20 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 21 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 22 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 23 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 24 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 25 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 26 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 27 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 28 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 29 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 30 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 31 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 32 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 1 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 2 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 3 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 4 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 5 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 6 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 7 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 8 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 9 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 10 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 11 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 12 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 13 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 14 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 15 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 16 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 17 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 18 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 19 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 20 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 21 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 22 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 23 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 24 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 25 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 26 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 27 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 28 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 29 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 30 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 31 + _mm512_store_si512(--nextd, _mm512_load_si512(--nexts)); // 32 + } + } + return dest; +} + +#endif + +//----------------------------------------------------------------------------- +// SSE4.1 Streaming: +//----------------------------------------------------------------------------- + +// SSE4.1 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/16), so it's "# of 128-bits" + +void * memmove_128bit_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + __m128i *nexts = s + len; + __m128i *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); + } + } + else + { + while (nextd != d) + { + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); + } + } + _mm_sfence(); + + return dest; +} + +// 32 bytes at a time +void * memmove_128bit_32B_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + __m128i *nexts = s + (len << 1); + __m128i *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 1 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 2 + } + } + _mm_sfence(); + + return dest; +} + +// 64 bytes at a time +void * memmove_128bit_64B_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + __m128i *nexts = s + (len << 2); + __m128i *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 1 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 2 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 3 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 4 + } + } + _mm_sfence(); + + return dest; +} + +// 128 bytes at a time +void * memmove_128bit_128B_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + __m128i *nexts = s + (len << 3); + __m128i *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 5 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 6 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 7 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 8 + } + } + else + { + while (nextd != d) + { + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 1 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 2 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 3 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 4 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 5 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 6 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 7 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 8 + } + } + _mm_sfence(); + + return dest; +} + +// For fun: 1 load->store for every xmm register (there are 16) +// 256 bytes +void * memmove_128bit_256B_as(void *dest, const void *src, size_t len) +{ + __m128i* s = (__m128i*)src; + __m128i* d = (__m128i*)dest; + + __m128i *nexts = s + (len << 4); + __m128i *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 1 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 2 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 3 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 4 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 5 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 6 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 7 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 8 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 9 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 10 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 11 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 12 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 13 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 14 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 15 + _mm_stream_si128(d++, _mm_stream_load_si128(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 1 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 2 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 3 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 4 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 5 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 6 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 7 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 8 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 9 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 10 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 11 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 12 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 13 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 14 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 15 + _mm_stream_si128(--nextd, _mm_stream_load_si128(--nexts)); // 16 + } + } + _mm_sfence(); + + return dest; +} + +//----------------------------------------------------------------------------- +// AVX2+ Streaming: +//----------------------------------------------------------------------------- + +// AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Haswell and Ryzen and up + +#ifdef __AVX2__ +void * memmove_256bit_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + len; + __m256i *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); + } + } + else + { + while (nextd != d) + { + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); + } + } + _mm_sfence(); + + return dest; +} + +// 64 bytes at a time +void * memmove_256bit_64B_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + (len << 1); + __m256i *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 1 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 2 + } + } + _mm_sfence(); + + return dest; +} + +// 128 bytes at a time +void * memmove_256bit_128B_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + (len << 2); + __m256i *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 1 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 2 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 3 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 4 + } + } + _mm_sfence(); + + return dest; +} + +// 256 bytes at a time +void * memmove_256bit_256B_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + (len << 3); + __m256i *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 5 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 6 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 7 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 8 + } + } + else + { + while (nextd != d) + { + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 1 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 2 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 3 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 4 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 5 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 6 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 7 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 8 + } + } + _mm_sfence(); + + return dest; +} + +// I just wanted to see what doing one move for every ymm register looks like. +// There are 16 256-bit (ymm) registers. +void * memmove_256bit_512B_as(void *dest, const void *src, size_t len) +{ + const __m256i* s = (__m256i*)src; + __m256i* d = (__m256i*)dest; + + const __m256i *nexts = s + (len << 4); + __m256i *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 1 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 2 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 3 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 4 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 5 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 6 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 7 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 8 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 9 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 10 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 11 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 12 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 13 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 14 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 15 + _mm256_stream_si256(d++, _mm256_stream_load_si256(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 1 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 2 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 3 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 4 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 5 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 6 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 7 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 8 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 9 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 10 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 11 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 12 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 13 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 14 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 15 + _mm256_stream_si256(--nextd, _mm256_stream_load_si256(--nexts)); // 16 + } + } + _mm_sfence(); + + return dest; +} + +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ +void * memmove_512bit_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + len; + __m512i *nextd = d + len; + + if (d < s) + { + while (d != nextd) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); + } + } + else + { + while (nextd != d) + { + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); + } + } + _mm_sfence(); + + return dest; +} + +// 128 bytes at a time +void * memmove_512bit_128B_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 1); + __m512i *nextd = d + (len << 1); + + if (d < s) + { + while (d != nextd) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + } + } + else + { + while (nextd != d) + { + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 + } + } + _mm_sfence(); + + return dest; +} + +// 256 bytes at a time +void * memmove_512bit_256B_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 2); + __m512i *nextd = d + (len << 2); + + if (d < s) + { + while (d != nextd) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + } + } + else + { + while (nextd != d) + { + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 + } + } + _mm_sfence(); + + return dest; +} + +// 512 bytes (half a KB!!) at a time +void * memmove_512bit_512B_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 3); + __m512i *nextd = d + (len << 3); + + if (d < s) + { + while (d != nextd) // Post-increment: use d then increment + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + } + } + else + { + while (nextd != d) // Pre-increment: increment nextd then use + { + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 + } + } + _mm_sfence(); + + return dest; +} + +// The functions below I made just for fun to see what doing one move for every +// zmm register looks like. I think the insanity speaks for itself. :) + +// 1024 bytes, or 1 kB +void * memmove_512bit_1kB_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 4); + __m512i *nextd = d + (len << 4); + + if (d < s) + { + while (d != nextd) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 + } + } + else + { + while (nextd != d) + { + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 9 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 10 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 11 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 12 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 13 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 14 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 15 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 16 + } + } + _mm_sfence(); + + return dest; +} + +// 2048 bytes, or 2 kB +// AVX512 has 32x 512-bit registers, so...... +void * memmove_512bit_2kB_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 5); + __m512i *nextd = d + (len << 5); + + if (d < s) + { + while (d != nextd) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 + } + } + else + { + while (nextd != d) + { + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 9 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 10 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 11 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 12 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 13 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 14 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 15 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 16 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 17 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 18 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 19 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 20 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 21 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 22 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 23 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 24 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 25 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 26 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 27 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 28 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 29 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 30 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 31 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 32 + } + } + _mm_sfence(); + + return dest; +} + +// Y'know what? Here's a whole page. +// 4096 bytes, or 4 kB +void * memmove_512bit_4kB_as(void *dest, const void *src, size_t len) +{ + const __m512i* s = (__m512i*)src; + __m512i* d = (__m512i*)dest; + + const __m512i *nexts = s + (len << 6); + __m512i *nextd = d + (len << 6); + + if (d < s) + { + while (d != nextd) + { + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 1 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 2 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 3 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 4 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 5 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 6 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 7 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 8 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 9 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 10 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 11 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 12 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 13 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 14 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 15 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 16 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 17 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 18 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 19 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 20 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 21 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 22 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 23 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 24 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 25 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 26 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 27 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 28 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 29 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 30 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 31 + _mm512_stream_si512(d++, _mm512_stream_load_si512(s++)); // 32 + } + } + else + { + while (nextd != d) + { + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 9 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 10 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 11 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 12 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 13 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 14 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 15 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 16 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 17 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 18 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 19 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 20 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 21 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 22 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 23 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 24 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 25 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 26 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 27 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 28 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 29 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 30 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 31 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 32 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 1 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 2 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 3 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 4 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 5 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 6 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 7 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 8 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 9 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 10 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 11 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 12 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 13 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 14 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 15 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 16 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 17 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 18 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 19 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 20 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 21 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 22 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 23 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 24 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 25 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 26 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 27 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 28 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 29 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 30 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 31 + _mm512_stream_si512(--nextd, _mm512_stream_load_si512(--nexts)); // 32 + } + } + _mm_sfence(); + + return dest; +} + +#endif + +//----------------------------------------------------------------------------- +// Dispatch Functions: +//----------------------------------------------------------------------------- + +// Move arbitrarily large amounts of data (dest addr < src addr) +void * memmove_large(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memmove is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // The biggest sizes will go first for alignment. There's no benefit to using + // aligned loads over unaligned loads here, so all are unaligned. + // NOTE: Each memmove has its own loop so that any one can be used individually. + { + if(numbytes < 2) // 1 byte + { + memmove(dest, src, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memmove_16bit(dest, src, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memmove_32bit(dest, src, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memmove_64bit(dest, src, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_u(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_256bit_u(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_512bit_u(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_512bit_128B_u(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memmove_512bit_256B_u(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memmove_512bit_512B_u(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memmove_512bit_1kB_u(dest, src, numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memmove_512bit_2kB_u(dest, src, numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memmove_512bit_4kB_u(dest, src, numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_u(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_256bit_u(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_256bit_64B_u(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_256bit_128B_u(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memmove_256bit_256B_u(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else // 512 bytes + { + memmove_256bit_512B_u(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_u(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_128bit_32B_u(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_128bit_64B_u(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_128bit_128B_u(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else // 256 bytes + { + memmove_128bit_256B_u(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMMOVE LARGE, UNALIGNED + +// Move arbitrarily large amounts of data (dest addr < src addr) +// Aligned version +void * memmove_large_a(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memmove is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // The biggest sizes will go first for alignment. There's no benefit to using + // aligned loads over unaligned loads here, so all are unaligned. + // NOTE: Each memmove has its own loop so that any one can be used individually. + { + if(numbytes < 2) // 1 byte + { + memmove(dest, src, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memmove_16bit(dest, src, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memmove_32bit(dest, src, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memmove_64bit(dest, src, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_a(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_256bit_a(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_512bit_a(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_512bit_128B_a(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memmove_512bit_256B_a(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memmove_512bit_512B_a(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memmove_512bit_1kB_a(dest, src, numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memmove_512bit_2kB_a(dest, src, numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memmove_512bit_4kB_a(dest, src, numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_a(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_256bit_a(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_256bit_64B_a(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_256bit_128B_a(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memmove_256bit_256B_a(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else // 512 bytes + { + memmove_256bit_512B_a(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_a(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_128bit_32B_a(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_128bit_64B_a(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_128bit_128B_a(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else // 256 bytes + { + memmove_128bit_256B_a(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMMOVE LARGE, ALIGNED + +// Move arbitrarily large amounts of data (dest addr < src addr) +// Aligned, streaming version +void * memmove_large_as(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memmove is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // The biggest sizes will go first for alignment. There's no benefit to using + // aligned loads over unaligned loads here, so all are unaligned. + // NOTE: Each memmove has its own loop so that any one can be used individually. + { + if(numbytes < 2) // 1 byte + { + memmove(dest, src, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memmove_16bit(dest, src, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memmove_32bit(dest, src, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memmove_64bit(dest, src, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_as(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_256bit_as(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_512bit_as(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_512bit_128B_as(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memmove_512bit_256B_as(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memmove_512bit_512B_as(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memmove_512bit_1kB_as(dest, src, numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memmove_512bit_2kB_as(dest, src, numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memmove_512bit_4kB_as(dest, src, numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 4095; + } +#elif __AVX2__ + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_as(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_256bit_as(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_256bit_64B_as(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_256bit_128B_as(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memmove_256bit_256B_as(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } + else // 512 bytes + { + memmove_256bit_512B_as(dest, src, numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 511; + } +#else // SSE4.1 only + else if(numbytes < 32) // 16 bytes + { + memmove_128bit_as(dest, src, numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memmove_128bit_32B_as(dest, src, numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memmove_128bit_64B_as(dest, src, numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memmove_128bit_128B_as(dest, src, numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 127; + } + else // 256 bytes + { + memmove_128bit_256B_as(dest, src, numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + src = (char *)src + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMMOVE LARGE, ALIGNED, STREAMING + +// Move arbitrarily large amounts of data in reverse order (ends first) +// src addr < dest addr +void * memmove_large_reverse(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memmove is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + void * nextdest = (char *)dest + numbytes; + void * nextsrc = (char *)src + numbytes; + + while(numbytes) + // Want smallest sizes to go first, at the tail end, so that the biggest sizes + // are aligned later in this operation (AVX_memmove sets the alignment up for + // this to work). + // NOTE: Each memmove has its own loop so that any one can be used individually. + { + if(numbytes & 1) // 1 byte + { + offset = numbytes & 1; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove(nextdest, nextsrc, 1); + numbytes &= -2; + } + else if(numbytes & 2) // 2 bytes + { + offset = numbytes & 3; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_16bit(nextdest, nextsrc, 1); + numbytes &= -4; + } + else if(numbytes & 4) // 4 bytes + { + offset = numbytes & 7; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_32bit(nextdest, nextsrc, 1); + numbytes &= -8; + } + else if(numbytes & 8) // 8 bytes + { + offset = numbytes & 15; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_64bit(nextdest, nextsrc, 1); + numbytes &= -16; + } +#ifdef __AVX512F__ + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_u(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_u(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_u(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128) // 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_128B_u(nextdest, nextsrc, 1); + numbytes &= -256; + } + else if(numbytes & 256) // 256 bytes + { + offset = numbytes & 511; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_256B_u(nextdest, nextsrc, 1); + numbytes &= -512; + } + else if(numbytes & 512) // 512 bytes + { + offset = numbytes & 1023; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_512B_u(nextdest, nextsrc, 1); + numbytes &= -1024; + } + else if(numbytes & 1024) // 1024 bytes (1 kB) + { + offset = numbytes & 2047; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_1kB_u(nextdest, nextsrc, 1); + numbytes &= -2048; + } + else if(numbytes & 2048) // 2048 bytes (2 kB) + { + offset = numbytes & 4095; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_2kB_u(nextdest, nextsrc, 1); + numbytes &= -4096; + } + else // 4096 bytes (4 kB) + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_512bit_4kB_u(nextdest, nextsrc, numbytes >> 12); + numbytes = 0; + } +#elif __AVX__ + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_u(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_u(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_64B_u(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128) // 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_128B_u(nextdest, nextsrc, 1); + numbytes &= -256; + } + else if(numbytes & 256) // 256 bytes + { + offset = numbytes & 511; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_256B_u(nextdest, nextsrc, 1); + numbytes &= -512; + } + else // 512 bytes + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_256bit_512B_u(nextdest, nextsrc, numbytes >> 9); + numbytes = 0; + } +#else // SSE2 only + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_u(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_32B_u(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_64B_u(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128)// 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_128B_u(nextdest, nextsrc, 1); + numbytes &= -256; + } + else // 256 bytes + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_128bit_256B_u(nextdest, nextsrc, numbytes >> 8); + numbytes = 0; + } +#endif + } + return returnval; +} // END MEMMOVE LARGE REVERSE, UNALIGNED + +// Move arbitrarily large amounts of data in reverse order (ends first) +// src addr < dest addr +// Aligned version +void * memmove_large_reverse_a(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memmove is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + void * nextdest = (char *)dest + numbytes; + void * nextsrc = (char *)src + numbytes; + + while(numbytes) + // Want smallest sizes to go first, at the tail end, so that the biggest sizes + // are aligned later in this operation (AVX_memmove sets the alignment up for + // this to work). + // NOTE: Each memmove has its own loop so that any one can be used individually. + { + if(numbytes & 1) // 1 byte + { + offset = numbytes & 1; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove(nextdest, nextsrc, 1); + numbytes &= -2; + } + else if(numbytes & 2) // 2 bytes + { + offset = numbytes & 3; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_16bit(nextdest, nextsrc, 1); + numbytes &= -4; + } + else if(numbytes & 4) // 4 bytes + { + offset = numbytes & 7; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_32bit(nextdest, nextsrc, 1); + numbytes &= -8; + } + else if(numbytes & 8) // 8 bytes + { + offset = numbytes & 15; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_64bit(nextdest, nextsrc, 1); + numbytes &= -16; + } +#ifdef __AVX512F__ + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_a(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_a(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_a(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128) // 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_128B_a(nextdest, nextsrc, 1); + numbytes &= -256; + } + else if(numbytes & 256) // 256 bytes + { + offset = numbytes & 511; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_256B_a(nextdest, nextsrc, 1); + numbytes &= -512; + } + else if(numbytes & 512) // 512 bytes + { + offset = numbytes & 1023; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_512B_a(nextdest, nextsrc, 1); + numbytes &= -1024; + } + else if(numbytes & 1024) // 1024 bytes (1 kB) + { + offset = numbytes & 2047; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_1kB_a(nextdest, nextsrc, 1); + numbytes &= -2048; + } + else if(numbytes & 2048) // 2048 bytes (2 kB) + { + offset = numbytes & 4095; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_2kB_a(nextdest, nextsrc, 1); + numbytes &= -4096; + } + else // 4096 bytes (4 kB) + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_512bit_4kB_a(nextdest, nextsrc, numbytes >> 12); + numbytes = 0; + } +#elif __AVX__ + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_a(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_a(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_64B_a(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128) // 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_128B_a(nextdest, nextsrc, 1); + numbytes &= -256; + } + else if(numbytes & 256) // 256 bytes + { + offset = numbytes & 511; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_256B_a(nextdest, nextsrc, 1); + numbytes &= -512; + } + else // 512 bytes + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_256bit_512B_a(nextdest, nextsrc, numbytes >> 9); + numbytes = 0; + } +#else // SSE2 only + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_a(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_32B_a(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_64B_a(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128)// 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_128B_a(nextdest, nextsrc, 1); + numbytes &= -256; + } + else // 256 bytes + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_128bit_256B_a(nextdest, nextsrc, numbytes >> 8); + numbytes = 0; + } +#endif + } + return returnval; +} // END MEMMOVE LARGE REVERSE, ALIGNED + +// Move arbitrarily large amounts of data in reverse order (ends first) +// src addr < dest addr +// Aligned, streaming version +void * memmove_large_reverse_as(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; // memmove is supposed to return the destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + void * nextdest = (char *)dest + numbytes; + void * nextsrc = (char *)src + numbytes; + + while(numbytes) + // Want smallest sizes to go first, at the tail end, so that the biggest sizes + // are aligned later in this operation (AVX_memmove sets the alignment up for + // this to work). + // NOTE: Each memmove has its own loop so that any one can be used individually. + { + if(numbytes & 1) // 1 byte + { + offset = numbytes & 1; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove(nextdest, nextsrc, 1); + numbytes &= -2; + } + else if(numbytes & 2) // 2 bytes + { + offset = numbytes & 3; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_16bit(nextdest, nextsrc, 1); + numbytes &= -4; + } + else if(numbytes & 4) // 4 bytes + { + offset = numbytes & 7; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_32bit(nextdest, nextsrc, 1); + numbytes &= -8; + } + else if(numbytes & 8) // 8 bytes + { + offset = numbytes & 15; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_64bit(nextdest, nextsrc, 1); + numbytes &= -16; + } +#ifdef __AVX512F__ + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_as(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_as(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_as(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128) // 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_128B_as(nextdest, nextsrc, 1); + numbytes &= -256; + } + else if(numbytes & 256) // 256 bytes + { + offset = numbytes & 511; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_256B_as(nextdest, nextsrc, 1); + numbytes &= -512; + } + else if(numbytes & 512) // 512 bytes + { + offset = numbytes & 1023; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_512B_as(nextdest, nextsrc, 1); + numbytes &= -1024; + } + else if(numbytes & 1024) // 1024 bytes (1 kB) + { + offset = numbytes & 2047; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_1kB_as(nextdest, nextsrc, 1); + numbytes &= -2048; + } + else if(numbytes & 2048) // 2048 bytes (2 kB) + { + offset = numbytes & 4095; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_512bit_2kB_as(nextdest, nextsrc, 1); + numbytes &= -4096; + } + else // 4096 bytes (4 kB) + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_512bit_4kB_as(nextdest, nextsrc, numbytes >> 12); + numbytes = 0; + } +#elif __AVX2__ + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_as(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_as(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_64B_as(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128) // 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_128B_as(nextdest, nextsrc, 1); + numbytes &= -256; + } + else if(numbytes & 256) // 256 bytes + { + offset = numbytes & 511; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_256bit_256B_as(nextdest, nextsrc, 1); + numbytes &= -512; + } + else // 512 bytes + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_256bit_512B_as(nextdest, nextsrc, numbytes >> 9); + numbytes = 0; + } +#else // SSE4.1 only + else if(numbytes & 16) // 16 bytes + { + offset = numbytes & 31; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_as(nextdest, nextsrc, 1); + numbytes &= -32; + } + else if(numbytes & 32) // 32 bytes + { + offset = numbytes & 63; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_32B_as(nextdest, nextsrc, 1); + numbytes &= -64; + } + else if(numbytes & 64) // 64 bytes + { + offset = numbytes & 127; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_64B_as(nextdest, nextsrc, 1); + numbytes &= -128; + } + else if(numbytes & 128)// 128 bytes + { + offset = numbytes & 255; + nextdest = (char *)nextdest - offset; + nextsrc = (char *)nextsrc - offset; + memmove_128bit_128B_as(nextdest, nextsrc, 1); + numbytes &= -256; + } + else // 256 bytes + { + offset = numbytes; + nextdest = (char *)nextdest - offset; // These should match initial src/dest + nextsrc = (char *)nextsrc - offset; + memmove_128bit_256B_as(nextdest, nextsrc, numbytes >> 8); + numbytes = 0; + } +#endif + } + return returnval; +} // END MEMMOVE LARGE REVERSE, ALIGNED, STREAMING + +//----------------------------------------------------------------------------- +// Main Function: +//----------------------------------------------------------------------------- + +// General-purpose function to call +void * memmoveAVX(void *dest, void *src, size_t numbytes) +{ + void * returnval = dest; + + if((char*)src == (char*)dest) + { + return returnval; + } + + if( + ( ((uintptr_t)src & BYTE_ALIGNMENT) == 0 ) + && + ( ((uintptr_t)dest & BYTE_ALIGNMENT) == 0 ) + ) // Check alignment + { + if((char *)dest < (char *)src) + { + // This is the fastest case: src and dest are both cache line aligned. + if(numbytes > CACHESIZE) + { + memmove_large_as(dest, src, numbytes); + } + else + { + memmove_large_a(dest, src, numbytes); // Even if numbytes is small this'll work + } + } + else // src < dest + { // Need to move ends first + if(numbytes > CACHESIZE) + { + memmove_large_reverse_as(dest, src, numbytes); + } + else + { + memmove_large_reverse_a(dest, src, numbytes); + } + } + } + else // Unaligned + { + size_t numbytes_to_align = (BYTE_ALIGNMENT + 1) - ((uintptr_t)dest & BYTE_ALIGNMENT); + + void * destoffset = (char*)dest + numbytes_to_align; + void * srcoffset = (char*)src + numbytes_to_align; + + if((char *)dest < (char *)src) + { + if(numbytes > numbytes_to_align) + { + // Get to an aligned position. + // This may be a little slower, but since it'll be mostly scalar operations + // alignment doesn't matter. Worst case it uses two vector functions, and + // this process only needs to be done once per call if dest is unaligned. + memmove_large(dest, src, numbytes_to_align); + // Now this should be faster since stores are aligned. + memmove_large(destoffset, srcoffset, numbytes - numbytes_to_align); // NOTE: Can't use streaming due to potential src misalignment + // On Haswell and up, cross cache line loads have a negligible penalty. + // Thus this will be slower on Sandy & Ivy Bridge, though Ivy Bridge will + // fare a little better (~2x, maybe?). Ryzen should generally fall somewhere + // inbetween Sandy Bridge and Haswell/Skylake on that front. + // NOTE: These are just rough theoretical estimates. + } + else // Small size + { + memmove_large(dest, src, numbytes); + } + } + else // src < dest + { + if(numbytes > numbytes_to_align) + { + // Move bulk, up to lowest alignment line + memmove_large_reverse(destoffset, srcoffset, numbytes - numbytes_to_align); + // Move remainder + memmove_large_reverse(dest, src, numbytes_to_align); + } + else // Small size + { + memmove_large_reverse(dest, src, numbytes); + } + } + } + + return returnval; +} diff --git a/kernel/memory/memset.c b/kernel/memory/memset.c new file mode 100644 index 0000000..80671d9 --- /dev/null +++ b/kernel/memory/memset.c @@ -0,0 +1,3071 @@ + +#include + +// This file provides a highly optimized version of memset. +// +// There is also an AVX_memset_4B for mass-setting of 4-byte data types, for example +// framebuffers with 32 bits per pixel could use this to set a contiguous portion +// of the buffer as one color. +// +// If you just want to zero a big array, use plain AVX_memset since it +// implments a dedicated zeroing function (AVX_memset_4B does not). +// + +#ifdef __clang__ +#define __m128i_u __m128i +#define __m256i_u __m256i +#define __m512i_u __m512i +#endif + +#ifdef __AVX512F__ +#define BYTE_ALIGNMENT 0x3F // For 64-byte alignment +#elif __AVX__ +#define BYTE_ALIGNMENT 0x1F // For 32-byte alignment +#else +#define BYTE_ALIGNMENT 0x0F // For 16-byte alignment +#endif + +void* memset_zeroes(void*,size_t); +void* memset_zeroes_a(void*,size_t); +void* memset_zeroes_as(void*,size_t); + +//----------------------------------------------------------------------------- +// Individual Functions: +//----------------------------------------------------------------------------- + +// 16-bit (2 bytes at a time) +// Len is (# of total bytes/2), so it's "# of 16-bits" + +void * memset_16bit(void *dest, const uint16_t val, size_t len) +{ + uint16_t *ptr = (uint16_t*)dest; + + while (len--) + { + *ptr++ = val; + } + + return dest; +} + +// 32-bit (4 bytes at a time - 1 pixel in a 32-bit linear frame buffer) +// Len is (# of total bytes/4), so it's "# of 32-bits" + +void * memset_32bit(void *dest, const uint32_t val, size_t len) +{ + uint32_t *ptr = (uint32_t*)dest; + + while (len--) + { + *ptr++ = val; + } + + return dest; +} + +// 64-bit (8 bytes at a time - 2 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/8), so it's "# of 64-bits" + +void * memset_64bit(void *dest, const uint64_t val, size_t len) +{ + uint64_t *ptr = (uint64_t*)dest; + + while (len--) + { + *ptr++ = val; + } + + return dest; +} + +//----------------------------------------------------------------------------- +// SSE2 Unaligned: +//----------------------------------------------------------------------------- + +// SSE2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/16), so it's "# of 128-bits" + +void * memset_128bit_u(void *dest, const __m128i_u val, size_t len) +{ + __m128i_u *ptr = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(ptr++, val); + } + + return dest; +} + +// 32 bytes +void * memset_128bit_32B_u(void *dest, const __m128i_u val, size_t len) +{ + __m128i_u *ptr = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(ptr++, val); // 1 + _mm_storeu_si128(ptr++, val); // 2 + } + + return dest; +} + +// 64 bytes +void * memset_128bit_64B_u(void *dest, const __m128i_u val, size_t len) +{ + __m128i_u *ptr = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(ptr++, val); // 1 + _mm_storeu_si128(ptr++, val); // 2 + _mm_storeu_si128(ptr++, val); // 3 + _mm_storeu_si128(ptr++, val); // 4 + } + + return dest; +} + +// 128 bytes +void * memset_128bit_128B_u(void *dest, const __m128i_u val, size_t len) +{ + __m128i_u *ptr = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(ptr++, val); // 1 + _mm_storeu_si128(ptr++, val); // 2 + _mm_storeu_si128(ptr++, val); // 3 + _mm_storeu_si128(ptr++, val); // 4 + _mm_storeu_si128(ptr++, val); // 5 + _mm_storeu_si128(ptr++, val); // 6 + _mm_storeu_si128(ptr++, val); // 7 + _mm_storeu_si128(ptr++, val); // 8 + } + + return dest; +} + +// 256 bytes +void * memset_128bit_256B_u(void *dest, const __m128i_u val, size_t len) +{ + __m128i_u *ptr = (__m128i_u*)dest; + + while (len--) + { + _mm_storeu_si128(ptr++, val); // 1 + _mm_storeu_si128(ptr++, val); // 2 + _mm_storeu_si128(ptr++, val); // 3 + _mm_storeu_si128(ptr++, val); // 4 + _mm_storeu_si128(ptr++, val); // 5 + _mm_storeu_si128(ptr++, val); // 6 + _mm_storeu_si128(ptr++, val); // 7 + _mm_storeu_si128(ptr++, val); // 8 + _mm_storeu_si128(ptr++, val); // 9 + _mm_storeu_si128(ptr++, val); // 10 + _mm_storeu_si128(ptr++, val); // 11 + _mm_storeu_si128(ptr++, val); // 12 + _mm_storeu_si128(ptr++, val); // 13 + _mm_storeu_si128(ptr++, val); // 14 + _mm_storeu_si128(ptr++, val); // 15 + _mm_storeu_si128(ptr++, val); // 16 + } + + return dest; +} + +//----------------------------------------------------------------------------- +// AVX+ Unaligned: +//----------------------------------------------------------------------------- + +// AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Sandybridge and Ryzen and up, Haswell and up for better performance + +#ifdef __AVX__ + +void * memset_256bit_u(void *dest, const __m256i_u val, size_t len) +{ + __m256i_u *ptr = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(ptr++, val); + } + + return dest; +} + +// 64 bytes +void * memset_256bit_64B_u(void *dest, const __m256i_u val, size_t len) +{ + __m256i_u *ptr = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(ptr++, val); // 1 + _mm256_storeu_si256(ptr++, val); // 2 + } + + return dest; +} + +// 128 bytes +void * memset_256bit_128B_u(void *dest, const __m256i_u val, size_t len) +{ + __m256i_u *ptr = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(ptr++, val); // 1 + _mm256_storeu_si256(ptr++, val); // 2 + _mm256_storeu_si256(ptr++, val); // 3 + _mm256_storeu_si256(ptr++, val); // 4 + } + + return dest; +} + +// 256 bytes +void * memset_256bit_256B_u(void *dest, const __m256i_u val, size_t len) +{ + __m256i_u *ptr = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(ptr++, val); // 1 + _mm256_storeu_si256(ptr++, val); // 2 + _mm256_storeu_si256(ptr++, val); // 3 + _mm256_storeu_si256(ptr++, val); // 4 + _mm256_storeu_si256(ptr++, val); // 5 + _mm256_storeu_si256(ptr++, val); // 6 + _mm256_storeu_si256(ptr++, val); // 7 + _mm256_storeu_si256(ptr++, val); // 8 + } + + return dest; +} + +// 512 bytes +void * memset_256bit_512B_u(void *dest, const __m256i_u val, size_t len) +{ + __m256i_u *ptr = (__m256i_u*)dest; + + while (len--) + { + _mm256_storeu_si256(ptr++, val); // 1 + _mm256_storeu_si256(ptr++, val); // 2 + _mm256_storeu_si256(ptr++, val); // 3 + _mm256_storeu_si256(ptr++, val); // 4 + _mm256_storeu_si256(ptr++, val); // 5 + _mm256_storeu_si256(ptr++, val); // 6 + _mm256_storeu_si256(ptr++, val); // 7 + _mm256_storeu_si256(ptr++, val); // 8 + _mm256_storeu_si256(ptr++, val); // 9 + _mm256_storeu_si256(ptr++, val); // 10 + _mm256_storeu_si256(ptr++, val); // 11 + _mm256_storeu_si256(ptr++, val); // 12 + _mm256_storeu_si256(ptr++, val); // 13 + _mm256_storeu_si256(ptr++, val); // 14 + _mm256_storeu_si256(ptr++, val); // 15 + _mm256_storeu_si256(ptr++, val); // 16 + } + + return dest; +} + +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ + +void * memset_512bit_u(void *dest, const __m512i_u val, size_t len) +{ + __m512i_u *ptr = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(ptr++, val); + } + + return dest; +} + +// 128 bytes +void * memset_512bit_128B_u(void *dest, const __m512i_u val, size_t len) +{ + __m512i_u *ptr = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(ptr++, val); // 1 + _mm512_storeu_si512(ptr++, val); // 2 + } + + return dest; +} + +// 256 bytes +void * memset_512bit_256B_u(void *dest, const __m512i_u val, size_t len) +{ + __m512i_u *ptr = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(ptr++, val); // 1 + _mm512_storeu_si512(ptr++, val); // 2 + _mm512_storeu_si512(ptr++, val); // 3 + _mm512_storeu_si512(ptr++, val); // 4 + } + + return dest; +} + +// 512 bytes +void * memset_512bit_512B_u(void *dest, const __m512i_u val, size_t len) +{ + __m512i_u *ptr = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(ptr++, val); // 1 + _mm512_storeu_si512(ptr++, val); // 2 + _mm512_storeu_si512(ptr++, val); // 3 + _mm512_storeu_si512(ptr++, val); // 4 + _mm512_storeu_si512(ptr++, val); // 5 + _mm512_storeu_si512(ptr++, val); // 6 + _mm512_storeu_si512(ptr++, val); // 7 + _mm512_storeu_si512(ptr++, val); // 8 + } + + return dest; +} + +// 1024 bytes, or 1 kB +void * memset_512bit_1kB_u(void *dest, const __m512i_u val, size_t len) +{ + __m512i_u *ptr = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(ptr++, val); // 1 + _mm512_storeu_si512(ptr++, val); // 2 + _mm512_storeu_si512(ptr++, val); // 3 + _mm512_storeu_si512(ptr++, val); // 4 + _mm512_storeu_si512(ptr++, val); // 5 + _mm512_storeu_si512(ptr++, val); // 6 + _mm512_storeu_si512(ptr++, val); // 7 + _mm512_storeu_si512(ptr++, val); // 8 + _mm512_storeu_si512(ptr++, val); // 9 + _mm512_storeu_si512(ptr++, val); // 10 + _mm512_storeu_si512(ptr++, val); // 11 + _mm512_storeu_si512(ptr++, val); // 12 + _mm512_storeu_si512(ptr++, val); // 13 + _mm512_storeu_si512(ptr++, val); // 14 + _mm512_storeu_si512(ptr++, val); // 15 + _mm512_storeu_si512(ptr++, val); // 16 + } + + return dest; +} + +// 2048 bytes, or 2 kB +void * memset_512bit_2kB_u(void *dest, const __m512i_u val, size_t len) +{ + __m512i_u *ptr = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(ptr++, val); // 1 + _mm512_storeu_si512(ptr++, val); // 2 + _mm512_storeu_si512(ptr++, val); // 3 + _mm512_storeu_si512(ptr++, val); // 4 + _mm512_storeu_si512(ptr++, val); // 5 + _mm512_storeu_si512(ptr++, val); // 6 + _mm512_storeu_si512(ptr++, val); // 7 + _mm512_storeu_si512(ptr++, val); // 8 + _mm512_storeu_si512(ptr++, val); // 9 + _mm512_storeu_si512(ptr++, val); // 10 + _mm512_storeu_si512(ptr++, val); // 11 + _mm512_storeu_si512(ptr++, val); // 12 + _mm512_storeu_si512(ptr++, val); // 13 + _mm512_storeu_si512(ptr++, val); // 14 + _mm512_storeu_si512(ptr++, val); // 15 + _mm512_storeu_si512(ptr++, val); // 16 + _mm512_storeu_si512(ptr++, val); // 17 + _mm512_storeu_si512(ptr++, val); // 18 + _mm512_storeu_si512(ptr++, val); // 19 + _mm512_storeu_si512(ptr++, val); // 20 + _mm512_storeu_si512(ptr++, val); // 21 + _mm512_storeu_si512(ptr++, val); // 22 + _mm512_storeu_si512(ptr++, val); // 23 + _mm512_storeu_si512(ptr++, val); // 24 + _mm512_storeu_si512(ptr++, val); // 25 + _mm512_storeu_si512(ptr++, val); // 26 + _mm512_storeu_si512(ptr++, val); // 27 + _mm512_storeu_si512(ptr++, val); // 28 + _mm512_storeu_si512(ptr++, val); // 29 + _mm512_storeu_si512(ptr++, val); // 30 + _mm512_storeu_si512(ptr++, val); // 31 + _mm512_storeu_si512(ptr++, val); // 32 + } + + return dest; +} + +// 4096 bytes, or 4 kB, also 1 page +void * memset_512bit_4kB_u(void *dest, const __m512i_u val, size_t len) +{ + __m512i_u *ptr = (__m512i_u*)dest; + + while (len--) + { + _mm512_storeu_si512(ptr++, val); // 1 + _mm512_storeu_si512(ptr++, val); // 2 + _mm512_storeu_si512(ptr++, val); // 3 + _mm512_storeu_si512(ptr++, val); // 4 + _mm512_storeu_si512(ptr++, val); // 5 + _mm512_storeu_si512(ptr++, val); // 6 + _mm512_storeu_si512(ptr++, val); // 7 + _mm512_storeu_si512(ptr++, val); // 8 + _mm512_storeu_si512(ptr++, val); // 9 + _mm512_storeu_si512(ptr++, val); // 10 + _mm512_storeu_si512(ptr++, val); // 11 + _mm512_storeu_si512(ptr++, val); // 12 + _mm512_storeu_si512(ptr++, val); // 13 + _mm512_storeu_si512(ptr++, val); // 14 + _mm512_storeu_si512(ptr++, val); // 15 + _mm512_storeu_si512(ptr++, val); // 16 + _mm512_storeu_si512(ptr++, val); // 17 + _mm512_storeu_si512(ptr++, val); // 18 + _mm512_storeu_si512(ptr++, val); // 19 + _mm512_storeu_si512(ptr++, val); // 20 + _mm512_storeu_si512(ptr++, val); // 21 + _mm512_storeu_si512(ptr++, val); // 22 + _mm512_storeu_si512(ptr++, val); // 23 + _mm512_storeu_si512(ptr++, val); // 24 + _mm512_storeu_si512(ptr++, val); // 25 + _mm512_storeu_si512(ptr++, val); // 26 + _mm512_storeu_si512(ptr++, val); // 27 + _mm512_storeu_si512(ptr++, val); // 28 + _mm512_storeu_si512(ptr++, val); // 29 + _mm512_storeu_si512(ptr++, val); // 30 + _mm512_storeu_si512(ptr++, val); // 31 + _mm512_storeu_si512(ptr++, val); // 32 + _mm512_storeu_si512(ptr++, val); // 1 + _mm512_storeu_si512(ptr++, val); // 2 + _mm512_storeu_si512(ptr++, val); // 3 + _mm512_storeu_si512(ptr++, val); // 4 + _mm512_storeu_si512(ptr++, val); // 5 + _mm512_storeu_si512(ptr++, val); // 6 + _mm512_storeu_si512(ptr++, val); // 7 + _mm512_storeu_si512(ptr++, val); // 8 + _mm512_storeu_si512(ptr++, val); // 9 + _mm512_storeu_si512(ptr++, val); // 10 + _mm512_storeu_si512(ptr++, val); // 11 + _mm512_storeu_si512(ptr++, val); // 12 + _mm512_storeu_si512(ptr++, val); // 13 + _mm512_storeu_si512(ptr++, val); // 14 + _mm512_storeu_si512(ptr++, val); // 15 + _mm512_storeu_si512(ptr++, val); // 16 + _mm512_storeu_si512(ptr++, val); // 17 + _mm512_storeu_si512(ptr++, val); // 18 + _mm512_storeu_si512(ptr++, val); // 19 + _mm512_storeu_si512(ptr++, val); // 20 + _mm512_storeu_si512(ptr++, val); // 21 + _mm512_storeu_si512(ptr++, val); // 22 + _mm512_storeu_si512(ptr++, val); // 23 + _mm512_storeu_si512(ptr++, val); // 24 + _mm512_storeu_si512(ptr++, val); // 25 + _mm512_storeu_si512(ptr++, val); // 26 + _mm512_storeu_si512(ptr++, val); // 27 + _mm512_storeu_si512(ptr++, val); // 28 + _mm512_storeu_si512(ptr++, val); // 29 + _mm512_storeu_si512(ptr++, val); // 30 + _mm512_storeu_si512(ptr++, val); // 31 + _mm512_storeu_si512(ptr++, val); // 32 + } + + return dest; +} + +#endif + +//----------------------------------------------------------------------------- +// SSE2 Aligned: +//----------------------------------------------------------------------------- + +// SSE2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/16), so it's "# of 128-bits" + +void * memset_128bit_a(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(ptr++, val); + } + + return dest; +} + +// 32 bytes +void * memset_128bit_32B_a(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(ptr++, val); // 1 + _mm_store_si128(ptr++, val); // 2 + } + + return dest; +} + +// 64 bytes +void * memset_128bit_64B_a(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(ptr++, val); // 1 + _mm_store_si128(ptr++, val); // 2 + _mm_store_si128(ptr++, val); // 3 + _mm_store_si128(ptr++, val); // 4 + } + + return dest; +} + +// 128 bytes +void * memset_128bit_128B_a(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(ptr++, val); // 1 + _mm_store_si128(ptr++, val); // 2 + _mm_store_si128(ptr++, val); // 3 + _mm_store_si128(ptr++, val); // 4 + _mm_store_si128(ptr++, val); // 5 + _mm_store_si128(ptr++, val); // 6 + _mm_store_si128(ptr++, val); // 7 + _mm_store_si128(ptr++, val); // 8 + } + + return dest; +} + +// 256 bytes +void * memset_128bit_256B_a(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_store_si128(ptr++, val); // 1 + _mm_store_si128(ptr++, val); // 2 + _mm_store_si128(ptr++, val); // 3 + _mm_store_si128(ptr++, val); // 4 + _mm_store_si128(ptr++, val); // 5 + _mm_store_si128(ptr++, val); // 6 + _mm_store_si128(ptr++, val); // 7 + _mm_store_si128(ptr++, val); // 8 + _mm_store_si128(ptr++, val); // 9 + _mm_store_si128(ptr++, val); // 10 + _mm_store_si128(ptr++, val); // 11 + _mm_store_si128(ptr++, val); // 12 + _mm_store_si128(ptr++, val); // 13 + _mm_store_si128(ptr++, val); // 14 + _mm_store_si128(ptr++, val); // 15 + _mm_store_si128(ptr++, val); // 16 + } + + return dest; +} + +//----------------------------------------------------------------------------- +// AVX+ Aligned: +//----------------------------------------------------------------------------- + +// AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Sandybridge and Ryzen and up + +#ifdef __AVX__ + +void * memset_256bit_a(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(ptr++, val); + } + + return dest; +} + +// 64 bytes +void * memset_256bit_64B_a(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(ptr++, val); // 1 + _mm256_store_si256(ptr++, val); // 2 + } + + return dest; +} + +// 128 bytes +void * memset_256bit_128B_a(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(ptr++, val); // 1 + _mm256_store_si256(ptr++, val); // 2 + _mm256_store_si256(ptr++, val); // 3 + _mm256_store_si256(ptr++, val); // 4 + } + + return dest; +} + +// 256 bytes +void * memset_256bit_256B_a(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(ptr++, val); // 1 + _mm256_store_si256(ptr++, val); // 2 + _mm256_store_si256(ptr++, val); // 3 + _mm256_store_si256(ptr++, val); // 4 + _mm256_store_si256(ptr++, val); // 5 + _mm256_store_si256(ptr++, val); // 6 + _mm256_store_si256(ptr++, val); // 7 + _mm256_store_si256(ptr++, val); // 8 + } + + return dest; +} + +// 512 bytes +void * memset_256bit_512B_a(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_store_si256(ptr++, val); // 1 + _mm256_store_si256(ptr++, val); // 2 + _mm256_store_si256(ptr++, val); // 3 + _mm256_store_si256(ptr++, val); // 4 + _mm256_store_si256(ptr++, val); // 5 + _mm256_store_si256(ptr++, val); // 6 + _mm256_store_si256(ptr++, val); // 7 + _mm256_store_si256(ptr++, val); // 8 + _mm256_store_si256(ptr++, val); // 9 + _mm256_store_si256(ptr++, val); // 10 + _mm256_store_si256(ptr++, val); // 11 + _mm256_store_si256(ptr++, val); // 12 + _mm256_store_si256(ptr++, val); // 13 + _mm256_store_si256(ptr++, val); // 14 + _mm256_store_si256(ptr++, val); // 15 + _mm256_store_si256(ptr++, val); // 16 + } + + return dest; +} + +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ + +void * memset_512bit_a(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(ptr++, val); + } + + return dest; +} + +// 128 bytes +void * memset_512bit_128B_a(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(ptr++, val); // 1 + _mm512_store_si512(ptr++, val); // 2 + } + + return dest; +} + +// 256 bytes +void * memset_512bit_256B_a(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(ptr++, val); // 1 + _mm512_store_si512(ptr++, val); // 2 + _mm512_store_si512(ptr++, val); // 3 + _mm512_store_si512(ptr++, val); // 4 + } + + return dest; +} + +// 512 bytes +void * memset_512bit_512B_a(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(ptr++, val); // 1 + _mm512_store_si512(ptr++, val); // 2 + _mm512_store_si512(ptr++, val); // 3 + _mm512_store_si512(ptr++, val); // 4 + _mm512_store_si512(ptr++, val); // 5 + _mm512_store_si512(ptr++, val); // 6 + _mm512_store_si512(ptr++, val); // 7 + _mm512_store_si512(ptr++, val); // 8 + } + + return dest; +} + +// 1024 bytes, or 1 kB +void * memset_512bit_1kB_a(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(ptr++, val); // 1 + _mm512_store_si512(ptr++, val); // 2 + _mm512_store_si512(ptr++, val); // 3 + _mm512_store_si512(ptr++, val); // 4 + _mm512_store_si512(ptr++, val); // 5 + _mm512_store_si512(ptr++, val); // 6 + _mm512_store_si512(ptr++, val); // 7 + _mm512_store_si512(ptr++, val); // 8 + _mm512_store_si512(ptr++, val); // 9 + _mm512_store_si512(ptr++, val); // 10 + _mm512_store_si512(ptr++, val); // 11 + _mm512_store_si512(ptr++, val); // 12 + _mm512_store_si512(ptr++, val); // 13 + _mm512_store_si512(ptr++, val); // 14 + _mm512_store_si512(ptr++, val); // 15 + _mm512_store_si512(ptr++, val); // 16 + } + + return dest; +} + +// 2048 bytes, or 2 kB +void * memset_512bit_2kB_a(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(ptr++, val); // 1 + _mm512_store_si512(ptr++, val); // 2 + _mm512_store_si512(ptr++, val); // 3 + _mm512_store_si512(ptr++, val); // 4 + _mm512_store_si512(ptr++, val); // 5 + _mm512_store_si512(ptr++, val); // 6 + _mm512_store_si512(ptr++, val); // 7 + _mm512_store_si512(ptr++, val); // 8 + _mm512_store_si512(ptr++, val); // 9 + _mm512_store_si512(ptr++, val); // 10 + _mm512_store_si512(ptr++, val); // 11 + _mm512_store_si512(ptr++, val); // 12 + _mm512_store_si512(ptr++, val); // 13 + _mm512_store_si512(ptr++, val); // 14 + _mm512_store_si512(ptr++, val); // 15 + _mm512_store_si512(ptr++, val); // 16 + _mm512_store_si512(ptr++, val); // 17 + _mm512_store_si512(ptr++, val); // 18 + _mm512_store_si512(ptr++, val); // 19 + _mm512_store_si512(ptr++, val); // 20 + _mm512_store_si512(ptr++, val); // 21 + _mm512_store_si512(ptr++, val); // 22 + _mm512_store_si512(ptr++, val); // 23 + _mm512_store_si512(ptr++, val); // 24 + _mm512_store_si512(ptr++, val); // 25 + _mm512_store_si512(ptr++, val); // 26 + _mm512_store_si512(ptr++, val); // 27 + _mm512_store_si512(ptr++, val); // 28 + _mm512_store_si512(ptr++, val); // 29 + _mm512_store_si512(ptr++, val); // 30 + _mm512_store_si512(ptr++, val); // 31 + _mm512_store_si512(ptr++, val); // 32 + } + + return dest; +} + +// 4096 bytes, or 4 kB, also 1 page +void * memset_512bit_4kB_a(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_store_si512(ptr++, val); // 1 + _mm512_store_si512(ptr++, val); // 2 + _mm512_store_si512(ptr++, val); // 3 + _mm512_store_si512(ptr++, val); // 4 + _mm512_store_si512(ptr++, val); // 5 + _mm512_store_si512(ptr++, val); // 6 + _mm512_store_si512(ptr++, val); // 7 + _mm512_store_si512(ptr++, val); // 8 + _mm512_store_si512(ptr++, val); // 9 + _mm512_store_si512(ptr++, val); // 10 + _mm512_store_si512(ptr++, val); // 11 + _mm512_store_si512(ptr++, val); // 12 + _mm512_store_si512(ptr++, val); // 13 + _mm512_store_si512(ptr++, val); // 14 + _mm512_store_si512(ptr++, val); // 15 + _mm512_store_si512(ptr++, val); // 16 + _mm512_store_si512(ptr++, val); // 17 + _mm512_store_si512(ptr++, val); // 18 + _mm512_store_si512(ptr++, val); // 19 + _mm512_store_si512(ptr++, val); // 20 + _mm512_store_si512(ptr++, val); // 21 + _mm512_store_si512(ptr++, val); // 22 + _mm512_store_si512(ptr++, val); // 23 + _mm512_store_si512(ptr++, val); // 24 + _mm512_store_si512(ptr++, val); // 25 + _mm512_store_si512(ptr++, val); // 26 + _mm512_store_si512(ptr++, val); // 27 + _mm512_store_si512(ptr++, val); // 28 + _mm512_store_si512(ptr++, val); // 29 + _mm512_store_si512(ptr++, val); // 30 + _mm512_store_si512(ptr++, val); // 31 + _mm512_store_si512(ptr++, val); // 32 + _mm512_store_si512(ptr++, val); // 1 + _mm512_store_si512(ptr++, val); // 2 + _mm512_store_si512(ptr++, val); // 3 + _mm512_store_si512(ptr++, val); // 4 + _mm512_store_si512(ptr++, val); // 5 + _mm512_store_si512(ptr++, val); // 6 + _mm512_store_si512(ptr++, val); // 7 + _mm512_store_si512(ptr++, val); // 8 + _mm512_store_si512(ptr++, val); // 9 + _mm512_store_si512(ptr++, val); // 10 + _mm512_store_si512(ptr++, val); // 11 + _mm512_store_si512(ptr++, val); // 12 + _mm512_store_si512(ptr++, val); // 13 + _mm512_store_si512(ptr++, val); // 14 + _mm512_store_si512(ptr++, val); // 15 + _mm512_store_si512(ptr++, val); // 16 + _mm512_store_si512(ptr++, val); // 17 + _mm512_store_si512(ptr++, val); // 18 + _mm512_store_si512(ptr++, val); // 19 + _mm512_store_si512(ptr++, val); // 20 + _mm512_store_si512(ptr++, val); // 21 + _mm512_store_si512(ptr++, val); // 22 + _mm512_store_si512(ptr++, val); // 23 + _mm512_store_si512(ptr++, val); // 24 + _mm512_store_si512(ptr++, val); // 25 + _mm512_store_si512(ptr++, val); // 26 + _mm512_store_si512(ptr++, val); // 27 + _mm512_store_si512(ptr++, val); // 28 + _mm512_store_si512(ptr++, val); // 29 + _mm512_store_si512(ptr++, val); // 30 + _mm512_store_si512(ptr++, val); // 31 + _mm512_store_si512(ptr++, val); // 32 + } + + return dest; +} + +#endif + + +//----------------------------------------------------------------------------- +// SSE2 Streaming: +//----------------------------------------------------------------------------- + +// If non-temporal stores are needed, then it's a big transfer + +void * memset_128bit_as(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(ptr++, val); + } + _mm_sfence(); + + return dest; +} + +// 32 bytes +void * memset_128bit_32B_as(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(ptr++, val); // 1 + _mm_stream_si128(ptr++, val); // 2 + } + _mm_sfence(); + + return dest; +} + +// 64 bytes +void * memset_128bit_64B_as(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(ptr++, val); // 1 + _mm_stream_si128(ptr++, val); // 2 + _mm_stream_si128(ptr++, val); // 3 + _mm_stream_si128(ptr++, val); // 4 + } + _mm_sfence(); + + return dest; +} + +// 128 bytes +void * memset_128bit_128B_as(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(ptr++, val); // 1 + _mm_stream_si128(ptr++, val); // 2 + _mm_stream_si128(ptr++, val); // 3 + _mm_stream_si128(ptr++, val); // 4 + _mm_stream_si128(ptr++, val); // 5 + _mm_stream_si128(ptr++, val); // 6 + _mm_stream_si128(ptr++, val); // 7 + _mm_stream_si128(ptr++, val); // 8 + } + _mm_sfence(); + + return dest; +} + +// 256 bytes +void * memset_128bit_256B_as(void *dest, const __m128i val, size_t len) +{ + __m128i *ptr = (__m128i*)dest; + + while (len--) + { + _mm_stream_si128(ptr++, val); // 1 + _mm_stream_si128(ptr++, val); // 2 + _mm_stream_si128(ptr++, val); // 3 + _mm_stream_si128(ptr++, val); // 4 + _mm_stream_si128(ptr++, val); // 5 + _mm_stream_si128(ptr++, val); // 6 + _mm_stream_si128(ptr++, val); // 7 + _mm_stream_si128(ptr++, val); // 8 + _mm_stream_si128(ptr++, val); // 9 + _mm_stream_si128(ptr++, val); // 10 + _mm_stream_si128(ptr++, val); // 11 + _mm_stream_si128(ptr++, val); // 12 + _mm_stream_si128(ptr++, val); // 13 + _mm_stream_si128(ptr++, val); // 14 + _mm_stream_si128(ptr++, val); // 15 + _mm_stream_si128(ptr++, val); // 16 + } + _mm_sfence(); + + return dest; +} + +//----------------------------------------------------------------------------- +// AVX+ Streaming: +//----------------------------------------------------------------------------- + +// AVX (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/32), so it's "# of 256-bits" +// Sandybridge and Ryzen and up + +#ifdef __AVX__ + +void * memset_256bit_as(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(ptr++, val); + } + _mm_sfence(); + + return dest; +} + +// 64 bytes +void * memset_256bit_64B_as(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(ptr++, val); // 1 + _mm256_stream_si256(ptr++, val); // 2 + } + _mm_sfence(); + + return dest; +} + +// 128 bytes +void * memset_256bit_128B_as(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(ptr++, val); // 1 + _mm256_stream_si256(ptr++, val); // 2 + _mm256_stream_si256(ptr++, val); // 3 + _mm256_stream_si256(ptr++, val); // 4 + } + _mm_sfence(); + + return dest; +} + +// 256 bytes +void * memset_256bit_256B_as(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(ptr++, val); // 1 + _mm256_stream_si256(ptr++, val); // 2 + _mm256_stream_si256(ptr++, val); // 3 + _mm256_stream_si256(ptr++, val); // 4 + _mm256_stream_si256(ptr++, val); // 5 + _mm256_stream_si256(ptr++, val); // 6 + _mm256_stream_si256(ptr++, val); // 7 + _mm256_stream_si256(ptr++, val); // 8 + } + _mm_sfence(); + + return dest; +} + +// 512 bytes +void * memset_256bit_512B_as(void *dest, const __m256i val, size_t len) +{ + __m256i *ptr = (__m256i*)dest; + + while (len--) + { + _mm256_stream_si256(ptr++, val); // 1 + _mm256_stream_si256(ptr++, val); // 2 + _mm256_stream_si256(ptr++, val); // 3 + _mm256_stream_si256(ptr++, val); // 4 + _mm256_stream_si256(ptr++, val); // 5 + _mm256_stream_si256(ptr++, val); // 6 + _mm256_stream_si256(ptr++, val); // 7 + _mm256_stream_si256(ptr++, val); // 8 + _mm256_stream_si256(ptr++, val); // 9 + _mm256_stream_si256(ptr++, val); // 10 + _mm256_stream_si256(ptr++, val); // 11 + _mm256_stream_si256(ptr++, val); // 12 + _mm256_stream_si256(ptr++, val); // 13 + _mm256_stream_si256(ptr++, val); // 14 + _mm256_stream_si256(ptr++, val); // 15 + _mm256_stream_si256(ptr++, val); // 16 + } + _mm_sfence(); + + return dest; +} + +#endif + +// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) +// Len is (# of total bytes/64), so it's "# of 512-bits" +// Requires AVX512F + +#ifdef __AVX512F__ + +void * memset_512bit_as(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(ptr++, val); + } + _mm_sfence(); + + return dest; +} + +// 128 bytes +void * memset_512bit_128B_as(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(ptr++, val); // 1 + _mm512_stream_si512(ptr++, val); // 2 + } + _mm_sfence(); + + return dest; +} + +// 256 bytes +void * memset_512bit_256B_as(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(ptr++, val); // 1 + _mm512_stream_si512(ptr++, val); // 2 + _mm512_stream_si512(ptr++, val); // 3 + _mm512_stream_si512(ptr++, val); // 4 + } + _mm_sfence(); + + return dest; +} + +// 512 bytes +void * memset_512bit_512B_as(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(ptr++, val); // 1 + _mm512_stream_si512(ptr++, val); // 2 + _mm512_stream_si512(ptr++, val); // 3 + _mm512_stream_si512(ptr++, val); // 4 + _mm512_stream_si512(ptr++, val); // 5 + _mm512_stream_si512(ptr++, val); // 6 + _mm512_stream_si512(ptr++, val); // 7 + _mm512_stream_si512(ptr++, val); // 8 + } + _mm_sfence(); + + return dest; +} + +// 1024 bytes, or 1 kB +void * memset_512bit_1kB_as(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(ptr++, val); // 1 + _mm512_stream_si512(ptr++, val); // 2 + _mm512_stream_si512(ptr++, val); // 3 + _mm512_stream_si512(ptr++, val); // 4 + _mm512_stream_si512(ptr++, val); // 5 + _mm512_stream_si512(ptr++, val); // 6 + _mm512_stream_si512(ptr++, val); // 7 + _mm512_stream_si512(ptr++, val); // 8 + _mm512_stream_si512(ptr++, val); // 9 + _mm512_stream_si512(ptr++, val); // 10 + _mm512_stream_si512(ptr++, val); // 11 + _mm512_stream_si512(ptr++, val); // 12 + _mm512_stream_si512(ptr++, val); // 13 + _mm512_stream_si512(ptr++, val); // 14 + _mm512_stream_si512(ptr++, val); // 15 + _mm512_stream_si512(ptr++, val); // 16 + } + _mm_sfence(); + + return dest; +} + +// 2048 bytes, or 2 kB +void * memset_512bit_2kB_as(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(ptr++, val); // 1 + _mm512_stream_si512(ptr++, val); // 2 + _mm512_stream_si512(ptr++, val); // 3 + _mm512_stream_si512(ptr++, val); // 4 + _mm512_stream_si512(ptr++, val); // 5 + _mm512_stream_si512(ptr++, val); // 6 + _mm512_stream_si512(ptr++, val); // 7 + _mm512_stream_si512(ptr++, val); // 8 + _mm512_stream_si512(ptr++, val); // 9 + _mm512_stream_si512(ptr++, val); // 10 + _mm512_stream_si512(ptr++, val); // 11 + _mm512_stream_si512(ptr++, val); // 12 + _mm512_stream_si512(ptr++, val); // 13 + _mm512_stream_si512(ptr++, val); // 14 + _mm512_stream_si512(ptr++, val); // 15 + _mm512_stream_si512(ptr++, val); // 16 + _mm512_stream_si512(ptr++, val); // 17 + _mm512_stream_si512(ptr++, val); // 18 + _mm512_stream_si512(ptr++, val); // 19 + _mm512_stream_si512(ptr++, val); // 20 + _mm512_stream_si512(ptr++, val); // 21 + _mm512_stream_si512(ptr++, val); // 22 + _mm512_stream_si512(ptr++, val); // 23 + _mm512_stream_si512(ptr++, val); // 24 + _mm512_stream_si512(ptr++, val); // 25 + _mm512_stream_si512(ptr++, val); // 26 + _mm512_stream_si512(ptr++, val); // 27 + _mm512_stream_si512(ptr++, val); // 28 + _mm512_stream_si512(ptr++, val); // 29 + _mm512_stream_si512(ptr++, val); // 30 + _mm512_stream_si512(ptr++, val); // 31 + _mm512_stream_si512(ptr++, val); // 32 + } + _mm_sfence(); + + return dest; +} + +// 4096 bytes, or 4 kB, also 1 page +void * memset_512bit_4kB_as(void *dest, const __m512i val, size_t len) +{ + __m512i *ptr = (__m512i*)dest; + + while (len--) + { + _mm512_stream_si512(ptr++, val); // 1 + _mm512_stream_si512(ptr++, val); // 2 + _mm512_stream_si512(ptr++, val); // 3 + _mm512_stream_si512(ptr++, val); // 4 + _mm512_stream_si512(ptr++, val); // 5 + _mm512_stream_si512(ptr++, val); // 6 + _mm512_stream_si512(ptr++, val); // 7 + _mm512_stream_si512(ptr++, val); // 8 + _mm512_stream_si512(ptr++, val); // 9 + _mm512_stream_si512(ptr++, val); // 10 + _mm512_stream_si512(ptr++, val); // 11 + _mm512_stream_si512(ptr++, val); // 12 + _mm512_stream_si512(ptr++, val); // 13 + _mm512_stream_si512(ptr++, val); // 14 + _mm512_stream_si512(ptr++, val); // 15 + _mm512_stream_si512(ptr++, val); // 16 + _mm512_stream_si512(ptr++, val); // 17 + _mm512_stream_si512(ptr++, val); // 18 + _mm512_stream_si512(ptr++, val); // 19 + _mm512_stream_si512(ptr++, val); // 20 + _mm512_stream_si512(ptr++, val); // 21 + _mm512_stream_si512(ptr++, val); // 22 + _mm512_stream_si512(ptr++, val); // 23 + _mm512_stream_si512(ptr++, val); // 24 + _mm512_stream_si512(ptr++, val); // 25 + _mm512_stream_si512(ptr++, val); // 26 + _mm512_stream_si512(ptr++, val); // 27 + _mm512_stream_si512(ptr++, val); // 28 + _mm512_stream_si512(ptr++, val); // 29 + _mm512_stream_si512(ptr++, val); // 30 + _mm512_stream_si512(ptr++, val); // 31 + _mm512_stream_si512(ptr++, val); // 32 + _mm512_stream_si512(ptr++, val); // 1 + _mm512_stream_si512(ptr++, val); // 2 + _mm512_stream_si512(ptr++, val); // 3 + _mm512_stream_si512(ptr++, val); // 4 + _mm512_stream_si512(ptr++, val); // 5 + _mm512_stream_si512(ptr++, val); // 6 + _mm512_stream_si512(ptr++, val); // 7 + _mm512_stream_si512(ptr++, val); // 8 + _mm512_stream_si512(ptr++, val); // 9 + _mm512_stream_si512(ptr++, val); // 10 + _mm512_stream_si512(ptr++, val); // 11 + _mm512_stream_si512(ptr++, val); // 12 + _mm512_stream_si512(ptr++, val); // 13 + _mm512_stream_si512(ptr++, val); // 14 + _mm512_stream_si512(ptr++, val); // 15 + _mm512_stream_si512(ptr++, val); // 16 + _mm512_stream_si512(ptr++, val); // 17 + _mm512_stream_si512(ptr++, val); // 18 + _mm512_stream_si512(ptr++, val); // 19 + _mm512_stream_si512(ptr++, val); // 20 + _mm512_stream_si512(ptr++, val); // 21 + _mm512_stream_si512(ptr++, val); // 22 + _mm512_stream_si512(ptr++, val); // 23 + _mm512_stream_si512(ptr++, val); // 24 + _mm512_stream_si512(ptr++, val); // 25 + _mm512_stream_si512(ptr++, val); // 26 + _mm512_stream_si512(ptr++, val); // 27 + _mm512_stream_si512(ptr++, val); // 28 + _mm512_stream_si512(ptr++, val); // 29 + _mm512_stream_si512(ptr++, val); // 30 + _mm512_stream_si512(ptr++, val); // 31 + _mm512_stream_si512(ptr++, val); // 32 + } + _mm_sfence(); + + return dest; +} + +#endif + +//----------------------------------------------------------------------------- +// Dispatch Functions: +//----------------------------------------------------------------------------- + +// Set arbitrarily large amounts of a single byte +void * memset_large(void *dest, const uint8_t val, size_t numbytes) +{ + void * returnval = dest; // Memset is supposed to return the initial destination + + if(val == 0) // Someone called this insted of memset_zeroes directly + { + memset_zeroes(dest, numbytes); + return returnval; + } + + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // Each memset has its own loop. + { + if(numbytes < 16) // 1-15 bytes (the other scalars would need to be memset anyways) + { + memset(dest, val, numbytes); + offset = numbytes; + dest = (char *)dest + offset; + numbytes = 0; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_u(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_u(dest, _mm256_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_512bit_u(dest, _mm512_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_512bit_128B_u(dest, _mm512_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_512bit_256B_u(dest, _mm512_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memset_512bit_512B_u(dest, _mm512_set1_epi8((char)val), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memset_512bit_1kB_u(dest, _mm512_set1_epi8((char)val), numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memset_512bit_2kB_u(dest, _mm512_set1_epi8((char)val), numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_u(dest, _mm512_set1_epi8((char)val), numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_u(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_u(dest, _mm256_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_256bit_64B_u(dest, _mm256_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_256bit_128B_u(dest, _mm256_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_256bit_256B_u(dest, _mm256_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else // 512 bytes + { + memset_256bit_512B_u(dest, _mm256_set1_epi8((char)val), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memset_128bit_u(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_128bit_32B_u(dest, _mm_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_128bit_64B_u(dest, _mm_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_128bit_128B_u(dest, _mm_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else // 256 bytes + { + memset_128bit_256B_u(dest, _mm_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMSET LARGE, UNALIGNED + +// Set arbitrarily large amounts of a single byte +// Aligned version +void * memset_large_a(void *dest, const uint8_t val, size_t numbytes) +{ + void * returnval = dest; // Memset is supposed to return the initial destination + + if(val == 0) // Someone called this insted of memset_zeroes directly + { + memset_zeroes_a(dest, numbytes); + return returnval; + } + + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // Each memset has its own loop. + { + if(numbytes < 16) // 1-15 bytes (the other scalars would need to be memset anyways) + { + memset(dest, val, numbytes); + offset = numbytes; + dest = (char *)dest + offset; + numbytes = 0; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_a(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_a(dest, _mm256_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_512bit_a(dest, _mm512_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_512bit_128B_a(dest, _mm512_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_512bit_256B_a(dest, _mm512_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memset_512bit_512B_a(dest, _mm512_set1_epi8((char)val), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memset_512bit_1kB_a(dest, _mm512_set1_epi8((char)val), numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memset_512bit_2kB_a(dest, _mm512_set1_epi8((char)val), numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_a(dest, _mm512_set1_epi8((char)val), numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_a(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_a(dest, _mm256_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_256bit_64B_a(dest, _mm256_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_256bit_128B_a(dest, _mm256_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_256bit_256B_a(dest, _mm256_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else // 512 bytes + { + memset_256bit_512B_a(dest, _mm256_set1_epi8((char)val), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memset_128bit_a(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_128bit_32B_a(dest, _mm_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_128bit_64B_a(dest, _mm_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_128bit_128B_a(dest, _mm_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else // 256 bytes + { + memset_128bit_256B_a(dest, _mm_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMSET LARGE, ALIGNED + +// Set arbitrarily large amounts of a single byte +// Aligned, streaming version +void * memset_large_as(void *dest, const uint8_t val, size_t numbytes) +{ + void * returnval = dest; // Memset is supposed to return the initial destination + + if(val == 0) // Someone called this insted of memset_zeroes directly + { + memset_zeroes_as(dest, numbytes); + return returnval; + } + + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes) + // Each memset has its own loop. + { + if(numbytes < 16) // 1-15 bytes (the other scalars would need to be memset anyways) + { + memset(dest, val, numbytes); + offset = numbytes; + dest = (char *)dest + offset; + numbytes = 0; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_as(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_as(dest, _mm256_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_512bit_as(dest, _mm512_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_512bit_128B_as(dest, _mm512_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_512bit_256B_as(dest, _mm512_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memset_512bit_512B_as(dest, _mm512_set1_epi8((char)val), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memset_512bit_1kB_as(dest, _mm512_set1_epi8((char)val), numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memset_512bit_2kB_as(dest, _mm512_set1_epi8((char)val), numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_as(dest, _mm512_set1_epi8((char)val), numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_as(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_as(dest, _mm256_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_256bit_64B_as(dest, _mm256_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_256bit_128B_as(dest, _mm256_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_256bit_256B_as(dest, _mm256_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else // 512 bytes + { + memset_256bit_512B_as(dest, _mm256_set1_epi8((char)val), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memset_128bit_as(dest, _mm_set1_epi8((char)val), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_128bit_32B_as(dest, _mm_set1_epi8((char)val), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_128bit_64B_as(dest, _mm_set1_epi8((char)val), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_128bit_128B_as(dest, _mm_set1_epi8((char)val), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else // 256 bytes + { + memset_128bit_256B_as(dest, _mm_set1_epi8((char)val), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMSET LARGE, ALIGNED, STREAMING + +// Set arbitrarily large amounts of only zeroes +void * memset_zeroes(void *dest, size_t numbytes) // Worst-case scenario: 127 bytes +{ + void * returnval = dest; // Memset is supposed to return the initial destination + size_t offset = 0; + + while(numbytes) + // Each memset has its own loop. + { + if(numbytes < 2) // 1 byte + { + memset(dest, 0, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memset_16bit(dest, 0, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memset_32bit(dest, 0, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memset_64bit(dest, 0, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_u(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_u(dest, _mm256_setzero_si256(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_512bit_u(dest, _mm512_setzero_si512(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_512bit_128B_u(dest, _mm512_setzero_si512(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_512bit_256B_u(dest, _mm512_setzero_si512(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memset_512bit_512B_u(dest, _mm512_setzero_si512(), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memset_512bit_1kB_u(dest, _mm512_setzero_si512(), numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memset_512bit_2kB_u(dest, _mm512_setzero_si512(), numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_u(dest, _mm512_setzero_si512(), numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_u(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_u(dest, _mm256_setzero_si256(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_256bit_64B_u(dest, _mm256_setzero_si256(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_256bit_128B_u(dest, _mm256_setzero_si256(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_256bit_256B_u(dest, _mm256_setzero_si256(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else // 512 bytes + { + memset_256bit_512B_u(dest, _mm256_setzero_si256(), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memset_128bit_u(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_128bit_32B_u(dest, _mm_setzero_si128(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_128bit_64B_u(dest, _mm_setzero_si128(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_128bit_128B_u(dest, _mm_setzero_si128(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else // 256 bytes + { + memset_128bit_256B_u(dest, _mm_setzero_si128(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMSET ZEROES, UNALIGNED + +// Set arbitrarily large amounts of only zeroes +// Aligned version +void * memset_zeroes_a(void *dest, size_t numbytes) // Worst-case scenario: 127 bytes +{ + void * returnval = dest; // Memset is supposed to return the initial destination + size_t offset = 0; + + while(numbytes) + // Each memset has its own loop. + { + if(numbytes < 2) // 1 byte + { + memset(dest, 0, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memset_16bit(dest, 0, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memset_32bit(dest, 0, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memset_64bit(dest, 0, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_a(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_a(dest, _mm256_setzero_si256(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_512bit_a(dest, _mm512_setzero_si512(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_512bit_128B_a(dest, _mm512_setzero_si512(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_512bit_256B_a(dest, _mm512_setzero_si512(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memset_512bit_512B_a(dest, _mm512_setzero_si512(), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memset_512bit_1kB_a(dest, _mm512_setzero_si512(), numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memset_512bit_2kB_a(dest, _mm512_setzero_si512(), numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_a(dest, _mm512_setzero_si512(), numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_a(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_a(dest, _mm256_setzero_si256(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_256bit_64B_a(dest, _mm256_setzero_si256(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_256bit_128B_a(dest, _mm256_setzero_si256(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_256bit_256B_a(dest, _mm256_setzero_si256(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else // 512 bytes + { + memset_256bit_512B_a(dest, _mm256_setzero_si256(), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memset_128bit_a(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_128bit_32B_a(dest, _mm_setzero_si128(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_128bit_64B_a(dest, _mm_setzero_si128(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_128bit_128B_a(dest, _mm_setzero_si128(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else // 256 bytes + { + memset_128bit_256B_a(dest, _mm_setzero_si128(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMSET ZEROES, ALIGNED + +// Set arbitrarily large amounts of only zeroes +// Aligned, streaming version +void * memset_zeroes_as(void *dest, size_t numbytes) // Worst-case scenario: 127 bytes +{ + void * returnval = dest; // Memset is supposed to return the initial destination + size_t offset = 0; + + while(numbytes) + // Each memset has its own loop. + { + if(numbytes < 2) // 1 byte + { + memset(dest, 0, numbytes); + offset = numbytes & -1; + dest = (char *)dest + offset; + numbytes = 0; + } + else if(numbytes < 4) // 2 bytes + { + memset_16bit(dest, 0, numbytes >> 1); + offset = numbytes & -2; + dest = (char *)dest + offset; + numbytes &= 1; + } + else if(numbytes < 8) // 4 bytes + { + memset_32bit(dest, 0, numbytes >> 2); + offset = numbytes & -4; + dest = (char *)dest + offset; + numbytes &= 3; + } + else if(numbytes < 16) // 8 bytes + { + memset_64bit(dest, 0, numbytes >> 3); + offset = numbytes & -8; + dest = (char *)dest + offset; + numbytes &= 7; + } +#ifdef __AVX512F__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_as(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_as(dest, _mm256_setzero_si256(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_512bit_as(dest, _mm512_setzero_si512(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_512bit_128B_as(dest, _mm512_setzero_si512(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_512bit_256B_as(dest, _mm512_setzero_si512(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else if(numbytes < 1024) // 512 bytes + { + memset_512bit_512B_as(dest, _mm512_setzero_si512(), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } + else if(numbytes < 2048) // 1024 bytes (1 kB) + { + memset_512bit_1kB_as(dest, _mm512_setzero_si512(), numbytes >> 10); + offset = numbytes & -1024; + dest = (char *)dest + offset; + numbytes &= 1023; + } + else if(numbytes < 4096) // 2048 bytes (2 kB) + { + memset_512bit_2kB_as(dest, _mm512_setzero_si512(), numbytes >> 11); + offset = numbytes & -2048; + dest = (char *)dest + offset; + numbytes &= 2047; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_as(dest, _mm512_setzero_si512(), numbytes >> 12); + offset = numbytes & -4096; + dest = (char *)dest + offset; + numbytes &= 4095; + } +#elif __AVX__ + else if(numbytes < 32) // 16 bytes + { + memset_128bit_as(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_256bit_as(dest, _mm256_setzero_si256(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_256bit_64B_as(dest, _mm256_setzero_si256(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_256bit_128B_as(dest, _mm256_setzero_si256(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else if(numbytes < 512) // 256 bytes + { + memset_256bit_256B_as(dest, _mm256_setzero_si256(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } + else // 512 bytes + { + memset_256bit_512B_as(dest, _mm256_setzero_si256(), numbytes >> 9); + offset = numbytes & -512; + dest = (char *)dest + offset; + numbytes &= 511; + } +#else // SSE2 only + else if(numbytes < 32) // 16 bytes + { + memset_128bit_as(dest, _mm_setzero_si128(), numbytes >> 4); + offset = numbytes & -16; + dest = (char *)dest + offset; + numbytes &= 15; + } + else if(numbytes < 64) // 32 bytes + { + memset_128bit_32B_as(dest, _mm_setzero_si128(), numbytes >> 5); + offset = numbytes & -32; + dest = (char *)dest + offset; + numbytes &= 31; + } + else if(numbytes < 128) // 64 bytes + { + memset_128bit_64B_as(dest, _mm_setzero_si128(), numbytes >> 6); + offset = numbytes & -64; + dest = (char *)dest + offset; + numbytes &= 63; + } + else if(numbytes < 256) // 128 bytes + { + memset_128bit_128B_as(dest, _mm_setzero_si128(), numbytes >> 7); + offset = numbytes & -128; + dest = (char *)dest + offset; + numbytes &= 127; + } + else // 256 bytes + { + memset_128bit_256B_as(dest, _mm_setzero_si128(), numbytes >> 8); + offset = numbytes & -256; + dest = (char *)dest + offset; + numbytes &= 255; + } +#endif + } + return returnval; +} // END MEMSET ZEROES, ALIGNED, STREAMING + +// Set arbitrarily large amounts of 4-byte values +// numbytes_div_4 is total number of bytes / 4 (since this is 4 bytes at a time) +void * memset_large_4B(void *dest, const uint32_t val, size_t numbytes_div_4) +{ + void * returnval = dest; // Memset is supposed to return the initial destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes_div_4) + // Each memset has its own loop. + { + if(numbytes_div_4 < 4) // 4, 8, 12 bytes (the other scalars would need to be memset anyways) + { + memset_32bit(dest, val, numbytes_div_4); + offset = numbytes_div_4; + dest = (char *)dest + offset; + numbytes_div_4 = 0; + } +#ifdef __AVX512F__ + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_u(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_256bit_u(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_512bit_u(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_512bit_128B_u(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else if(numbytes_div_4 < 128) // 256 bytes + { + memset_512bit_256B_u(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } + else if(numbytes_div_4 < 256) // 512 bytes + { + memset_512bit_512B_u(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 7); + offset = numbytes_div_4 & -128; + dest = (char *)dest + offset; + numbytes_div_4 &= 127; + } + else if(numbytes_div_4 < 512) // 1024 bytes (1 kB) + { + memset_512bit_1kB_u(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 8); + offset = numbytes_div_4 & -256; + dest = (char *)dest + offset; + numbytes_div_4 &= 255; + } + else if(numbytes_div_4 < 1024) // 2048 bytes (2 kB) + { + memset_512bit_2kB_u(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 9); + offset = numbytes_div_4 & -512; + dest = (char *)dest + offset; + numbytes_div_4 &= 511; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_u(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 10); + offset = numbytes_div_4 & -1024; + dest = (char *)dest + offset; + numbytes_div_4 &= 1023; + } +#elif __AVX__ + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_u(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_256bit_u(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_256bit_64B_u(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_256bit_128B_u(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else if(numbytes_div_4 < 128) // 256 bytes + { + memset_256bit_256B_u(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } + else // 512 bytes + { + memset_256bit_512B_u(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 7); + offset = numbytes_div_4 & -128; + dest = (char *)dest + offset; + numbytes_div_4 &= 127; + } +#else // SSE2 only + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_u(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_128bit_32B_u(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_128bit_64B_u(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_128bit_128B_u(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else // 256 bytes + { + memset_128bit_256B_u(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } +#endif + } + return returnval; +} // END MEMSET LARGE, 4B, UNALIGNED + +// Set arbitrarily large amounts of 4-byte values +// numbytes_div_4 is total number of bytes / 4 (since this is 4 bytes at a time) +// Aligned version +void * memset_large_4B_a(void *dest, const uint32_t val, size_t numbytes_div_4) +{ + void * returnval = dest; // Memset is supposed to return the initial destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes_div_4) + // Each memset has its own loop. + { + if(numbytes_div_4 < 4) // 4, 8, 12 bytes (the other scalars would need to be memset anyways) + { + memset_32bit(dest, val, numbytes_div_4); + offset = numbytes_div_4; + dest = (char *)dest + offset; + numbytes_div_4 = 0; + } +#ifdef __AVX512F__ + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_a(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_256bit_a(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_512bit_a(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_512bit_128B_a(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else if(numbytes_div_4 < 128) // 256 bytes + { + memset_512bit_256B_a(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } + else if(numbytes_div_4 < 256) // 512 bytes + { + memset_512bit_512B_a(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 7); + offset = numbytes_div_4 & -128; + dest = (char *)dest + offset; + numbytes_div_4 &= 127; + } + else if(numbytes_div_4 < 512) // 1024 bytes (1 kB) + { + memset_512bit_1kB_a(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 8); + offset = numbytes_div_4 & -256; + dest = (char *)dest + offset; + numbytes_div_4 &= 255; + } + else if(numbytes_div_4 < 1024) // 2048 bytes (2 kB) + { + memset_512bit_2kB_a(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 9); + offset = numbytes_div_4 & -512; + dest = (char *)dest + offset; + numbytes_div_4 &= 511; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_a(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 10); + offset = numbytes_div_4 & -1024; + dest = (char *)dest + offset; + numbytes_div_4 &= 1023; + } +#elif __AVX__ + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_a(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_256bit_a(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_256bit_64B_a(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_256bit_128B_a(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else if(numbytes_div_4 < 128) // 256 bytes + { + memset_256bit_256B_a(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } + else // 512 bytes + { + memset_256bit_512B_a(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 7); + offset = numbytes_div_4 & -128; + dest = (char *)dest + offset; + numbytes_div_4 &= 127; + } +#else // SSE2 only + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_a(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_128bit_32B_a(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_128bit_64B_a(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_128bit_128B_a(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else // 256 bytes + { + memset_128bit_256B_a(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } +#endif + } + return returnval; +} // END MEMSET LARGE, 4B, ALIGNED + +// Set arbitrarily large amounts of 4-byte values +// numbytes_div_4 is total number of bytes / 4 (since this is 4 bytes at a time) +// Aligned, streaming version +void * memset_large_4B_as(void *dest, const uint32_t val, size_t numbytes_div_4) +{ + void * returnval = dest; // Memset is supposed to return the initial destination + size_t offset = 0; // Offset size needs to match the size of a pointer + + while(numbytes_div_4) + // Each memset has its own loop. + { + if(numbytes_div_4 < 4) // 4, 8, 12 bytes (the other scalars would need to be memset anyways) + { + memset_32bit(dest, val, numbytes_div_4); + offset = numbytes_div_4; + dest = (char *)dest + offset; + numbytes_div_4 = 0; + } +#ifdef __AVX512F__ + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_as(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_256bit_as(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_512bit_as(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_512bit_128B_as(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else if(numbytes_div_4 < 128) // 256 bytes + { + memset_512bit_256B_as(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } + else if(numbytes_div_4 < 256) // 512 bytes + { + memset_512bit_512B_as(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 7); + offset = numbytes_div_4 & -128; + dest = (char *)dest + offset; + numbytes_div_4 &= 127; + } + else if(numbytes_div_4 < 512) // 1024 bytes (1 kB) + { + memset_512bit_1kB_as(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 8); + offset = numbytes_div_4 & -256; + dest = (char *)dest + offset; + numbytes_div_4 &= 255; + } + else if(numbytes_div_4 < 1024) // 2048 bytes (2 kB) + { + memset_512bit_2kB_as(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 9); + offset = numbytes_div_4 & -512; + dest = (char *)dest + offset; + numbytes_div_4 &= 511; + } + else // 4096 bytes (4 kB) + { + memset_512bit_4kB_as(dest, _mm512_set1_epi32((int32_t)val), numbytes_div_4 >> 10); + offset = numbytes_div_4 & -1024; + dest = (char *)dest + offset; + numbytes_div_4 &= 1023; + } +#elif __AVX__ + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_as(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_256bit_as(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_256bit_64B_as(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_256bit_128B_as(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else if(numbytes_div_4 < 128) // 256 bytes + { + memset_256bit_256B_as(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } + else // 512 bytes + { + memset_256bit_512B_as(dest, _mm256_set1_epi32((int32_t)val), numbytes_div_4 >> 7); + offset = numbytes_div_4 & -128; + dest = (char *)dest + offset; + numbytes_div_4 &= 127; + } +#else // SSE2 only + else if(numbytes_div_4 < 8) // 16 bytes + { + memset_128bit_as(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 2); + offset = numbytes_div_4 & -4; + dest = (char *)dest + offset; + numbytes_div_4 &= 3; + } + else if(numbytes_div_4 < 16) // 32 bytes + { + memset_128bit_32B_as(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 3); + offset = numbytes_div_4 & -8; + dest = (char *)dest + offset; + numbytes_div_4 &= 7; + } + else if(numbytes_div_4 < 32) // 64 bytes + { + memset_128bit_64B_as(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 4); + offset = numbytes_div_4 & -16; + dest = (char *)dest + offset; + numbytes_div_4 &= 15; + } + else if(numbytes_div_4 < 64) // 128 bytes + { + memset_128bit_128B_as(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 5); + offset = numbytes_div_4 & -32; + dest = (char *)dest + offset; + numbytes_div_4 &= 31; + } + else // 256 bytes + { + memset_128bit_256B_as(dest, _mm_set1_epi32((int32_t)val), numbytes_div_4 >> 6); + offset = numbytes_div_4 & -64; + dest = (char *)dest + offset; + numbytes_div_4 &= 63; + } +#endif + } + return returnval; +} // END MEMSET LARGE, 4B, ALIGNED, STREAMING + +//----------------------------------------------------------------------------- +// Main Functions: +//----------------------------------------------------------------------------- + +// To set values of sizes > 1 byte, call the desired memset functions directly +// instead. A 4-byte version exists below, however. +void * memsetAVX(void *dest, const uint8_t val, size_t numbytes) +{ + void * returnval = dest; + + if( ((uintptr_t)dest & BYTE_ALIGNMENT) == 0 ) // Check alignment + { + if(val == 0) + { + if(numbytes > CACHESIZE) + { + memset_zeroes_as(dest, numbytes); + } + else + { + memset_zeroes_a(dest, numbytes); + } + } + else + { + if(numbytes > CACHESIZE) + { + memset_large_as(dest, val, numbytes); + } + else + { + memset_large_a(dest, val, numbytes); + } + } + } + else + { + size_t numbytes_to_align = (BYTE_ALIGNMENT + 1) - ((uintptr_t)dest & BYTE_ALIGNMENT); + + void * destoffset = (char*)dest + numbytes_to_align; + + if(val == 0) + { + if(numbytes > numbytes_to_align) + { + // Get to an aligned position. + // This may be a little slower, but since it'll be mostly scalar operations + // alignment doesn't matter. Worst case it uses two vector functions, and + // this process only needs to be done once per call if dest is unaligned. + memset_zeroes(dest, numbytes_to_align); + // Now this should be near the fastest possible since stores are aligned. + if((numbytes - numbytes_to_align) > CACHESIZE) + { + memset_zeroes_as(destoffset, numbytes - numbytes_to_align); + } + else + { + memset_zeroes_a(destoffset, numbytes - numbytes_to_align); + } + } + else // Small size + { + memset_zeroes(dest, numbytes); + } + } + else + { + if(numbytes > numbytes_to_align) + { + // Get to an aligned position. + // This may be a little slower, but since it'll be mostly scalar operations + // alignment doesn't matter. Worst case it uses two vector functions, and + // this process only needs to be done once per call if dest is unaligned. + memset_large(dest, val, numbytes_to_align); + // Now this should be near the fastest possible since stores are aligned. + if((numbytes - numbytes_to_align) > CACHESIZE) + { + memset_large_as(destoffset, val, numbytes - numbytes_to_align); + } + else + { + memset_large_a(destoffset, val, numbytes - numbytes_to_align); + } + } + else // Small size + { + memset_large(dest, val, numbytes); + } + } + } + + return returnval; +} + +// Set 4 bytes at a time, mainly for 32-bit framebuffers. +// Only use this if you know your set size is aways going to be a multiple of +// 4 bytes, for example in a video framebuffer where 4 bytes is one pixel. +// Numbytes_div_4 is total number of bytes / 4. +// Also, the destination address can, at worst, only be misaligned from the +// cacheline by a value that is a multiple of 4 bytes. +void * memsetAVX_By4Bytes(void *dest, const uint32_t val, size_t numbytes_div_4) +{ + void * returnval = dest; + + if( ((uintptr_t)dest & BYTE_ALIGNMENT) == 0 ) // Check alignment + { + if((numbytes_div_4 * 4) > CACHESIZE) + { + memset_large_4B_as(dest, val, numbytes_div_4); + } + else + { + memset_large_4B_a(dest, val, numbytes_div_4); + } + } + else + { + size_t numbytes_to_align = (BYTE_ALIGNMENT + 1) - ((uintptr_t)dest & BYTE_ALIGNMENT); + if(numbytes_to_align & 0x03) // Sanity check, return NULL if not alignable in 4B increments + { + return NULL; + } + void * destoffset = (char*)dest + numbytes_to_align; + + if(numbytes_div_4 > (numbytes_to_align >> 2)) + { + // Get to an aligned position. + // This process only needs to be done once per call if dest is unaligned. + memset_large_4B(dest, val, numbytes_to_align >> 2); + // Now this should be near the fastest possible since stores are aligned. + // ...and in memset there are only stores. + if((numbytes_div_4 * 4 - numbytes_to_align) > CACHESIZE) + { + memset_large_4B_as(destoffset, val, numbytes_div_4 - (numbytes_to_align >> 2)); + } + else + { + memset_large_4B_a(destoffset, val, numbytes_div_4 - (numbytes_to_align >> 2)); + } + } + else // Small size + { + memset_large_4B(dest, val, numbytes_div_4); + } + } + + return returnval; +} + +// AVX-1024+ support pending existence of the standard. \ No newline at end of file