#include // Equality-only version int memcmp_eq (const void *str1, const void *str2, size_t count) { const unsigned char *s1 = (unsigned char *)str1; const unsigned char *s2 = (unsigned char *)str2; while (count-- > 0) { if (*s1++ != *s2++) { return -1; // Makes more sense to me if -1 means unequal. } } return 0; // Return 0 if equal to match normal memcmp } ///============================================================================= /// LICENSING INFORMATION ///============================================================================= // // The code above this comment is in the public domain. // The code below this comment is subject to the custom attribution license found // here: https://github.com/KNNSpeed/Simple-Kernel/blob/master/LICENSE_KERNEL // //============================================================================== // AVX Memory Functions: AVX Memcmp //============================================================================== // // Version 1.2 // // Author: // KNNSpeed // // Source Code: // https://github.com/KNNSpeed/Simple-Kernel // // Minimum requirement: // x86_64 CPU with SSE4.2, but AVX2 or later is recommended // // This file provides a highly optimized version of memcmp. // It allows for selection of modes, too: "check for equality" or perform the full // greater-than/less-than comparison. For equality-only, pass 0 to the equality // argument. Pass 1 for full comparison (or really any nonzero int). // // In equality mode, a return value of 0 means equal, -1 means unequal. // In full comparison mode, -1 -> str1 is less, 0 -> equal, 1 -> str1 is greater. // #ifdef __clang__ #define __m128i_u __m128i #define __m256i_u __m256i #define __m512i_u __m512i #define _mm_cvtsi128_si64x _mm_cvtsi128_si64 #define _mm_cvtsi64x_si128 _mm_cvtsi64_si128 #endif #ifdef __AVX512F__ #define BYTE_ALIGNMENT 0x3F // For 64-byte alignment #elif __AVX2__ #define BYTE_ALIGNMENT 0x1F // For 32-byte alignment #else #define BYTE_ALIGNMENT 0x0F // For 16-byte alignment #endif //----------------------------------------------------------------------------- // Individual Functions: //----------------------------------------------------------------------------- // // The following memcmps return -1 or 1 depending on the sign of the first unit // of their respective sizes, as opposed to the first byte (it seems memcmp(3) // is only defined for byte-by-byte comparisons, not, e.g., 16-byte-by-16-byte). // // The way these functions are made allows them to work properly even if they // run off the edge of the desired memory area (e.g. numbytes was larger than the // desired area for whatever reason). The returned value won't necessarily be // indicative of the memory area in this case. // // 16-bit (2 bytes at a time) // Count is (# of total bytes/2), so it's "# of 16-bits" int memcmp_16bit(const void *str1, const void *str2, size_t count) { const uint16_t *s1 = (uint16_t*)str1; const uint16_t *s2 = (uint16_t*)str2; while (count-- > 0) { if (*s1++ != *s2++) { return s1[-1] < s2[-1] ? -1 : 1; } } return 0; } // Equality-only version int memcmp_16bit_eq(const void *str1, const void *str2, size_t count) { const uint16_t *s1 = (uint16_t*)str1; const uint16_t *s2 = (uint16_t*)str2; while (count--) { if (*s1++ != *s2++) { return -1; } } return 0; } // 32-bit (4 bytes at a time - 1 pixel in a 32-bit linear frame buffer) // Count is (# of total bytes/4), so it's "# of 32-bits" int memcmp_32bit(const void *str1, const void *str2, size_t count) { const uint32_t *s1 = (uint32_t*)str1; const uint32_t *s2 = (uint32_t*)str2; while (count--) { if (*s1++ != *s2++) { return s1[-1] < s2[-1] ? -1 : 1; } } return 0; } // Equality-only version int memcmp_32bit_eq(const void *str1, const void *str2, size_t count) { const uint32_t *s1 = (uint32_t*)str1; const uint32_t *s2 = (uint32_t*)str2; while (count--) { if (*s1++ != *s2++) { return -1; } } return 0; } // 64-bit (8 bytes at a time - 2 pixels in a 32-bit linear frame buffer) // Count is (# of total bytes/8), so it's "# of 64-bits" int memcmp_64bit(const void *str1, const void *str2, size_t count) { const uint64_t *s1 = (uint64_t*)str1; const uint64_t *s2 = (uint64_t*)str2; while (count--) { if (*s1++ != *s2++) { return s1[-1] < s2[-1] ? -1 : 1; } } return 0; } // Equality-only version int memcmp_64bit_eq(const void *str1, const void *str2, size_t count) { const uint64_t *s1 = (uint64_t*)str1; const uint64_t *s2 = (uint64_t*)str2; while (count--) { if (*s1++ != *s2++) { return -1; } } return 0; } //----------------------------------------------------------------------------- // SSE4.2 Unaligned: //----------------------------------------------------------------------------- // SSE4.2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) // Count is (# of total bytes/16), so it's "# of 128-bits" int memcmp_128bit_u(const void *str1, const void *str2, size_t count) { const __m128i_u *s1 = (__m128i_u*)str1; const __m128i_u *s2 = (__m128i_u*)str2; while (count--) { __m128i item1 = _mm_lddqu_si128(s1++); __m128i item2 = _mm_lddqu_si128(s2++); __m128i result = _mm_cmpeq_epi64(item1, item2); // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is // true, and 0 per 64-bit portion where false // If result is not all ones, then there is a difference here if(!(unsigned int)_mm_test_all_ones(result)) {// Ok, now we know they're not equal somewhere // In the case where both halves of the 128-bit result integer are // 0x0000000000000000, that's the same as // 0x0000000000000000FFFFFFFFFFFFFFFF. Only the MSB matters here as the // comparison is a greater-than check. // Do the greater than comparison here to have it done before the conditional // Also make it an unsigned compare: // https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work const __m128i rangeshift = _mm_set1_epi64x(0x8000000000000000); __m128i resultgt = _mm_cmpgt_epi64(_mm_xor_si128(item1, rangeshift), _mm_xor_si128(item2, rangeshift)); // cmpgt returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true // _mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF) makes 0x0000000000000000FFFFFFFFFFFFFFFF, // which is the desired mask inverted. // AND the mask with result such that it returns 1 if all zeroes if((unsigned int)_mm_test_all_zeros(result, ~_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF))) { // Returned a 1, therefore equality comparison gave 0x0000000000000000 // for both 64-bits or 0x0000000000000000FFFFFFFFFFFFFFFF - this // particular case highlights why an unsigned compare is very important. // CMPGT will have given 0xFFFFFFFFFFFFFFFFYYYYYYYYYYYYYYYY or // 0x0000000000000000YYYYYYYYYYYYYYYY // Right shift to put the desired bits into the lower part of the // register (overwrite the Ys) resultgt = _mm_bsrli_si128(resultgt, 8); // Will either be all ones or all zeros. If all ones, item1 > item2, if // all zeros, item1 < item2 if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half { return 1; // 0x[0000000000000000]0000000000000000 } else { return -1; // 0x[0000000000000000]FFFFFFFFFFFFFFFF } } else // AND mask produced a nonzero value, so the test returned 0. { // Therefore equality comparison gave 0xFFFFFFFFFFFFFFFF0000000000000000 // (which is the same as the mask) and CMPGT will have given // 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF or 0xFFFFFFFFFFFFFFFF0000000000000000 // Lower register bits will either be all ones or all zeros. If all ones, // item1 > item2, if all zeros, item1 < item2 if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half { return 1; // 0x[FFFFFFFFFFFFFFFF]FFFFFFFFFFFFFFFF } else { return -1; // 0x[FFFFFFFFFFFFFFFF]0000000000000000 } } } } return 0; } // Equality-only version int memcmp_128bit_eq_u(const void *str1, const void *str2, size_t count) { const __m128i_u *s1 = (__m128i_u*)str1; const __m128i_u *s2 = (__m128i_u*)str2; while (count--) { __m128i item1 = _mm_lddqu_si128(s1++); __m128i item2 = _mm_lddqu_si128(s2++); __m128i result = _mm_cmpeq_epi64(item1, item2); // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is // true, and 0 per 64-bit portion where false // If result is not all ones, then there is a difference here if(!(unsigned int)_mm_test_all_ones(result)) { return -1; } } return 0; } //----------------------------------------------------------------------------- // AVX2+ Unaligned: //----------------------------------------------------------------------------- // AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) // Count is (# of total bytes/32), so it's "# of 256-bits" // Haswell and Ryzen and up #ifdef __AVX2__ int memcmp_256bit_u(const void *str1, const void *str2, size_t count) { const __m256i_u *s1 = (__m256i_u*)str1; const __m256i_u *s2 = (__m256i_u*)str2; while (count--) { __m256i item1 = _mm256_lddqu_si256(s1++); __m256i item2 = _mm256_lddqu_si256(s2++); __m256i result = _mm256_cmpeq_epi64(item1, item2); // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is // true, and 0 per 64-bit portion where false // If result is not all ones, then there is a difference here. // This is the same thing as _mm_test_all_ones, but 256-bit if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF))) { // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity. // It really makes no difference on two's complement machines. // Ok, now we know they're not equal somewhere. Man, doing a pure != is // sooo much simpler than > or <.... // Unsigned greater-than compare using signed operations, see: // https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work const __m256i rangeshift = _mm256_set1_epi64x(0x8000000000000000); __m256i resultgt = _mm256_cmpgt_epi64(_mm256_xor_si256(item1, rangeshift), _mm256_xor_si256(item2, rangeshift)); // Returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true // 32-bit value, 4 outcomes we care about from cmpeq -> movemask: // 00YYYYYY FF00YYYY FFFF00YY FFFFFF00, where Y is "don't care." The most // significant zeroed byte is the inequality we care about. // This is the fastest we can do on AVX2. unsigned int result_to_scan = (unsigned int)_mm256_movemask_epi8(result); unsigned int resultgt_to_scan = (unsigned int)_mm256_movemask_epi8(resultgt); // Outcomes from cmpgt are ZZYYYYYY 00ZZYYYY 0000ZZYY 000000ZZ, where // Z is F if item1 > item2, 0 if item1 < item2, and Y is "don't care." // The ZZ position of cmpgt will match the corresponding 00 of cmpeq. // result_to_scan: 00YYYYYY FF00YYYY FFFF00YY FFFFFF00 --inverted--> // FFYYYYYY 00FFYYYY 0000FFYY 000000FF. This will either be // > resultgt_to_scan (ZZ = 00) or it won't (ZZ = FF). if(~result_to_scan > resultgt_to_scan) { return -1; // If ZZ = 00, item1 < item2 } else { return 1; // If ZZ = FF, item1 > item2 } } } return 0; } // Equality-only version int memcmp_256bit_eq_u(const void *str1, const void *str2, size_t count) { const __m256i_u *s1 = (__m256i_u*)str1; const __m256i_u *s2 = (__m256i_u*)str2; while (count--) { __m256i item1 = _mm256_lddqu_si256(s1++); __m256i item2 = _mm256_lddqu_si256(s2++); __m256i result = _mm256_cmpeq_epi64(item1, item2); // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is // true, and 0 per 64-bit portion where false // If result is not all ones, then there is a difference here. // This is the same thing as _mm_test_all_ones, but 256-bit if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF))) { // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity. // It really makes no difference on two's complement machines. return -1; } } return 0; } #endif // AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) // Count is (# of total bytes/64), so it's "# of 512-bits" // Requires AVX512F #ifdef __AVX512F__ int memcmp_512bit_u(const void *str1, const void *str2, size_t count) { const __m512i_u *s1 = (__m512i_u*)str1; const __m512i_u *s2 = (__m512i_u*)str2; while (count--) { __m512i item1 = _mm512_loadu_si512(s1++); __m512i item2 = _mm512_loadu_si512(s2++); unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2); // All bits == 0 means equal if(result) // I don't believe this. I really need a CPU with AVX-512, lol. // if(_mm512_mask_cmp_epu64_mask(0xFF, item1, item2, 4)) // 0 is CMPEQ, 4 is CMP_NE, this is the same thing { unsigned char resultgt = _mm512_cmpgt_epu64_mask(item1, item2); // For every set of 64-bits where item1 > item2, the mask will have a 1 bit // there, else 0 if(result > resultgt) // Similar deal as AVX2 { return -1; } else { return 1; } } } return 0; } // Equality-only version int memcmp_512bit_eq_u(const void *str1, const void *str2, size_t count) { const __m512i_u *s1 = (__m512i_u*)str1; const __m512i_u *s2 = (__m512i_u*)str2; while (count--) { __m512i item1 = _mm512_loadu_si512(s1++); __m512i item2 = _mm512_loadu_si512(s2++); unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2); // All bits == 0 means equal if(result) // This is barely bigger than 1-byte memcmp_eq { return -1; } } return 0; } #endif //----------------------------------------------------------------------------- // SSE4.2 Aligned: //----------------------------------------------------------------------------- // SSE4.2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer) // Count is (# of total bytes/16), so it's "# of 128-bits" int memcmp_128bit_a(const void *str1, const void *str2, size_t count) { const __m128i *s1 = (__m128i*)str1; const __m128i *s2 = (__m128i*)str2; while (count--) { __m128i item1 = _mm_load_si128(s1++); __m128i item2 = _mm_load_si128(s2++); __m128i result = _mm_cmpeq_epi64(item1, item2); // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is // true, and 0 per 64-bit portion where false // If result is not all ones, then there is a difference here if(!(unsigned int)_mm_test_all_ones(result)) {// Ok, now we know they're not equal somewhere // In the case where both halves of the 128-bit result integer are // 0x0000000000000000, that's the same as // 0x0000000000000000FFFFFFFFFFFFFFFF. Only the MSB matters here as the // comparison is a greater-than check. // Do the greater than comparison here to have it done before the conditional // Also make it an unsigned compare: // https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work const __m128i rangeshift = _mm_set1_epi64x(0x8000000000000000); __m128i resultgt = _mm_cmpgt_epi64(_mm_xor_si128(item1, rangeshift), _mm_xor_si128(item2, rangeshift)); // cmpgt returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true // _mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF) makes 0x0000000000000000FFFFFFFFFFFFFFFF, // which is the desired mask inverted. // AND the mask with result such that it returns 1 if all zeroes if((unsigned int)_mm_test_all_zeros(result, ~_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF))) { // Returned a 1, therefore equality comparison gave 0x0000000000000000 // for both 64-bits or 0x0000000000000000FFFFFFFFFFFFFFFF - this // particular case highlights why an unsigned compare is very important. // CMPGT will have given 0xFFFFFFFFFFFFFFFFYYYYYYYYYYYYYYYY or // 0x0000000000000000YYYYYYYYYYYYYYYY // Right shift to put the desired bits into the lower part of the // register (overwrite the Ys) resultgt = _mm_bsrli_si128(resultgt, 8); // Will either be all ones or all zeros. If all ones, item1 > item2, if // all zeros, item1 < item2 if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half { return 1; // 0x[0000000000000000]0000000000000000 } else { return -1; // 0x[0000000000000000]FFFFFFFFFFFFFFFF } } else // AND mask produced a nonzero value, so the test returned 0. { // Therefore equality comparison gave 0xFFFFFFFFFFFFFFFF0000000000000000 // (which is the same as the mask) and CMPGT will have given // 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF or 0xFFFFFFFFFFFFFFFF0000000000000000 // Lower register bits will either be all ones or all zeros. If all ones, // item1 > item2, if all zeros, item1 < item2 if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half { return 1; // 0x[FFFFFFFFFFFFFFFF]FFFFFFFFFFFFFFFF } else { return -1; // 0x[FFFFFFFFFFFFFFFF]0000000000000000 } } } } return 0; } // Equality-only version int memcmp_128bit_eq_a(const void *str1, const void *str2, size_t count) { const __m128i *s1 = (__m128i*)str1; const __m128i *s2 = (__m128i*)str2; while (count--) { __m128i item1 = _mm_load_si128(s1++); __m128i item2 = _mm_load_si128(s2++); __m128i result = _mm_cmpeq_epi64(item1, item2); // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is // true, and 0 per 64-bit portion where false // If result is not all ones, then there is a difference here if(!(unsigned int)_mm_test_all_ones(result)) { return -1; } } return 0; } //----------------------------------------------------------------------------- // AVX2+ Aligned: //----------------------------------------------------------------------------- // AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer) // Count is (# of total bytes/32), so it's "# of 256-bits" // Haswell and Ryzen and up #ifdef __AVX2__ int memcmp_256bit_a(const void *str1, const void *str2, size_t count) { const __m256i *s1 = (__m256i*)str1; const __m256i *s2 = (__m256i*)str2; while (count--) { __m256i item1 = _mm256_load_si256(s1++); __m256i item2 = _mm256_load_si256(s2++); __m256i result = _mm256_cmpeq_epi64(item1, item2); // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is // true, and 0 per 64-bit portion where false // If result is not all ones, then there is a difference here. // This is the same thing as _mm_test_all_ones, but 256-bit if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF))) { // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity. // It really makes no difference on two's complement machines. // Ok, now we know they're not equal somewhere. Man, doing a pure != is // sooo much simpler than > or <.... // Unsigned greater-than compare using signed operations, see: // https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work const __m256i rangeshift = _mm256_set1_epi64x(0x8000000000000000); __m256i resultgt = _mm256_cmpgt_epi64(_mm256_xor_si256(item1, rangeshift), _mm256_xor_si256(item2, rangeshift)); // Returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true // 32-bit value, 4 outcomes we care about from cmpeq -> movemask: // 00YYYYYY FF00YYYY FFFF00YY FFFFFF00, where Y is "don't care." The most // significant zeroed byte is the inequality we care about. // This is the fastest we can do on AVX2. unsigned int result_to_scan = (unsigned int)_mm256_movemask_epi8(result); unsigned int resultgt_to_scan = (unsigned int)_mm256_movemask_epi8(resultgt); // Outcomes from cmpgt are ZZYYYYYY 00ZZYYYY 0000ZZYY 000000ZZ, where // Z is F if item1 > item2, 0 if item1 < item2, and Y is "don't care." // The ZZ position of cmpgt will match the corresponding 00 of cmpeq. // result_to_scan: 00YYYYYY FF00YYYY FFFF00YY FFFFFF00 --inverted--> // FFYYYYYY 00FFYYYY 0000FFYY 000000FF. This will either be // > resultgt_to_scan (ZZ = 00) or it won't (ZZ = FF). if(~result_to_scan > resultgt_to_scan) { return -1; // If ZZ = 00, item1 < item2 } else { return 1; // If ZZ = FF, item1 > item2 } } } return 0; } // Equality-only version int memcmp_256bit_eq_a(const void *str1, const void *str2, size_t count) { const __m256i *s1 = (__m256i*)str1; const __m256i *s2 = (__m256i*)str2; while (count--) { __m256i item1 = _mm256_load_si256(s1++); __m256i item2 = _mm256_load_si256(s2++); __m256i result = _mm256_cmpeq_epi64(item1, item2); // cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is // true, and 0 per 64-bit portion where false // If result is not all ones, then there is a difference here. // This is the same thing as _mm_test_all_ones, but 256-bit if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF))) { // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity. // It really makes no difference on two's complement machines. return -1; } } return 0; } #endif // AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer) // Count is (# of total bytes/64), so it's "# of 512-bits" // Requires AVX512F #ifdef __AVX512F__ int memcmp_512bit_a(const void *str1, const void *str2, size_t count) { const __m512i *s1 = (__m512i*)str1; const __m512i *s2 = (__m512i*)str2; while (count--) { __m512i item1 = _mm512_load_si512(s1++); __m512i item2 = _mm512_load_si512(s2++); unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2); // All bits == 0 means equal if(result) // I don't believe this. I really need a CPU with AVX-512, lol. // if(_mm512_mask_cmp_epu64_mask(0xFF, item1, item2, 4)) // 0 is CMPEQ, 4 is CMP_NE, this is the same thing { unsigned char resultgt = _mm512_cmpgt_epu64_mask(item1, item2); // For every set of 64-bits where item1 > item2, the mask will have a 1 bit // there, else 0 if(result > resultgt) // Similar deal as AVX2 { return -1; } else { return 1; } } } return 0; } // GCC -O3 makes memcmp_512bit_a(...) take 25 lines of assembly. This version // (~10 cycles) is around 5 or so cycles slower per set of memory regions than // memcmp (~5 cycles). It's the mask operations that take ~3 cycles each... // // When the latency of jumps are taken into account, that means this function can // compare 64 BYTES of data at around the same speed that memcmp does only 1 byte. // The AVX2 version is 1 cycle slower than the AVX512 version in its main loop // (i.e. it takes ~11 cycles). When an inequality is found, memcmp takes 3 cycles, // AVX2 takes 16 cycles, and AVX512 takes 10 cycles to determine which input is // greater. // // NOTE: These are estimates based solely on instruction latencies per Agner // Fog's optimization tables: https://www.agner.org/optimize/. // Equality-only version int memcmp_512bit_eq_a(const void *str1, const void *str2, size_t count) { const __m512i *s1 = (__m512i*)str1; const __m512i *s2 = (__m512i*)str2; while (count--) { __m512i item1 = _mm512_load_si512(s1++); __m512i item2 = _mm512_load_si512(s2++); unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2); // All bits == 0 means equal if(result) // This is barely bigger than byte-by-byte memcmp_eq { return -1; } } return 0; } #endif //----------------------------------------------------------------------------- // Dispatch Functions (Unaligned): //----------------------------------------------------------------------------- // memcmp for large chunks of memory with arbitrary sizes int memcmp_large(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes. { int returnval = 0; // Return value if equal... or numbytes is 0 size_t offset = 0; while(numbytes) // This loop will, at most, get evaluated 7 times, ending sooner each time. // At minimum non-trivial case, once. Each memcmp has its own loop. { if(numbytes < 2) // 1 byte { returnval = memcmp(str1, str2, numbytes); if(returnval) { return returnval; } offset = numbytes & -1; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes = 0; } else if(numbytes < 4) // 2 bytes { returnval = memcmp_16bit(str1, str2, numbytes >> 1); if(returnval) { return returnval; } offset = numbytes & -2; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 1; } else if(numbytes < 8) // 4 bytes { returnval = memcmp_32bit(str1, str2, numbytes >> 2); if(returnval) { return returnval; } offset = numbytes & -4; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 3; } else if(numbytes < 16) // 8 bytes { returnval = memcmp_64bit(str1, str2, numbytes >> 3); if(returnval) { return returnval; } offset = numbytes & -8; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 7; } #ifdef __AVX512F__ else if(numbytes < 32) // 16 bytes { returnval = memcmp_128bit_u(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { returnval = memcmp_256bit_u(str1, str2, numbytes >> 5); if(returnval) { return returnval; } offset = numbytes & -32; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 31; } else // 64 bytes { returnval = memcmp_512bit_u(str1, str2, numbytes >> 6); if(returnval) { return returnval; } offset = numbytes & -64; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 63; } #elif __AVX2__ else if(numbytes < 32) // 16 bytes { returnval = memcmp_128bit_u(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } else // 32 bytes { returnval = memcmp_256bit_u(str1, str2, numbytes >> 5); if(returnval) { return returnval; } offset = numbytes & -32; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 31; } #else // SSE4.2 only else // 16 bytes { returnval = memcmp_128bit_u(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } #endif } return returnval; } // Equality-only version int memcmp_large_eq(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes. { int returnval = 0; // Return value if equal... or numbytes is 0 size_t offset = 0; while(numbytes) { if(numbytes < 2) // 1 byte { returnval = memcmp_eq(str1, str2, numbytes); if(returnval) { return returnval; } offset = numbytes & -1; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes = 0; } else if(numbytes < 4) // 2 bytes { returnval = memcmp_16bit_eq(str1, str2, numbytes >> 1); if(returnval) { return returnval; } offset = numbytes & -2; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 1; } else if(numbytes < 8) // 4 bytes { returnval = memcmp_32bit_eq(str1, str2, numbytes >> 2); if(returnval) { return returnval; } offset = numbytes & -4; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 3; } else if(numbytes < 16) // 8 bytes { returnval = memcmp_64bit_eq(str1, str2, numbytes >> 3); if(returnval) { return returnval; } offset = numbytes & -8; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 7; } #ifdef __AVX512F__ else if(numbytes < 32) // 16 bytes { returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { returnval = memcmp_256bit_eq_u(str1, str2, numbytes >> 5); if(returnval) { return returnval; } offset = numbytes & -32; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 31; } else // 64 bytes { returnval = memcmp_512bit_eq_u(str1, str2, numbytes >> 6); if(returnval) { return returnval; } offset = numbytes & -64; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 63; } #elif __AVX2__ else if(numbytes < 32) // 16 bytes { returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } else // 32 bytes { returnval = memcmp_256bit_eq_u(str1, str2, numbytes >> 5); if(returnval) { return returnval; } offset = numbytes & -32; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 31; } #else // SSE4.2 only else // 16 bytes { returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } #endif } return returnval; } //----------------------------------------------------------------------------- // Dispatch Functions (Aligned): //----------------------------------------------------------------------------- // memcmp for large chunks of memory with arbitrary sizes (aligned) int memcmp_large_a(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes. { int returnval = 0; // Return value if equal... or numbytes is 0 size_t offset = 0; while(numbytes) // This loop will, at most, get evaulated 7 times, ending sooner each time. // At minimum non-trivial case, once. Each memcmp has its own loop. { if(numbytes < 2) // 1 byte { returnval = memcmp(str1, str2, numbytes); if(returnval) { return returnval; } offset = numbytes & -1; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes = 0; } else if(numbytes < 4) // 2 bytes { returnval = memcmp_16bit(str1, str2, numbytes >> 1); if(returnval) { return returnval; } offset = numbytes & -2; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 1; } else if(numbytes < 8) // 4 bytes { returnval = memcmp_32bit(str1, str2, numbytes >> 2); if(returnval) { return returnval; } offset = numbytes & -4; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 3; } else if(numbytes < 16) // 8 bytes { returnval = memcmp_64bit(str1, str2, numbytes >> 3); if(returnval) { return returnval; } offset = numbytes & -8; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 7; } #ifdef __AVX512F__ else if(numbytes < 32) // 16 bytes { returnval = memcmp_128bit_a(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { returnval = memcmp_256bit_a(str1, str2, numbytes >> 5); if(returnval) { return returnval; } offset = numbytes & -32; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 31; } else // 64 bytes { returnval = memcmp_512bit_a(str1, str2, numbytes >> 6); if(returnval) { return returnval; } offset = numbytes & -64; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 63; } #elif __AVX2__ else if(numbytes < 32) // 16 bytes { returnval = memcmp_128bit_a(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } else // 32 bytes { returnval = memcmp_256bit_a(str1, str2, numbytes >> 5); if(returnval) { return returnval; } offset = numbytes & -32; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 31; } #else // SSE4.2 only else // 16 bytes { returnval = memcmp_128bit_a(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } #endif } return returnval; } // Equality-only version (aligned) int memcmp_large_eq_a(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes. { int returnval = 0; // Return value if equal... or numbytes is 0 size_t offset = 0; while(numbytes) { if(numbytes < 2) // 1 byte { returnval = memcmp_eq(str1, str2, numbytes); if(returnval) { return returnval; } offset = numbytes & -1; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes = 0; } else if(numbytes < 4) // 2 bytes { returnval = memcmp_16bit_eq(str1, str2, numbytes >> 1); if(returnval) { return returnval; } offset = numbytes & -2; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 1; } else if(numbytes < 8) // 4 bytes { returnval = memcmp_32bit_eq(str1, str2, numbytes >> 2); if(returnval) { return returnval; } offset = numbytes & -4; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 3; } else if(numbytes < 16) // 8 bytes { returnval = memcmp_64bit_eq(str1, str2, numbytes >> 3); if(returnval) { return returnval; } offset = numbytes & -8; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 7; } #ifdef __AVX512F__ else if(numbytes < 32) // 16 bytes { returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } else if(numbytes < 64) // 32 bytes { returnval = memcmp_256bit_eq_a(str1, str2, numbytes >> 5); if(returnval) { return returnval; } offset = numbytes & -32; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 31; } else // 64 bytes { returnval = memcmp_512bit_eq_a(str1, str2, numbytes >> 6); if(returnval) { return returnval; } offset = numbytes & -64; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 63; } #elif __AVX2__ else if(numbytes < 32) // 16 bytes { returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } else // 32 bytes { returnval = memcmp_256bit_eq_a(str1, str2, numbytes >> 5); if(returnval) { return returnval; } offset = numbytes & -32; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 31; } #else // SSE4.2 only else // 16 bytes { returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4); if(returnval) { return returnval; } offset = numbytes & -16; str1 = (char *)str1 + offset; str2 = (char *)str2 + offset; numbytes &= 15; } #endif } return returnval; } //----------------------------------------------------------------------------- // Main Function: //----------------------------------------------------------------------------- // Main memcmp function int AVX_memcmp(const void *str1, const void *str2, size_t numbytes, int equality) { int returnval = 0; if( ( ((uintptr_t)str1 & BYTE_ALIGNMENT) == 0 ) && ( ((uintptr_t)str2 & BYTE_ALIGNMENT) == 0 ) ) // Check alignment { // See memmove.c for why it's worth doing special aligned versions of memcmp, which // is a function that involves 2 loads. if(equality == 0) { returnval = memcmp_large_eq_a(str1, str2, numbytes); } else { returnval = memcmp_large_a(str1, str2, numbytes); } } else { if(equality == 0) { returnval = memcmp_large_eq(str1, str2, numbytes); } else { returnval = memcmp_large(str1, str2, numbytes); } } return returnval; } // AVX-1024+ support pending existence of the standard.