Sync/kernel/memory/memcmp.c

1303 lines
38 KiB
C
Raw Permalink Normal View History

#include <kernel.h>
// Equality-only version
int memcmp_eq (const void *str1, const void *str2, size_t count)
{
const unsigned char *s1 = (unsigned char *)str1;
const unsigned char *s2 = (unsigned char *)str2;
while (count-- > 0)
{
if (*s1++ != *s2++)
{
return -1; // Makes more sense to me if -1 means unequal.
}
}
return 0; // Return 0 if equal to match normal memcmp
}
///=============================================================================
/// LICENSING INFORMATION
///=============================================================================
//
// The code above this comment is in the public domain.
// The code below this comment is subject to the custom attribution license found
// here: https://github.com/KNNSpeed/Simple-Kernel/blob/master/LICENSE_KERNEL
//
//==============================================================================
// AVX Memory Functions: AVX Memcmp
//==============================================================================
//
// Version 1.2
//
// Author:
// KNNSpeed
//
// Source Code:
// https://github.com/KNNSpeed/Simple-Kernel
//
// Minimum requirement:
// x86_64 CPU with SSE4.2, but AVX2 or later is recommended
//
// This file provides a highly optimized version of memcmp.
// It allows for selection of modes, too: "check for equality" or perform the full
// greater-than/less-than comparison. For equality-only, pass 0 to the equality
// argument. Pass 1 for full comparison (or really any nonzero int).
//
// In equality mode, a return value of 0 means equal, -1 means unequal.
// In full comparison mode, -1 -> str1 is less, 0 -> equal, 1 -> str1 is greater.
//
#ifdef __clang__
#define __m128i_u __m128i
#define __m256i_u __m256i
#define __m512i_u __m512i
#define _mm_cvtsi128_si64x _mm_cvtsi128_si64
#define _mm_cvtsi64x_si128 _mm_cvtsi64_si128
#endif
#ifdef __AVX512F__
#define BYTE_ALIGNMENT 0x3F // For 64-byte alignment
#elif __AVX2__
#define BYTE_ALIGNMENT 0x1F // For 32-byte alignment
#else
#define BYTE_ALIGNMENT 0x0F // For 16-byte alignment
#endif
//-----------------------------------------------------------------------------
// Individual Functions:
//-----------------------------------------------------------------------------
//
// The following memcmps return -1 or 1 depending on the sign of the first unit
// of their respective sizes, as opposed to the first byte (it seems memcmp(3)
// is only defined for byte-by-byte comparisons, not, e.g., 16-byte-by-16-byte).
//
// The way these functions are made allows them to work properly even if they
// run off the edge of the desired memory area (e.g. numbytes was larger than the
// desired area for whatever reason). The returned value won't necessarily be
// indicative of the memory area in this case.
//
// 16-bit (2 bytes at a time)
// Count is (# of total bytes/2), so it's "# of 16-bits"
int memcmp_16bit(const void *str1, const void *str2, size_t count)
{
const uint16_t *s1 = (uint16_t*)str1;
const uint16_t *s2 = (uint16_t*)str2;
while (count-- > 0)
{
if (*s1++ != *s2++)
{
return s1[-1] < s2[-1] ? -1 : 1;
}
}
return 0;
}
// Equality-only version
int memcmp_16bit_eq(const void *str1, const void *str2, size_t count)
{
const uint16_t *s1 = (uint16_t*)str1;
const uint16_t *s2 = (uint16_t*)str2;
while (count--)
{
if (*s1++ != *s2++)
{
return -1;
}
}
return 0;
}
// 32-bit (4 bytes at a time - 1 pixel in a 32-bit linear frame buffer)
// Count is (# of total bytes/4), so it's "# of 32-bits"
int memcmp_32bit(const void *str1, const void *str2, size_t count)
{
const uint32_t *s1 = (uint32_t*)str1;
const uint32_t *s2 = (uint32_t*)str2;
while (count--)
{
if (*s1++ != *s2++)
{
return s1[-1] < s2[-1] ? -1 : 1;
}
}
return 0;
}
// Equality-only version
int memcmp_32bit_eq(const void *str1, const void *str2, size_t count)
{
const uint32_t *s1 = (uint32_t*)str1;
const uint32_t *s2 = (uint32_t*)str2;
while (count--)
{
if (*s1++ != *s2++)
{
return -1;
}
}
return 0;
}
// 64-bit (8 bytes at a time - 2 pixels in a 32-bit linear frame buffer)
// Count is (# of total bytes/8), so it's "# of 64-bits"
int memcmp_64bit(const void *str1, const void *str2, size_t count)
{
const uint64_t *s1 = (uint64_t*)str1;
const uint64_t *s2 = (uint64_t*)str2;
while (count--)
{
if (*s1++ != *s2++)
{
return s1[-1] < s2[-1] ? -1 : 1;
}
}
return 0;
}
// Equality-only version
int memcmp_64bit_eq(const void *str1, const void *str2, size_t count)
{
const uint64_t *s1 = (uint64_t*)str1;
const uint64_t *s2 = (uint64_t*)str2;
while (count--)
{
if (*s1++ != *s2++)
{
return -1;
}
}
return 0;
}
//-----------------------------------------------------------------------------
// SSE4.2 Unaligned:
//-----------------------------------------------------------------------------
// SSE4.2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer)
// Count is (# of total bytes/16), so it's "# of 128-bits"
int memcmp_128bit_u(const void *str1, const void *str2, size_t count)
{
const __m128i_u *s1 = (__m128i_u*)str1;
const __m128i_u *s2 = (__m128i_u*)str2;
while (count--)
{
__m128i item1 = _mm_lddqu_si128(s1++);
__m128i item2 = _mm_lddqu_si128(s2++);
__m128i result = _mm_cmpeq_epi64(item1, item2);
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
// true, and 0 per 64-bit portion where false
// If result is not all ones, then there is a difference here
if(!(unsigned int)_mm_test_all_ones(result))
{// Ok, now we know they're not equal somewhere
// In the case where both halves of the 128-bit result integer are
// 0x0000000000000000, that's the same as
// 0x0000000000000000FFFFFFFFFFFFFFFF. Only the MSB matters here as the
// comparison is a greater-than check.
// Do the greater than comparison here to have it done before the conditional
// Also make it an unsigned compare:
// https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work
const __m128i rangeshift = _mm_set1_epi64x(0x8000000000000000);
__m128i resultgt = _mm_cmpgt_epi64(_mm_xor_si128(item1, rangeshift), _mm_xor_si128(item2, rangeshift));
// cmpgt returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true
// _mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF) makes 0x0000000000000000FFFFFFFFFFFFFFFF,
// which is the desired mask inverted.
// AND the mask with result such that it returns 1 if all zeroes
if((unsigned int)_mm_test_all_zeros(result, ~_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF)))
{
// Returned a 1, therefore equality comparison gave 0x0000000000000000
// for both 64-bits or 0x0000000000000000FFFFFFFFFFFFFFFF - this
// particular case highlights why an unsigned compare is very important.
// CMPGT will have given 0xFFFFFFFFFFFFFFFFYYYYYYYYYYYYYYYY or
// 0x0000000000000000YYYYYYYYYYYYYYYY
// Right shift to put the desired bits into the lower part of the
// register (overwrite the Ys)
resultgt = _mm_bsrli_si128(resultgt, 8);
// Will either be all ones or all zeros. If all ones, item1 > item2, if
// all zeros, item1 < item2
if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half
{
return 1; // 0x[0000000000000000]0000000000000000
}
else
{
return -1; // 0x[0000000000000000]FFFFFFFFFFFFFFFF
}
}
else // AND mask produced a nonzero value, so the test returned 0.
{
// Therefore equality comparison gave 0xFFFFFFFFFFFFFFFF0000000000000000
// (which is the same as the mask) and CMPGT will have given
// 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF or 0xFFFFFFFFFFFFFFFF0000000000000000
// Lower register bits will either be all ones or all zeros. If all ones,
// item1 > item2, if all zeros, item1 < item2
if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half
{
return 1; // 0x[FFFFFFFFFFFFFFFF]FFFFFFFFFFFFFFFF
}
else
{
return -1; // 0x[FFFFFFFFFFFFFFFF]0000000000000000
}
}
}
}
return 0;
}
// Equality-only version
int memcmp_128bit_eq_u(const void *str1, const void *str2, size_t count)
{
const __m128i_u *s1 = (__m128i_u*)str1;
const __m128i_u *s2 = (__m128i_u*)str2;
while (count--)
{
__m128i item1 = _mm_lddqu_si128(s1++);
__m128i item2 = _mm_lddqu_si128(s2++);
__m128i result = _mm_cmpeq_epi64(item1, item2);
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
// true, and 0 per 64-bit portion where false
// If result is not all ones, then there is a difference here
if(!(unsigned int)_mm_test_all_ones(result))
{
return -1;
}
}
return 0;
}
//-----------------------------------------------------------------------------
// AVX2+ Unaligned:
//-----------------------------------------------------------------------------
// AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer)
// Count is (# of total bytes/32), so it's "# of 256-bits"
// Haswell and Ryzen and up
#ifdef __AVX2__
int memcmp_256bit_u(const void *str1, const void *str2, size_t count)
{
const __m256i_u *s1 = (__m256i_u*)str1;
const __m256i_u *s2 = (__m256i_u*)str2;
while (count--)
{
__m256i item1 = _mm256_lddqu_si256(s1++);
__m256i item2 = _mm256_lddqu_si256(s2++);
__m256i result = _mm256_cmpeq_epi64(item1, item2);
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
// true, and 0 per 64-bit portion where false
// If result is not all ones, then there is a difference here.
// This is the same thing as _mm_test_all_ones, but 256-bit
if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF)))
{ // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity.
// It really makes no difference on two's complement machines.
// Ok, now we know they're not equal somewhere. Man, doing a pure != is
// sooo much simpler than > or <....
// Unsigned greater-than compare using signed operations, see:
// https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work
const __m256i rangeshift = _mm256_set1_epi64x(0x8000000000000000);
__m256i resultgt = _mm256_cmpgt_epi64(_mm256_xor_si256(item1, rangeshift), _mm256_xor_si256(item2, rangeshift));
// Returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true
// 32-bit value, 4 outcomes we care about from cmpeq -> movemask:
// 00YYYYYY FF00YYYY FFFF00YY FFFFFF00, where Y is "don't care." The most
// significant zeroed byte is the inequality we care about.
// This is the fastest we can do on AVX2.
unsigned int result_to_scan = (unsigned int)_mm256_movemask_epi8(result);
unsigned int resultgt_to_scan = (unsigned int)_mm256_movemask_epi8(resultgt);
// Outcomes from cmpgt are ZZYYYYYY 00ZZYYYY 0000ZZYY 000000ZZ, where
// Z is F if item1 > item2, 0 if item1 < item2, and Y is "don't care."
// The ZZ position of cmpgt will match the corresponding 00 of cmpeq.
// result_to_scan: 00YYYYYY FF00YYYY FFFF00YY FFFFFF00 --inverted-->
// FFYYYYYY 00FFYYYY 0000FFYY 000000FF. This will either be
// > resultgt_to_scan (ZZ = 00) or it won't (ZZ = FF).
if(~result_to_scan > resultgt_to_scan)
{
return -1; // If ZZ = 00, item1 < item2
}
else
{
return 1; // If ZZ = FF, item1 > item2
}
}
}
return 0;
}
// Equality-only version
int memcmp_256bit_eq_u(const void *str1, const void *str2, size_t count)
{
const __m256i_u *s1 = (__m256i_u*)str1;
const __m256i_u *s2 = (__m256i_u*)str2;
while (count--)
{
__m256i item1 = _mm256_lddqu_si256(s1++);
__m256i item2 = _mm256_lddqu_si256(s2++);
__m256i result = _mm256_cmpeq_epi64(item1, item2);
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
// true, and 0 per 64-bit portion where false
// If result is not all ones, then there is a difference here.
// This is the same thing as _mm_test_all_ones, but 256-bit
if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF)))
{ // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity.
// It really makes no difference on two's complement machines.
return -1;
}
}
return 0;
}
#endif
// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer)
// Count is (# of total bytes/64), so it's "# of 512-bits"
// Requires AVX512F
#ifdef __AVX512F__
int memcmp_512bit_u(const void *str1, const void *str2, size_t count)
{
const __m512i_u *s1 = (__m512i_u*)str1;
const __m512i_u *s2 = (__m512i_u*)str2;
while (count--)
{
__m512i item1 = _mm512_loadu_si512(s1++);
__m512i item2 = _mm512_loadu_si512(s2++);
unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2);
// All bits == 0 means equal
if(result) // I don't believe this. I really need a CPU with AVX-512, lol.
// if(_mm512_mask_cmp_epu64_mask(0xFF, item1, item2, 4)) // 0 is CMPEQ, 4 is CMP_NE, this is the same thing
{
unsigned char resultgt = _mm512_cmpgt_epu64_mask(item1, item2);
// For every set of 64-bits where item1 > item2, the mask will have a 1 bit
// there, else 0
if(result > resultgt) // Similar deal as AVX2
{
return -1;
}
else
{
return 1;
}
}
}
return 0;
}
// Equality-only version
int memcmp_512bit_eq_u(const void *str1, const void *str2, size_t count)
{
const __m512i_u *s1 = (__m512i_u*)str1;
const __m512i_u *s2 = (__m512i_u*)str2;
while (count--)
{
__m512i item1 = _mm512_loadu_si512(s1++);
__m512i item2 = _mm512_loadu_si512(s2++);
unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2);
// All bits == 0 means equal
if(result) // This is barely bigger than 1-byte memcmp_eq
{
return -1;
}
}
return 0;
}
#endif
//-----------------------------------------------------------------------------
// SSE4.2 Aligned:
//-----------------------------------------------------------------------------
// SSE4.2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer)
// Count is (# of total bytes/16), so it's "# of 128-bits"
int memcmp_128bit_a(const void *str1, const void *str2, size_t count)
{
const __m128i *s1 = (__m128i*)str1;
const __m128i *s2 = (__m128i*)str2;
while (count--)
{
__m128i item1 = _mm_load_si128(s1++);
__m128i item2 = _mm_load_si128(s2++);
__m128i result = _mm_cmpeq_epi64(item1, item2);
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
// true, and 0 per 64-bit portion where false
// If result is not all ones, then there is a difference here
if(!(unsigned int)_mm_test_all_ones(result))
{// Ok, now we know they're not equal somewhere
// In the case where both halves of the 128-bit result integer are
// 0x0000000000000000, that's the same as
// 0x0000000000000000FFFFFFFFFFFFFFFF. Only the MSB matters here as the
// comparison is a greater-than check.
// Do the greater than comparison here to have it done before the conditional
// Also make it an unsigned compare:
// https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work
const __m128i rangeshift = _mm_set1_epi64x(0x8000000000000000);
__m128i resultgt = _mm_cmpgt_epi64(_mm_xor_si128(item1, rangeshift), _mm_xor_si128(item2, rangeshift));
// cmpgt returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true
// _mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF) makes 0x0000000000000000FFFFFFFFFFFFFFFF,
// which is the desired mask inverted.
// AND the mask with result such that it returns 1 if all zeroes
if((unsigned int)_mm_test_all_zeros(result, ~_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF)))
{
// Returned a 1, therefore equality comparison gave 0x0000000000000000
// for both 64-bits or 0x0000000000000000FFFFFFFFFFFFFFFF - this
// particular case highlights why an unsigned compare is very important.
// CMPGT will have given 0xFFFFFFFFFFFFFFFFYYYYYYYYYYYYYYYY or
// 0x0000000000000000YYYYYYYYYYYYYYYY
// Right shift to put the desired bits into the lower part of the
// register (overwrite the Ys)
resultgt = _mm_bsrli_si128(resultgt, 8);
// Will either be all ones or all zeros. If all ones, item1 > item2, if
// all zeros, item1 < item2
if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half
{
return 1; // 0x[0000000000000000]0000000000000000
}
else
{
return -1; // 0x[0000000000000000]FFFFFFFFFFFFFFFF
}
}
else // AND mask produced a nonzero value, so the test returned 0.
{
// Therefore equality comparison gave 0xFFFFFFFFFFFFFFFF0000000000000000
// (which is the same as the mask) and CMPGT will have given
// 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF or 0xFFFFFFFFFFFFFFFF0000000000000000
// Lower register bits will either be all ones or all zeros. If all ones,
// item1 > item2, if all zeros, item1 < item2
if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half
{
return 1; // 0x[FFFFFFFFFFFFFFFF]FFFFFFFFFFFFFFFF
}
else
{
return -1; // 0x[FFFFFFFFFFFFFFFF]0000000000000000
}
}
}
}
return 0;
}
// Equality-only version
int memcmp_128bit_eq_a(const void *str1, const void *str2, size_t count)
{
const __m128i *s1 = (__m128i*)str1;
const __m128i *s2 = (__m128i*)str2;
while (count--)
{
__m128i item1 = _mm_load_si128(s1++);
__m128i item2 = _mm_load_si128(s2++);
__m128i result = _mm_cmpeq_epi64(item1, item2);
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
// true, and 0 per 64-bit portion where false
// If result is not all ones, then there is a difference here
if(!(unsigned int)_mm_test_all_ones(result))
{
return -1;
}
}
return 0;
}
//-----------------------------------------------------------------------------
// AVX2+ Aligned:
//-----------------------------------------------------------------------------
// AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer)
// Count is (# of total bytes/32), so it's "# of 256-bits"
// Haswell and Ryzen and up
#ifdef __AVX2__
int memcmp_256bit_a(const void *str1, const void *str2, size_t count)
{
const __m256i *s1 = (__m256i*)str1;
const __m256i *s2 = (__m256i*)str2;
while (count--)
{
__m256i item1 = _mm256_load_si256(s1++);
__m256i item2 = _mm256_load_si256(s2++);
__m256i result = _mm256_cmpeq_epi64(item1, item2);
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
// true, and 0 per 64-bit portion where false
// If result is not all ones, then there is a difference here.
// This is the same thing as _mm_test_all_ones, but 256-bit
if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF)))
{ // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity.
// It really makes no difference on two's complement machines.
// Ok, now we know they're not equal somewhere. Man, doing a pure != is
// sooo much simpler than > or <....
// Unsigned greater-than compare using signed operations, see:
// https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work
const __m256i rangeshift = _mm256_set1_epi64x(0x8000000000000000);
__m256i resultgt = _mm256_cmpgt_epi64(_mm256_xor_si256(item1, rangeshift), _mm256_xor_si256(item2, rangeshift));
// Returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true
// 32-bit value, 4 outcomes we care about from cmpeq -> movemask:
// 00YYYYYY FF00YYYY FFFF00YY FFFFFF00, where Y is "don't care." The most
// significant zeroed byte is the inequality we care about.
// This is the fastest we can do on AVX2.
unsigned int result_to_scan = (unsigned int)_mm256_movemask_epi8(result);
unsigned int resultgt_to_scan = (unsigned int)_mm256_movemask_epi8(resultgt);
// Outcomes from cmpgt are ZZYYYYYY 00ZZYYYY 0000ZZYY 000000ZZ, where
// Z is F if item1 > item2, 0 if item1 < item2, and Y is "don't care."
// The ZZ position of cmpgt will match the corresponding 00 of cmpeq.
// result_to_scan: 00YYYYYY FF00YYYY FFFF00YY FFFFFF00 --inverted-->
// FFYYYYYY 00FFYYYY 0000FFYY 000000FF. This will either be
// > resultgt_to_scan (ZZ = 00) or it won't (ZZ = FF).
if(~result_to_scan > resultgt_to_scan)
{
return -1; // If ZZ = 00, item1 < item2
}
else
{
return 1; // If ZZ = FF, item1 > item2
}
}
}
return 0;
}
// Equality-only version
int memcmp_256bit_eq_a(const void *str1, const void *str2, size_t count)
{
const __m256i *s1 = (__m256i*)str1;
const __m256i *s2 = (__m256i*)str2;
while (count--)
{
__m256i item1 = _mm256_load_si256(s1++);
__m256i item2 = _mm256_load_si256(s2++);
__m256i result = _mm256_cmpeq_epi64(item1, item2);
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
// true, and 0 per 64-bit portion where false
// If result is not all ones, then there is a difference here.
// This is the same thing as _mm_test_all_ones, but 256-bit
if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF)))
{ // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity.
// It really makes no difference on two's complement machines.
return -1;
}
}
return 0;
}
#endif
// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer)
// Count is (# of total bytes/64), so it's "# of 512-bits"
// Requires AVX512F
#ifdef __AVX512F__
int memcmp_512bit_a(const void *str1, const void *str2, size_t count)
{
const __m512i *s1 = (__m512i*)str1;
const __m512i *s2 = (__m512i*)str2;
while (count--)
{
__m512i item1 = _mm512_load_si512(s1++);
__m512i item2 = _mm512_load_si512(s2++);
unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2);
// All bits == 0 means equal
if(result) // I don't believe this. I really need a CPU with AVX-512, lol.
// if(_mm512_mask_cmp_epu64_mask(0xFF, item1, item2, 4)) // 0 is CMPEQ, 4 is CMP_NE, this is the same thing
{
unsigned char resultgt = _mm512_cmpgt_epu64_mask(item1, item2);
// For every set of 64-bits where item1 > item2, the mask will have a 1 bit
// there, else 0
if(result > resultgt) // Similar deal as AVX2
{
return -1;
}
else
{
return 1;
}
}
}
return 0;
}
// GCC -O3 makes memcmp_512bit_a(...) take 25 lines of assembly. This version
// (~10 cycles) is around 5 or so cycles slower per set of memory regions than
// memcmp (~5 cycles). It's the mask operations that take ~3 cycles each...
//
// When the latency of jumps are taken into account, that means this function can
// compare 64 BYTES of data at around the same speed that memcmp does only 1 byte.
// The AVX2 version is 1 cycle slower than the AVX512 version in its main loop
// (i.e. it takes ~11 cycles). When an inequality is found, memcmp takes 3 cycles,
// AVX2 takes 16 cycles, and AVX512 takes 10 cycles to determine which input is
// greater.
//
// NOTE: These are estimates based solely on instruction latencies per Agner
// Fog's optimization tables: https://www.agner.org/optimize/.
// Equality-only version
int memcmp_512bit_eq_a(const void *str1, const void *str2, size_t count)
{
const __m512i *s1 = (__m512i*)str1;
const __m512i *s2 = (__m512i*)str2;
while (count--)
{
__m512i item1 = _mm512_load_si512(s1++);
__m512i item2 = _mm512_load_si512(s2++);
unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2);
// All bits == 0 means equal
if(result) // This is barely bigger than byte-by-byte memcmp_eq
{
return -1;
}
}
return 0;
}
#endif
//-----------------------------------------------------------------------------
// Dispatch Functions (Unaligned):
//-----------------------------------------------------------------------------
// memcmp for large chunks of memory with arbitrary sizes
int memcmp_large(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes.
{
int returnval = 0; // Return value if equal... or numbytes is 0
size_t offset = 0;
while(numbytes)
// This loop will, at most, get evaluated 7 times, ending sooner each time.
// At minimum non-trivial case, once. Each memcmp has its own loop.
{
if(numbytes < 2) // 1 byte
{
returnval = memcmp(str1, str2, numbytes);
if(returnval)
{
return returnval;
}
offset = numbytes & -1;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes = 0;
}
else if(numbytes < 4) // 2 bytes
{
returnval = memcmp_16bit(str1, str2, numbytes >> 1);
if(returnval)
{
return returnval;
}
offset = numbytes & -2;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 1;
}
else if(numbytes < 8) // 4 bytes
{
returnval = memcmp_32bit(str1, str2, numbytes >> 2);
if(returnval)
{
return returnval;
}
offset = numbytes & -4;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 3;
}
else if(numbytes < 16) // 8 bytes
{
returnval = memcmp_64bit(str1, str2, numbytes >> 3);
if(returnval)
{
return returnval;
}
offset = numbytes & -8;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 7;
}
#ifdef __AVX512F__
else if(numbytes < 32) // 16 bytes
{
returnval = memcmp_128bit_u(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
else if(numbytes < 64) // 32 bytes
{
returnval = memcmp_256bit_u(str1, str2, numbytes >> 5);
if(returnval)
{
return returnval;
}
offset = numbytes & -32;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 31;
}
else // 64 bytes
{
returnval = memcmp_512bit_u(str1, str2, numbytes >> 6);
if(returnval)
{
return returnval;
}
offset = numbytes & -64;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 63;
}
#elif __AVX2__
else if(numbytes < 32) // 16 bytes
{
returnval = memcmp_128bit_u(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
else // 32 bytes
{
returnval = memcmp_256bit_u(str1, str2, numbytes >> 5);
if(returnval)
{
return returnval;
}
offset = numbytes & -32;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 31;
}
#else // SSE4.2 only
else // 16 bytes
{
returnval = memcmp_128bit_u(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
#endif
}
return returnval;
}
// Equality-only version
int memcmp_large_eq(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes.
{
int returnval = 0; // Return value if equal... or numbytes is 0
size_t offset = 0;
while(numbytes)
{
if(numbytes < 2) // 1 byte
{
returnval = memcmp_eq(str1, str2, numbytes);
if(returnval)
{
return returnval;
}
offset = numbytes & -1;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes = 0;
}
else if(numbytes < 4) // 2 bytes
{
returnval = memcmp_16bit_eq(str1, str2, numbytes >> 1);
if(returnval)
{
return returnval;
}
offset = numbytes & -2;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 1;
}
else if(numbytes < 8) // 4 bytes
{
returnval = memcmp_32bit_eq(str1, str2, numbytes >> 2);
if(returnval)
{
return returnval;
}
offset = numbytes & -4;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 3;
}
else if(numbytes < 16) // 8 bytes
{
returnval = memcmp_64bit_eq(str1, str2, numbytes >> 3);
if(returnval)
{
return returnval;
}
offset = numbytes & -8;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 7;
}
#ifdef __AVX512F__
else if(numbytes < 32) // 16 bytes
{
returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
else if(numbytes < 64) // 32 bytes
{
returnval = memcmp_256bit_eq_u(str1, str2, numbytes >> 5);
if(returnval)
{
return returnval;
}
offset = numbytes & -32;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 31;
}
else // 64 bytes
{
returnval = memcmp_512bit_eq_u(str1, str2, numbytes >> 6);
if(returnval)
{
return returnval;
}
offset = numbytes & -64;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 63;
}
#elif __AVX2__
else if(numbytes < 32) // 16 bytes
{
returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
else // 32 bytes
{
returnval = memcmp_256bit_eq_u(str1, str2, numbytes >> 5);
if(returnval)
{
return returnval;
}
offset = numbytes & -32;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 31;
}
#else // SSE4.2 only
else // 16 bytes
{
returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
#endif
}
return returnval;
}
//-----------------------------------------------------------------------------
// Dispatch Functions (Aligned):
//-----------------------------------------------------------------------------
// memcmp for large chunks of memory with arbitrary sizes (aligned)
int memcmp_large_a(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes.
{
int returnval = 0; // Return value if equal... or numbytes is 0
size_t offset = 0;
while(numbytes)
// This loop will, at most, get evaulated 7 times, ending sooner each time.
// At minimum non-trivial case, once. Each memcmp has its own loop.
{
if(numbytes < 2) // 1 byte
{
returnval = memcmp(str1, str2, numbytes);
if(returnval)
{
return returnval;
}
offset = numbytes & -1;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes = 0;
}
else if(numbytes < 4) // 2 bytes
{
returnval = memcmp_16bit(str1, str2, numbytes >> 1);
if(returnval)
{
return returnval;
}
offset = numbytes & -2;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 1;
}
else if(numbytes < 8) // 4 bytes
{
returnval = memcmp_32bit(str1, str2, numbytes >> 2);
if(returnval)
{
return returnval;
}
offset = numbytes & -4;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 3;
}
else if(numbytes < 16) // 8 bytes
{
returnval = memcmp_64bit(str1, str2, numbytes >> 3);
if(returnval)
{
return returnval;
}
offset = numbytes & -8;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 7;
}
#ifdef __AVX512F__
else if(numbytes < 32) // 16 bytes
{
returnval = memcmp_128bit_a(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
else if(numbytes < 64) // 32 bytes
{
returnval = memcmp_256bit_a(str1, str2, numbytes >> 5);
if(returnval)
{
return returnval;
}
offset = numbytes & -32;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 31;
}
else // 64 bytes
{
returnval = memcmp_512bit_a(str1, str2, numbytes >> 6);
if(returnval)
{
return returnval;
}
offset = numbytes & -64;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 63;
}
#elif __AVX2__
else if(numbytes < 32) // 16 bytes
{
returnval = memcmp_128bit_a(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
else // 32 bytes
{
returnval = memcmp_256bit_a(str1, str2, numbytes >> 5);
if(returnval)
{
return returnval;
}
offset = numbytes & -32;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 31;
}
#else // SSE4.2 only
else // 16 bytes
{
returnval = memcmp_128bit_a(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
#endif
}
return returnval;
}
// Equality-only version (aligned)
int memcmp_large_eq_a(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes.
{
int returnval = 0; // Return value if equal... or numbytes is 0
size_t offset = 0;
while(numbytes)
{
if(numbytes < 2) // 1 byte
{
returnval = memcmp_eq(str1, str2, numbytes);
if(returnval)
{
return returnval;
}
offset = numbytes & -1;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes = 0;
}
else if(numbytes < 4) // 2 bytes
{
returnval = memcmp_16bit_eq(str1, str2, numbytes >> 1);
if(returnval)
{
return returnval;
}
offset = numbytes & -2;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 1;
}
else if(numbytes < 8) // 4 bytes
{
returnval = memcmp_32bit_eq(str1, str2, numbytes >> 2);
if(returnval)
{
return returnval;
}
offset = numbytes & -4;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 3;
}
else if(numbytes < 16) // 8 bytes
{
returnval = memcmp_64bit_eq(str1, str2, numbytes >> 3);
if(returnval)
{
return returnval;
}
offset = numbytes & -8;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 7;
}
#ifdef __AVX512F__
else if(numbytes < 32) // 16 bytes
{
returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
else if(numbytes < 64) // 32 bytes
{
returnval = memcmp_256bit_eq_a(str1, str2, numbytes >> 5);
if(returnval)
{
return returnval;
}
offset = numbytes & -32;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 31;
}
else // 64 bytes
{
returnval = memcmp_512bit_eq_a(str1, str2, numbytes >> 6);
if(returnval)
{
return returnval;
}
offset = numbytes & -64;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 63;
}
#elif __AVX2__
else if(numbytes < 32) // 16 bytes
{
returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
else // 32 bytes
{
returnval = memcmp_256bit_eq_a(str1, str2, numbytes >> 5);
if(returnval)
{
return returnval;
}
offset = numbytes & -32;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 31;
}
#else // SSE4.2 only
else // 16 bytes
{
returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4);
if(returnval)
{
return returnval;
}
offset = numbytes & -16;
str1 = (char *)str1 + offset;
str2 = (char *)str2 + offset;
numbytes &= 15;
}
#endif
}
return returnval;
}
//-----------------------------------------------------------------------------
// Main Function:
//-----------------------------------------------------------------------------
// Main memcmp function
int AVX_memcmp(const void *str1, const void *str2, size_t numbytes, int equality)
{
int returnval = 0;
if(
( ((uintptr_t)str1 & BYTE_ALIGNMENT) == 0 )
&&
( ((uintptr_t)str2 & BYTE_ALIGNMENT) == 0 )
) // Check alignment
{
// See memmove.c for why it's worth doing special aligned versions of memcmp, which
// is a function that involves 2 loads.
if(equality == 0)
{
returnval = memcmp_large_eq_a(str1, str2, numbytes);
}
else
{
returnval = memcmp_large_a(str1, str2, numbytes);
}
}
else
{
if(equality == 0)
{
returnval = memcmp_large_eq(str1, str2, numbytes);
}
else
{
returnval = memcmp_large(str1, str2, numbytes);
}
}
return returnval;
}
// AVX-1024+ support pending existence of the standard.