1303 lines
38 KiB
C
1303 lines
38 KiB
C
|
|
#include <kernel.h>
|
|
|
|
// Equality-only version
|
|
int memcmp_eq (const void *str1, const void *str2, size_t count)
|
|
{
|
|
const unsigned char *s1 = (unsigned char *)str1;
|
|
const unsigned char *s2 = (unsigned char *)str2;
|
|
|
|
while (count-- > 0)
|
|
{
|
|
if (*s1++ != *s2++)
|
|
{
|
|
return -1; // Makes more sense to me if -1 means unequal.
|
|
}
|
|
}
|
|
return 0; // Return 0 if equal to match normal memcmp
|
|
}
|
|
|
|
///=============================================================================
|
|
/// LICENSING INFORMATION
|
|
///=============================================================================
|
|
//
|
|
// The code above this comment is in the public domain.
|
|
// The code below this comment is subject to the custom attribution license found
|
|
// here: https://github.com/KNNSpeed/Simple-Kernel/blob/master/LICENSE_KERNEL
|
|
//
|
|
//==============================================================================
|
|
// AVX Memory Functions: AVX Memcmp
|
|
//==============================================================================
|
|
//
|
|
// Version 1.2
|
|
//
|
|
// Author:
|
|
// KNNSpeed
|
|
//
|
|
// Source Code:
|
|
// https://github.com/KNNSpeed/Simple-Kernel
|
|
//
|
|
// Minimum requirement:
|
|
// x86_64 CPU with SSE4.2, but AVX2 or later is recommended
|
|
//
|
|
// This file provides a highly optimized version of memcmp.
|
|
// It allows for selection of modes, too: "check for equality" or perform the full
|
|
// greater-than/less-than comparison. For equality-only, pass 0 to the equality
|
|
// argument. Pass 1 for full comparison (or really any nonzero int).
|
|
//
|
|
// In equality mode, a return value of 0 means equal, -1 means unequal.
|
|
// In full comparison mode, -1 -> str1 is less, 0 -> equal, 1 -> str1 is greater.
|
|
//
|
|
|
|
#ifdef __clang__
|
|
#define __m128i_u __m128i
|
|
#define __m256i_u __m256i
|
|
#define __m512i_u __m512i
|
|
#define _mm_cvtsi128_si64x _mm_cvtsi128_si64
|
|
#define _mm_cvtsi64x_si128 _mm_cvtsi64_si128
|
|
#endif
|
|
|
|
#ifdef __AVX512F__
|
|
#define BYTE_ALIGNMENT 0x3F // For 64-byte alignment
|
|
#elif __AVX2__
|
|
#define BYTE_ALIGNMENT 0x1F // For 32-byte alignment
|
|
#else
|
|
#define BYTE_ALIGNMENT 0x0F // For 16-byte alignment
|
|
#endif
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Individual Functions:
|
|
//-----------------------------------------------------------------------------
|
|
//
|
|
// The following memcmps return -1 or 1 depending on the sign of the first unit
|
|
// of their respective sizes, as opposed to the first byte (it seems memcmp(3)
|
|
// is only defined for byte-by-byte comparisons, not, e.g., 16-byte-by-16-byte).
|
|
//
|
|
// The way these functions are made allows them to work properly even if they
|
|
// run off the edge of the desired memory area (e.g. numbytes was larger than the
|
|
// desired area for whatever reason). The returned value won't necessarily be
|
|
// indicative of the memory area in this case.
|
|
//
|
|
|
|
// 16-bit (2 bytes at a time)
|
|
// Count is (# of total bytes/2), so it's "# of 16-bits"
|
|
|
|
int memcmp_16bit(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const uint16_t *s1 = (uint16_t*)str1;
|
|
const uint16_t *s2 = (uint16_t*)str2;
|
|
|
|
while (count-- > 0)
|
|
{
|
|
if (*s1++ != *s2++)
|
|
{
|
|
return s1[-1] < s2[-1] ? -1 : 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_16bit_eq(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const uint16_t *s1 = (uint16_t*)str1;
|
|
const uint16_t *s2 = (uint16_t*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
if (*s1++ != *s2++)
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// 32-bit (4 bytes at a time - 1 pixel in a 32-bit linear frame buffer)
|
|
// Count is (# of total bytes/4), so it's "# of 32-bits"
|
|
|
|
int memcmp_32bit(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const uint32_t *s1 = (uint32_t*)str1;
|
|
const uint32_t *s2 = (uint32_t*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
if (*s1++ != *s2++)
|
|
{
|
|
return s1[-1] < s2[-1] ? -1 : 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_32bit_eq(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const uint32_t *s1 = (uint32_t*)str1;
|
|
const uint32_t *s2 = (uint32_t*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
if (*s1++ != *s2++)
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// 64-bit (8 bytes at a time - 2 pixels in a 32-bit linear frame buffer)
|
|
// Count is (# of total bytes/8), so it's "# of 64-bits"
|
|
|
|
int memcmp_64bit(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const uint64_t *s1 = (uint64_t*)str1;
|
|
const uint64_t *s2 = (uint64_t*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
if (*s1++ != *s2++)
|
|
{
|
|
return s1[-1] < s2[-1] ? -1 : 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_64bit_eq(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const uint64_t *s1 = (uint64_t*)str1;
|
|
const uint64_t *s2 = (uint64_t*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
if (*s1++ != *s2++)
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// SSE4.2 Unaligned:
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// SSE4.2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer)
|
|
// Count is (# of total bytes/16), so it's "# of 128-bits"
|
|
|
|
int memcmp_128bit_u(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m128i_u *s1 = (__m128i_u*)str1;
|
|
const __m128i_u *s2 = (__m128i_u*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m128i item1 = _mm_lddqu_si128(s1++);
|
|
__m128i item2 = _mm_lddqu_si128(s2++);
|
|
__m128i result = _mm_cmpeq_epi64(item1, item2);
|
|
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
|
|
// true, and 0 per 64-bit portion where false
|
|
|
|
// If result is not all ones, then there is a difference here
|
|
if(!(unsigned int)_mm_test_all_ones(result))
|
|
{// Ok, now we know they're not equal somewhere
|
|
|
|
// In the case where both halves of the 128-bit result integer are
|
|
// 0x0000000000000000, that's the same as
|
|
// 0x0000000000000000FFFFFFFFFFFFFFFF. Only the MSB matters here as the
|
|
// comparison is a greater-than check.
|
|
|
|
// Do the greater than comparison here to have it done before the conditional
|
|
// Also make it an unsigned compare:
|
|
// https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work
|
|
const __m128i rangeshift = _mm_set1_epi64x(0x8000000000000000);
|
|
__m128i resultgt = _mm_cmpgt_epi64(_mm_xor_si128(item1, rangeshift), _mm_xor_si128(item2, rangeshift));
|
|
// cmpgt returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true
|
|
|
|
// _mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF) makes 0x0000000000000000FFFFFFFFFFFFFFFF,
|
|
// which is the desired mask inverted.
|
|
// AND the mask with result such that it returns 1 if all zeroes
|
|
if((unsigned int)_mm_test_all_zeros(result, ~_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF)))
|
|
{
|
|
// Returned a 1, therefore equality comparison gave 0x0000000000000000
|
|
// for both 64-bits or 0x0000000000000000FFFFFFFFFFFFFFFF - this
|
|
// particular case highlights why an unsigned compare is very important.
|
|
// CMPGT will have given 0xFFFFFFFFFFFFFFFFYYYYYYYYYYYYYYYY or
|
|
// 0x0000000000000000YYYYYYYYYYYYYYYY
|
|
|
|
// Right shift to put the desired bits into the lower part of the
|
|
// register (overwrite the Ys)
|
|
resultgt = _mm_bsrli_si128(resultgt, 8);
|
|
// Will either be all ones or all zeros. If all ones, item1 > item2, if
|
|
// all zeros, item1 < item2
|
|
if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half
|
|
{
|
|
return 1; // 0x[0000000000000000]0000000000000000
|
|
}
|
|
else
|
|
{
|
|
return -1; // 0x[0000000000000000]FFFFFFFFFFFFFFFF
|
|
}
|
|
}
|
|
else // AND mask produced a nonzero value, so the test returned 0.
|
|
{
|
|
// Therefore equality comparison gave 0xFFFFFFFFFFFFFFFF0000000000000000
|
|
// (which is the same as the mask) and CMPGT will have given
|
|
// 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF or 0xFFFFFFFFFFFFFFFF0000000000000000
|
|
// Lower register bits will either be all ones or all zeros. If all ones,
|
|
// item1 > item2, if all zeros, item1 < item2
|
|
if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half
|
|
{
|
|
return 1; // 0x[FFFFFFFFFFFFFFFF]FFFFFFFFFFFFFFFF
|
|
}
|
|
else
|
|
{
|
|
return -1; // 0x[FFFFFFFFFFFFFFFF]0000000000000000
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_128bit_eq_u(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m128i_u *s1 = (__m128i_u*)str1;
|
|
const __m128i_u *s2 = (__m128i_u*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m128i item1 = _mm_lddqu_si128(s1++);
|
|
__m128i item2 = _mm_lddqu_si128(s2++);
|
|
__m128i result = _mm_cmpeq_epi64(item1, item2);
|
|
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
|
|
// true, and 0 per 64-bit portion where false
|
|
|
|
// If result is not all ones, then there is a difference here
|
|
if(!(unsigned int)_mm_test_all_ones(result))
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// AVX2+ Unaligned:
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer)
|
|
// Count is (# of total bytes/32), so it's "# of 256-bits"
|
|
// Haswell and Ryzen and up
|
|
|
|
#ifdef __AVX2__
|
|
int memcmp_256bit_u(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m256i_u *s1 = (__m256i_u*)str1;
|
|
const __m256i_u *s2 = (__m256i_u*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m256i item1 = _mm256_lddqu_si256(s1++);
|
|
__m256i item2 = _mm256_lddqu_si256(s2++);
|
|
__m256i result = _mm256_cmpeq_epi64(item1, item2);
|
|
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
|
|
// true, and 0 per 64-bit portion where false
|
|
|
|
// If result is not all ones, then there is a difference here.
|
|
// This is the same thing as _mm_test_all_ones, but 256-bit
|
|
if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF)))
|
|
{ // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity.
|
|
// It really makes no difference on two's complement machines.
|
|
|
|
// Ok, now we know they're not equal somewhere. Man, doing a pure != is
|
|
// sooo much simpler than > or <....
|
|
|
|
// Unsigned greater-than compare using signed operations, see:
|
|
// https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work
|
|
const __m256i rangeshift = _mm256_set1_epi64x(0x8000000000000000);
|
|
__m256i resultgt = _mm256_cmpgt_epi64(_mm256_xor_si256(item1, rangeshift), _mm256_xor_si256(item2, rangeshift));
|
|
// Returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true
|
|
|
|
// 32-bit value, 4 outcomes we care about from cmpeq -> movemask:
|
|
// 00YYYYYY FF00YYYY FFFF00YY FFFFFF00, where Y is "don't care." The most
|
|
// significant zeroed byte is the inequality we care about.
|
|
|
|
// This is the fastest we can do on AVX2.
|
|
unsigned int result_to_scan = (unsigned int)_mm256_movemask_epi8(result);
|
|
unsigned int resultgt_to_scan = (unsigned int)_mm256_movemask_epi8(resultgt);
|
|
// Outcomes from cmpgt are ZZYYYYYY 00ZZYYYY 0000ZZYY 000000ZZ, where
|
|
// Z is F if item1 > item2, 0 if item1 < item2, and Y is "don't care."
|
|
// The ZZ position of cmpgt will match the corresponding 00 of cmpeq.
|
|
|
|
// result_to_scan: 00YYYYYY FF00YYYY FFFF00YY FFFFFF00 --inverted-->
|
|
// FFYYYYYY 00FFYYYY 0000FFYY 000000FF. This will either be
|
|
// > resultgt_to_scan (ZZ = 00) or it won't (ZZ = FF).
|
|
if(~result_to_scan > resultgt_to_scan)
|
|
{
|
|
return -1; // If ZZ = 00, item1 < item2
|
|
}
|
|
else
|
|
{
|
|
return 1; // If ZZ = FF, item1 > item2
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_256bit_eq_u(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m256i_u *s1 = (__m256i_u*)str1;
|
|
const __m256i_u *s2 = (__m256i_u*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m256i item1 = _mm256_lddqu_si256(s1++);
|
|
__m256i item2 = _mm256_lddqu_si256(s2++);
|
|
__m256i result = _mm256_cmpeq_epi64(item1, item2);
|
|
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
|
|
// true, and 0 per 64-bit portion where false
|
|
|
|
// If result is not all ones, then there is a difference here.
|
|
// This is the same thing as _mm_test_all_ones, but 256-bit
|
|
if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF)))
|
|
{ // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity.
|
|
// It really makes no difference on two's complement machines.
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer)
|
|
// Count is (# of total bytes/64), so it's "# of 512-bits"
|
|
// Requires AVX512F
|
|
|
|
#ifdef __AVX512F__
|
|
int memcmp_512bit_u(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m512i_u *s1 = (__m512i_u*)str1;
|
|
const __m512i_u *s2 = (__m512i_u*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m512i item1 = _mm512_loadu_si512(s1++);
|
|
__m512i item2 = _mm512_loadu_si512(s2++);
|
|
unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2);
|
|
// All bits == 0 means equal
|
|
|
|
if(result) // I don't believe this. I really need a CPU with AVX-512, lol.
|
|
// if(_mm512_mask_cmp_epu64_mask(0xFF, item1, item2, 4)) // 0 is CMPEQ, 4 is CMP_NE, this is the same thing
|
|
{
|
|
unsigned char resultgt = _mm512_cmpgt_epu64_mask(item1, item2);
|
|
// For every set of 64-bits where item1 > item2, the mask will have a 1 bit
|
|
// there, else 0
|
|
|
|
if(result > resultgt) // Similar deal as AVX2
|
|
{
|
|
return -1;
|
|
}
|
|
else
|
|
{
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_512bit_eq_u(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m512i_u *s1 = (__m512i_u*)str1;
|
|
const __m512i_u *s2 = (__m512i_u*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m512i item1 = _mm512_loadu_si512(s1++);
|
|
__m512i item2 = _mm512_loadu_si512(s2++);
|
|
unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2);
|
|
// All bits == 0 means equal
|
|
|
|
if(result) // This is barely bigger than 1-byte memcmp_eq
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// SSE4.2 Aligned:
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// SSE4.2 (128-bit, 16 bytes at a time - 4 pixels in a 32-bit linear frame buffer)
|
|
// Count is (# of total bytes/16), so it's "# of 128-bits"
|
|
|
|
int memcmp_128bit_a(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m128i *s1 = (__m128i*)str1;
|
|
const __m128i *s2 = (__m128i*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m128i item1 = _mm_load_si128(s1++);
|
|
__m128i item2 = _mm_load_si128(s2++);
|
|
__m128i result = _mm_cmpeq_epi64(item1, item2);
|
|
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
|
|
// true, and 0 per 64-bit portion where false
|
|
|
|
// If result is not all ones, then there is a difference here
|
|
if(!(unsigned int)_mm_test_all_ones(result))
|
|
{// Ok, now we know they're not equal somewhere
|
|
|
|
// In the case where both halves of the 128-bit result integer are
|
|
// 0x0000000000000000, that's the same as
|
|
// 0x0000000000000000FFFFFFFFFFFFFFFF. Only the MSB matters here as the
|
|
// comparison is a greater-than check.
|
|
|
|
// Do the greater than comparison here to have it done before the conditional
|
|
// Also make it an unsigned compare:
|
|
// https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work
|
|
const __m128i rangeshift = _mm_set1_epi64x(0x8000000000000000);
|
|
__m128i resultgt = _mm_cmpgt_epi64(_mm_xor_si128(item1, rangeshift), _mm_xor_si128(item2, rangeshift));
|
|
// cmpgt returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true
|
|
|
|
// _mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF) makes 0x0000000000000000FFFFFFFFFFFFFFFF,
|
|
// which is the desired mask inverted.
|
|
// AND the mask with result such that it returns 1 if all zeroes
|
|
if((unsigned int)_mm_test_all_zeros(result, ~_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFF)))
|
|
{
|
|
// Returned a 1, therefore equality comparison gave 0x0000000000000000
|
|
// for both 64-bits or 0x0000000000000000FFFFFFFFFFFFFFFF - this
|
|
// particular case highlights why an unsigned compare is very important.
|
|
// CMPGT will have given 0xFFFFFFFFFFFFFFFFYYYYYYYYYYYYYYYY or
|
|
// 0x0000000000000000YYYYYYYYYYYYYYYY
|
|
|
|
// Right shift to put the desired bits into the lower part of the
|
|
// register (overwrite the Ys)
|
|
resultgt = _mm_bsrli_si128(resultgt, 8);
|
|
// Will either be all ones or all zeros. If all ones, item1 > item2, if
|
|
// all zeros, item1 < item2
|
|
if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half
|
|
{
|
|
return 1; // 0x[0000000000000000]0000000000000000
|
|
}
|
|
else
|
|
{
|
|
return -1; // 0x[0000000000000000]FFFFFFFFFFFFFFFF
|
|
}
|
|
}
|
|
else // AND mask produced a nonzero value, so the test returned 0.
|
|
{
|
|
// Therefore equality comparison gave 0xFFFFFFFFFFFFFFFF0000000000000000
|
|
// (which is the same as the mask) and CMPGT will have given
|
|
// 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF or 0xFFFFFFFFFFFFFFFF0000000000000000
|
|
// Lower register bits will either be all ones or all zeros. If all ones,
|
|
// item1 > item2, if all zeros, item1 < item2
|
|
if((uint64_t)_mm_cvtsi128_si64x(resultgt)) // Lop off upper half
|
|
{
|
|
return 1; // 0x[FFFFFFFFFFFFFFFF]FFFFFFFFFFFFFFFF
|
|
}
|
|
else
|
|
{
|
|
return -1; // 0x[FFFFFFFFFFFFFFFF]0000000000000000
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_128bit_eq_a(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m128i *s1 = (__m128i*)str1;
|
|
const __m128i *s2 = (__m128i*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m128i item1 = _mm_load_si128(s1++);
|
|
__m128i item2 = _mm_load_si128(s2++);
|
|
__m128i result = _mm_cmpeq_epi64(item1, item2);
|
|
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
|
|
// true, and 0 per 64-bit portion where false
|
|
|
|
// If result is not all ones, then there is a difference here
|
|
if(!(unsigned int)_mm_test_all_ones(result))
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// AVX2+ Aligned:
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// AVX2 (256-bit, 32 bytes at a time - 8 pixels in a 32-bit linear frame buffer)
|
|
// Count is (# of total bytes/32), so it's "# of 256-bits"
|
|
// Haswell and Ryzen and up
|
|
|
|
#ifdef __AVX2__
|
|
int memcmp_256bit_a(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m256i *s1 = (__m256i*)str1;
|
|
const __m256i *s2 = (__m256i*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m256i item1 = _mm256_load_si256(s1++);
|
|
__m256i item2 = _mm256_load_si256(s2++);
|
|
__m256i result = _mm256_cmpeq_epi64(item1, item2);
|
|
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
|
|
// true, and 0 per 64-bit portion where false
|
|
|
|
// If result is not all ones, then there is a difference here.
|
|
// This is the same thing as _mm_test_all_ones, but 256-bit
|
|
if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF)))
|
|
{ // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity.
|
|
// It really makes no difference on two's complement machines.
|
|
|
|
// Ok, now we know they're not equal somewhere. Man, doing a pure != is
|
|
// sooo much simpler than > or <....
|
|
|
|
// Unsigned greater-than compare using signed operations, see:
|
|
// https://stackoverflow.com/questions/52805528/how-does-the-mm-cmpgt-epi64-intrinsic-work
|
|
const __m256i rangeshift = _mm256_set1_epi64x(0x8000000000000000);
|
|
__m256i resultgt = _mm256_cmpgt_epi64(_mm256_xor_si256(item1, rangeshift), _mm256_xor_si256(item2, rangeshift));
|
|
// Returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where item1 > item2 is true
|
|
|
|
// 32-bit value, 4 outcomes we care about from cmpeq -> movemask:
|
|
// 00YYYYYY FF00YYYY FFFF00YY FFFFFF00, where Y is "don't care." The most
|
|
// significant zeroed byte is the inequality we care about.
|
|
|
|
// This is the fastest we can do on AVX2.
|
|
unsigned int result_to_scan = (unsigned int)_mm256_movemask_epi8(result);
|
|
unsigned int resultgt_to_scan = (unsigned int)_mm256_movemask_epi8(resultgt);
|
|
// Outcomes from cmpgt are ZZYYYYYY 00ZZYYYY 0000ZZYY 000000ZZ, where
|
|
// Z is F if item1 > item2, 0 if item1 < item2, and Y is "don't care."
|
|
// The ZZ position of cmpgt will match the corresponding 00 of cmpeq.
|
|
|
|
// result_to_scan: 00YYYYYY FF00YYYY FFFF00YY FFFFFF00 --inverted-->
|
|
// FFYYYYYY 00FFYYYY 0000FFYY 000000FF. This will either be
|
|
// > resultgt_to_scan (ZZ = 00) or it won't (ZZ = FF).
|
|
if(~result_to_scan > resultgt_to_scan)
|
|
{
|
|
return -1; // If ZZ = 00, item1 < item2
|
|
}
|
|
else
|
|
{
|
|
return 1; // If ZZ = FF, item1 > item2
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_256bit_eq_a(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m256i *s1 = (__m256i*)str1;
|
|
const __m256i *s2 = (__m256i*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m256i item1 = _mm256_load_si256(s1++);
|
|
__m256i item2 = _mm256_load_si256(s2++);
|
|
__m256i result = _mm256_cmpeq_epi64(item1, item2);
|
|
// cmpeq returns 0xFFFFFFFFFFFFFFFF per 64-bit portion where equality is
|
|
// true, and 0 per 64-bit portion where false
|
|
|
|
// If result is not all ones, then there is a difference here.
|
|
// This is the same thing as _mm_test_all_ones, but 256-bit
|
|
if(!(unsigned int)_mm256_testc_si256(result, _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF)))
|
|
{ // Using 0xFFFFFFFFFFFFFFFF explicitly instead of -1 for clarity.
|
|
// It really makes no difference on two's complement machines.
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
// AVX-512 (512-bit, 64 bytes at a time - 16 pixels in a 32-bit linear frame buffer)
|
|
// Count is (# of total bytes/64), so it's "# of 512-bits"
|
|
// Requires AVX512F
|
|
|
|
#ifdef __AVX512F__
|
|
int memcmp_512bit_a(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m512i *s1 = (__m512i*)str1;
|
|
const __m512i *s2 = (__m512i*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m512i item1 = _mm512_load_si512(s1++);
|
|
__m512i item2 = _mm512_load_si512(s2++);
|
|
unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2);
|
|
// All bits == 0 means equal
|
|
|
|
if(result) // I don't believe this. I really need a CPU with AVX-512, lol.
|
|
// if(_mm512_mask_cmp_epu64_mask(0xFF, item1, item2, 4)) // 0 is CMPEQ, 4 is CMP_NE, this is the same thing
|
|
{
|
|
unsigned char resultgt = _mm512_cmpgt_epu64_mask(item1, item2);
|
|
// For every set of 64-bits where item1 > item2, the mask will have a 1 bit
|
|
// there, else 0
|
|
|
|
if(result > resultgt) // Similar deal as AVX2
|
|
{
|
|
return -1;
|
|
}
|
|
else
|
|
{
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// GCC -O3 makes memcmp_512bit_a(...) take 25 lines of assembly. This version
|
|
// (~10 cycles) is around 5 or so cycles slower per set of memory regions than
|
|
// memcmp (~5 cycles). It's the mask operations that take ~3 cycles each...
|
|
//
|
|
// When the latency of jumps are taken into account, that means this function can
|
|
// compare 64 BYTES of data at around the same speed that memcmp does only 1 byte.
|
|
// The AVX2 version is 1 cycle slower than the AVX512 version in its main loop
|
|
// (i.e. it takes ~11 cycles). When an inequality is found, memcmp takes 3 cycles,
|
|
// AVX2 takes 16 cycles, and AVX512 takes 10 cycles to determine which input is
|
|
// greater.
|
|
//
|
|
// NOTE: These are estimates based solely on instruction latencies per Agner
|
|
// Fog's optimization tables: https://www.agner.org/optimize/.
|
|
|
|
// Equality-only version
|
|
int memcmp_512bit_eq_a(const void *str1, const void *str2, size_t count)
|
|
{
|
|
const __m512i *s1 = (__m512i*)str1;
|
|
const __m512i *s2 = (__m512i*)str2;
|
|
|
|
while (count--)
|
|
{
|
|
__m512i item1 = _mm512_load_si512(s1++);
|
|
__m512i item2 = _mm512_load_si512(s2++);
|
|
unsigned char result = _mm512_cmpneq_epu64_mask(item1, item2);
|
|
// All bits == 0 means equal
|
|
|
|
if(result) // This is barely bigger than byte-by-byte memcmp_eq
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Dispatch Functions (Unaligned):
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// memcmp for large chunks of memory with arbitrary sizes
|
|
int memcmp_large(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes.
|
|
{
|
|
int returnval = 0; // Return value if equal... or numbytes is 0
|
|
size_t offset = 0;
|
|
|
|
while(numbytes)
|
|
// This loop will, at most, get evaluated 7 times, ending sooner each time.
|
|
// At minimum non-trivial case, once. Each memcmp has its own loop.
|
|
{
|
|
if(numbytes < 2) // 1 byte
|
|
{
|
|
returnval = memcmp(str1, str2, numbytes);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -1;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes = 0;
|
|
}
|
|
else if(numbytes < 4) // 2 bytes
|
|
{
|
|
returnval = memcmp_16bit(str1, str2, numbytes >> 1);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -2;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 1;
|
|
}
|
|
else if(numbytes < 8) // 4 bytes
|
|
{
|
|
returnval = memcmp_32bit(str1, str2, numbytes >> 2);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -4;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 3;
|
|
}
|
|
else if(numbytes < 16) // 8 bytes
|
|
{
|
|
returnval = memcmp_64bit(str1, str2, numbytes >> 3);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -8;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 7;
|
|
}
|
|
#ifdef __AVX512F__
|
|
else if(numbytes < 32) // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_u(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
else if(numbytes < 64) // 32 bytes
|
|
{
|
|
returnval = memcmp_256bit_u(str1, str2, numbytes >> 5);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -32;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 31;
|
|
}
|
|
else // 64 bytes
|
|
{
|
|
returnval = memcmp_512bit_u(str1, str2, numbytes >> 6);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -64;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 63;
|
|
}
|
|
#elif __AVX2__
|
|
else if(numbytes < 32) // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_u(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
else // 32 bytes
|
|
{
|
|
returnval = memcmp_256bit_u(str1, str2, numbytes >> 5);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -32;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 31;
|
|
}
|
|
#else // SSE4.2 only
|
|
else // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_u(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
#endif
|
|
}
|
|
return returnval;
|
|
}
|
|
|
|
// Equality-only version
|
|
int memcmp_large_eq(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes.
|
|
{
|
|
int returnval = 0; // Return value if equal... or numbytes is 0
|
|
size_t offset = 0;
|
|
|
|
while(numbytes)
|
|
{
|
|
if(numbytes < 2) // 1 byte
|
|
{
|
|
returnval = memcmp_eq(str1, str2, numbytes);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -1;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes = 0;
|
|
}
|
|
else if(numbytes < 4) // 2 bytes
|
|
{
|
|
returnval = memcmp_16bit_eq(str1, str2, numbytes >> 1);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -2;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 1;
|
|
}
|
|
else if(numbytes < 8) // 4 bytes
|
|
{
|
|
returnval = memcmp_32bit_eq(str1, str2, numbytes >> 2);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -4;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 3;
|
|
}
|
|
else if(numbytes < 16) // 8 bytes
|
|
{
|
|
returnval = memcmp_64bit_eq(str1, str2, numbytes >> 3);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -8;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 7;
|
|
}
|
|
#ifdef __AVX512F__
|
|
else if(numbytes < 32) // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
else if(numbytes < 64) // 32 bytes
|
|
{
|
|
returnval = memcmp_256bit_eq_u(str1, str2, numbytes >> 5);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -32;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 31;
|
|
}
|
|
else // 64 bytes
|
|
{
|
|
returnval = memcmp_512bit_eq_u(str1, str2, numbytes >> 6);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -64;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 63;
|
|
}
|
|
#elif __AVX2__
|
|
else if(numbytes < 32) // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
else // 32 bytes
|
|
{
|
|
returnval = memcmp_256bit_eq_u(str1, str2, numbytes >> 5);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -32;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 31;
|
|
}
|
|
#else // SSE4.2 only
|
|
else // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_eq_u(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
#endif
|
|
}
|
|
return returnval;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Dispatch Functions (Aligned):
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// memcmp for large chunks of memory with arbitrary sizes (aligned)
|
|
int memcmp_large_a(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes.
|
|
{
|
|
int returnval = 0; // Return value if equal... or numbytes is 0
|
|
size_t offset = 0;
|
|
|
|
while(numbytes)
|
|
// This loop will, at most, get evaulated 7 times, ending sooner each time.
|
|
// At minimum non-trivial case, once. Each memcmp has its own loop.
|
|
{
|
|
if(numbytes < 2) // 1 byte
|
|
{
|
|
returnval = memcmp(str1, str2, numbytes);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -1;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes = 0;
|
|
}
|
|
else if(numbytes < 4) // 2 bytes
|
|
{
|
|
returnval = memcmp_16bit(str1, str2, numbytes >> 1);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -2;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 1;
|
|
}
|
|
else if(numbytes < 8) // 4 bytes
|
|
{
|
|
returnval = memcmp_32bit(str1, str2, numbytes >> 2);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -4;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 3;
|
|
}
|
|
else if(numbytes < 16) // 8 bytes
|
|
{
|
|
returnval = memcmp_64bit(str1, str2, numbytes >> 3);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -8;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 7;
|
|
}
|
|
#ifdef __AVX512F__
|
|
else if(numbytes < 32) // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_a(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
else if(numbytes < 64) // 32 bytes
|
|
{
|
|
returnval = memcmp_256bit_a(str1, str2, numbytes >> 5);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -32;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 31;
|
|
}
|
|
else // 64 bytes
|
|
{
|
|
returnval = memcmp_512bit_a(str1, str2, numbytes >> 6);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -64;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 63;
|
|
}
|
|
#elif __AVX2__
|
|
else if(numbytes < 32) // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_a(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
else // 32 bytes
|
|
{
|
|
returnval = memcmp_256bit_a(str1, str2, numbytes >> 5);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -32;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 31;
|
|
}
|
|
#else // SSE4.2 only
|
|
else // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_a(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
#endif
|
|
}
|
|
return returnval;
|
|
}
|
|
|
|
// Equality-only version (aligned)
|
|
int memcmp_large_eq_a(const void *str1, const void *str2, size_t numbytes) // Worst-case scenario: 127 bytes.
|
|
{
|
|
int returnval = 0; // Return value if equal... or numbytes is 0
|
|
size_t offset = 0;
|
|
|
|
while(numbytes)
|
|
{
|
|
if(numbytes < 2) // 1 byte
|
|
{
|
|
returnval = memcmp_eq(str1, str2, numbytes);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -1;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes = 0;
|
|
}
|
|
else if(numbytes < 4) // 2 bytes
|
|
{
|
|
returnval = memcmp_16bit_eq(str1, str2, numbytes >> 1);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -2;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 1;
|
|
}
|
|
else if(numbytes < 8) // 4 bytes
|
|
{
|
|
returnval = memcmp_32bit_eq(str1, str2, numbytes >> 2);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -4;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 3;
|
|
}
|
|
else if(numbytes < 16) // 8 bytes
|
|
{
|
|
returnval = memcmp_64bit_eq(str1, str2, numbytes >> 3);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -8;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 7;
|
|
}
|
|
#ifdef __AVX512F__
|
|
else if(numbytes < 32) // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
else if(numbytes < 64) // 32 bytes
|
|
{
|
|
returnval = memcmp_256bit_eq_a(str1, str2, numbytes >> 5);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -32;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 31;
|
|
}
|
|
else // 64 bytes
|
|
{
|
|
returnval = memcmp_512bit_eq_a(str1, str2, numbytes >> 6);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -64;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 63;
|
|
}
|
|
#elif __AVX2__
|
|
else if(numbytes < 32) // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
else // 32 bytes
|
|
{
|
|
returnval = memcmp_256bit_eq_a(str1, str2, numbytes >> 5);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -32;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 31;
|
|
}
|
|
#else // SSE4.2 only
|
|
else // 16 bytes
|
|
{
|
|
returnval = memcmp_128bit_eq_a(str1, str2, numbytes >> 4);
|
|
if(returnval)
|
|
{
|
|
return returnval;
|
|
}
|
|
offset = numbytes & -16;
|
|
str1 = (char *)str1 + offset;
|
|
str2 = (char *)str2 + offset;
|
|
numbytes &= 15;
|
|
}
|
|
#endif
|
|
}
|
|
return returnval;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Main Function:
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Main memcmp function
|
|
int AVX_memcmp(const void *str1, const void *str2, size_t numbytes, int equality)
|
|
{
|
|
int returnval = 0;
|
|
|
|
if(
|
|
( ((uintptr_t)str1 & BYTE_ALIGNMENT) == 0 )
|
|
&&
|
|
( ((uintptr_t)str2 & BYTE_ALIGNMENT) == 0 )
|
|
) // Check alignment
|
|
{
|
|
// See memmove.c for why it's worth doing special aligned versions of memcmp, which
|
|
// is a function that involves 2 loads.
|
|
if(equality == 0)
|
|
{
|
|
returnval = memcmp_large_eq_a(str1, str2, numbytes);
|
|
}
|
|
else
|
|
{
|
|
returnval = memcmp_large_a(str1, str2, numbytes);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(equality == 0)
|
|
{
|
|
returnval = memcmp_large_eq(str1, str2, numbytes);
|
|
}
|
|
else
|
|
{
|
|
returnval = memcmp_large(str1, str2, numbytes);
|
|
}
|
|
}
|
|
|
|
return returnval;
|
|
}
|
|
|
|
// AVX-1024+ support pending existence of the standard.
|