std/crc32: re-write x86_sse42 implementation

name                                                       old speed      new speed      delta

wuffs_crc32_ieee_10k/clang14                               13.5GB/s ± 0%  21.9GB/s ± 0%  +62.09%  (p=0.016 n=5+4)
wuffs_crc32_ieee_100k/clang14                              22.4GB/s ± 9%  29.7GB/s ± 0%  +32.76%  (p=0.016 n=5+4)

wuffs_crc32_ieee_10k/gcc12                                 14.2GB/s ± 2%  22.2GB/s ± 0%  +56.92%  (p=0.008 n=5+5)
wuffs_crc32_ieee_100k/gcc12                                21.3GB/s ± 3%  29.6GB/s ± 1%  +39.18%  (p=0.008 n=5+5)

wuffs_gzip_decode_10k/clang14                               366MB/s ± 0%   372MB/s ± 0%   +1.41%  (p=0.008 n=5+5)
wuffs_gzip_decode_100k/clang14                              482MB/s ± 0%   494MB/s ± 0%   +2.57%  (p=0.008 n=5+5)

wuffs_gzip_decode_10k/gcc12                                 398MB/s ± 0%   418MB/s ± 0%   +5.19%  (p=0.008 n=5+5)
wuffs_gzip_decode_100k/gcc12                                510MB/s ± 0%   537MB/s ± 0%   +5.21%  (p=0.008 n=5+5)

wuffs_png_decode_image_19k_8bpp/clang14                     263MB/s ± 0%   264MB/s ± 0%   +0.39%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang14                    297MB/s ± 0%   298MB/s ± 0%   +0.16%  (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/clang14                     932MB/s ± 0%   945MB/s ± 0%   +1.33%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang14   831MB/s ± 0%   833MB/s ± 0%   +0.25%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang14   799MB/s ± 0%   802MB/s ± 0%   +0.37%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang14                  306MB/s ± 0%   307MB/s ± 0%   +0.56%  (p=0.008 n=5+5)

wuffs_png_decode_image_19k_8bpp/gcc12                       283MB/s ± 0%   270MB/s ± 0%   -4.84%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc12                      330MB/s ± 0%   329MB/s ± 0%   -0.29%  (p=0.016 n=5+4)
wuffs_png_decode_image_77k_8bpp/gcc12                       992MB/s ± 0%   957MB/s ± 0%   -3.56%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc12     920MB/s ± 0%   908MB/s ± 0%   -1.30%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc12     875MB/s ± 0%   867MB/s ± 0%   -0.95%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc12                    341MB/s ± 0%   342MB/s ± 0%   +0.25%  (p=0.008 n=5+5)
This commit is contained in:
Nigel Tao
2024-04-23 22:50:55 +10:00
parent e541e0cf79
commit 92ff96359a
6 changed files with 377 additions and 512 deletions

View File

@@ -35634,30 +35634,6 @@ WUFFS_CRC32__IEEE_TABLE[16][256] WUFFS_BASE__POTENTIALLY_UNUSED = {
},
};
static const uint8_t
WUFFS_CRC32__IEEE_X86_SSE42_K1K2[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
212u, 43u, 68u, 84u, 1u, 0u, 0u, 0u,
150u, 21u, 228u, 198u, 1u, 0u, 0u, 0u,
};
static const uint8_t
WUFFS_CRC32__IEEE_X86_SSE42_K3K4[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
208u, 151u, 25u, 117u, 1u, 0u, 0u, 0u,
158u, 0u, 170u, 204u, 0u, 0u, 0u, 0u,
};
static const uint8_t
WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
36u, 97u, 205u, 99u, 1u, 0u, 0u, 0u,
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u,
};
static const uint8_t
WUFFS_CRC32__IEEE_X86_SSE42_PXMU[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
65u, 6u, 113u, 219u, 1u, 0u, 0u, 0u,
65u, 22u, 1u, 247u, 1u, 0u, 0u, 0u,
};
// ---------------- Private Initializer Prototypes
// ---------------- Private Function Prototypes
@@ -35682,14 +35658,6 @@ wuffs_crc32__ieee_hasher__up_arm_crc32(
wuffs_base__slice_u8 a_x);
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
WUFFS_BASE__GENERATED_C_CODE
static wuffs_base__empty_struct
wuffs_crc32__ieee_hasher__up_x86_avx2(
wuffs_crc32__ieee_hasher* self,
wuffs_base__slice_u8 a_x);
#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
WUFFS_BASE__GENERATED_C_CODE
static wuffs_base__empty_struct
@@ -35845,9 +35813,6 @@ wuffs_crc32__ieee_hasher__update(
#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
wuffs_base__cpu_arch__have_arm_crc32() ? &wuffs_crc32__ieee_hasher__up_arm_crc32 :
#endif
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_crc32__ieee_hasher__up_x86_avx2 :
#endif
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_crc32__ieee_hasher__up_x86_sse42 :
#endif
@@ -36073,129 +36038,6 @@ wuffs_crc32__ieee_hasher__up_arm_crc32(
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
// ‼ WUFFS MULTI-FILE SECTION -arm_crc32
// ‼ WUFFS MULTI-FILE SECTION +x86_avx2
// -------- func crc32.ieee_hasher.up_x86_avx2
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2")
WUFFS_BASE__GENERATED_C_CODE
static wuffs_base__empty_struct
wuffs_crc32__ieee_hasher__up_x86_avx2(
wuffs_crc32__ieee_hasher* self,
wuffs_base__slice_u8 a_x) {
uint32_t v_s = 0;
wuffs_base__slice_u8 v_p = {0};
__m128i v_k1k2 = {0};
__m128i v_k3k4 = {0};
__m128i v_k5zz = {0};
__m128i v_pxmu = {0};
__m128i v_x0 = {0};
__m128i v_x1 = {0};
__m128i v_x2 = {0};
__m128i v_x3 = {0};
__m128i v_y0 = {0};
__m128i v_y1 = {0};
__m128i v_y2 = {0};
__m128i v_y3 = {0};
uint64_t v_tail_index = 0;
v_s = (4294967295u ^ self->private_impl.f_state);
while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) {
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
}
if (((uint64_t)(a_x.len)) < 64u) {
{
wuffs_base__slice_u8 i_slice_p = a_x;
v_p.ptr = i_slice_p.ptr;
v_p.len = 1;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
while (v_p.ptr < i_end0_p) {
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
v_p.ptr += 1;
}
v_p.len = 0;
}
self->private_impl.f_state = (4294967295u ^ v_s);
return wuffs_base__make_empty_struct();
}
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
v_k1k2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
{
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64u);
v_p.ptr = i_slice_p.ptr;
v_p.len = 64;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(v_p.ptr, (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64));
while (v_p.ptr < i_end0_p) {
v_y0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(0u));
v_y1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(0u));
v_y2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(0u));
v_y3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(17u));
v_x1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(17u));
v_x2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(17u));
v_x3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(17u));
v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0u)));
v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16u)));
v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32u)));
v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48u)));
v_p.ptr += 64;
}
v_p.len = 0;
}
v_k3k4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
v_x0 = _mm_xor_si128(v_x0, v_x1);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
v_x0 = _mm_xor_si128(v_x0, v_x2);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
v_x0 = _mm_xor_si128(v_x0, v_x3);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_x1 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(16u));
v_x2 = _mm_set_epi32((int32_t)(0u), (int32_t)(4294967295u), (int32_t)(0u), (int32_t)(4294967295u));
v_x0 = _mm_srli_si128(v_x0, (int32_t)(8u));
v_x0 = _mm_xor_si128(v_x0, v_x1);
v_k5zz = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
v_x1 = _mm_srli_si128(v_x0, (int32_t)(4u));
v_x0 = _mm_and_si128(v_x0, v_x2);
v_x0 = _mm_clmulepi64_si128(v_x0, v_k5zz, (int32_t)(0u));
v_x0 = _mm_xor_si128(v_x0, v_x1);
v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_PXMU));
v_x1 = _mm_and_si128(v_x0, v_x2);
v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(16u));
v_x1 = _mm_and_si128(v_x1, v_x2);
v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(0u));
v_x1 = _mm_xor_si128(v_x1, v_x0);
v_s = ((uint32_t)(_mm_extract_epi32(v_x1, (int32_t)(1u))));
v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
if (v_tail_index < ((uint64_t)(a_x.len))) {
{
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
v_p.ptr = i_slice_p.ptr;
v_p.len = 1;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
while (v_p.ptr < i_end0_p) {
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
v_p.ptr += 1;
}
v_p.len = 0;
}
}
self->private_impl.f_state = (4294967295u ^ v_s);
return wuffs_base__make_empty_struct();
}
#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
// ‼ WUFFS MULTI-FILE SECTION -x86_avx2
// ‼ WUFFS MULTI-FILE SECTION +x86_sse42
// -------- func crc32.ieee_hasher.up_x86_sse42
@@ -36207,111 +36049,120 @@ wuffs_crc32__ieee_hasher__up_x86_sse42(
wuffs_crc32__ieee_hasher* self,
wuffs_base__slice_u8 a_x) {
uint32_t v_s = 0;
wuffs_base__slice_u8 v_p = {0};
__m128i v_k1k2 = {0};
__m128i v_k3k4 = {0};
__m128i v_k5zz = {0};
__m128i v_pxmu = {0};
__m128i v_kk = {0};
__m128i v_x0 = {0};
__m128i v_x1 = {0};
__m128i v_x2 = {0};
__m128i v_x3 = {0};
__m128i v_x4 = {0};
__m128i v_x5 = {0};
__m128i v_x6 = {0};
__m128i v_x7 = {0};
__m128i v_y0 = {0};
__m128i v_y1 = {0};
__m128i v_y2 = {0};
__m128i v_y3 = {0};
uint64_t v_tail_index = 0;
__m128i v_y4 = {0};
__m128i v_y5 = {0};
__m128i v_y6 = {0};
__m128i v_y7 = {0};
v_s = (4294967295u ^ self->private_impl.f_state);
while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) {
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
}
if (((uint64_t)(a_x.len)) < 64u) {
{
wuffs_base__slice_u8 i_slice_p = a_x;
v_p.ptr = i_slice_p.ptr;
v_p.len = 1;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
while (v_p.ptr < i_end0_p) {
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
v_p.ptr += 1;
}
v_p.len = 0;
if (((uint64_t)(a_x.len)) >= 128u) {
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
v_x4 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u));
v_x5 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u));
v_x6 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u));
v_x7 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u));
v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(2433674945u), (int32_t)(0u), (int32_t)(872412467u));
v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
while (((uint64_t)(a_x.len)) >= 128u) {
v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
v_y1 = _mm_clmulepi64_si128(v_x1, v_kk, (int32_t)(0u));
v_x1 = _mm_clmulepi64_si128(v_x1, v_kk, (int32_t)(17u));
v_y2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(0u));
v_x2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(17u));
v_y3 = _mm_clmulepi64_si128(v_x3, v_kk, (int32_t)(0u));
v_x3 = _mm_clmulepi64_si128(v_x3, v_kk, (int32_t)(17u));
v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
v_y5 = _mm_clmulepi64_si128(v_x5, v_kk, (int32_t)(0u));
v_x5 = _mm_clmulepi64_si128(v_x5, v_kk, (int32_t)(17u));
v_y6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(0u));
v_x6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(17u));
v_y7 = _mm_clmulepi64_si128(v_x7, v_kk, (int32_t)(0u));
v_x7 = _mm_clmulepi64_si128(v_x7, v_kk, (int32_t)(17u));
v_y0 = _mm_xor_si128(v_y0, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_y1 = _mm_xor_si128(v_y1, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
v_x1 = _mm_xor_si128(v_x1, v_y1);
v_y2 = _mm_xor_si128(v_y2, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u)));
v_x2 = _mm_xor_si128(v_x2, v_y2);
v_y3 = _mm_xor_si128(v_y3, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u)));
v_x3 = _mm_xor_si128(v_x3, v_y3);
v_y4 = _mm_xor_si128(v_y4, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u)));
v_x4 = _mm_xor_si128(v_x4, v_y4);
v_y5 = _mm_xor_si128(v_y5, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u)));
v_x5 = _mm_xor_si128(v_x5, v_y5);
v_y6 = _mm_xor_si128(v_y6, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u)));
v_x6 = _mm_xor_si128(v_x6, v_y6);
v_y7 = _mm_xor_si128(v_y7, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u)));
v_x7 = _mm_xor_si128(v_x7, v_y7);
a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
}
self->private_impl.f_state = (4294967295u ^ v_s);
return wuffs_base__make_empty_struct();
v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(3433693342u), (int32_t)(0u), (int32_t)(2926088593u));
v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
v_y2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(0u));
v_x2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(17u));
v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
v_y6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(0u));
v_x6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(17u));
v_y0 = _mm_xor_si128(v_y0, v_x1);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_y2 = _mm_xor_si128(v_y2, v_x3);
v_x2 = _mm_xor_si128(v_x2, v_y2);
v_y4 = _mm_xor_si128(v_y4, v_x5);
v_x4 = _mm_xor_si128(v_x4, v_y4);
v_y6 = _mm_xor_si128(v_y6, v_x7);
v_x6 = _mm_xor_si128(v_x6, v_y6);
v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(2166711591u), (int32_t)(0u), (int32_t)(4057597354u));
v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
v_y0 = _mm_xor_si128(v_y0, v_x2);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_y4 = _mm_xor_si128(v_y4, v_x6);
v_x4 = _mm_xor_si128(v_x4, v_y4);
v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(496309207u), (int32_t)(0u), (int32_t)(2402626965u));
v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
v_y0 = _mm_xor_si128(v_y0, v_x4);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)(((uint64_t)(_mm_extract_epi64(v_x0, (int32_t)(0u)))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)((((uint64_t)(_mm_extract_epi64(v_x0, (int32_t)(1u)))) ^ ((uint64_t)(v_s))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
}
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
v_k1k2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
{
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64u);
v_p.ptr = i_slice_p.ptr;
v_p.len = 64;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(v_p.ptr, (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64));
while (v_p.ptr < i_end0_p) {
v_y0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(0u));
v_y1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(0u));
v_y2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(0u));
v_y3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(17u));
v_x1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(17u));
v_x2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(17u));
v_x3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(17u));
v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0u)));
v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16u)));
v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32u)));
v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48u)));
v_p.ptr += 64;
}
v_p.len = 0;
while (((uint64_t)(a_x.len)) >= 8u) {
v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)((wuffs_base__peek_u64le__no_bounds_check(a_x.ptr) ^ ((uint64_t)(v_s))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 8u);
}
v_k3k4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
v_x0 = _mm_xor_si128(v_x0, v_x1);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
v_x0 = _mm_xor_si128(v_x0, v_x2);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
v_x0 = _mm_xor_si128(v_x0, v_x3);
v_x0 = _mm_xor_si128(v_x0, v_y0);
v_x1 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(16u));
v_x2 = _mm_set_epi32((int32_t)(0u), (int32_t)(4294967295u), (int32_t)(0u), (int32_t)(4294967295u));
v_x0 = _mm_srli_si128(v_x0, (int32_t)(8u));
v_x0 = _mm_xor_si128(v_x0, v_x1);
v_k5zz = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
v_x1 = _mm_srli_si128(v_x0, (int32_t)(4u));
v_x0 = _mm_and_si128(v_x0, v_x2);
v_x0 = _mm_clmulepi64_si128(v_x0, v_k5zz, (int32_t)(0u));
v_x0 = _mm_xor_si128(v_x0, v_x1);
v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_PXMU));
v_x1 = _mm_and_si128(v_x0, v_x2);
v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(16u));
v_x1 = _mm_and_si128(v_x1, v_x2);
v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(0u));
v_x1 = _mm_xor_si128(v_x1, v_x0);
v_s = ((uint32_t)(_mm_extract_epi32(v_x1, (int32_t)(1u))));
v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
if (v_tail_index < ((uint64_t)(a_x.len))) {
{
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
v_p.ptr = i_slice_p.ptr;
v_p.len = 1;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
while (v_p.ptr < i_end0_p) {
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
v_p.ptr += 1;
}
v_p.len = 0;
}
while (((uint64_t)(a_x.len)) > 0u) {
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
}
self->private_impl.f_state = (4294967295u ^ v_s);
return wuffs_base__make_empty_struct();

View File

@@ -0,0 +1,167 @@
// Copyright 2024 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
//go:build ignore
// +build ignore
package main
// print-crc32-x86-sse42-code.go prints the std/crc32 x86/SSE4.2 Wuffs code
// based on some C code generated by https://github.com/corsix/fast-crc32/
//
// Usage: go run print-crc32-x86-sse42-code.go
import (
"fmt"
"regexp"
"strconv"
"strings"
)
func main() {
var (
reXEqLoadu = regexp.MustCompile(`^__m128i x(\d+) = _mm_loadu_si128`)
reKEqSetr = regexp.MustCompile(`^k = _mm_setr_epi32\(([^,]+), ([^,]+), ([^,]+), ([^\)]+)\);$`)
reYEqClmul = regexp.MustCompile(`^y(\d+) = clmul_lo\(x(\d+), k\), x(\d+) = clmul_hi\(x(\d+), k\);$`)
reYEqXorLoadu = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), _mm_loadu_si128`)
reYEqXorYX = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), x(\d+)\), x(\d+) = _mm_xor_si128\(x(\d+), y(\d+)\);$`)
)
fmt.Println("// BEGIN script/print-crc32-x86-sse42-code.go generated code.")
for src := srcSSECRC32V8; src != ""; {
i := strings.IndexByte(src, '\n')
line := strings.TrimSpace(src[:i])
src = src[i+1:]
if (line == "") || strings.HasPrefix(line, "/*") {
continue
} else if s := reXEqLoadu.FindStringSubmatch(line); len(s) > 0 {
n, _ := strconv.Atoi(s[1])
fmt.Printf("x%d = util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X])\n", n, 16*(n), 16*(n+1))
} else if line == "__m128i k;" {
continue
} else if s := reKEqSetr.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: %s, a01: %s, a02: %s, a03: %s)\n", s[1], s[2], s[3], s[4])
} else if line == "x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);" {
fmt.Printf("x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))\n")
} else if line == "buf += 128;" {
fmt.Printf("args.x = args.x[128 ..]\n")
} else if line == "len -= 128;" {
continue
} else if line == "while (len >= 128) {" {
fmt.Printf("while args.x.length() >= 128 {\n")
} else if line == "}" {
fmt.Printf("} endwhile\n")
} else if s := reYEqClmul.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("y%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x00)\n", s[1], s[2])
fmt.Printf("x%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x11)\n", s[3], s[4])
} else if s := reYEqXorLoadu.FindStringSubmatch(line); len(s) > 0 {
n, _ := strconv.Atoi(s[1])
fmt.Printf("y%d = y%d._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X]))\n", n, n, 16*(n), 16*(n+1))
fmt.Printf("x%d = x%d._mm_xor_si128(b: y%d)\n", n, n, n)
} else if s := reYEqXorYX.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("y%s = y%s._mm_xor_si128(b: x%s)\n", s[1], s[2], s[3])
fmt.Printf("x%s = x%s._mm_xor_si128(b: y%s)\n", s[4], s[5], s[6])
} else if line == "crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));" {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
// fmt.Printf("s = util.make_m128i_single_u64(a: (s as base.u64) ^ args.x.peek_u64le()).\n")
} else if line == "crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));" {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
} else {
fmt.Printf("// Could not process %q.\n", line)
break
}
}
fmt.Println("// END script/print-crc32-x86-sse42-code.go generated code.")
}
// This is the core (inside "if (len >= 128)") of the code produced by
// generate.c in https://github.com/corsix/fast-crc32/ when parameterized by
// "./generate -i sse -p crc32 -a v8".
const srcSSECRC32V8 = `
/* First vector chunk. */
__m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
__m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
__m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
__m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
__m128i x4 = _mm_loadu_si128((const __m128i*)(buf + 64)), y4;
__m128i x5 = _mm_loadu_si128((const __m128i*)(buf + 80)), y5;
__m128i x6 = _mm_loadu_si128((const __m128i*)(buf + 96)), y6;
__m128i x7 = _mm_loadu_si128((const __m128i*)(buf + 112)), y7;
__m128i k;
k = _mm_setr_epi32(0x33fff533, 0, 0x910eeec1, 0);
x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
buf += 128;
len -= 128;
/* Main loop. */
while (len >= 128) {
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
y4 = _mm_xor_si128(y4, _mm_loadu_si128((const __m128i*)(buf + 64))), x4 = _mm_xor_si128(x4, y4);
y5 = _mm_xor_si128(y5, _mm_loadu_si128((const __m128i*)(buf + 80))), x5 = _mm_xor_si128(x5, y5);
y6 = _mm_xor_si128(y6, _mm_loadu_si128((const __m128i*)(buf + 96))), x6 = _mm_xor_si128(x6, y6);
y7 = _mm_xor_si128(y7, _mm_loadu_si128((const __m128i*)(buf + 112))), x7 = _mm_xor_si128(x7, y7);
buf += 128;
len -= 128;
}
/* Reduce x0 ... x7 to just x0. */
k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
y4 = _mm_xor_si128(y4, x5), x4 = _mm_xor_si128(x4, y4);
y6 = _mm_xor_si128(y6, x7), x6 = _mm_xor_si128(x6, y6);
k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
y4 = _mm_xor_si128(y4, x6), x4 = _mm_xor_si128(x4, y4);
k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y0 = _mm_xor_si128(y0, x4), x0 = _mm_xor_si128(x0, y0);
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));
crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));
`

View File

@@ -13,6 +13,8 @@
package main
// This program is obsolete.
//
// print-crc32-x86-sse42-magic-numbers.go prints the std/crc32
// IEEE_X86_SSE42_ETC magic number tables.
//

View File

@@ -28,7 +28,6 @@ pub func ieee_hasher.update!(x: roslice base.u8) {
if this.state == 0 {
choose up = [
up_arm_crc32,
up_x86_avx2,
up_x86_sse42]
}
this.up!(x: args.x)

View File

@@ -1,137 +0,0 @@
// Copyright 2021 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
// --------
// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
// up_x86_avx2 is exactly the same as up_x86_sse42 except for the "choose
// cpu_arch >= x86_avx2". With AVX, PCLMULQDQ has a three-operand form, not
// just a two-operand form: https://www.felixcloutier.com/x86/pclmulqdq
pri func ieee_hasher.up_x86_avx2!(x: roslice base.u8),
choose cpu_arch >= x86_avx2,
{
var s : base.u32
var p : roslice base.u8
var util : base.x86_sse42_utility
var k1k2 : base.x86_m128i
var k3k4 : base.x86_m128i
var k5zz : base.x86_m128i
var pxmu : base.x86_m128i
var x0 : base.x86_m128i
var x1 : base.x86_m128i
var x2 : base.x86_m128i
var x3 : base.x86_m128i
var y0 : base.x86_m128i
var y1 : base.x86_m128i
var y2 : base.x86_m128i
var y3 : base.x86_m128i
var tail_index : base.u64
s = 0xFFFF_FFFF ^ this.state
// Align to a 16-byte boundary.
while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
args.x = args.x[1 ..]
} endwhile
// For short inputs, just do a simple loop.
if args.x.length() < 64 {
iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
}
this.state = 0xFFFF_FFFF ^ s
return nothing
}
// Load 128×4 = 512 bits from the first 64-byte chunk.
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
// Combine with the initial state.
x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
// Process the remaining 64-byte chunks.
k1k2 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
y0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
y1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
y2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
y3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
x1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
x2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
x3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
}
// Reduce 128×4 = 512 bits to 128 bits.
k3k4 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
x0 = x0._mm_xor_si128(b: x1)
x0 = x0._mm_xor_si128(b: y0)
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
x0 = x0._mm_xor_si128(b: x2)
x0 = x0._mm_xor_si128(b: y0)
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
x0 = x0._mm_xor_si128(b: x3)
x0 = x0._mm_xor_si128(b: y0)
// Reduce 128 bits to 64 bits.
x1 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x10)
x2 = util.make_m128i_multiple_u32(
a00: 0xFFFF_FFFF,
a01: 0x0000_0000,
a02: 0xFFFF_FFFF,
a03: 0x0000_0000)
x0 = x0._mm_srli_si128(imm8: 8)
x0 = x0._mm_xor_si128(b: x1)
k5zz = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
x1 = x0._mm_srli_si128(imm8: 4)
x0 = x0._mm_and_si128(b: x2)
x0 = x0._mm_clmulepi64_si128(b: k5zz, imm8: 0x00)
x0 = x0._mm_xor_si128(b: x1)
// Reduce 64 bits to 32 bits (Barrett Reduction) and extract.
//
// Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after
// adjusting for bit-reflection as per Figure 12 (page 21).
pxmu = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16])
x1 = x0._mm_and_si128(b: x2)
x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x10)
x1 = x1._mm_and_si128(b: x2)
x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00)
x1 = x1._mm_xor_si128(b: x0)
s = x1._mm_extract_epi32(imm8: 1)
// Handle the tail of args.x that wasn't a complete 64-byte chunk.
tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
if tail_index < args.x.length() {
iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
}
}
this.state = 0xFFFF_FFFF ^ s
}

View File

@@ -8,32 +8,29 @@
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
// --------
// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
pri func ieee_hasher.up_x86_sse42!(x: roslice base.u8),
choose cpu_arch >= x86_sse42,
{
var s : base.u32
var p : roslice base.u8
var util : base.x86_sse42_utility
var k1k2 : base.x86_m128i
var k3k4 : base.x86_m128i
var k5zz : base.x86_m128i
var pxmu : base.x86_m128i
var kk : base.x86_m128i
var x0 : base.x86_m128i
var x1 : base.x86_m128i
var x2 : base.x86_m128i
var x3 : base.x86_m128i
var x4 : base.x86_m128i
var x5 : base.x86_m128i
var x6 : base.x86_m128i
var x7 : base.x86_m128i
var y0 : base.x86_m128i
var y1 : base.x86_m128i
var y2 : base.x86_m128i
var y3 : base.x86_m128i
var tail_index : base.u64
var y4 : base.x86_m128i
var y5 : base.x86_m128i
var y6 : base.x86_m128i
var y7 : base.x86_m128i
s = 0xFFFF_FFFF ^ this.state
@@ -43,125 +40,111 @@ pri func ieee_hasher.up_x86_sse42!(x: roslice base.u8),
args.x = args.x[1 ..]
} endwhile
// For short inputs, just do a simple loop.
if args.x.length() < 64 {
iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
}
this.state = 0xFFFF_FFFF ^ s
return nothing
if args.x.length() >= 128 {
// BEGIN script/print-crc32-x86-sse42-code.go generated code.
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
x4 = util.make_m128i_slice128(a: args.x[0x40 .. 0x50])
x5 = util.make_m128i_slice128(a: args.x[0x50 .. 0x60])
x6 = util.make_m128i_slice128(a: args.x[0x60 .. 0x70])
x7 = util.make_m128i_slice128(a: args.x[0x70 .. 0x80])
kk = util.make_m128i_multiple_u32(a00: 0x33FF_F533, a01: 0, a02: 0x910E_EEC1, a03: 0)
x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
args.x = args.x[128 ..]
while args.x.length() >= 128 {
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y1 = x1._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x1 = x1._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y3 = x3._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x3 = x3._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y5 = x5._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x5 = x5._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y7 = x7._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x7 = x7._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y0 = y0._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
x0 = x0._mm_xor_si128(b: y0)
y1 = y1._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
x1 = x1._mm_xor_si128(b: y1)
y2 = y2._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
x2 = x2._mm_xor_si128(b: y2)
y3 = y3._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
x3 = x3._mm_xor_si128(b: y3)
y4 = y4._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x40 .. 0x50]))
x4 = x4._mm_xor_si128(b: y4)
y5 = y5._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x50 .. 0x60]))
x5 = x5._mm_xor_si128(b: y5)
y6 = y6._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x60 .. 0x70]))
x6 = x6._mm_xor_si128(b: y6)
y7 = y7._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x70 .. 0x80]))
x7 = x7._mm_xor_si128(b: y7)
args.x = args.x[128 ..]
} endwhile
kk = util.make_m128i_multiple_u32(a00: 0xAE68_9191, a01: 0, a02: 0xCCAA_009E, a03: 0)
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y0 = y0._mm_xor_si128(b: x1)
x0 = x0._mm_xor_si128(b: y0)
y2 = y2._mm_xor_si128(b: x3)
x2 = x2._mm_xor_si128(b: y2)
y4 = y4._mm_xor_si128(b: x5)
x4 = x4._mm_xor_si128(b: y4)
y6 = y6._mm_xor_si128(b: x7)
x6 = x6._mm_xor_si128(b: y6)
kk = util.make_m128i_multiple_u32(a00: 0xF1DA_05AA, a01: 0, a02: 0x8125_6527, a03: 0)
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y0 = y0._mm_xor_si128(b: x2)
x0 = x0._mm_xor_si128(b: y0)
y4 = y4._mm_xor_si128(b: x6)
x4 = x4._mm_xor_si128(b: y4)
kk = util.make_m128i_multiple_u32(a00: 0x8F35_2D95, a01: 0, a02: 0x1D95_13D7, a03: 0)
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y0 = y0._mm_xor_si128(b: x4)
x0 = x0._mm_xor_si128(b: y0)
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
_mm_extract_epi32(imm8: 2)
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
_mm_extract_epi32(imm8: 2)
// END script/print-crc32-x86-sse42-code.go generated code.
}
// Load 128×4 = 512 bits from the first 64-byte chunk.
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
while args.x.length() >= 8 {
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
s = util.make_m128i_single_u64(a: args.x.peek_u64le() ^ (s as base.u64)).
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
_mm_extract_epi32(imm8: 2)
args.x = args.x[8 ..]
} endwhile
// Combine with the initial state.
x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
// Process the remaining 64-byte chunks.
k1k2 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
y0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
y1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
y2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
y3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
x1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
x2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
x3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
}
// Reduce 128×4 = 512 bits to 128 bits.
k3k4 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
x0 = x0._mm_xor_si128(b: x1)
x0 = x0._mm_xor_si128(b: y0)
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
x0 = x0._mm_xor_si128(b: x2)
x0 = x0._mm_xor_si128(b: y0)
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
x0 = x0._mm_xor_si128(b: x3)
x0 = x0._mm_xor_si128(b: y0)
// Reduce 128 bits to 64 bits.
x1 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x10)
x2 = util.make_m128i_multiple_u32(
a00: 0xFFFF_FFFF,
a01: 0x0000_0000,
a02: 0xFFFF_FFFF,
a03: 0x0000_0000)
x0 = x0._mm_srli_si128(imm8: 8)
x0 = x0._mm_xor_si128(b: x1)
k5zz = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
x1 = x0._mm_srli_si128(imm8: 4)
x0 = x0._mm_and_si128(b: x2)
x0 = x0._mm_clmulepi64_si128(b: k5zz, imm8: 0x00)
x0 = x0._mm_xor_si128(b: x1)
// Reduce 64 bits to 32 bits (Barrett Reduction) and extract.
//
// Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after
// adjusting for bit-reflection as per Figure 12 (page 21).
pxmu = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16])
x1 = x0._mm_and_si128(b: x2)
x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x10)
x1 = x1._mm_and_si128(b: x2)
x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00)
x1 = x1._mm_xor_si128(b: x0)
s = x1._mm_extract_epi32(imm8: 1)
// Handle the tail of args.x that wasn't a complete 64-byte chunk.
tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
if tail_index < args.x.length() {
iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
}
}
while args.x.length() > 0 {
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
args.x = args.x[1 ..]
} endwhile
this.state = 0xFFFF_FFFF ^ s
}
// These constants come from page 22 of Gopal et al. They are also reproduced
// by script/print-crc32-x86-sse42-magic-numbers.go which is runnable online at
// https://play.golang.org/p/wH1q6GfhKOE
//
// The k6' constant from the Gopal paper is unused.
//
// The rkN names match the numbers at
// https://github.com/intel/isa-l/blob/7b30857e20b84e5afab1a28291189b9dc571110d/crc/crc32_gzip_refl_by16_10.asm#L475-L499
//
// The "+§" means a harmless off-by-one difference compared to Intel's numbers.
// https://danlark.org/2021/03/08/how-a-bug-in-the-linux-crc-32-checksum-turned-out-not-to-be-a-bug/
// https://github.com/google/wuffs/commit/b24e046670396d7ef22ccf499051340b9288419b
pri const IEEE_X86_SSE42_K1K2 : roarray[16] base.u8 = [
0xD4, 0x2B, 0x44, 0x54, 0x01, 0x00, 0x00, 0x00, // k1' = 0x1_5444_2BD4 = rk16
0x96, 0x15, 0xE4, 0xC6, 0x01, 0x00, 0x00, 0x00, // k2' = 0x1_C6E4_1596 = rk15
]
pri const IEEE_X86_SSE42_K3K4 : roarray[16] base.u8 = [
0xD0, 0x97, 0x19, 0x75, 0x01, 0x00, 0x00, 0x00, // k3' = 0x1_7519_97D0 = rk2
0x9E, 0x00, 0xAA, 0xCC, 0x00, 0x00, 0x00, 0x00, // k4' = 0x0_CCAA_009E = rk1
]
pri const IEEE_X86_SSE42_K5ZZ : roarray[16] base.u8 = [
0x24, 0x61, 0xCD, 0x63, 0x01, 0x00, 0x00, 0x00, // k5' = 0x1_63CD_6124 = rk6
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Unused
]
pri const IEEE_X86_SSE42_PXMU : roarray[16] base.u8 = [
0x41, 0x06, 0x71, 0xDB, 0x01, 0x00, 0x00, 0x00, // Px' = 0x1_DB71_0641 = rk8+§
0x41, 0x16, 0x01, 0xF7, 0x01, 0x00, 0x00, 0x00, // μ' = 0x1_F701_1641 = rk7+§
]