mirror of
https://github.com/google/wuffs.git
synced 2026-01-18 17:11:32 +01:00
std/crc32: re-write x86_sse42 implementation
name old speed new speed delta wuffs_crc32_ieee_10k/clang14 13.5GB/s ± 0% 21.9GB/s ± 0% +62.09% (p=0.016 n=5+4) wuffs_crc32_ieee_100k/clang14 22.4GB/s ± 9% 29.7GB/s ± 0% +32.76% (p=0.016 n=5+4) wuffs_crc32_ieee_10k/gcc12 14.2GB/s ± 2% 22.2GB/s ± 0% +56.92% (p=0.008 n=5+5) wuffs_crc32_ieee_100k/gcc12 21.3GB/s ± 3% 29.6GB/s ± 1% +39.18% (p=0.008 n=5+5) wuffs_gzip_decode_10k/clang14 366MB/s ± 0% 372MB/s ± 0% +1.41% (p=0.008 n=5+5) wuffs_gzip_decode_100k/clang14 482MB/s ± 0% 494MB/s ± 0% +2.57% (p=0.008 n=5+5) wuffs_gzip_decode_10k/gcc12 398MB/s ± 0% 418MB/s ± 0% +5.19% (p=0.008 n=5+5) wuffs_gzip_decode_100k/gcc12 510MB/s ± 0% 537MB/s ± 0% +5.21% (p=0.008 n=5+5) wuffs_png_decode_image_19k_8bpp/clang14 263MB/s ± 0% 264MB/s ± 0% +0.39% (p=0.008 n=5+5) wuffs_png_decode_image_40k_24bpp/clang14 297MB/s ± 0% 298MB/s ± 0% +0.16% (p=0.008 n=5+5) wuffs_png_decode_image_77k_8bpp/clang14 932MB/s ± 0% 945MB/s ± 0% +1.33% (p=0.008 n=5+5) wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang14 831MB/s ± 0% 833MB/s ± 0% +0.25% (p=0.008 n=5+5) wuffs_png_decode_image_552k_32bpp_verify_checksum/clang14 799MB/s ± 0% 802MB/s ± 0% +0.37% (p=0.008 n=5+5) wuffs_png_decode_image_4002k_24bpp/clang14 306MB/s ± 0% 307MB/s ± 0% +0.56% (p=0.008 n=5+5) wuffs_png_decode_image_19k_8bpp/gcc12 283MB/s ± 0% 270MB/s ± 0% -4.84% (p=0.008 n=5+5) wuffs_png_decode_image_40k_24bpp/gcc12 330MB/s ± 0% 329MB/s ± 0% -0.29% (p=0.016 n=5+4) wuffs_png_decode_image_77k_8bpp/gcc12 992MB/s ± 0% 957MB/s ± 0% -3.56% (p=0.008 n=5+5) wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc12 920MB/s ± 0% 908MB/s ± 0% -1.30% (p=0.008 n=5+5) wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc12 875MB/s ± 0% 867MB/s ± 0% -0.95% (p=0.008 n=5+5) wuffs_png_decode_image_4002k_24bpp/gcc12 341MB/s ± 0% 342MB/s ± 0% +0.25% (p=0.008 n=5+5)
This commit is contained in:
@@ -35634,30 +35634,6 @@ WUFFS_CRC32__IEEE_TABLE[16][256] WUFFS_BASE__POTENTIALLY_UNUSED = {
|
||||
},
|
||||
};
|
||||
|
||||
static const uint8_t
|
||||
WUFFS_CRC32__IEEE_X86_SSE42_K1K2[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
|
||||
212u, 43u, 68u, 84u, 1u, 0u, 0u, 0u,
|
||||
150u, 21u, 228u, 198u, 1u, 0u, 0u, 0u,
|
||||
};
|
||||
|
||||
static const uint8_t
|
||||
WUFFS_CRC32__IEEE_X86_SSE42_K3K4[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
|
||||
208u, 151u, 25u, 117u, 1u, 0u, 0u, 0u,
|
||||
158u, 0u, 170u, 204u, 0u, 0u, 0u, 0u,
|
||||
};
|
||||
|
||||
static const uint8_t
|
||||
WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
|
||||
36u, 97u, 205u, 99u, 1u, 0u, 0u, 0u,
|
||||
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u,
|
||||
};
|
||||
|
||||
static const uint8_t
|
||||
WUFFS_CRC32__IEEE_X86_SSE42_PXMU[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
|
||||
65u, 6u, 113u, 219u, 1u, 0u, 0u, 0u,
|
||||
65u, 22u, 1u, 247u, 1u, 0u, 0u, 0u,
|
||||
};
|
||||
|
||||
// ---------------- Private Initializer Prototypes
|
||||
|
||||
// ---------------- Private Function Prototypes
|
||||
@@ -35682,14 +35658,6 @@ wuffs_crc32__ieee_hasher__up_arm_crc32(
|
||||
wuffs_base__slice_u8 a_x);
|
||||
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
|
||||
|
||||
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
|
||||
WUFFS_BASE__GENERATED_C_CODE
|
||||
static wuffs_base__empty_struct
|
||||
wuffs_crc32__ieee_hasher__up_x86_avx2(
|
||||
wuffs_crc32__ieee_hasher* self,
|
||||
wuffs_base__slice_u8 a_x);
|
||||
#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
|
||||
|
||||
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
|
||||
WUFFS_BASE__GENERATED_C_CODE
|
||||
static wuffs_base__empty_struct
|
||||
@@ -35845,9 +35813,6 @@ wuffs_crc32__ieee_hasher__update(
|
||||
#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
|
||||
wuffs_base__cpu_arch__have_arm_crc32() ? &wuffs_crc32__ieee_hasher__up_arm_crc32 :
|
||||
#endif
|
||||
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
|
||||
wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_crc32__ieee_hasher__up_x86_avx2 :
|
||||
#endif
|
||||
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
|
||||
wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_crc32__ieee_hasher__up_x86_sse42 :
|
||||
#endif
|
||||
@@ -36073,129 +36038,6 @@ wuffs_crc32__ieee_hasher__up_arm_crc32(
|
||||
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
|
||||
// ‼ WUFFS MULTI-FILE SECTION -arm_crc32
|
||||
|
||||
// ‼ WUFFS MULTI-FILE SECTION +x86_avx2
|
||||
// -------- func crc32.ieee_hasher.up_x86_avx2
|
||||
|
||||
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
|
||||
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2")
|
||||
WUFFS_BASE__GENERATED_C_CODE
|
||||
static wuffs_base__empty_struct
|
||||
wuffs_crc32__ieee_hasher__up_x86_avx2(
|
||||
wuffs_crc32__ieee_hasher* self,
|
||||
wuffs_base__slice_u8 a_x) {
|
||||
uint32_t v_s = 0;
|
||||
wuffs_base__slice_u8 v_p = {0};
|
||||
__m128i v_k1k2 = {0};
|
||||
__m128i v_k3k4 = {0};
|
||||
__m128i v_k5zz = {0};
|
||||
__m128i v_pxmu = {0};
|
||||
__m128i v_x0 = {0};
|
||||
__m128i v_x1 = {0};
|
||||
__m128i v_x2 = {0};
|
||||
__m128i v_x3 = {0};
|
||||
__m128i v_y0 = {0};
|
||||
__m128i v_y1 = {0};
|
||||
__m128i v_y2 = {0};
|
||||
__m128i v_y3 = {0};
|
||||
uint64_t v_tail_index = 0;
|
||||
|
||||
v_s = (4294967295u ^ self->private_impl.f_state);
|
||||
while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) {
|
||||
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
|
||||
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
|
||||
}
|
||||
if (((uint64_t)(a_x.len)) < 64u) {
|
||||
{
|
||||
wuffs_base__slice_u8 i_slice_p = a_x;
|
||||
v_p.ptr = i_slice_p.ptr;
|
||||
v_p.len = 1;
|
||||
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
|
||||
while (v_p.ptr < i_end0_p) {
|
||||
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
|
||||
v_p.ptr += 1;
|
||||
}
|
||||
v_p.len = 0;
|
||||
}
|
||||
self->private_impl.f_state = (4294967295u ^ v_s);
|
||||
return wuffs_base__make_empty_struct();
|
||||
}
|
||||
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
|
||||
v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
|
||||
v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
|
||||
v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
|
||||
v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
|
||||
v_k1k2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
|
||||
{
|
||||
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64u);
|
||||
v_p.ptr = i_slice_p.ptr;
|
||||
v_p.len = 64;
|
||||
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(v_p.ptr, (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64));
|
||||
while (v_p.ptr < i_end0_p) {
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(0u));
|
||||
v_y1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(0u));
|
||||
v_y2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(0u));
|
||||
v_y3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(17u));
|
||||
v_x1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(17u));
|
||||
v_x2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(17u));
|
||||
v_x3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(17u));
|
||||
v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0u)));
|
||||
v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16u)));
|
||||
v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32u)));
|
||||
v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48u)));
|
||||
v_p.ptr += 64;
|
||||
}
|
||||
v_p.len = 0;
|
||||
}
|
||||
v_k3k4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x1);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x2);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x3);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_x1 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(16u));
|
||||
v_x2 = _mm_set_epi32((int32_t)(0u), (int32_t)(4294967295u), (int32_t)(0u), (int32_t)(4294967295u));
|
||||
v_x0 = _mm_srli_si128(v_x0, (int32_t)(8u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x1);
|
||||
v_k5zz = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
|
||||
v_x1 = _mm_srli_si128(v_x0, (int32_t)(4u));
|
||||
v_x0 = _mm_and_si128(v_x0, v_x2);
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k5zz, (int32_t)(0u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x1);
|
||||
v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_PXMU));
|
||||
v_x1 = _mm_and_si128(v_x0, v_x2);
|
||||
v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(16u));
|
||||
v_x1 = _mm_and_si128(v_x1, v_x2);
|
||||
v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(0u));
|
||||
v_x1 = _mm_xor_si128(v_x1, v_x0);
|
||||
v_s = ((uint32_t)(_mm_extract_epi32(v_x1, (int32_t)(1u))));
|
||||
v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
|
||||
if (v_tail_index < ((uint64_t)(a_x.len))) {
|
||||
{
|
||||
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
|
||||
v_p.ptr = i_slice_p.ptr;
|
||||
v_p.len = 1;
|
||||
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
|
||||
while (v_p.ptr < i_end0_p) {
|
||||
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
|
||||
v_p.ptr += 1;
|
||||
}
|
||||
v_p.len = 0;
|
||||
}
|
||||
}
|
||||
self->private_impl.f_state = (4294967295u ^ v_s);
|
||||
return wuffs_base__make_empty_struct();
|
||||
}
|
||||
#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
|
||||
// ‼ WUFFS MULTI-FILE SECTION -x86_avx2
|
||||
|
||||
// ‼ WUFFS MULTI-FILE SECTION +x86_sse42
|
||||
// -------- func crc32.ieee_hasher.up_x86_sse42
|
||||
|
||||
@@ -36207,111 +36049,120 @@ wuffs_crc32__ieee_hasher__up_x86_sse42(
|
||||
wuffs_crc32__ieee_hasher* self,
|
||||
wuffs_base__slice_u8 a_x) {
|
||||
uint32_t v_s = 0;
|
||||
wuffs_base__slice_u8 v_p = {0};
|
||||
__m128i v_k1k2 = {0};
|
||||
__m128i v_k3k4 = {0};
|
||||
__m128i v_k5zz = {0};
|
||||
__m128i v_pxmu = {0};
|
||||
__m128i v_kk = {0};
|
||||
__m128i v_x0 = {0};
|
||||
__m128i v_x1 = {0};
|
||||
__m128i v_x2 = {0};
|
||||
__m128i v_x3 = {0};
|
||||
__m128i v_x4 = {0};
|
||||
__m128i v_x5 = {0};
|
||||
__m128i v_x6 = {0};
|
||||
__m128i v_x7 = {0};
|
||||
__m128i v_y0 = {0};
|
||||
__m128i v_y1 = {0};
|
||||
__m128i v_y2 = {0};
|
||||
__m128i v_y3 = {0};
|
||||
uint64_t v_tail_index = 0;
|
||||
__m128i v_y4 = {0};
|
||||
__m128i v_y5 = {0};
|
||||
__m128i v_y6 = {0};
|
||||
__m128i v_y7 = {0};
|
||||
|
||||
v_s = (4294967295u ^ self->private_impl.f_state);
|
||||
while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) {
|
||||
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
|
||||
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
|
||||
}
|
||||
if (((uint64_t)(a_x.len)) < 64u) {
|
||||
{
|
||||
wuffs_base__slice_u8 i_slice_p = a_x;
|
||||
v_p.ptr = i_slice_p.ptr;
|
||||
v_p.len = 1;
|
||||
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
|
||||
while (v_p.ptr < i_end0_p) {
|
||||
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
|
||||
v_p.ptr += 1;
|
||||
}
|
||||
v_p.len = 0;
|
||||
if (((uint64_t)(a_x.len)) >= 128u) {
|
||||
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
|
||||
v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
|
||||
v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
|
||||
v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
|
||||
v_x4 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u));
|
||||
v_x5 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u));
|
||||
v_x6 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u));
|
||||
v_x7 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u));
|
||||
v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(2433674945u), (int32_t)(0u), (int32_t)(872412467u));
|
||||
v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
|
||||
a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
|
||||
while (((uint64_t)(a_x.len)) >= 128u) {
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
|
||||
v_y1 = _mm_clmulepi64_si128(v_x1, v_kk, (int32_t)(0u));
|
||||
v_x1 = _mm_clmulepi64_si128(v_x1, v_kk, (int32_t)(17u));
|
||||
v_y2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(0u));
|
||||
v_x2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(17u));
|
||||
v_y3 = _mm_clmulepi64_si128(v_x3, v_kk, (int32_t)(0u));
|
||||
v_x3 = _mm_clmulepi64_si128(v_x3, v_kk, (int32_t)(17u));
|
||||
v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
|
||||
v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
|
||||
v_y5 = _mm_clmulepi64_si128(v_x5, v_kk, (int32_t)(0u));
|
||||
v_x5 = _mm_clmulepi64_si128(v_x5, v_kk, (int32_t)(17u));
|
||||
v_y6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(0u));
|
||||
v_x6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(17u));
|
||||
v_y7 = _mm_clmulepi64_si128(v_x7, v_kk, (int32_t)(0u));
|
||||
v_x7 = _mm_clmulepi64_si128(v_x7, v_kk, (int32_t)(17u));
|
||||
v_y0 = _mm_xor_si128(v_y0, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_y1 = _mm_xor_si128(v_y1, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
|
||||
v_x1 = _mm_xor_si128(v_x1, v_y1);
|
||||
v_y2 = _mm_xor_si128(v_y2, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u)));
|
||||
v_x2 = _mm_xor_si128(v_x2, v_y2);
|
||||
v_y3 = _mm_xor_si128(v_y3, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u)));
|
||||
v_x3 = _mm_xor_si128(v_x3, v_y3);
|
||||
v_y4 = _mm_xor_si128(v_y4, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u)));
|
||||
v_x4 = _mm_xor_si128(v_x4, v_y4);
|
||||
v_y5 = _mm_xor_si128(v_y5, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u)));
|
||||
v_x5 = _mm_xor_si128(v_x5, v_y5);
|
||||
v_y6 = _mm_xor_si128(v_y6, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u)));
|
||||
v_x6 = _mm_xor_si128(v_x6, v_y6);
|
||||
v_y7 = _mm_xor_si128(v_y7, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u)));
|
||||
v_x7 = _mm_xor_si128(v_x7, v_y7);
|
||||
a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
|
||||
}
|
||||
self->private_impl.f_state = (4294967295u ^ v_s);
|
||||
return wuffs_base__make_empty_struct();
|
||||
v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(3433693342u), (int32_t)(0u), (int32_t)(2926088593u));
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
|
||||
v_y2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(0u));
|
||||
v_x2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(17u));
|
||||
v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
|
||||
v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
|
||||
v_y6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(0u));
|
||||
v_x6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(17u));
|
||||
v_y0 = _mm_xor_si128(v_y0, v_x1);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_y2 = _mm_xor_si128(v_y2, v_x3);
|
||||
v_x2 = _mm_xor_si128(v_x2, v_y2);
|
||||
v_y4 = _mm_xor_si128(v_y4, v_x5);
|
||||
v_x4 = _mm_xor_si128(v_x4, v_y4);
|
||||
v_y6 = _mm_xor_si128(v_y6, v_x7);
|
||||
v_x6 = _mm_xor_si128(v_x6, v_y6);
|
||||
v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(2166711591u), (int32_t)(0u), (int32_t)(4057597354u));
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
|
||||
v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
|
||||
v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
|
||||
v_y0 = _mm_xor_si128(v_y0, v_x2);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_y4 = _mm_xor_si128(v_y4, v_x6);
|
||||
v_x4 = _mm_xor_si128(v_x4, v_y4);
|
||||
v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(496309207u), (int32_t)(0u), (int32_t)(2402626965u));
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
|
||||
v_y0 = _mm_xor_si128(v_y0, v_x4);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
|
||||
v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)(((uint64_t)(_mm_extract_epi64(v_x0, (int32_t)(0u)))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
|
||||
v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
|
||||
v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)((((uint64_t)(_mm_extract_epi64(v_x0, (int32_t)(1u)))) ^ ((uint64_t)(v_s))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
|
||||
}
|
||||
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
|
||||
v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
|
||||
v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
|
||||
v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
|
||||
v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
|
||||
v_k1k2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
|
||||
{
|
||||
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64u);
|
||||
v_p.ptr = i_slice_p.ptr;
|
||||
v_p.len = 64;
|
||||
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(v_p.ptr, (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64));
|
||||
while (v_p.ptr < i_end0_p) {
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(0u));
|
||||
v_y1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(0u));
|
||||
v_y2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(0u));
|
||||
v_y3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(17u));
|
||||
v_x1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(17u));
|
||||
v_x2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(17u));
|
||||
v_x3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(17u));
|
||||
v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0u)));
|
||||
v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16u)));
|
||||
v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32u)));
|
||||
v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48u)));
|
||||
v_p.ptr += 64;
|
||||
}
|
||||
v_p.len = 0;
|
||||
while (((uint64_t)(a_x.len)) >= 8u) {
|
||||
v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
|
||||
v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)((wuffs_base__peek_u64le__no_bounds_check(a_x.ptr) ^ ((uint64_t)(v_s))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
|
||||
a_x = wuffs_base__slice_u8__subslice_i(a_x, 8u);
|
||||
}
|
||||
v_k3k4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x1);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x2);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x3);
|
||||
v_x0 = _mm_xor_si128(v_x0, v_y0);
|
||||
v_x1 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(16u));
|
||||
v_x2 = _mm_set_epi32((int32_t)(0u), (int32_t)(4294967295u), (int32_t)(0u), (int32_t)(4294967295u));
|
||||
v_x0 = _mm_srli_si128(v_x0, (int32_t)(8u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x1);
|
||||
v_k5zz = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
|
||||
v_x1 = _mm_srli_si128(v_x0, (int32_t)(4u));
|
||||
v_x0 = _mm_and_si128(v_x0, v_x2);
|
||||
v_x0 = _mm_clmulepi64_si128(v_x0, v_k5zz, (int32_t)(0u));
|
||||
v_x0 = _mm_xor_si128(v_x0, v_x1);
|
||||
v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_PXMU));
|
||||
v_x1 = _mm_and_si128(v_x0, v_x2);
|
||||
v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(16u));
|
||||
v_x1 = _mm_and_si128(v_x1, v_x2);
|
||||
v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(0u));
|
||||
v_x1 = _mm_xor_si128(v_x1, v_x0);
|
||||
v_s = ((uint32_t)(_mm_extract_epi32(v_x1, (int32_t)(1u))));
|
||||
v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
|
||||
if (v_tail_index < ((uint64_t)(a_x.len))) {
|
||||
{
|
||||
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
|
||||
v_p.ptr = i_slice_p.ptr;
|
||||
v_p.len = 1;
|
||||
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
|
||||
while (v_p.ptr < i_end0_p) {
|
||||
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
|
||||
v_p.ptr += 1;
|
||||
}
|
||||
v_p.len = 0;
|
||||
}
|
||||
while (((uint64_t)(a_x.len)) > 0u) {
|
||||
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
|
||||
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
|
||||
}
|
||||
self->private_impl.f_state = (4294967295u ^ v_s);
|
||||
return wuffs_base__make_empty_struct();
|
||||
|
||||
167
script/print-crc32-x86-sse42-code.go
Normal file
167
script/print-crc32-x86-sse42-code.go
Normal file
@@ -0,0 +1,167 @@
|
||||
// Copyright 2024 The Wuffs Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 OR MIT
|
||||
|
||||
//go:build ignore
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// print-crc32-x86-sse42-code.go prints the std/crc32 x86/SSE4.2 Wuffs code
|
||||
// based on some C code generated by https://github.com/corsix/fast-crc32/
|
||||
//
|
||||
// Usage: go run print-crc32-x86-sse42-code.go
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var (
|
||||
reXEqLoadu = regexp.MustCompile(`^__m128i x(\d+) = _mm_loadu_si128`)
|
||||
reKEqSetr = regexp.MustCompile(`^k = _mm_setr_epi32\(([^,]+), ([^,]+), ([^,]+), ([^\)]+)\);$`)
|
||||
reYEqClmul = regexp.MustCompile(`^y(\d+) = clmul_lo\(x(\d+), k\), x(\d+) = clmul_hi\(x(\d+), k\);$`)
|
||||
reYEqXorLoadu = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), _mm_loadu_si128`)
|
||||
reYEqXorYX = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), x(\d+)\), x(\d+) = _mm_xor_si128\(x(\d+), y(\d+)\);$`)
|
||||
)
|
||||
|
||||
fmt.Println("// BEGIN script/print-crc32-x86-sse42-code.go generated code.")
|
||||
for src := srcSSECRC32V8; src != ""; {
|
||||
i := strings.IndexByte(src, '\n')
|
||||
line := strings.TrimSpace(src[:i])
|
||||
src = src[i+1:]
|
||||
|
||||
if (line == "") || strings.HasPrefix(line, "/*") {
|
||||
continue
|
||||
|
||||
} else if s := reXEqLoadu.FindStringSubmatch(line); len(s) > 0 {
|
||||
n, _ := strconv.Atoi(s[1])
|
||||
fmt.Printf("x%d = util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X])\n", n, 16*(n), 16*(n+1))
|
||||
|
||||
} else if line == "__m128i k;" {
|
||||
continue
|
||||
|
||||
} else if s := reKEqSetr.FindStringSubmatch(line); len(s) > 0 {
|
||||
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: %s, a01: %s, a02: %s, a03: %s)\n", s[1], s[2], s[3], s[4])
|
||||
|
||||
} else if line == "x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);" {
|
||||
fmt.Printf("x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))\n")
|
||||
|
||||
} else if line == "buf += 128;" {
|
||||
fmt.Printf("args.x = args.x[128 ..]\n")
|
||||
|
||||
} else if line == "len -= 128;" {
|
||||
continue
|
||||
|
||||
} else if line == "while (len >= 128) {" {
|
||||
fmt.Printf("while args.x.length() >= 128 {\n")
|
||||
|
||||
} else if line == "}" {
|
||||
fmt.Printf("} endwhile\n")
|
||||
|
||||
} else if s := reYEqClmul.FindStringSubmatch(line); len(s) > 0 {
|
||||
fmt.Printf("y%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x00)\n", s[1], s[2])
|
||||
fmt.Printf("x%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x11)\n", s[3], s[4])
|
||||
|
||||
} else if s := reYEqXorLoadu.FindStringSubmatch(line); len(s) > 0 {
|
||||
n, _ := strconv.Atoi(s[1])
|
||||
fmt.Printf("y%d = y%d._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X]))\n", n, n, 16*(n), 16*(n+1))
|
||||
fmt.Printf("x%d = x%d._mm_xor_si128(b: y%d)\n", n, n, n)
|
||||
|
||||
} else if s := reYEqXorYX.FindStringSubmatch(line); len(s) > 0 {
|
||||
fmt.Printf("y%s = y%s._mm_xor_si128(b: x%s)\n", s[1], s[2], s[3])
|
||||
fmt.Printf("x%s = x%s._mm_xor_si128(b: y%s)\n", s[4], s[5], s[6])
|
||||
|
||||
} else if line == "crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));" {
|
||||
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
|
||||
fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).\n")
|
||||
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
|
||||
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
|
||||
fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
|
||||
|
||||
// fmt.Printf("s = util.make_m128i_single_u64(a: (s as base.u64) ^ args.x.peek_u64le()).\n")
|
||||
} else if line == "crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));" {
|
||||
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
|
||||
fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).\n")
|
||||
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
|
||||
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
|
||||
fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
|
||||
|
||||
} else {
|
||||
fmt.Printf("// Could not process %q.\n", line)
|
||||
break
|
||||
}
|
||||
}
|
||||
fmt.Println("// END script/print-crc32-x86-sse42-code.go generated code.")
|
||||
}
|
||||
|
||||
// This is the core (inside "if (len >= 128)") of the code produced by
|
||||
// generate.c in https://github.com/corsix/fast-crc32/ when parameterized by
|
||||
// "./generate -i sse -p crc32 -a v8".
|
||||
const srcSSECRC32V8 = `
|
||||
/* First vector chunk. */
|
||||
__m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
|
||||
__m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
|
||||
__m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
|
||||
__m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
|
||||
__m128i x4 = _mm_loadu_si128((const __m128i*)(buf + 64)), y4;
|
||||
__m128i x5 = _mm_loadu_si128((const __m128i*)(buf + 80)), y5;
|
||||
__m128i x6 = _mm_loadu_si128((const __m128i*)(buf + 96)), y6;
|
||||
__m128i x7 = _mm_loadu_si128((const __m128i*)(buf + 112)), y7;
|
||||
__m128i k;
|
||||
k = _mm_setr_epi32(0x33fff533, 0, 0x910eeec1, 0);
|
||||
x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
|
||||
buf += 128;
|
||||
len -= 128;
|
||||
/* Main loop. */
|
||||
while (len >= 128) {
|
||||
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
|
||||
y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
|
||||
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
|
||||
y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
|
||||
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
|
||||
y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
|
||||
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
|
||||
y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
|
||||
y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
|
||||
y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
|
||||
y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
|
||||
y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
|
||||
y4 = _mm_xor_si128(y4, _mm_loadu_si128((const __m128i*)(buf + 64))), x4 = _mm_xor_si128(x4, y4);
|
||||
y5 = _mm_xor_si128(y5, _mm_loadu_si128((const __m128i*)(buf + 80))), x5 = _mm_xor_si128(x5, y5);
|
||||
y6 = _mm_xor_si128(y6, _mm_loadu_si128((const __m128i*)(buf + 96))), x6 = _mm_xor_si128(x6, y6);
|
||||
y7 = _mm_xor_si128(y7, _mm_loadu_si128((const __m128i*)(buf + 112))), x7 = _mm_xor_si128(x7, y7);
|
||||
buf += 128;
|
||||
len -= 128;
|
||||
}
|
||||
/* Reduce x0 ... x7 to just x0. */
|
||||
k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0);
|
||||
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
|
||||
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
|
||||
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
|
||||
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
|
||||
y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
|
||||
y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
|
||||
y4 = _mm_xor_si128(y4, x5), x4 = _mm_xor_si128(x4, y4);
|
||||
y6 = _mm_xor_si128(y6, x7), x6 = _mm_xor_si128(x6, y6);
|
||||
k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0);
|
||||
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
|
||||
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
|
||||
y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
|
||||
y4 = _mm_xor_si128(y4, x6), x4 = _mm_xor_si128(x4, y4);
|
||||
k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0);
|
||||
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
|
||||
y0 = _mm_xor_si128(y0, x4), x0 = _mm_xor_si128(x0, y0);
|
||||
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
|
||||
crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));
|
||||
crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));
|
||||
`
|
||||
@@ -13,6 +13,8 @@
|
||||
|
||||
package main
|
||||
|
||||
// This program is obsolete.
|
||||
//
|
||||
// print-crc32-x86-sse42-magic-numbers.go prints the std/crc32
|
||||
// IEEE_X86_SSE42_ETC magic number tables.
|
||||
//
|
||||
|
||||
@@ -28,7 +28,6 @@ pub func ieee_hasher.update!(x: roslice base.u8) {
|
||||
if this.state == 0 {
|
||||
choose up = [
|
||||
up_arm_crc32,
|
||||
up_x86_avx2,
|
||||
up_x86_sse42]
|
||||
}
|
||||
this.up!(x: args.x)
|
||||
|
||||
@@ -1,137 +0,0 @@
|
||||
// Copyright 2021 The Wuffs Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 OR MIT
|
||||
|
||||
// --------
|
||||
|
||||
// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
|
||||
// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
|
||||
|
||||
// up_x86_avx2 is exactly the same as up_x86_sse42 except for the "choose
|
||||
// cpu_arch >= x86_avx2". With AVX, PCLMULQDQ has a three-operand form, not
|
||||
// just a two-operand form: https://www.felixcloutier.com/x86/pclmulqdq
|
||||
pri func ieee_hasher.up_x86_avx2!(x: roslice base.u8),
|
||||
choose cpu_arch >= x86_avx2,
|
||||
{
|
||||
var s : base.u32
|
||||
var p : roslice base.u8
|
||||
|
||||
var util : base.x86_sse42_utility
|
||||
var k1k2 : base.x86_m128i
|
||||
var k3k4 : base.x86_m128i
|
||||
var k5zz : base.x86_m128i
|
||||
var pxmu : base.x86_m128i
|
||||
var x0 : base.x86_m128i
|
||||
var x1 : base.x86_m128i
|
||||
var x2 : base.x86_m128i
|
||||
var x3 : base.x86_m128i
|
||||
var y0 : base.x86_m128i
|
||||
var y1 : base.x86_m128i
|
||||
var y2 : base.x86_m128i
|
||||
var y3 : base.x86_m128i
|
||||
|
||||
var tail_index : base.u64
|
||||
|
||||
s = 0xFFFF_FFFF ^ this.state
|
||||
|
||||
// Align to a 16-byte boundary.
|
||||
while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
|
||||
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
|
||||
args.x = args.x[1 ..]
|
||||
} endwhile
|
||||
|
||||
// For short inputs, just do a simple loop.
|
||||
if args.x.length() < 64 {
|
||||
iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
|
||||
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
|
||||
}
|
||||
this.state = 0xFFFF_FFFF ^ s
|
||||
return nothing
|
||||
}
|
||||
|
||||
// Load 128×4 = 512 bits from the first 64-byte chunk.
|
||||
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
|
||||
x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
|
||||
x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
|
||||
x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
|
||||
|
||||
// Combine with the initial state.
|
||||
x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
|
||||
|
||||
// Process the remaining 64-byte chunks.
|
||||
k1k2 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
|
||||
iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
|
||||
y0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
|
||||
y1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
|
||||
y2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
|
||||
y3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
|
||||
|
||||
x0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
|
||||
x1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
|
||||
x2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
|
||||
x3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
|
||||
|
||||
x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
|
||||
x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
|
||||
x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
|
||||
x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
|
||||
}
|
||||
|
||||
// Reduce 128×4 = 512 bits to 128 bits.
|
||||
k3k4 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
|
||||
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
|
||||
x0 = x0._mm_xor_si128(b: x1)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
|
||||
x0 = x0._mm_xor_si128(b: x2)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
|
||||
x0 = x0._mm_xor_si128(b: x3)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
|
||||
// Reduce 128 bits to 64 bits.
|
||||
x1 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x10)
|
||||
x2 = util.make_m128i_multiple_u32(
|
||||
a00: 0xFFFF_FFFF,
|
||||
a01: 0x0000_0000,
|
||||
a02: 0xFFFF_FFFF,
|
||||
a03: 0x0000_0000)
|
||||
x0 = x0._mm_srli_si128(imm8: 8)
|
||||
x0 = x0._mm_xor_si128(b: x1)
|
||||
k5zz = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
|
||||
x1 = x0._mm_srli_si128(imm8: 4)
|
||||
x0 = x0._mm_and_si128(b: x2)
|
||||
x0 = x0._mm_clmulepi64_si128(b: k5zz, imm8: 0x00)
|
||||
x0 = x0._mm_xor_si128(b: x1)
|
||||
|
||||
// Reduce 64 bits to 32 bits (Barrett Reduction) and extract.
|
||||
//
|
||||
// Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after
|
||||
// adjusting for bit-reflection as per Figure 12 (page 21).
|
||||
pxmu = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16])
|
||||
x1 = x0._mm_and_si128(b: x2)
|
||||
x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x10)
|
||||
x1 = x1._mm_and_si128(b: x2)
|
||||
x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00)
|
||||
x1 = x1._mm_xor_si128(b: x0)
|
||||
s = x1._mm_extract_epi32(imm8: 1)
|
||||
|
||||
// Handle the tail of args.x that wasn't a complete 64-byte chunk.
|
||||
tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
|
||||
if tail_index < args.x.length() {
|
||||
iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
|
||||
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
|
||||
}
|
||||
}
|
||||
|
||||
this.state = 0xFFFF_FFFF ^ s
|
||||
}
|
||||
@@ -8,32 +8,29 @@
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 OR MIT
|
||||
|
||||
// --------
|
||||
|
||||
// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
|
||||
// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
|
||||
|
||||
pri func ieee_hasher.up_x86_sse42!(x: roslice base.u8),
|
||||
choose cpu_arch >= x86_sse42,
|
||||
{
|
||||
var s : base.u32
|
||||
var p : roslice base.u8
|
||||
|
||||
var util : base.x86_sse42_utility
|
||||
var k1k2 : base.x86_m128i
|
||||
var k3k4 : base.x86_m128i
|
||||
var k5zz : base.x86_m128i
|
||||
var pxmu : base.x86_m128i
|
||||
var kk : base.x86_m128i
|
||||
var x0 : base.x86_m128i
|
||||
var x1 : base.x86_m128i
|
||||
var x2 : base.x86_m128i
|
||||
var x3 : base.x86_m128i
|
||||
var x4 : base.x86_m128i
|
||||
var x5 : base.x86_m128i
|
||||
var x6 : base.x86_m128i
|
||||
var x7 : base.x86_m128i
|
||||
var y0 : base.x86_m128i
|
||||
var y1 : base.x86_m128i
|
||||
var y2 : base.x86_m128i
|
||||
var y3 : base.x86_m128i
|
||||
|
||||
var tail_index : base.u64
|
||||
var y4 : base.x86_m128i
|
||||
var y5 : base.x86_m128i
|
||||
var y6 : base.x86_m128i
|
||||
var y7 : base.x86_m128i
|
||||
|
||||
s = 0xFFFF_FFFF ^ this.state
|
||||
|
||||
@@ -43,125 +40,111 @@ pri func ieee_hasher.up_x86_sse42!(x: roslice base.u8),
|
||||
args.x = args.x[1 ..]
|
||||
} endwhile
|
||||
|
||||
// For short inputs, just do a simple loop.
|
||||
if args.x.length() < 64 {
|
||||
iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
|
||||
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
|
||||
}
|
||||
this.state = 0xFFFF_FFFF ^ s
|
||||
return nothing
|
||||
if args.x.length() >= 128 {
|
||||
// BEGIN script/print-crc32-x86-sse42-code.go generated code.
|
||||
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
|
||||
x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
|
||||
x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
|
||||
x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
|
||||
x4 = util.make_m128i_slice128(a: args.x[0x40 .. 0x50])
|
||||
x5 = util.make_m128i_slice128(a: args.x[0x50 .. 0x60])
|
||||
x6 = util.make_m128i_slice128(a: args.x[0x60 .. 0x70])
|
||||
x7 = util.make_m128i_slice128(a: args.x[0x70 .. 0x80])
|
||||
kk = util.make_m128i_multiple_u32(a00: 0x33FF_F533, a01: 0, a02: 0x910E_EEC1, a03: 0)
|
||||
x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
|
||||
args.x = args.x[128 ..]
|
||||
while args.x.length() >= 128 {
|
||||
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y1 = x1._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x1 = x1._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y3 = x3._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x3 = x3._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y5 = x5._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x5 = x5._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y7 = x7._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x7 = x7._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y0 = y0._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
y1 = y1._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
|
||||
x1 = x1._mm_xor_si128(b: y1)
|
||||
y2 = y2._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
|
||||
x2 = x2._mm_xor_si128(b: y2)
|
||||
y3 = y3._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
|
||||
x3 = x3._mm_xor_si128(b: y3)
|
||||
y4 = y4._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x40 .. 0x50]))
|
||||
x4 = x4._mm_xor_si128(b: y4)
|
||||
y5 = y5._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x50 .. 0x60]))
|
||||
x5 = x5._mm_xor_si128(b: y5)
|
||||
y6 = y6._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x60 .. 0x70]))
|
||||
x6 = x6._mm_xor_si128(b: y6)
|
||||
y7 = y7._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x70 .. 0x80]))
|
||||
x7 = x7._mm_xor_si128(b: y7)
|
||||
args.x = args.x[128 ..]
|
||||
} endwhile
|
||||
kk = util.make_m128i_multiple_u32(a00: 0xAE68_9191, a01: 0, a02: 0xCCAA_009E, a03: 0)
|
||||
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y0 = y0._mm_xor_si128(b: x1)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
y2 = y2._mm_xor_si128(b: x3)
|
||||
x2 = x2._mm_xor_si128(b: y2)
|
||||
y4 = y4._mm_xor_si128(b: x5)
|
||||
x4 = x4._mm_xor_si128(b: y4)
|
||||
y6 = y6._mm_xor_si128(b: x7)
|
||||
x6 = x6._mm_xor_si128(b: y6)
|
||||
kk = util.make_m128i_multiple_u32(a00: 0xF1DA_05AA, a01: 0, a02: 0x8125_6527, a03: 0)
|
||||
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y0 = y0._mm_xor_si128(b: x2)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
y4 = y4._mm_xor_si128(b: x6)
|
||||
x4 = x4._mm_xor_si128(b: y4)
|
||||
kk = util.make_m128i_multiple_u32(a00: 0x8F35_2D95, a01: 0, a02: 0x1D95_13D7, a03: 0)
|
||||
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
|
||||
y0 = y0._mm_xor_si128(b: x4)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
|
||||
s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).
|
||||
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
|
||||
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
|
||||
_mm_extract_epi32(imm8: 2)
|
||||
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
|
||||
s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).
|
||||
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
|
||||
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
|
||||
_mm_extract_epi32(imm8: 2)
|
||||
// END script/print-crc32-x86-sse42-code.go generated code.
|
||||
}
|
||||
|
||||
// Load 128×4 = 512 bits from the first 64-byte chunk.
|
||||
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
|
||||
x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
|
||||
x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
|
||||
x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
|
||||
while args.x.length() >= 8 {
|
||||
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
|
||||
s = util.make_m128i_single_u64(a: args.x.peek_u64le() ^ (s as base.u64)).
|
||||
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
|
||||
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
|
||||
_mm_extract_epi32(imm8: 2)
|
||||
args.x = args.x[8 ..]
|
||||
} endwhile
|
||||
|
||||
// Combine with the initial state.
|
||||
x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
|
||||
|
||||
// Process the remaining 64-byte chunks.
|
||||
k1k2 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
|
||||
iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
|
||||
y0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
|
||||
y1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
|
||||
y2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
|
||||
y3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
|
||||
|
||||
x0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
|
||||
x1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
|
||||
x2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
|
||||
x3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
|
||||
|
||||
x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
|
||||
x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
|
||||
x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
|
||||
x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
|
||||
}
|
||||
|
||||
// Reduce 128×4 = 512 bits to 128 bits.
|
||||
k3k4 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
|
||||
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
|
||||
x0 = x0._mm_xor_si128(b: x1)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
|
||||
x0 = x0._mm_xor_si128(b: x2)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
|
||||
x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
|
||||
x0 = x0._mm_xor_si128(b: x3)
|
||||
x0 = x0._mm_xor_si128(b: y0)
|
||||
|
||||
// Reduce 128 bits to 64 bits.
|
||||
x1 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x10)
|
||||
x2 = util.make_m128i_multiple_u32(
|
||||
a00: 0xFFFF_FFFF,
|
||||
a01: 0x0000_0000,
|
||||
a02: 0xFFFF_FFFF,
|
||||
a03: 0x0000_0000)
|
||||
x0 = x0._mm_srli_si128(imm8: 8)
|
||||
x0 = x0._mm_xor_si128(b: x1)
|
||||
k5zz = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
|
||||
x1 = x0._mm_srli_si128(imm8: 4)
|
||||
x0 = x0._mm_and_si128(b: x2)
|
||||
x0 = x0._mm_clmulepi64_si128(b: k5zz, imm8: 0x00)
|
||||
x0 = x0._mm_xor_si128(b: x1)
|
||||
|
||||
// Reduce 64 bits to 32 bits (Barrett Reduction) and extract.
|
||||
//
|
||||
// Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after
|
||||
// adjusting for bit-reflection as per Figure 12 (page 21).
|
||||
pxmu = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16])
|
||||
x1 = x0._mm_and_si128(b: x2)
|
||||
x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x10)
|
||||
x1 = x1._mm_and_si128(b: x2)
|
||||
x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00)
|
||||
x1 = x1._mm_xor_si128(b: x0)
|
||||
s = x1._mm_extract_epi32(imm8: 1)
|
||||
|
||||
// Handle the tail of args.x that wasn't a complete 64-byte chunk.
|
||||
tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
|
||||
if tail_index < args.x.length() {
|
||||
iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
|
||||
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
|
||||
}
|
||||
}
|
||||
while args.x.length() > 0 {
|
||||
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
|
||||
args.x = args.x[1 ..]
|
||||
} endwhile
|
||||
|
||||
this.state = 0xFFFF_FFFF ^ s
|
||||
}
|
||||
|
||||
// These constants come from page 22 of Gopal et al. They are also reproduced
|
||||
// by script/print-crc32-x86-sse42-magic-numbers.go which is runnable online at
|
||||
// https://play.golang.org/p/wH1q6GfhKOE
|
||||
//
|
||||
// The k6' constant from the Gopal paper is unused.
|
||||
//
|
||||
// The rkN names match the numbers at
|
||||
// https://github.com/intel/isa-l/blob/7b30857e20b84e5afab1a28291189b9dc571110d/crc/crc32_gzip_refl_by16_10.asm#L475-L499
|
||||
//
|
||||
// The "+§" means a harmless off-by-one difference compared to Intel's numbers.
|
||||
// https://danlark.org/2021/03/08/how-a-bug-in-the-linux-crc-32-checksum-turned-out-not-to-be-a-bug/
|
||||
// https://github.com/google/wuffs/commit/b24e046670396d7ef22ccf499051340b9288419b
|
||||
|
||||
pri const IEEE_X86_SSE42_K1K2 : roarray[16] base.u8 = [
|
||||
0xD4, 0x2B, 0x44, 0x54, 0x01, 0x00, 0x00, 0x00, // k1' = 0x1_5444_2BD4 = rk16
|
||||
0x96, 0x15, 0xE4, 0xC6, 0x01, 0x00, 0x00, 0x00, // k2' = 0x1_C6E4_1596 = rk15
|
||||
]
|
||||
|
||||
pri const IEEE_X86_SSE42_K3K4 : roarray[16] base.u8 = [
|
||||
0xD0, 0x97, 0x19, 0x75, 0x01, 0x00, 0x00, 0x00, // k3' = 0x1_7519_97D0 = rk2
|
||||
0x9E, 0x00, 0xAA, 0xCC, 0x00, 0x00, 0x00, 0x00, // k4' = 0x0_CCAA_009E = rk1
|
||||
]
|
||||
|
||||
pri const IEEE_X86_SSE42_K5ZZ : roarray[16] base.u8 = [
|
||||
0x24, 0x61, 0xCD, 0x63, 0x01, 0x00, 0x00, 0x00, // k5' = 0x1_63CD_6124 = rk6
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Unused
|
||||
]
|
||||
|
||||
pri const IEEE_X86_SSE42_PXMU : roarray[16] base.u8 = [
|
||||
0x41, 0x06, 0x71, 0xDB, 0x01, 0x00, 0x00, 0x00, // Px' = 0x1_DB71_0641 = rk8+§
|
||||
0x41, 0x16, 0x01, 0xF7, 0x01, 0x00, 0x00, 0x00, // μ' = 0x1_F701_1641 = rk7+§
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user