std/crc64: re-write x86_sse42 implementation

name                           old speed      new speed       delta

wuffs_crc64_ecma_10k/clang14   5.84GB/s ± 0%  23.43GB/s ± 0%  +301.23%  (p=0.008 n=5+5)
wuffs_crc64_ecma_100k/clang14  6.18GB/s ± 0%  30.23GB/s ± 1%  +389.45%  (p=0.008 n=5+5)

wuffs_crc64_ecma_10k/gcc12     5.87GB/s ± 1%  24.02GB/s ± 0%  +309.21%  (p=0.008 n=5+5)
wuffs_crc64_ecma_100k/gcc12    6.21GB/s ± 0%  30.68GB/s ± 0%  +394.22%  (p=0.008 n=5+5)

$ time example-mzcat < linux-6.8.2.tar.xz > /dev/null
Before: user 0m7.604s
After:  user 0m7.432s
This commit is contained in:
Nigel Tao
2024-04-22 13:54:03 +10:00
parent e1f803324b
commit e541e0cf79
5 changed files with 343 additions and 189 deletions

View File

@@ -49,8 +49,7 @@ func (q *checker) optimizeIOMethodAdvance(receiver *a.Expr, advance *big.Int, ad
return q.optimizeIOMethodAdvanceExpr(receiver, advanceExpr, update)
}
// Check if receiver looks like "a[i .. j]" where i and j are constants and
// ((j - i) >= advance).
// Check if the receiver looks like "a[i .. j]".
if _, i, j, ok := receiver.IsSlice(); ok {
icv := (*big.Int)(nil)
if i == nil {
@@ -64,11 +63,25 @@ func (q *checker) optimizeIOMethodAdvance(receiver *a.Expr, advance *big.Int, ad
jcv = j.ConstValue()
}
if (icv != nil) && (jcv != nil) {
switch {
case (icv != nil) && (jcv != nil):
// OK if i and j are constants and ((j - i) >= advance).
n := big.NewInt(0).Sub(jcv, icv)
if n.Cmp(advance) >= 0 {
retOK = true
}
case (i != nil) && (j != nil) && (j.Operator() == t.IDXBinaryPlus):
// OK if j is (i + constant) and (constant >= advance).
n := (*big.Int)(nil)
if j.LHS().AsExpr().Eq(i) {
n = j.RHS().AsExpr().ConstValue()
} else if j.RHS().AsExpr().Eq(i) {
n = j.LHS().AsExpr().ConstValue()
}
if n != nil && (n.Cmp(advance) >= 0) {
retOK = true
}
}
}

View File

@@ -36597,27 +36597,33 @@ WUFFS_CRC64__ECMA_TABLE[8][256] WUFFS_BASE__POTENTIALLY_UNUSED = {
};
static const uint8_t
WUFFS_CRC64__SHUFFLE_707F[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
112u, 113u, 114u, 115u, 116u, 117u, 118u, 119u,
120u, 121u, 122u, 123u, 124u, 125u, 126u, 127u,
};
static const uint8_t
WUFFS_CRC64__SHUFFLE_8F80[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
143u, 142u, 141u, 140u, 139u, 138u, 137u, 136u,
135u, 134u, 133u, 132u, 131u, 130u, 129u, 128u,
};
static const uint8_t
WUFFS_CRC64__ECMA_X86_SSE42_K3K4[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
WUFFS_CRC64__ECMA_X86_SSE42_FOLD1[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
228u, 58u, 57u, 202u, 151u, 212u, 93u, 224u,
64u, 95u, 135u, 199u, 175u, 149u, 190u, 218u,
};
static const uint8_t
WUFFS_CRC64__ECMA_X86_SSE42_PXMU[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
133u, 30u, 14u, 175u, 43u, 175u, 216u, 146u,
WUFFS_CRC64__ECMA_X86_SSE42_FOLD2[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
68u, 250u, 158u, 138u, 0u, 91u, 9u, 96u,
81u, 175u, 225u, 15u, 163u, 83u, 230u, 59u,
};
static const uint8_t
WUFFS_CRC64__ECMA_X86_SSE42_FOLD4[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
243u, 65u, 212u, 157u, 187u, 239u, 227u, 106u,
244u, 45u, 132u, 167u, 84u, 96u, 31u, 8u,
};
static const uint8_t
WUFFS_CRC64__ECMA_X86_SSE42_FOLD8[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
0u, 16u, 204u, 79u, 29u, 215u, 87u, 135u,
64u, 231u, 61u, 247u, 42u, 107u, 216u, 215u,
};
static const uint8_t
WUFFS_CRC64__ECMA_X86_SSE42_MUPX[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
213u, 99u, 41u, 23u, 108u, 70u, 62u, 156u,
133u, 30u, 14u, 175u, 43u, 175u, 216u, 146u,
};
// ---------------- Private Initializer Prototypes
@@ -36899,89 +36905,122 @@ wuffs_crc64__ecma_hasher__up_x86_sse42(
wuffs_base__slice_u8 a_x) {
uint64_t v_s = 0;
wuffs_base__slice_u8 v_p = {0};
__m128i v_s0 = {0};
__m128i v_s0_707F = {0};
__m128i v_s0_8F80 = {0};
__m128i v_x0 = {0};
__m128i v_aa = {0};
__m128i v_k3k4 = {0};
__m128i v_t0 = {0};
__m128i v_t1 = {0};
__m128i v_t2 = {0};
__m128i v_u0 = {0};
__m128i v_u1 = {0};
__m128i v_u2 = {0};
__m128i v_v0 = {0};
__m128i v_v1 = {0};
__m128i v_pxmu = {0};
__m128i v_w1 = {0};
__m128i v_w2 = {0};
uint64_t v_tail_index = 0;
uint8_t v_buf[48] = {0};
__m128i v_xa = {0};
__m128i v_xb = {0};
__m128i v_xc = {0};
__m128i v_xd = {0};
__m128i v_xe = {0};
__m128i v_xf = {0};
__m128i v_xg = {0};
__m128i v_xh = {0};
__m128i v_mu1 = {0};
__m128i v_mu2 = {0};
__m128i v_mu4 = {0};
__m128i v_mu8 = {0};
__m128i v_mupx = {0};
v_s = (18446744073709551615u ^ self->private_impl.f_state);
while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) {
v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
}
if (((uint64_t)(a_x.len)) < 32u) {
{
wuffs_base__slice_u8 i_slice_p = a_x;
v_p.ptr = i_slice_p.ptr;
v_p.len = 1;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
while (v_p.ptr < i_end0_p) {
v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
v_p.ptr += 1;
do {
do {
if (((uint64_t)(a_x.len)) >= 128u) {
} else if (((uint64_t)(a_x.len)) >= 64u) {
v_xa = _mm_xor_si128(_mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)), _mm_cvtsi64_si128((int64_t)(v_s)));
v_xb = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
v_xc = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
v_xd = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 64u);
break;
} else if (((uint64_t)(a_x.len)) >= 32u) {
v_xa = _mm_xor_si128(_mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)), _mm_cvtsi64_si128((int64_t)(v_s)));
v_xb = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 32u);
goto label__chain2__break;
} else {
{
wuffs_base__slice_u8 i_slice_p = a_x;
v_p.ptr = i_slice_p.ptr;
v_p.len = 1;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
while (v_p.ptr < i_end0_p) {
v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
v_p.ptr += 1;
}
v_p.len = 0;
}
self->private_impl.f_state = (18446744073709551615u ^ v_s);
return wuffs_base__make_empty_struct();
}
v_p.len = 0;
v_xa = _mm_xor_si128(_mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)), _mm_cvtsi64_si128((int64_t)(v_s)));
v_xb = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
v_xc = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
v_xd = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
v_xe = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u));
v_xf = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u));
v_xg = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u));
v_xh = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
v_mu8 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD8));
while (((uint64_t)(a_x.len)) >= 128u) {
v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
v_xc = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xc, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xc, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u)));
v_xd = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xd, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xd, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u)));
v_xe = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xe, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xe, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u)));
v_xf = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xf, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xf, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u)));
v_xg = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xg, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xg, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u)));
v_xh = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xh, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xh, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u)));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
}
v_mu4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD4));
v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu4, (int32_t)(17u))), v_xe);
v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu4, (int32_t)(17u))), v_xf);
v_xc = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xc, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xc, v_mu4, (int32_t)(17u))), v_xg);
v_xd = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xd, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xd, v_mu4, (int32_t)(17u))), v_xh);
if (((uint64_t)(a_x.len)) > 64u) {
v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu4, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu4, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
v_xc = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xc, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xc, v_mu4, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u)));
v_xd = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xd, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xd, v_mu4, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u)));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 64u);
}
} while (0);
v_mu2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD2));
v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(17u))), v_xc);
v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu2, (int32_t)(17u))), v_xd);
if (((uint64_t)(a_x.len)) > 32u) {
v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu2, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 32u);
}
self->private_impl.f_state = (18446744073709551615u ^ v_s);
return wuffs_base__make_empty_struct();
}
v_s0 = _mm_cvtsi64_si128((int64_t)(v_s));
v_s0_707F = _mm_shuffle_epi8(v_s0, _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__SHUFFLE_707F)));
v_s0_8F80 = _mm_shuffle_epi8(v_s0, _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__SHUFFLE_8F80)));
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u);
v_k3k4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_K3K4));
v_t0 = _mm_xor_si128(v_s0_707F, v_x0);
v_t1 = _mm_clmulepi64_si128(v_t0, v_k3k4, (int32_t)(0u));
v_t2 = _mm_clmulepi64_si128(v_t0, v_k3k4, (int32_t)(17u));
v_aa = _mm_xor_si128(_mm_xor_si128(v_t1, v_t2), v_s0_8F80);
while (((uint64_t)(a_x.len)) >= 32u) {
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
} while (0);
label__chain2__break:;
v_mu1 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD1));
v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu1, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu1, (int32_t)(17u))), v_xb);
if (((uint64_t)(a_x.len)) > 24u) {
v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu1, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu1, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u);
v_u0 = _mm_xor_si128(v_aa, v_x0);
v_u1 = _mm_clmulepi64_si128(v_u0, v_k3k4, (int32_t)(0u));
v_u2 = _mm_clmulepi64_si128(v_u0, v_k3k4, (int32_t)(17u));
v_aa = _mm_xor_si128(v_u1, v_u2);
}
if (((uint64_t)(a_x.len)) < 16u) {
return wuffs_base__make_empty_struct();
}
v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u);
v_v0 = _mm_xor_si128(v_aa, v_x0);
v_v1 = _mm_clmulepi64_si128(v_v0, v_k3k4, (int32_t)(16u));
v_aa = _mm_xor_si128(v_v1, _mm_srli_si128(v_v0, (int32_t)(8u)));
v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_PXMU));
v_w1 = _mm_clmulepi64_si128(v_aa, v_pxmu, (int32_t)(16u));
v_w2 = _mm_clmulepi64_si128(v_w1, v_pxmu, (int32_t)(0u));
v_s = ((uint64_t)(_mm_extract_epi64(_mm_xor_si128(v_aa, _mm_xor_si128(v_w2, _mm_slli_si128(v_w1, (int32_t)(8u)))), (int32_t)(1u))));
v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551600u);
if (v_tail_index < ((uint64_t)(a_x.len))) {
{
wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
v_p.ptr = i_slice_p.ptr;
v_p.len = 1;
const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
while (v_p.ptr < i_end0_p) {
v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
v_p.ptr += 1;
}
v_p.len = 0;
if (((uint64_t)(a_x.len)) > 24u) {
return wuffs_base__make_empty_struct();
}
}
_mm_storeu_si128((__m128i*)(void*)(v_buf + (24u - ((uint64_t)(a_x.len)))), v_xa);
wuffs_private_impl__slice_u8__copy_from_slice(wuffs_base__make_slice_u8_ij(v_buf, ((24u - ((uint64_t)(a_x.len))) + 16u), 48), a_x);
v_mu2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD2));
v_xa = _mm_lddqu_si128((const __m128i*)(const void*)(v_buf + 0u));
v_xb = _mm_lddqu_si128((const __m128i*)(const void*)(v_buf + 16u));
v_xc = _mm_lddqu_si128((const __m128i*)(const void*)(v_buf + 32u));
v_xd = _mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(17u)));
v_xe = _mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu1, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu1, (int32_t)(17u)));
v_xa = _mm_xor_si128(v_xd, _mm_xor_si128(v_xe, v_xc));
v_mupx = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_MUPX));
v_xb = _mm_clmulepi64_si128(v_xa, v_mupx, (int32_t)(0u));
v_xc = _mm_clmulepi64_si128(v_xb, v_mupx, (int32_t)(16u));
v_s = ((uint64_t)(_mm_extract_epi64(_mm_xor_si128(_mm_xor_si128(v_xc, _mm_slli_si128(v_xb, (int32_t)(8u))), v_xa), (int32_t)(1u))));
self->private_impl.f_state = (18446744073709551615u ^ v_s);
return wuffs_base__make_empty_struct();
}

View File

@@ -105,10 +105,14 @@ func calcMu(name string) {
}
func main() {
show("Px'", px)
calcKn("k1'", 512+64)
calcKn("k2'", 512)
calcKn("k3'", 128+64)
calcKn("k4'", 128)
calcKn("fold1a'", 128+64)
calcKn("fold1b'", 128)
calcKn("fold2a'", 256+64)
calcKn("fold2b'", 256)
calcKn("fold4a'", 512+64)
calcKn("fold4b'", 512)
calcKn("fold8a'", 1024+64)
calcKn("fold8b'", 1024)
calcMu("μ' ")
show("Px'", px)
}

View File

@@ -12,6 +12,8 @@
// Like std/crc32's x86 SIMD implementation, this is based on Gopal et al.
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction".
// and further optimized a la
// https://github.com/JoernEngel/joernblog/blob/778f0007b9a580e477608691f1aa86369f0efdd2/crc64.c
pri func ecma_hasher.up_x86_sse42!(x: roslice base.u8),
choose cpu_arch >= x86_sse42,
@@ -19,26 +21,22 @@ pri func ecma_hasher.up_x86_sse42!(x: roslice base.u8),
var s : base.u64
var p : roslice base.u8
var util : base.x86_sse42_utility
var s0 : base.x86_m128i
var s0_707F : base.x86_m128i
var s0_8F80 : base.x86_m128i
var x0 : base.x86_m128i
var aa : base.x86_m128i
var k3k4 : base.x86_m128i
var t0 : base.x86_m128i
var t1 : base.x86_m128i
var t2 : base.x86_m128i
var u0 : base.x86_m128i
var u1 : base.x86_m128i
var u2 : base.x86_m128i
var v0 : base.x86_m128i
var v1 : base.x86_m128i
var pxmu : base.x86_m128i
var w1 : base.x86_m128i
var w2 : base.x86_m128i
var buf : array[48] base.u8
var tail_index : base.u64
var util : base.x86_sse42_utility
var xa : base.x86_m128i
var xb : base.x86_m128i
var xc : base.x86_m128i
var xd : base.x86_m128i
var xe : base.x86_m128i
var xf : base.x86_m128i
var xg : base.x86_m128i
var xh : base.x86_m128i
var mu1 : base.x86_m128i
var mu2 : base.x86_m128i
var mu4 : base.x86_m128i
var mu8 : base.x86_m128i
var mupx : base.x86_m128i
s = 0xFFFF_FFFF_FFFF_FFFF ^ this.state
@@ -48,8 +46,31 @@ pri func ecma_hasher.up_x86_sse42!(x: roslice base.u8),
args.x = args.x[1 ..]
} endwhile
// For short inputs, just do a simple loop.
if args.x.length() < 0x20 {
while.chain2 true {{
while.chain4 true {{
if args.x.length() >= 0x80 {
// No-op.
} else if args.x.length() >= 0x40 {
// Set up 4 accumulators.
xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
util.make_m128i_single_u64(a: s))
xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
xc = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
xd = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
args.x = args.x[0x40 ..]
break.chain4
} else if args.x.length() >= 0x20 {
// Set up 2 accumulators.
xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
util.make_m128i_single_u64(a: s))
xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
args.x = args.x[0x20 ..]
break.chain2
} else {
// For short inputs, just do a simple loop.
iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
}
@@ -57,97 +78,174 @@ pri func ecma_hasher.up_x86_sse42!(x: roslice base.u8),
return nothing
}
// Process N 16-byte chunks (1 on ramp, N-2 main loop, 1 off ramp). The
// algorithm is loosely based on https://github.com/rawrunprotected/crc but
// modified to always start and end on 16-byte alignment boundaries. It
// also doesn't crash on zero-length input.
// On ramp.
s0 = util.make_m128i_single_u64(a: s)
s0_707F = s0._mm_shuffle_epi8(b: util.make_m128i_slice128(a: SHUFFLE_707F[.. 16]))
s0_8F80 = s0._mm_shuffle_epi8(b: util.make_m128i_slice128(a: SHUFFLE_8F80[.. 16]))
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
args.x = args.x[0x10 ..]
k3k4 = util.make_m128i_slice128(a: ECMA_X86_SSE42_K3K4[.. 16])
t0 = s0_707F._mm_xor_si128(b: x0)
t1 = t0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
t2 = t0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
aa = t1._mm_xor_si128(b: t2)._mm_xor_si128(b: s0_8F80)
// Set up 8 accumulators.
xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
util.make_m128i_single_u64(a: s))
xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
xc = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
xd = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
xe = util.make_m128i_slice128(a: args.x[0x40 .. 0x50])
xf = util.make_m128i_slice128(a: args.x[0x50 .. 0x60])
xg = util.make_m128i_slice128(a: args.x[0x60 .. 0x70])
xh = util.make_m128i_slice128(a: args.x[0x70 .. 0x80])
args.x = args.x[0x80 ..]
// Main loop.
while args.x.length() >= 0x20 {
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
args.x = args.x[0x10 ..]
u0 = aa._mm_xor_si128(b: x0)
u1 = u0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
u2 = u0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
aa = u1._mm_xor_si128(b: u2)
mu8 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD8[.. 16])
while args.x.length() >= 0x80 {
xa = xa._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
xb = xb._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
xc = xc._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xc._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
xd = xd._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xd._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
xe = xe._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xe._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x40 .. 0x50]))
xf = xf._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xf._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x50 .. 0x60]))
xg = xg._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xg._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x60 .. 0x70]))
xh = xh._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xh._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x70 .. 0x80]))
args.x = args.x[0x80 ..]
} endwhile
// Off ramp.
if args.x.length() < 0x10 {
// Unreachable, but the "if" is needed by the bounds checker.
return nothing
// Reduce 8 chains down to 4.
mu4 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD4[.. 16])
xa = xa._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
xe)
xb = xb._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
xf)
xc = xc._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xc._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
xg)
xd = xd._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xd._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
xh)
if args.x.length() > 0x40 {
xa = xa._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
xb = xb._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
xc = xc._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xc._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
xd = xd._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xd._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
args.x = args.x[0x40 ..]
}
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
args.x = args.x[0x10 ..]
v0 = aa._mm_xor_si128(b: x0)
v1 = v0._mm_clmulepi64_si128(b: k3k4, imm8: 0x10)
aa = v1._mm_xor_si128(b: v0._mm_srli_si128(imm8: 8))
break.chain4
}} endwhile.chain4
// Reduce 4 chains down to 2.
mu2 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD2[.. 16])
xa = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
xc)
xb = xb._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
xd)
if args.x.length() > 0x20 {
xa = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
xb = xb._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
args.x = args.x[0x20 ..]
}
// Barrett reduction.
pxmu = util.make_m128i_slice128(a: ECMA_X86_SSE42_PXMU[.. 16])
w1 = aa._mm_clmulepi64_si128(b: pxmu, imm8: 0x10)
w2 = w1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00)
s = aa._mm_xor_si128(b:
w2._mm_xor_si128(b:
w1._mm_slli_si128(imm8: 8)))._mm_extract_epi64(imm8: 1)
// Handle the tail of args.x that wasn't a complete 16-byte chunk.
tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFF0 // And-not 16.
if tail_index < args.x.length() {
iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
break.chain2
}} endwhile.chain2
// Reduce 2 chains down to 1.
mu1 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD1[.. 16])
xa = xa._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu1, imm8: 0x11))._mm_xor_si128(b:
xb)
if args.x.length() > 24 {
xa = xa._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu1, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
args.x = args.x[0x10 ..]
if args.x.length() > 24 {
return nothing // Unreachable.
}
}
assert args.x.length() <= 24
// Copy our accumulator xa and the remaining data to buf, a 48-byte buffer.
// The buffer includes the implicitly appended 8 bytes for the CRC.
//
// | 0-padding | xa | args.x | 0-padding |
// | 0-24 bytes | 16 bytes | 0-24 bytes| 8 bytes |
//
assert (24 - args.x.length()) <= ((24 - args.x.length()) + 16) via "a <= (a + b): 0 <= b"()
xa.store_slice128!(a: buf[24 - args.x.length() .. (24 - args.x.length()) + 16])
buf[(24 - args.x.length()) + 16 ..].copy_from_slice!(s: args.x)
// Reduce the 48-byte buffer down to a 16-byte (128-bit) accumulator.
mu2 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD2[.. 16])
xa = util.make_m128i_slice128(a: buf[0x00 .. 0x10])
xb = util.make_m128i_slice128(a: buf[0x10 .. 0x20])
xc = util.make_m128i_slice128(a: buf[0x20 .. 0x30])
xd = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))
xe = xb._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu1, imm8: 0x11))
xa = xd._mm_xor_si128(b: xe._mm_xor_si128(b: xc))
// Reduce 128-bit to 64-bit via Barrett reduction.
mupx = util.make_m128i_slice128(a: ECMA_X86_SSE42_MUPX[.. 16])
xb = xa._mm_clmulepi64_si128(b: mupx, imm8: 0x00)
xc = xb._mm_clmulepi64_si128(b: mupx, imm8: 0x10)
s = xc._mm_xor_si128(b: xb._mm_slli_si128(imm8: 8)).
_mm_xor_si128(b: xa)._mm_extract_epi64(imm8: 1)
this.state = 0xFFFF_FFFF_FFFF_FFFF ^ s
}
pri const SHUFFLE_707F : roarray[16] base.u8 = [
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
]
pri const SHUFFLE_8F80 : roarray[16] base.u8 = [
0x8F, 0x8E, 0x8D, 0x8C, 0x8B, 0x8A, 0x89, 0x88,
0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
]
// These constants are reproduced by
// script/print-crc64-x86-sse42-magic-numbers.go
//
// The rkN names match the numbers at
// https://github.com/intel/isa-l/blob/7b30857e20b84e5afab1a28291189b9dc571110d/crc/crc64_ecma_refl_by16_10.asm#L33-L56
// pri const ECMA_X86_SSE42_K1K2 : roarray[16] base.u8 = [
// 0xF3, 0x41, 0xD4, 0x9D, 0xBB, 0xEF, 0xE3, 0x6A, // k1' = 0x6AE3_EFBB_9DD4_41F3 = rk16
// 0xF4, 0x2D, 0x84, 0xA7, 0x54, 0x60, 0x1F, 0x08, // k2' = 0x081F_6054_A784_2DF4 = rk15
// ]
pri const ECMA_X86_SSE42_K3K4 : roarray[16] base.u8 = [
0xE4, 0x3A, 0x39, 0xCA, 0x97, 0xD4, 0x5D, 0xE0, // k3' = 0xE05D_D497_CA39_3AE4 = rk2
0x40, 0x5F, 0x87, 0xC7, 0xAF, 0x95, 0xBE, 0xDA, // k4' = 0xDABE_95AF_C787_5F40 = rk1
pri const ECMA_X86_SSE42_FOLD1 : roarray[16] base.u8 = [
0xE4, 0x3A, 0x39, 0xCA, 0x97, 0xD4, 0x5D, 0xE0, // fold1a' = 0xE05D_D497_CA39_3AE4 = rk2
0x40, 0x5F, 0x87, 0xC7, 0xAF, 0x95, 0xBE, 0xDA, // fold1b' = 0xDABE_95AF_C787_5F40 = rk1
]
pri const ECMA_X86_SSE42_PXMU : roarray[16] base.u8 = [
0x85, 0x1E, 0x0E, 0xAF, 0x2B, 0xAF, 0xD8, 0x92, // Px' = 0x92D8_AF2B_AF0E_1E85 = rk8
pri const ECMA_X86_SSE42_FOLD2 : roarray[16] base.u8 = [
0x44, 0xFA, 0x9E, 0x8A, 0x00, 0x5B, 0x09, 0x60, // fold2a' = 0x6009_5B00_8A9E_FA44 = rk20
0x51, 0xAF, 0xE1, 0x0F, 0xA3, 0x53, 0xE6, 0x3B, // fold2b' = 0x3BE6_53A3_0FE1_AF51 = rk19
]
pri const ECMA_X86_SSE42_FOLD4 : roarray[16] base.u8 = [
0xF3, 0x41, 0xD4, 0x9D, 0xBB, 0xEF, 0xE3, 0x6A, // fold4a' = 0x6AE3_EFBB_9DD4_41F3 = rk16
0xF4, 0x2D, 0x84, 0xA7, 0x54, 0x60, 0x1F, 0x08, // fold4b' = 0x081F_6054_A784_2DF4 = rk15
]
pri const ECMA_X86_SSE42_FOLD8 : roarray[16] base.u8 = [
0x00, 0x10, 0xCC, 0x4F, 0x1D, 0xD7, 0x57, 0x87, // fold8a' = 0x8757_D71D_4FCC_1000 = rk4
0x40, 0xE7, 0x3D, 0xF7, 0x2A, 0x6B, 0xD8, 0xD7, // fold8b' = 0xD7D8_6B2A_F73D_E740 = rk3
]
pri const ECMA_X86_SSE42_MUPX : roarray[16] base.u8 = [
0xD5, 0x63, 0x29, 0x17, 0x6C, 0x46, 0x3E, 0x9C, // μ' = 0x9C3E_466C_1729_63D5 = rk7
0x85, 0x1E, 0x0E, 0xAF, 0x2B, 0xAF, 0xD8, 0x92, // Px' = 0x92D8_AF2B_AF0E_1E85 = rk8
]

View File

@@ -377,7 +377,7 @@ bench_wuffs_crc64_ecma_10k() {
return do_bench_io_buffers(
wuffs_bench_crc64_ecma,
WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, tcounter_src,
&g_crc64_midsummer_gt, UINT64_MAX, UINT64_MAX, 200);
&g_crc64_midsummer_gt, UINT64_MAX, UINT64_MAX, 2000);
}
const char* //
@@ -386,7 +386,7 @@ bench_wuffs_crc64_ecma_100k() {
return do_bench_io_buffers(
wuffs_bench_crc64_ecma,
WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, tcounter_src,
&g_crc64_pi_gt, UINT64_MAX, UINT64_MAX, 20);
&g_crc64_pi_gt, UINT64_MAX, UINT64_MAX, 200);
}
// ---------------- Mimic Benches