diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl index 8dc06dd52a..f38a64bc6b 100755 --- a/crypto/arm64cpuid.pl +++ b/crypto/arm64cpuid.pl @@ -120,6 +120,14 @@ _armv8_sve2_probe: ret .size _armv8_sve2_probe,.-_armv8_sve2_probe +.globl _armv8_sve_get_vl_bytes +.type _armv8_sve_get_vl_bytes,%function +_armv8_sve_get_vl_bytes: + AARCH64_VALID_CALL_TARGET + .inst 0x0420e3e0 // cntb x0 + ret +.size _armv8_sve_get_vl_bytes,.-_armv8_sve_get_vl_bytes + .globl _armv8_cpuid_probe .type _armv8_cpuid_probe,%function _armv8_cpuid_probe: diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index fc780a7080..b037e1b9f1 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -86,9 +86,10 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; # define ARMV8_SHA3 (1<<11) # define ARMV8_UNROLL8_EOR3 (1<<12) # define ARMV8_SVE (1<<13) -# define ARMV8_SVE2 (1<<14) +# define ARMV9_SVE2 (1<<14) # define ARMV8_HAVE_SHA3_AND_WORTH_USING (1<<15) # define ARMV8_UNROLL12_EOR3 (1<<16) +# define ARMV9_SVE2_POLY1305 (1<<17) /* * MIDR_EL1 system register diff --git a/crypto/armcap.c b/crypto/armcap.c index 7eeea93bd1..84f621aeb8 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -24,11 +24,18 @@ #include #endif #include "arm_arch.h" +#ifdef __aarch64__ +#include +#endif unsigned int OPENSSL_armcap_P = 0; unsigned int OPENSSL_arm_midr = 0; unsigned int OPENSSL_armv8_rsa_neonized = 0; +#ifdef __aarch64__ +uint64_t _armv8_sve_get_vl_bytes(void); +#endif + #ifdef _WIN32 void OPENSSL_cpuid_setup(void) { @@ -346,7 +353,7 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SVE; if (getauxval(OSSL_HWCAP2) & OSSL_HWCAP2_SVE2) - OPENSSL_armcap_P |= ARMV8_SVE2; + OPENSSL_armcap_P |= ARMV9_SVE2; if (getauxval(OSSL_HWCAP2) & OSSL_HWCAP2_RNG) OPENSSL_armcap_P |= ARMV8_RNG; @@ -391,7 +398,7 @@ void OPENSSL_cpuid_setup(void) } # ifdef __aarch64__ OPENSSL_armcap_P |= arm_probe_for(_armv8_sve_probe, ARMV8_SVE); - OPENSSL_armcap_P |= arm_probe_for(_armv8_sve2_probe, ARMV8_SVE2); + OPENSSL_armcap_P |= arm_probe_for(_armv8_sve2_probe, ARMV9_SVE2); OPENSSL_armcap_P |= arm_probe_for(_armv8_rng_probe, ARMV8_RNG); # endif @@ -450,6 +457,17 @@ void OPENSSL_cpuid_setup(void) MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_QCOMM, QCOM_CPU_PART_ORYON_X1)) && (OPENSSL_armcap_P & ARMV8_SHA3)) OPENSSL_armcap_P |= ARMV8_HAVE_SHA3_AND_WORTH_USING; + if (OPENSSL_armcap_P & ARMV9_SVE2) { + uint64_t vl_bytes = _armv8_sve_get_vl_bytes(); + + if (vl_bytes > 16 && (vl_bytes & (vl_bytes - 1)) == 0) { + /* + * This implementation faster if vector length > 128 bits + * But vector length must be a power of 2 (e.g. 256, 512 bits) + */ + OPENSSL_armcap_P |= ARMV9_SVE2_POLY1305; + } + } # endif } #endif /* _WIN32, __ARM_MAX_ARCH__ >= 7 */ diff --git a/crypto/chacha/asm/chacha-armv8-sve.pl b/crypto/chacha/asm/chacha-armv8-sve.pl index 62a8be6fe1..40454c3322 100755 --- a/crypto/chacha/asm/chacha-armv8-sve.pl +++ b/crypto/chacha/asm/chacha-armv8-sve.pl @@ -756,7 +756,7 @@ ChaCha20_ctr32_sve: mov $sve2flag,0 adrp $tmp,OPENSSL_armcap_P ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P] - tst $tmpw,#ARMV8_SVE2 + tst $tmpw,#ARMV9_SVE2 b.eq 1f mov $sve2flag,1 b 2f diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl index cc2052ecc9..6659cd631f 100755 --- a/crypto/poly1305/asm/poly1305-armv8.pl +++ b/crypto/poly1305/asm/poly1305-armv8.pl @@ -69,6 +69,8 @@ $code.=<<___; .globl poly1305_emit .hidden poly1305_emit +.extern poly1305_blocks_sve2 + .type poly1305_init,%function .align 5 poly1305_init: @@ -109,6 +111,13 @@ poly1305_init: csel $d0,$d0,$r0,eq csel $d1,$d1,$r1,eq + tst w17, #ARMV9_SVE2_POLY1305 + + adrp $r0,poly1305_blocks_sve2 + add $r0,$r0,#:lo12:poly1305_blocks_sve2 + + csel $d0,$d0,$r0,eq + #ifdef __ILP32__ stp w12,w13,[$len] #else diff --git a/crypto/poly1305/asm/poly1305-armv9-sve2.pl b/crypto/poly1305/asm/poly1305-armv9-sve2.pl new file mode 100755 index 0000000000..b68741fe58 --- /dev/null +++ b/crypto/poly1305/asm/poly1305-armv9-sve2.pl @@ -0,0 +1,1420 @@ +#! /usr/bin/env perl +# Copyright 2016-2025 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +############################################################################## +# +# Copyright (c) 2025, Iakov Polyak +# This file is an SVE2 port-and-merge of POLY1305 hash algorithm, derived from +# the OpenSSL Neon implementation and a vector length agnostic (VLA) +# RISC-V implementation from the CRYPTOGAMS project. +# +############################################################################## +# +# Copyright (c) 2006, CRYPTOGAMS by +# All rights reserved. +# +#Redistribution and use in source and binary forms, with or without +#modification, are permitted provided that the following conditions +#are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the CRYPTOGAMS nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +#ALTERNATIVELY, provided that this notice is retained in full, this +#product may be distributed under the terms of the GNU General Public +#License (GPL), in which case the provisions of the GPL apply INSTEAD OF +#those given above. +# +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +#OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +############################################################################## +# +# September 2025 +# +# This is a 100% vector length agnostic implementation and has +# been tested with QEMU for the vector length of up to 2048 bits. +# +# On Graviton4, with the vector register length of 128 bits, +# it is less efficient than the Neon implementation by only 6%. +# This number has been obtained by running +# `openssl speed -evp ChaCha20-POLY1305` and +# `openssl speed -evp ChaCha20`, pinned to a single CPU, +# converting the 8192-byte result to cycles per byte +# using actual average runtime CPU frequency from `perf stat`, +# and taking the difference. On Graviton 4, this results in +# 0.62 cpb for Neon and 0.66 for SVE2. +# +# While Neon should probably be the default choice on a 128-bit architecture, +# speed-up is clearly expected with 256-bit and larger vector registers +# in the future. + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); + +my ($h0,$h1,$h2,$r0,$r1,$r2,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +my ($SVE_R0,$SVE_R1,$SVE_S1,$SVE_R2,$SVE_S2,$SVE_R3,$SVE_S3,$SVE_R4,$SVE_S4) = map("z$_.s",(0..8)); +my ($SVE_INlo_0,$SVE_INlo_1,$SVE_INlo_2,$SVE_INlo_3,$SVE_INlo_4) = map("z$_.s",(9..13)); +my ($SVE_INhi_0,$SVE_INhi_1,$SVE_INhi_2,$SVE_INhi_3,$SVE_INhi_4) = map("z$_.s",(14..18)); +my ($SVE_ACC0,$SVE_ACC1,$SVE_ACC2,$SVE_ACC3,$SVE_ACC4) = map("z$_.d",(19..23)); +my ($SVE_H0,$SVE_H1,$SVE_H2,$SVE_H3,$SVE_H4) = map("z$_.s",(24..28)); +my ($SVE_T0,$SVE_T1,$SVE_MASK) = map("z$_",(29..31)); + +my ($vl,$vl0,$vl1,$vl2,$vl3,$vl4) = ("x16",$h0,$h1,$h2,$r0,$r1); +my ($cs0,$cs1,$cs2,$cs3,$cs4,$cs5) = map("x$_",(19..24)); +my ($pwr,$mask) = map("x$_",(25..26)); +my $is_base2_26 = "w17"; + +$code.=<<___; +#include "arm_arch.h" + +.text + +.arch armv8-a + +.extern poly1305_blocks + +// --- poly1305_sw_2_26 --- +// Performs conversion of 3 base2_44 to 5 base2_26 scalars and +// stores them in memory at addresses [x5], [x5,#28], [x5,#56], +// [x5,#84] and [x5,#112]. +// +// This is a leaf function and does not modify stack. +// +// Calling Convention: +// Inputs: +// x5: Pointer into memory where 1st value should be stored. +// x7-x9: The three base2_44 scalar values (r0-r2) +// Clobbers (uses as temporaries): +// x10-x15 +.type poly1305_sw_2_26,%function +.align 5 +poly1305_sw_2_26: + // Converts 3 base2_44 -> 5 base2_26 values and stores + mov x15,#0x3ffffff // w15 : 2^26-1 mask + and x10,$r0,x15 // w10 -> r0 + lsr x11,$r0,#26 // w11 : top 18 bits of r0 + str w10,[x5] // Store r0 + bfi x11,$r1,#18,#8 // w11 -> r1 + ubfx x12,$r1,#8,#26 // w12 -> r2 + str w11,[x5,#28] // Store r1 + lsr x13,$r1,#34 // w13 : top 10 bits of r1 + str w12,[x5,#56] // Store r2 + bfi x13,$r2,#10,#16 // w13 -> r3 + lsr x14,$r2,#16 // w14 -> r4 + str w13,[x5,#84] // Store r3 + str w14,[x5,#112] // Store r4 + ret +.size poly1305_sw_2_26,.-poly1305_sw_2_26 + +// --- poly1305_sqr_2_44 --- +// Calculates base2_44 squaring operation. +// +// This is a leaf function and does not modify stack. +// It however uses callee-saved registers as scratch, so those must be +// saved on stack prior to calling. +// +// Calling Convention: +// Inputs: +// x7-x9: The three base2_44 scalar values (r0-r2) +// Outputs: +// x7-x9: The three base2_44 scalar values, squared (r0-r2) +// Clobbers (uses as temporaries): +// x10-x15, x19-x24, x26 +.type poly1305_sqr_2_44,%function +.align 5 +poly1305_sqr_2_44: + + // Pre-calculate constants and doubled terms. + mov x12,#20 + lsl x13,$r1,#1 // x13 = r1 * 2 + mul x12,$r2,x12 // x12 = r2 * 20 + lsl x10,$r0,#1 // x10 = r0 * 2 + + // --- Calculate d2 = r1*r1 + 2*r0*r2 --- + umulh $cs5,$r1,$r1 // high part of r1*r1 + mul $cs4,$r1,$r1 // low part of r1*r1 + umulh x15,x10,$r2 // high part of (r0*2)*r2 + mul x14,x10,$r2 // low part of (r0*2)*r2 + + // --- Calculate d0 = r0*r0 + 20*(2*r1*r2) --- + umulh $cs1,$r0,$r0 // high part of r0*r0 + mul $cs0,$r0,$r0 // low part of r0*r0 + umulh x11,x13,x12 // high part of (r1*2)*(r2*20) + mul x10,x13,x12 // low part of (r1*2)*(r2*20) + + adds $cs4,$cs4,x14 // d2_lo + adc $cs5,$cs5,x15 // d2_hi + + // --- Calculate d1 = 2*r0*r1 + 20*r2*r2 --- + // d1 is a 128-bit result stored in x7:x6 (hi:lo) + umulh $cs3,$r0,x13 // high part of r0*(r1*2) + mul $cs2,$r0,x13 // low part of r0*(r1*2) + umulh x13,$r2,x12 // high part of r2*(r2*20) + mul x12,$r2,x12 // low part of r2*(r2*20) + + adds $cs0,$cs0,x10 // d0_lo + adc $cs1,$cs1,x11 // d0_hi + + adds $cs2,$cs2,x12 // d1_lo + adc $cs3,$cs3,x13 // d1_hi + + // --- Reduction and Carry Propagation --- + // Reduce the 128-bit d0, d1, d2 back to three 44-bit limbs in x0, x1, x2 + lsr x10,$cs0,#44 // (d0_lo >> 44) + lsl x11,$cs1,#20 // (d0_hi << 20) - high 20 bits are zero + and $r0,$cs0,$mask // r0 -> d0_lo & mask + orr x10,x10,x11 // x10 -> 64-bit carry from d0 + + lsr x12,$cs2,#44 // (d1_lo >> 44) + lsl x13,$cs3,#20 // (d1_hi << 20) + and $r1,$cs2,$mask // r1 -> d1_lo & mask + orr x12,x12,x13 // x12 -> 64-bit carry from d1 + add $r1,$r1,x10 // r1 += carry from d0 + + lsr x11,$mask,#2 // x11 -> 2^42-1 mask for d2 reduction + lsr x10,$cs4,#42 // (d2_lo >> 42) + lsl x13,$cs5,#22 // (d2_hi << 22) + and $r2,$cs4,x11 // r2 -> d2_lo & 2^42-1 mask + orr x10,x10,x13 // x10 -> final carry from d2 + add $r2,$r2,x12 // r2 += carry from d1 + + // Handle ripple-carry from r2 and apply the *5 reduction. + lsr x13,$r2,#42 // Get carry from r2 (if r2 >= 2^42) + and $r2,$r2,x11 // Mask r2 back down to 42 bits + add x10,x10,x13 // Add this ripple-carry to the final carry + + add x11,x10,x10,lsl #2 // x11 -> final_carry * 5 + add $r0,$r0,x11 // r0 += final_carry * 5 + + // Final ripple-carry chain to ensure all limbs are 44 bits. + lsr x11,$r1,#44 // Get carry from r1 + and $r1,$r1,$mask // Mask r1 to 44 bits + add $r2,$r2,x11 // r2 += carry from r1 + + lsr x10,$r0,#44 // Get carry from r0 + and $r0,$r0,$mask // Mask r0 to 44 bits + add $r1,$r1,x10 // r1 += carry from r0 + + ret +.size poly1305_sqr_2_44,.-poly1305_sqr_2_44 + +// --- poly1305_lazy_reduce_sve2 --- +// Performs lazy reduction on five accumulator vectors as discussed +// in "NEON crypto" by D.J. Bernstein and P. Schwabe. +// +// This is a leaf function and does not modify GPRs or the stack. +// +// Calling Convention: +// Inputs: +// z19-z23: The five 64-bit .d accumulator vectors (ACC0-ACC4) +// Outputs: +// z24-z28: The five 32-bit .s final limb vectors (H0-H4) +// z31: All-zeros (resets mask) +// Clobbers (uses as temporaries): +// z29, z30 + +.type poly1305_lazy_reduce_sve2,%function +.align 5 +poly1305_lazy_reduce_sve2: + dup ${SVE_MASK}.d,#-1 + lsr ${SVE_T0}.d,$SVE_ACC3,#26 + trn1 $SVE_H3,z22.s,z24.s // reproducing Neon's `xtn` - treat ACC3 as a .s vector + lsr ${SVE_MASK}.d,${SVE_MASK}.d,#38 + lsr ${SVE_T1}.d,$SVE_ACC0,#26 + and $SVE_ACC0,$SVE_ACC0,${SVE_MASK}.d + add $SVE_ACC4,$SVE_ACC4,${SVE_T0}.d // h3 -> h4 + // Neon's bic is replaced with &=$SVE_MASK (because of using even-indexed elements) + and z27.d,z27.d,${SVE_MASK}.d // refer to SVE_H3 as .d + add $SVE_ACC1,$SVE_ACC1,${SVE_T1}.d // h0 -> h1 + + lsr ${SVE_T0}.d,$SVE_ACC4,#26 + trn1 $SVE_H4,z23.s,z24.s // reproducing Neon's `xtn` - treat ACC4 as a .s vector + lsr ${SVE_T1}.d,$SVE_ACC1,#26 + trn1 $SVE_H1,z20.s,z24.s // reproducing Neon's `xtn` - treat ACC1 as a .s vector + and z28.d,z28.d,${SVE_MASK}.d // refer to SVE_H4 as .d + add $SVE_ACC2,$SVE_ACC2,${SVE_T1}.d // h1 -> h2 + + add $SVE_ACC0,$SVE_ACC0,${SVE_T0}.d + lsl ${SVE_T0}.d,${SVE_T0}.d,#2 + shrnb ${SVE_T1}.s,$SVE_ACC2,#26 // check it's OK + trn1 $SVE_H2,z21.s,z24.s // reproducing Neon's `xtn` - treat ACC2 as a .s vector + add $SVE_ACC0,$SVE_ACC0,${SVE_T0}.d // h4 -> h0 + and z25.d,z25.d,${SVE_MASK}.d // refer to SVE_H1 as .d + add $SVE_H3,$SVE_H3,${SVE_T1}.s // h2 -> h3 + and z26.d,z26.d,${SVE_MASK}.d // refer to SVE_H2 as .d + + shrnb ${SVE_T0}.s,$SVE_ACC0,#26 + trn1 $SVE_H0,z19.s,z24.s // reproducing Neon's `xtn` - treat ACC0 as a .s vector - re-writing H0 here... + lsr ${SVE_T1}.s,$SVE_H3,#26 + and z27.d,z27.d,${SVE_MASK}.d // refer to SVE_H3 as .d + add $SVE_H1,$SVE_H1,${SVE_T0}.s // h0 -> h1 + and z24.d,z24.d,${SVE_MASK}.d // refer to SVE_H0 as .d + add $SVE_H4,$SVE_H4,${SVE_T1}.s // h3 -> h4 + + eor ${SVE_MASK}.d,${SVE_MASK}.d,${SVE_MASK}.d // reset zero mask + + ret +.size poly1305_lazy_reduce_sve2,.-poly1305_lazy_reduce_sve2 + +// --- poly1305_blocks_sve2 --- +// Main function, implementing POLY1305 algorithm as discussed +// in "NEON crypto" by D.J. Bernstein and P. Schwabe, in a VLA fashion, +// using SVE2. +// +// It is mostly a port-and-merge of the 128-bit Neon implementation herein and +// a VLA risc-v implementation in https://github.com/dot-asm/cryptogams. +// +.globl poly1305_blocks_sve2 +.type poly1305_blocks_sve2,%function +.align 5 +poly1305_blocks_sve2: +.Lpoly1305_blocks_sve2: + AARCH64_VALID_CALL_TARGET + ldr $is_base2_26,[$ctx,#24] + // Estimate vector width and branch to scalar if input too short + cntd $vl // vector width in 64-bit lanes (vl) + lsl $vl0,$vl,#4 // vl * 16 (bytes per vector input blocks) + add $vl1,$vl0,$vl0,lsl #1 // 3 * vl * 16 - new threshold. + cmp $len,$vl1 + b.hs .Lblocks_sve2 + cbz $is_base2_26,.Lshort_blocks // Call scalar f-n if short; if in base 2^26 - proceed + +.Lblocks_sve2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-144]! // Allowing for callee-saved reg-s + add x29,sp,#0 + + //Store some callee-saved GPRs + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + + ands $len,$len,#-16 + b.eq .Lno_data_sve2 + + cbz $is_base2_26,.Lbase2_64_sve2 + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + neg $vl1,$vl0 // - (vl * 16) + sub $vl0,$vl0,#1 // (vl * 16) - 1 + and $vl2,$len,$vl1 // $len - ($len % (vl * 16)) -> VLA length + and $vl4,$len,$vl0 // $len % (vl * 16) -> scalar remainder + cbz $vl4,.Leven_sve2 // If no scalar "head", proceed to VLA + add $vl3,$inp,$vl4 // Pointer to the start of the VLA data + stp $vl2,$vl3,[sp,#-16]! // Backup VLA length and ptr + mov $len,$vl4 // So that scalar part knows it's length + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $d2,$h2,xzr // can be partially reduced... + + and $t0,$d2,#-4 // ... so reduce + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$h0,$t0 + adcs $h1,$h1,xzr + adc $h2,$h2,xzr + + stp $h0,$h1,[$ctx] // store hash value base 2^64 + str $h2,[$ctx,#16] + + bl poly1305_blocks // Calculate the scalar "head" + ldp $len,$inp,[sp],#16 // Recover updated length and input ptr + ldr x30,[sp,#8] + + cbz $padbit,.Lzero_padbit_sve2 // hash already stored in poly1305_blocks + + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + cbnz $len,.Leven_sve2 + + stp w10,w11,[$ctx] // store hash value base 2^26 + stp w12,w13,[$ctx,#8] + str w14,[$ctx,#16] + b .Lno_data_sve2 + +.align 4 +.Lzero_padbit_sve2: + str xzr,[$ctx,#24] + b .Lno_data_sve2 + +.align 4 +.Lbase2_64_sve2: + neg $vl1,$vl0 // - (vl * 16) + sub $vl0,$vl0,#1 // (vl * 16) - 1 + and $vl2,$len,$vl1 // $len - ($len % (vl * 16)) -> VLA length + and $vl4,$len,$vl0 // $len % (vl * 16) -> scalar remainder + cbz $vl4,.Linit_sve2 // If no scalar "head", proceed to VLA + add $vl3,$inp,$vl4 // Pointer to the start of the VLA data + stp $vl2,$vl3,[sp,#-16]! // Backup VLA length and ptr + mov $len,$vl4 // So that scalar part knows it's length + bl poly1305_blocks // Calculate the scalar "head" + ldp $len,$inp,[sp],#16 // Recover updated length and input ptr + +.Linit_sve2: + // Calculating and storing r-powers (powers of a key). + // The layout of how r-powers are stored in memory: + ////////////////////////////////////////////////////////////////////////////////////// + // lobe 1 lobe 2 etc. // + // | .. r^{max},r^{max/2},...,r^2,r | .. r^{max},r^{max/2},...,r^2,r | .. // + // / \ / \ / \ // + // [$ctx,48] [$ctx,48+28] [$ctx,48+56] // + ////////////////////////////////////////////////////////////////////////////////////// + + ldr w5,[$ctx,#28] // Load top power (if exists - 0 by default) + add $pwr,$ctx,#48+28 // Point to the end of powers allocation (1st lobe) + + mov $mask,#-1 + lsr $mask,$mask,#20 //2^44-1 + + cbnz w5,.Lpwrs_precomputed + + ldp $r0,$r1,[$ctx,#32] // load key value + + lsr $r2,$r1,#24 // base2_64 -> base2_44 + extr $r1,$r1,$r0,#44 + and $r0,$r0,$mask + and $r1,$r1,$mask + + mov x4,$vl + add x5,$pwr,#-4 + bl poly1305_sw_2_26 + +.Loop_pwrs_sqr: + lsr x4,x4,#1 + add x5,x5,#-4 + bl poly1305_sqr_2_44 + bl poly1305_sw_2_26 + cbnz x4,.Loop_pwrs_sqr + + sub x5,x5,$pwr + str w5,[$ctx,#28] + +.Lpwrs_precomputed: + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + stp d8,d9,[sp,#80] // meet ABI requirements + stp d10,d11,[sp,#96] + stp d12,d13,[sp,#112] + stp d14,d15,[sp,#128] + + // Zeroing H0-H4 registers + eor z24.d,z24.d,z24.d // H0 + eor z25.d,z25.d,z25.d // H1 + eor z26.d,z26.d,z26.d // H2 + eor z27.d,z27.d,z27.d // H3 + eor z28.d,z28.d,z28.d // H4 + + // Using Neon's fmov here for speed. + // We only need the low 26 bits in the first step so no need for post-mov reshuffle. + fmov d24,x10 // H0 + fmov d25,x11 // H1 + fmov d26,x12 // H2 + fmov d27,x13 // H3 + fmov d28,x14 // H4 + + ldr x30,[sp,#8] + + mov x4,#1 + stur w4,[$ctx,#24] // set is_base2_26 + b .Ldo_sve2 + +.align 4 +.Leven_sve2: + // In principle all this could be moved to Ldo_sve2 + stp d8,d9,[sp,#80] // meet ABI requirements + stp d10,d11,[sp,#96] + stp d12,d13,[sp,#112] + stp d14,d15,[sp,#128] + + eor z24.d,z24.d,z24.d // H0 + eor z25.d,z25.d,z25.d // H1 + eor z26.d,z26.d,z26.d // H2 + eor z27.d,z27.d,z27.d // H3 + eor z28.d,z28.d,z28.d // H4 + + fmov d24,x10 // H0 + fmov d25,x11 // H1 + fmov d26,x12 // H2 + fmov d27,x13 // H3 + fmov d28,x14 // H4 + +.Ldo_sve2: + ptrue p0.b, ALL // Set all-true predicate + + // Load r-powers. + // They are stored in five lobes, in the order r^{max},...,r^2,r^1 each. + // We need specific powers to be at specific R- and S-vector indices. + // Hence we can't load all of them, an arbitrary amount, dependent on VL. + // Instead we load {r^{max},r^{max/2}} and {r^2,r^1} in batches, + // and then interleave them using zip1 as {r^{max},r^2,r^{max/2},r}. + // We don't really care where r^{max} and r^{max/2} are, but we want + // r^2 and r to be in either even or odd lanes. We chose lanes 1 and 3. + // Intermediate r-powers (r^{max/4},..,r^4), if applicable, will be + // reloaded into lane 0 iteratively in Loop_reduce_sve2. + + ldr w5,[$ctx,#28] + sxtw x5,w5 // Zero-extend + add $pwr,$ctx,#48+28 // Pointer to the end of the r-powers 1st lobe + add x10,$ctx,#48+20 // Pointer to r^2. + add $pwr,$pwr,x5 // Pointer to the r^{max} + + mov x15,#2 + whilelo p1.s,xzr,x15 + + // If wouldn't need to load in two chunks, could use ld1rqw - + // optimisation potential for 256-bit vector. + ld1w { $SVE_R0 },p1/z,[$pwr] + ld1w { $SVE_T0.s },p1/z,[x10] + add $pwr,$pwr,#28 + add x10,x10,#28 + zip1 $SVE_R0,$SVE_R0,$SVE_T0.s + + ld1w { $SVE_R1 },p1/z,[$pwr] + ld1w { $SVE_T1.s },p1/z,[x10] + add $pwr,$pwr,#28 + add x10,x10,#28 + zip1 $SVE_R1,$SVE_R1,$SVE_T1.s + + ld1w { $SVE_R2 },p1/z,[$pwr] + ld1w { $SVE_T0.s },p1/z,[x10] + add $pwr,$pwr,#28 + add x10,x10,#28 + zip1 $SVE_R2,$SVE_R2,$SVE_T0.s + + ld1w { $SVE_R3 },p1/z,[$pwr] + ld1w { $SVE_T1.s },p1/z,[x10] + add $pwr,$pwr,#28 + add x10,x10,#28 + zip1 $SVE_R3,$SVE_R3,$SVE_T1.s + + ld1w { $SVE_R4 },p1/z,[$pwr] + ld1w { $SVE_T0.s },p1/z,[x10] + sub $pwr,$pwr,#104 // Adjust to 1st lobe, 3d power + zip1 $SVE_R4,$SVE_R4,$SVE_T0.s + + // Broadcast r-powers loaded above to higher parts of the R-vectors. + cmp $vl,#2 + b.eq .L_skip_dup_broadcast + dup z0.q,z0.q[0] + dup z1.q,z1.q[0] + dup z3.q,z3.q[0] + dup z5.q,z5.q[0] + dup z7.q,z7.q[0] + +.L_skip_dup_broadcast: + // Calculate S-vectors (r^x*5) + adr $SVE_S1,[$SVE_R1,$SVE_R1,lsl #2] + adr $SVE_S2,[$SVE_R2,$SVE_R2,lsl #2] + adr $SVE_S3,[$SVE_R3,$SVE_R3,lsl #2] + adr $SVE_S4,[$SVE_R4,$SVE_R4,lsl #2] + + // Load initial input blocks + lsr x15,$len,#4 + whilelo p1.s,xzr,x15 // Set predicate for blocks loading + lsl $padbit,$padbit,#24 + ld4w { z9.s-z12.s },p1/z,[$inp] // Loading all blocks at once + +#ifdef __AARCH64EB__ + revb z9.s, p0/m, z9.s + revb z10.s, p0/m, z10.s + revb z11.s, p0/m, z11.s + revb z12.s, p0/m, z12.s +#endif + + // In-vector (VLA) conversion base2_64 -> base2_26. + dup ${SVE_MASK}.s,#-1 + lsr ${SVE_MASK}.s,${SVE_MASK}.s,#6 + + lsr ${SVE_T0}.s,z11.s,#14 // T0 -> z11 >> 14 + lsr z13.s,z12.s,#8 // z13 -> l4 + lsl z11.s,z11.s,#12 // z11 -> upper part of l2 + lsl z12.s,z12.s,#18 // z12 -> upper part of l3 + lsr ${SVE_T1}.s,z10.s,#20 // T1 -> z10 >> 20 + orr z12.d,z12.d,${SVE_T0}.d // z12 -> final l3 + lsl z10.s,z10.s,#6 // z10 -> upper part of l1 + lsr ${SVE_T0}.s,z9.s,#26 // T0 -> z9 >> 26 + and z9.d,z9.d,${SVE_MASK}.d // z9 is now final l0 + orr z11.d,z11.d,${SVE_T1}.d // z11 -> final l2 + orr z10.d,z10.d,${SVE_T0}.d // z10 -> final l1 + dup ${SVE_T1}.s,w3 // x3 -> $padbit but need it as a word + eor ${SVE_T0}.d,${SVE_T0}.d,${SVE_T0}.d // set zero mask + orr z13.d,z13.d,${SVE_T1}.d // l4 += padbit + and z12.d,z12.d,${SVE_MASK}.d // Mask l3 + and z11.d,z11.d,${SVE_MASK}.d // Mask l2 + and z10.d,z10.d,${SVE_MASK}.d // Mask l1 + + + // Move high blocks from INlo -> INhi and sparcify (put in even lanes) + zip2 z14.s,z9.s,${SVE_T0}.s + zip2 z18.s,z13.s,${SVE_T0}.s + zip2 z17.s,z12.s,${SVE_T0}.s + zip2 z16.s,z11.s,${SVE_T0}.s + zip2 z15.s,z10.s,${SVE_T0}.s + + // Sparcify blocks to even lanes in INlo + zip1 z9.s,z9.s,${SVE_T0}.s + zip1 z13.s,z13.s,${SVE_T0}.s + zip1 z12.s,z12.s,${SVE_T0}.s + zip1 z11.s,z11.s,${SVE_T0}.s + zip1 z10.s,z10.s,${SVE_T0}.s + + subs $len,$len,$vl,lsl #5 // By half vector width * 32 + + b.ls .Lskip_loop_sve2 + +.align 4 +.Loop_sve2: + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ((inp[0]*r^{vl*2} + inp[vl] *r^{vl} + inp[2*vl] )*r^{vl} + inp[3*vl] )*r^{vl} + //+((inp[1]*r^{vl*2} + inp[vl+1]*r^{vl} + inp[2*vl+1])*r^{vl} + inp[3*vl+1])*r^{vl-1} + //+... + // \_______________________________/ \_________________________________________/ + // first main loop iteration long tail + // + // ((inp[0]*r^{vl*2} + inp[vl] *r^{vl} + inp[2*vl] )*r^{vl*2} + inp[3*vl] *r^{vl} + inp[4*vl] )*r^{vl} + //+((inp[1]*r^{vl*2} + inp[vl+1]*r^{vl} + inp[2*vl+1])*r^{vl*2} + inp[3*vl+1]*r^{vl} + inp[4*vl+1])*r^{vl-1} + //+... + // \_______________________________/ \________________________________________/ \___________________/ + // first main loop iteration second main loop iteration short tail + // + // Note that we start with inp[vl:vl*2]*r^{vl}, as it + // doesn't depend on reduction in previous iteration. + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Hash-key power product f-la for the 5 limbs in base2^26 representation: + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + add $inp,$inp,$vl,lsl #5 + + umullb $SVE_ACC4,$SVE_INhi_0,${SVE_R4}[2] + umullb $SVE_ACC3,$SVE_INhi_0,${SVE_R3}[2] + umullb $SVE_ACC2,$SVE_INhi_0,${SVE_R2}[2] + umullb $SVE_ACC1,$SVE_INhi_0,${SVE_R1}[2] + umullb $SVE_ACC0,$SVE_INhi_0,${SVE_R0}[2] + + umlalb $SVE_ACC4,$SVE_INhi_1,${SVE_R3}[2] + umlalb $SVE_ACC3,$SVE_INhi_1,${SVE_R2}[2] + umlalb $SVE_ACC2,$SVE_INhi_1,${SVE_R1}[2] + umlalb $SVE_ACC1,$SVE_INhi_1,${SVE_R0}[2] + umlalb $SVE_ACC0,$SVE_INhi_1,${SVE_S4}[2] + + umlalb $SVE_ACC4,$SVE_INhi_2,${SVE_R2}[2] + umlalb $SVE_ACC3,$SVE_INhi_2,${SVE_R1}[2] + umlalb $SVE_ACC2,$SVE_INhi_2,${SVE_R0}[2] + umlalb $SVE_ACC1,$SVE_INhi_2,${SVE_S4}[2] + umlalb $SVE_ACC0,$SVE_INhi_2,${SVE_S3}[2] + + umlalb $SVE_ACC4,$SVE_INhi_3,${SVE_R1}[2] + umlalb $SVE_ACC3,$SVE_INhi_3,${SVE_R0}[2] + umlalb $SVE_ACC2,$SVE_INhi_3,${SVE_S4}[2] + umlalb $SVE_ACC1,$SVE_INhi_3,${SVE_S3}[2] + umlalb $SVE_ACC0,$SVE_INhi_3,${SVE_S2}[2] + + add $SVE_INlo_2,$SVE_INlo_2,$SVE_H2 + umlalb $SVE_ACC4,$SVE_INhi_4,${SVE_R0}[2] + umlalb $SVE_ACC3,$SVE_INhi_4,${SVE_S4}[2] + umlalb $SVE_ACC2,$SVE_INhi_4,${SVE_S3}[2] + umlalb $SVE_ACC1,$SVE_INhi_4,${SVE_S2}[2] + umlalb $SVE_ACC0,$SVE_INhi_4,${SVE_S1}[2] + + ////////////////////////////////////////////////////////////////////// + // (hash+inp[0:vl])*r^{vl*2} and accumulate + // Interleave add+mul with loading and converting the next input batch + + add $SVE_INlo_0,$SVE_INlo_0,$SVE_H0 + lsr x15,$len,#4 + umlalb $SVE_ACC3,$SVE_INlo_2,${SVE_R1}[0] + whilelo p1.s,xzr,x15 + umlalb $SVE_ACC0,$SVE_INlo_2,${SVE_S3}[0] + ld4w { z14.s-z17.s }, p1/z, [$inp] + umlalb $SVE_ACC4,$SVE_INlo_2,${SVE_R2}[0] + umlalb $SVE_ACC1,$SVE_INlo_2,${SVE_S4}[0] + umlalb $SVE_ACC2,$SVE_INlo_2,${SVE_R0}[0] + +#ifdef __AARCH64EB__ + revb z14.s, p0/m, z14.s + revb z15.s, p0/m, z15.s + revb z16.s, p0/m, z16.s + revb z17.s, p0/m, z17.s +#endif + + add $SVE_INlo_1,$SVE_INlo_1,$SVE_H1 + dup ${SVE_MASK}.s,#-1 + umlalb $SVE_ACC3,$SVE_INlo_0,${SVE_R3}[0] + lsr ${SVE_MASK}.s,${SVE_MASK}.s,#6 + umlalb $SVE_ACC4,$SVE_INlo_0,${SVE_R4}[0] + lsr ${SVE_T0}.s,z16.s,#14 // T0 -> z16 >> 14 + umlalb $SVE_ACC2,$SVE_INlo_0,${SVE_R2}[0] + lsr z18.s,z17.s,#8 // z18 -> l4 + umlalb $SVE_ACC0,$SVE_INlo_0,${SVE_R0}[0] + lsl z16.s,z16.s,#12 // z16 -> upper part of l2 + umlalb $SVE_ACC1,$SVE_INlo_0,${SVE_R1}[0] + lsl z17.s,z17.s,#18 // z17 -> upper part of l3 + + add $SVE_INlo_3,$SVE_INlo_3,$SVE_H3 + lsr ${SVE_T1}.s,z15.s,#20 // T1 -> z15 >> 20 + umlalb $SVE_ACC3,$SVE_INlo_1,${SVE_R2}[0] + orr z17.d,z17.d,${SVE_T0}.d // z17 -> final l3 + umlalb $SVE_ACC4,$SVE_INlo_1,${SVE_R3}[0] + lsl z15.s,z15.s,#6 // z15 -> upper part of l1 + umlalb $SVE_ACC0,$SVE_INlo_1,${SVE_S4}[0] + lsr ${SVE_T0}.s,z14.s,#26 // T0 -> z14 >> 26 + umlalb $SVE_ACC2,$SVE_INlo_1,${SVE_R1}[0] + and z14.d,z14.d,${SVE_MASK}.d // z14 is now final l0 + umlalb $SVE_ACC1,$SVE_INlo_1,${SVE_R0}[0] + orr z16.d,z16.d,${SVE_T1}.d // z16 -> final l2 + + add $SVE_INlo_4,$SVE_INlo_4,$SVE_H4 + orr z15.d,z15.d,${SVE_T0}.d // z15 -> final l1 + umlalb $SVE_ACC3,$SVE_INlo_3,${SVE_R0}[0] + dup ${SVE_T1}.s,w3 + umlalb $SVE_ACC0,$SVE_INlo_3,${SVE_S2}[0] + eor ${SVE_T0}.d,${SVE_T0}.d,${SVE_T0}.d // set zero mask + umlalb $SVE_ACC4,$SVE_INlo_3,${SVE_R1}[0] + orr z18.d,z18.d,${SVE_T1}.d // l4 += padbit + umlalb $SVE_ACC1,$SVE_INlo_3,${SVE_S3}[0] + and z17.d,z17.d,${SVE_MASK}.d // Mask l3 + umlalb $SVE_ACC2,$SVE_INlo_3,${SVE_S4}[0] + and z16.d,z16.d,${SVE_MASK}.d // Mask l2 + + umlalb $SVE_ACC3,$SVE_INlo_4,${SVE_S4}[0] + and z15.d,z15.d,${SVE_MASK}.d // Mask l1 + umlalb $SVE_ACC0,$SVE_INlo_4,${SVE_S1}[0] + zip1 z9.s,z14.s,${SVE_T0}.s + umlalb $SVE_ACC4,$SVE_INlo_4,${SVE_R0}[0] + zip1 z10.s,z15.s,${SVE_T0}.s + umlalb $SVE_ACC1,$SVE_INlo_4,${SVE_S2}[0] + zip1 z11.s,z16.s,${SVE_T0}.s + umlalb $SVE_ACC2,$SVE_INlo_4,${SVE_S3}[0] + zip1 z12.s,z17.s,${SVE_T0}.s + zip1 z13.s,z18.s,${SVE_T0}.s + + // Sparcify blocks to even lanes in INlo + zip2 z14.s,z14.s,${SVE_T0}.s + zip2 z15.s,z15.s,${SVE_T0}.s + zip2 z16.s,z16.s,${SVE_T0}.s + zip2 z17.s,z17.s,${SVE_T0}.s + zip2 z18.s,z18.s,${SVE_T0}.s + + subs $len,$len,$vl,lsl #5 + + // Lazy reduction + bl poly1305_lazy_reduce_sve2 + ldr x30,[sp,#8] + + b.hi .Loop_sve2 + +.Lskip_loop_sve2: + + adds $len,$len,$vl,lsl #4 // By half the usual input size + b.eq .Lshort_tail_sve2 + +.Long_tail_sve2: + //////////////////////////////////////////////////////////////// + // (hash + inp[lo])*r^{vl} + inp[hi])*r^{vl..1} // + // \____________________/ // + // first part of long tail // + //////////////////////////////////////////////////////////////// + //NB `vl` here (and in the code) is the vector length in double words. + // Intereaving algebra with copying INhi -> INlo for the next steps. + + add $SVE_INlo_2,$SVE_INlo_2,$SVE_H2 + add $SVE_INlo_0,$SVE_INlo_0,$SVE_H0 + add $SVE_INlo_1,$SVE_INlo_1,$SVE_H1 + add $SVE_INlo_3,$SVE_INlo_3,$SVE_H3 + add $SVE_INlo_4,$SVE_INlo_4,$SVE_H4 + + umullb $SVE_ACC3,$SVE_INlo_2,${SVE_R1}[2] + umullb $SVE_ACC0,$SVE_INlo_2,${SVE_S3}[2] + umullb $SVE_ACC4,$SVE_INlo_2,${SVE_R2}[2] + umullb $SVE_ACC1,$SVE_INlo_2,${SVE_S4}[2] + umullb $SVE_ACC2,$SVE_INlo_2,${SVE_R0}[2] + + umlalb $SVE_ACC3,$SVE_INlo_0,${SVE_R3}[2] + umlalb $SVE_ACC4,$SVE_INlo_0,${SVE_R4}[2] + umlalb $SVE_ACC2,$SVE_INlo_0,${SVE_R2}[2] + umlalb $SVE_ACC0,$SVE_INlo_0,${SVE_R0}[2] + umlalb $SVE_ACC1,$SVE_INlo_0,${SVE_R1}[2] + mov z11.d,z16.d + + umlalb $SVE_ACC3,$SVE_INlo_1,${SVE_R2}[2] + umlalb $SVE_ACC4,$SVE_INlo_1,${SVE_R3}[2] + umlalb $SVE_ACC0,$SVE_INlo_1,${SVE_S4}[2] + umlalb $SVE_ACC2,$SVE_INlo_1,${SVE_R1}[2] + umlalb $SVE_ACC1,$SVE_INlo_1,${SVE_R0}[2] + mov z9.d,z14.d + + umlalb $SVE_ACC3,$SVE_INlo_3,${SVE_R0}[2] + umlalb $SVE_ACC0,$SVE_INlo_3,${SVE_S2}[2] + umlalb $SVE_ACC4,$SVE_INlo_3,${SVE_R1}[2] + umlalb $SVE_ACC1,$SVE_INlo_3,${SVE_S3}[2] + umlalb $SVE_ACC2,$SVE_INlo_3,${SVE_S4}[2] + mov z10.d,z15.d + + umlalb $SVE_ACC3,$SVE_INlo_4,${SVE_S4}[2] + umlalb $SVE_ACC0,$SVE_INlo_4,${SVE_S1}[2] + umlalb $SVE_ACC4,$SVE_INlo_4,${SVE_R0}[2] + umlalb $SVE_ACC1,$SVE_INlo_4,${SVE_S2}[2] + umlalb $SVE_ACC2,$SVE_INlo_4,${SVE_S3}[2] + mov z12.d,z17.d + + // Lazy reduction + bl poly1305_lazy_reduce_sve2 + ldr x30,[sp,#8] + + mov z13.d,z18.d + +.Lshort_tail_sve2: + + cmp $vl, #2 + b.ls .Last_reduce_sve2 + + mov x15,#1 + whilelo p1.s,xzr,x15 + +.Loop_reduce_sve2: + //////////////////////////////////////////////////////////////// + // (hash + inp[hi])*r^{vl/2..2} // + // \____________________/ // + // iterative reduction part of the short tail // + //////////////////////////////////////////////////////////////// + // Last column of products is calculated by iteratively "folding" vectors: + // 1. If vl==2 - skip to Last_reduce_sve2 + // 2. calculate product with r^{vl/2} -> ACC{0-4} + // 3. lazy reduction -> H{0-4} + // 4. upper half of vectors (INlo{0-4}) is copied to lower halves + // 5. If vl/2==2 - go to Last_reduce_sve2 + // 6. continue with 2. + // NB: this part is skipped for 128-bit case (vl==2) + // For 256-bit, no intermediate loading is necessary - r^2 is already in [1]. + // So a special case can be easily implemented, when corresponding hardware is available. + + // Load the intermediate r-power into the 0th lanes of vectors + // Interleave with broadcasting and S-vector calculation. + ldr w10,[$pwr] + ldr w11,[$pwr,#28] + ldr w12,[$pwr,#56] + cpy $SVE_R0,p1/m,w10 + ldr w13,[$pwr,#84] + cpy $SVE_R1,p1/m,w11 + dup z0.q,z0.q[0] + ldr w14,[$pwr,#112] + cpy $SVE_R2,p1/m,w12 + dup z1.q,z1.q[0] + cpy $SVE_R3,p1/m,w13 + dup z3.q,z3.q[0] + cpy $SVE_R4,p1/m,w14 + add $pwr,$pwr,#4 // Increment pointer for the next iteration + dup z5.q,z5.q[0] + dup z7.q,z7.q[0] + + // Interleaved hash contraction and S-vector calc. + add $SVE_INlo_2,$SVE_INlo_2,$SVE_H2 + adr $SVE_S1,[$SVE_R1,$SVE_R1,lsl #2] + add $SVE_INlo_0,$SVE_INlo_0,$SVE_H0 + adr $SVE_S2,[$SVE_R2,$SVE_R2,lsl #2] + add $SVE_INlo_1,$SVE_INlo_1,$SVE_H1 + adr $SVE_S3,[$SVE_R3,$SVE_R3,lsl #2] + add $SVE_INlo_3,$SVE_INlo_3,$SVE_H3 + adr $SVE_S4,[$SVE_R4,$SVE_R4,lsl #2] + add $SVE_INlo_4,$SVE_INlo_4,$SVE_H4 + + umullb $SVE_ACC3,$SVE_INlo_0,${SVE_R3}[0] + umullb $SVE_ACC4,$SVE_INlo_0,${SVE_R4}[0] + umullb $SVE_ACC2,$SVE_INlo_0,${SVE_R2}[0] + umullb $SVE_ACC0,$SVE_INlo_0,${SVE_R0}[0] + umullb $SVE_ACC1,$SVE_INlo_0,${SVE_R1}[0] + + umlalb $SVE_ACC3,$SVE_INlo_1,${SVE_R2}[0] + umlalb $SVE_ACC4,$SVE_INlo_1,${SVE_R3}[0] + umlalb $SVE_ACC0,$SVE_INlo_1,${SVE_S4}[0] + umlalb $SVE_ACC2,$SVE_INlo_1,${SVE_R1}[0] + umlalb $SVE_ACC1,$SVE_INlo_1,${SVE_R0}[0] + + umlalb $SVE_ACC3,$SVE_INlo_2,${SVE_R1}[0] + umlalb $SVE_ACC0,$SVE_INlo_2,${SVE_S3}[0] + umlalb $SVE_ACC4,$SVE_INlo_2,${SVE_R2}[0] + umlalb $SVE_ACC1,$SVE_INlo_2,${SVE_S4}[0] + umlalb $SVE_ACC2,$SVE_INlo_2,${SVE_R0}[0] + + umlalb $SVE_ACC3,$SVE_INlo_3,${SVE_R0}[0] + umlalb $SVE_ACC0,$SVE_INlo_3,${SVE_S2}[0] + umlalb $SVE_ACC4,$SVE_INlo_3,${SVE_R1}[0] + umlalb $SVE_ACC1,$SVE_INlo_3,${SVE_S3}[0] + umlalb $SVE_ACC2,$SVE_INlo_3,${SVE_S4}[0] + + umlalb $SVE_ACC3,$SVE_INlo_4,${SVE_S4}[0] + umlalb $SVE_ACC0,$SVE_INlo_4,${SVE_S1}[0] + umlalb $SVE_ACC4,$SVE_INlo_4,${SVE_R0}[0] + umlalb $SVE_ACC1,$SVE_INlo_4,${SVE_S2}[0] + umlalb $SVE_ACC2,$SVE_INlo_4,${SVE_S3}[0] + + // Lazy reduction + bl poly1305_lazy_reduce_sve2 + ldr x30,[sp,#8] + + // Move higher part of vectors to lower part, depending on current vl + // NB look-up is done in terms of single-word lanes, hence indices + // start from vl (refer to as w16) and not vl/2 + // Higher part now contains "junk" + index ${SVE_T0}.s,w16,#1 + tbl ${SVE_INlo_0},${SVE_INlo_0},${SVE_T0}.s + tbl ${SVE_INlo_1},${SVE_INlo_1},${SVE_T0}.s + tbl ${SVE_INlo_2},${SVE_INlo_2},${SVE_T0}.s + tbl ${SVE_INlo_3},${SVE_INlo_3},${SVE_T0}.s + tbl ${SVE_INlo_4},${SVE_INlo_4},${SVE_T0}.s + lsr $vl,$vl,#1 // vl /= 2 + cmp $vl,#2 + b.hi .Loop_reduce_sve2 + +.Last_reduce_sve2: + //////////////////////////////////////////////////////////////// + // (hash + inp[n-1])*r^2 // + //+(hash + inp[n] )*r // + // \_____________/ // + // Final part of the short tail // + //////////////////////////////////////////////////////////////// + + //Last hash addition - now everything stored in SVE_Hx + add $SVE_H2,$SVE_H2,$SVE_INlo_2 + add $SVE_H0,$SVE_H0,$SVE_INlo_0 + add $SVE_H1,$SVE_H1,$SVE_INlo_1 + add $SVE_H3,$SVE_H3,$SVE_INlo_3 + add $SVE_H4,$SVE_H4,$SVE_INlo_4 + + // Shift even lanes to odd lanes and set even to zero + // because r^2 and r^1 are in lanes 1 and 3 of R-vectors + trn1 $SVE_H2,${SVE_MASK}.s,$SVE_H2 + trn1 $SVE_H0,${SVE_MASK}.s,$SVE_H0 + trn1 $SVE_H1,${SVE_MASK}.s,$SVE_H1 + trn1 $SVE_H3,${SVE_MASK}.s,$SVE_H3 + trn1 $SVE_H4,${SVE_MASK}.s,$SVE_H4 + + umullt $SVE_ACC3,$SVE_H2,${SVE_R1} + umullt $SVE_ACC0,$SVE_H2,${SVE_S3} + umullt $SVE_ACC4,$SVE_H2,${SVE_R2} + umullt $SVE_ACC1,$SVE_H2,${SVE_S4} + umullt $SVE_ACC2,$SVE_H2,${SVE_R0} + + umlalt $SVE_ACC3,$SVE_H0,${SVE_R3} + umlalt $SVE_ACC4,$SVE_H0,${SVE_R4} + umlalt $SVE_ACC2,$SVE_H0,${SVE_R2} + umlalt $SVE_ACC0,$SVE_H0,${SVE_R0} + umlalt $SVE_ACC1,$SVE_H0,${SVE_R1} + + umlalt $SVE_ACC3,$SVE_H1,${SVE_R2} + umlalt $SVE_ACC4,$SVE_H1,${SVE_R3} + umlalt $SVE_ACC0,$SVE_H1,${SVE_S4} + umlalt $SVE_ACC2,$SVE_H1,${SVE_R1} + umlalt $SVE_ACC1,$SVE_H1,${SVE_R0} + + umlalt $SVE_ACC3,$SVE_H3,${SVE_R0} + umlalt $SVE_ACC0,$SVE_H3,${SVE_S2} + umlalt $SVE_ACC4,$SVE_H3,${SVE_R1} + umlalt $SVE_ACC1,$SVE_H3,${SVE_S3} + umlalt $SVE_ACC2,$SVE_H3,${SVE_S4} + + umlalt $SVE_ACC3,$SVE_H4,${SVE_S4} + umlalt $SVE_ACC0,$SVE_H4,${SVE_S1} + umlalt $SVE_ACC4,$SVE_H4,${SVE_R0} + umlalt $SVE_ACC1,$SVE_H4,${SVE_S2} + umlalt $SVE_ACC2,$SVE_H4,${SVE_S3} + + // Generate predicate for the last two double words + mov x15,#2 + whilelo p2.d,xzr,x15 + + dup ${SVE_MASK}.d,#-1 + lsr ${SVE_MASK}.d,${SVE_MASK}.d,#38 + + //////////////////////////////////////////////////////////////// + // horizontal add + + //In Neon implementation, one effectively using lower 64 bits of vector registers here. + //Here and below I use hard-coded FP registers. + + uaddv d22,p2,$SVE_ACC3 + ldp d8,d9,[sp,#80] // meet ABI requirements + uaddv d19,p2,$SVE_ACC0 + ldp d10,d11,[sp,#96] + uaddv d23,p2,$SVE_ACC4 + ldp d12,d13,[sp,#112] + uaddv d20,p2,$SVE_ACC1 + ldp d14,d15,[sp,#128] + uaddv d21,p2,$SVE_ACC2 + + //////////////////////////////////////////////////////////////// + // Lazy reduction, but without narrowing + + // Since results were accumulated in the lower 64 bits, + // one can refer to them as FP/aSIMD reg-s. + + ushr d29,d22,#26 + and v22.8b,v22.8b,v31.8b + ushr d30,d19,#26 + and v19.8b,v19.8b,v31.8b + + add d23,d23,d29 // h3 -> h4 + add d20,d20,d30 // h0 -> h1 + + ushr d29,d23,#26 + and v23.8b,v23.8b,v31.8b + ushr d30,d20,#26 + and v20.8b,v20.8b,v31.8b + add d21,d21,d30 // h1 -> h2 + + add d19,d19,d29 + shl d29,d29,#2 + ushr d30,d21,#26 + and v21.8b,v21.8b,v31.8b + add d19,d19,d29 // h4 -> h0 + add d22,d22,d30 // h2 -> h3 + + ushr d29,d19,#26 + and v19.8b,v19.8b,v31.8b + ushr d30,d22,#26 + and v22.8b,v22.8b,v31.8b + add d20,d20,d29 // h0 -> h1 + add d23,d23,d30 // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + stp s19,s20,[$ctx],#8 + stp s21,s22,[$ctx],#8 + str s23,[$ctx] + +.Lno_data_sve2: + // Restore the callee-saved GPRs + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldr x29,[sp],#144 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Lshort_blocks: + b poly1305_blocks + +.size poly1305_blocks_sve2,.-poly1305_blocks_sve2 +___ + +############################################################################## +# +# SVE instruction encoder, adapted from chacha20-sve.pl +# +############################################################################## + +my $debug_encoder = 0; + +{ +my %opcode_unpred = ( + "eor" => 0x04a03000, + "add" => 0x04200000, + "orr" => 0x04603000, + "mov" => 0x04603000, # Alias for ORR + "and" => 0x04203000, + "lsl" => 0x04209C00, + "lsr" => 0x04209400, + "zip1" => 0x05206000, + "zip2" => 0x05206400, + "trn1" => 0x05207000, + "dup_gpr" => 0x05203800, + "dup_elem" => 0x05302000, + "cntd" => 0x04e0e000, + "tbl" => 0x05203000, + "adr" => 0x04a0a000, + "umullb" => 0x44e0d000, + "umullt" => 0x45c07c00, + "umlalb" => 0x44e09000, + "umlalt" => 0x44c04c00, + "shrnb" => 0x45201000); + +my %opcode_imm_unpred = ( + "dup" => 0x2538C000, + "index" => 0x04204400); + +my %opcode_scalar_pred = ( + "cpy" => 0x0528A000); + +my %opcode_pred = ( + "whilelo" => 0x25200C00, + "ptrue" => 0x2518E000, + "ld4w" => 0xA560E000, + "ld1w" => 0xA540A000, + "revb" => 0x05248000, + "uaddv" => 0x04012000); + +my %tsize = ( + 'b' => 0, + 'h' => 1, + 's' => 2, + 'd' => 3, + 'q' => 3); # To handle dup zx.q,zx.q[i] case + +my %sf = ( + "w" => 0, + "x" => 1); + +my %pattern = ("ALL" => 31); + +sub create_verifier { + my $filename="./compile_sve.sh"; + +$scripts = <<'___'; +#! /bin/bash +set -e +CROSS_COMPILE=${CROSS_COMPILE:-'aarch64-linux-gnu-'} + +[ -z "$1" ] && exit 1 +INST_TO_COMPILE="$1" +FILENAME_BASE=${1%% *} +TMPFILE="/tmp/${FILENAME_BASE}_test" +OBJDUMP_LOG="/tmp/${FILENAME_BASE}_objdump.log" + +echo "--- DEBUG INFO ---" >&2 +echo "Received \$1 (Instruction): '$1'" >&2 +echo "Using Filename Base: '$FILENAME_BASE'" >&2 +echo "------------------" >&2 + +ARCH=`uname -p | xargs echo -n` + +if [ $ARCH == 'aarch64' ]; then + CC=gcc-11 + AS=as + OBJDUMP=objdump +else + CC=${CROSS_COMPILE}gcc + AS=${CROSS_COMPILE}as + OBJDUMP=${CROSS_COMPILE}objdump +fi + +cat > "${TMPFILE}.c" << EOF +extern __attribute__((noinline, section("disasm_output"))) void dummy_func() +{ + asm("$INST_TO_COMPILE"); +} +int main(int argc, char *argv[]) +{ +} +EOF + +$CC -march=armv8.2-a+sve+sve2 -S -o "${TMPFILE}.s" "${TMPFILE}.c" + +$AS -march=armv8-a+sve2 -o "${TMPFILE}.o" "${TMPFILE}.s" + +#$OBJDUMP -d "${TMPFILE}.o" > "$OBJDUMP_LOG" + +#cat "$OBJDUMP_LOG" | awk -F"\n" -v RS="\n\n" '$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",$2}' +$OBJDUMP -d "${TMPFILE}.o" | awk -F"\n" -v RS="\n\n" '$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",$2}' + +rm "${TMPFILE}.c" "${TMPFILE}.s" "${TMPFILE}.o" +___ + open(FH, '>', $filename) or die $!; + print FH $scripts; + close(FH); + system("chmod a+x ./compile_sve.sh"); +} + +sub compile_sve { + my $inst = shift; + return `./compile_sve.sh "$inst"`; +} + +sub verify_inst { + my ($code,$inst)=@_; + my $hexcode = (sprintf "%08x", $code); + + if ($debug_encoder == 1) { + my $expect=&compile_sve($inst); + if ($expect ne $hexcode) { + return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode); + } + } + return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst); +} + +sub reg_code { + my $code = shift; + + if ($code == "zr") { + return "31"; + } + return $code; +} + +sub encode_size_imm() { + my ($mnemonic, $isize, $const)=@_; + my $esize = (8<<$tsize{$isize}); + my $tsize_imm; + if ($mnemonic eq "shrnb") { + # Formula for narrowing shifts + $tsize_imm = $esize - $const; + } elsif ($mnemonic eq "lsr") { + # Formula for logical right shifts + $tsize_imm = 2*$esize - $const; + } else { + # Default formula for logical left shifts (lsl) + $tsize_imm = $esize + $const; + } + return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16); +} + +sub sve_unpred { + my ($mnemonic,$arg)=@_; + my $inst = (sprintf "%s %s", $mnemonic,$arg); + # Special case: Widening multiplies (indexed and vector) + if (($mnemonic =~ /^(umull[bt]|umlal[bt])/) && $arg =~ m/z([0-9]+)\.d,\s*z([0-9]+)\.s,\s*z([0-9]+)\.s(\[([0-9]+)\])?/o) { + my ($zd, $zn, $zm, $indexed, $imm) = ($1, $2, $3, $4, $5); + my $opcode = $opcode_unpred{$mnemonic}; + if ($indexed) { + # Split the 2-bit immediate index into its parts. + my $i2h = ($imm >> 1) & 0x1; # High bit of index + my $i2l = $imm & 0x1; # Low bit of index + # Get the low 4 bits of the Zm register. + my $zm_low = $zm & 0xF; + return &verify_inst($opcode|($i2h << 20)|($zm_low << 16)|($i2l << 11)|($zn << 5)|$zd,$inst); + } else { + return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst); + } + # Special case: 3-register vector ADR with lsl #2 + } elsif ($mnemonic eq "adr" && $arg =~ m/z([0-9]+)\.s,\s*\[z([0-9]+)\.s,\s*z([0-9]+)\.s,\s*lsl\s*#2\]/o) { + my ($zd, $zn, $zm) = ($1, $2, $3); + my $opcode = $opcode_unpred{"adr"}; + # Per the manual, the 'sz' bit (22) must be 0 for .s size. + # It is already 0 in our base, so we do nothing. + # The 'msz' field (bits 11-10) must be '10'. We achieve this by setting bit 11. + $opcode |= (1<<11); + return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst); + # Special case: 'cntd xd' alias + } elsif ($mnemonic eq "cntd" && $arg =~ m/x([0-9]+)/o) { + my ($xd) = ($1); + my $opcode = $opcode_unpred{$mnemonic}; + my $pattern_all = $pattern{"ALL"} << 5; + return &verify_inst($opcode|$xd|$pattern_all, $inst); + # Special parser for SHRNB's unique syntax (Zd.s, Zn.d, #imm) + } elsif ($mnemonic eq "shrnb" && $arg =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.([bhsd]),\s*#([0-9]+)/o) { + my ($zd, $zn, $size_src, $imm) = ($1, $2, $3, $4); + my $opcode = $opcode_unpred{$mnemonic}; + return &verify_inst($opcode|&encode_size_imm($mnemonic,$size_src,$imm)|($zn << 5)|$zd, $inst); + } elsif ($mnemonic eq "dup" && $arg =~ m/z([0-9]+)\.q,\s*z([0-9]+)\.q\[0\]/o) { # DUP from element + my ($zd, $zn) = ($1, $2); + my $opcode = $opcode_unpred{"dup_elem"}; + return &verify_inst($opcode | ($zn << 5) | $zd, $inst); + } elsif ($mnemonic eq "dup" && $arg =~ m/z([0-9]+)\.([bhsdq]),\s*w([0-9]+)/o) { # DUP from GPR (wX/xX) + my ($zd, $size, $rn) = ($1, $2, $3); + my $opcode = $opcode_unpred{"dup_gpr"}; + $opcode |= ($tsize{$size}<<22); + return &verify_inst($opcode|$zd|($rn<<5), $inst); + # Generic argument patterns + } elsif ($arg =~ m/z([0-9]+)\.([bhsdq]),\s*(.*)/o) { + my ($zd, $size, $regs) = ($1, $2, $3); + my $opcode = $opcode_unpred{$mnemonic}; + # Handle shift-by-immediate separately due to its unique encoding. + if ($mnemonic eq "lsl" || $mnemonic eq "lsr") { + if ($regs =~ m/z([0-9]+)\.[bhsd],\s*#([0-9]+)/o) { + my ($zn, $imm) = ($1, $2); + return &verify_inst($opcode|$zd|($zn<<5)|&encode_size_imm($mnemonic,$size,$imm), $inst); + } + } + if ($mnemonic !~ /^(and|orr|eor|mov)$/) { + $opcode |= ($tsize{$size}<<22); + } + if ($regs =~ m/z([0-9]+)\.[bhsdq],\s*z([0-9]+)\.[bhsdq]/o) { # 3-operand vector + my ($zn, $zm) = ($1, $2); + return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst); + } elsif ($regs =~ m/z([0-9]+)\.[bhsdq]/o) { # 2-operand vector (mov) + my $zn = $1; + my $zm = ($mnemonic eq "mov") ? $zn : 0; + return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst); + } elsif ($regs =~ m/w([0-9]+),\s*#1/o) { # index + my ($rn, $rm) = ($1, 1); + $opcode = $opcode_imm_unpred{"index"}; + $opcode |= ($tsize{$size}<<22); + return &verify_inst($opcode|$zd|($rn<<5)|($rm<<16), $inst); + } elsif ($regs =~ m/#(-?[0-9]+)/o) { # dup from immediate + my $imm = $1; + $opcode = $opcode_imm_unpred{"dup"}; + $opcode |= ($tsize{$size}<<22); + my $imm_val = $imm & 0xff; # Only accounting for a simple case with zero shift. + return &verify_inst($opcode|$zd|($imm_val<<5), $inst); + } + } + sprintf "%s // fail to parse: %s", $mnemonic, $arg; +} + +sub sve_pred { + my ($mnemonic, $arg)=@_; + my $inst = (sprintf "%s %s", $mnemonic,$arg); + # Special case: Multi-register loads (ld4w) + if ($arg =~ m/\{\s*z([0-9]+)\.s-z([0-9]+)\.s\s*\},\s*p([0-9]+)\/z,\s*\[(x[0-9]+)\]/o) { + my ($zt, $pg, $xn) = ($1, $3, $4); + $xn =~ s/x//; + my $opcode = $opcode_pred{$mnemonic}; + return &verify_inst($opcode|$zt|($pg<<10)|($xn<<5), $inst); + # Special case: Single-register loads (ld1w) + } elsif ($arg =~ m/\{\s*z([0-9]+)\.s\s*\},\s*p([0-9]+)\/z,\s*\[(x[0-9]+)\]/o) { + my ($zt, $pg, $xn) = ($1, $2, $3); + $xn =~ s/x//; + my $opcode = $opcode_pred{$mnemonic}; + return &verify_inst($opcode|$zt|($pg<<10)|($xn<<5), $inst); + # Special case: uaddv (scalar destination) + } elsif ($mnemonic eq "uaddv" && $arg =~ m/d([0-9]+),\s*p([0-9]+),\s*z([0-9]+)\.([bhsd])/o) { + my ($vd, $pg, $zn, $size) = ($1, $2, $3, $4); + my $opcode = $opcode_pred{$mnemonic}; + return &verify_inst($opcode|($tsize{$size}<<22)|$vd|($pg<<10)|($zn<<5), $inst); + # Generic pattern: Starts with a predicate register (whilelo, ptrue) + } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(.*)/o) { + my ($pd, $size, $regs) = ($1, $2, $3); + my $opcode = $opcode_pred{$mnemonic}; + if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) { # whilelo + my ($sf_char, $rn, $rm) = ($1, $2, $3); + return &verify_inst($opcode|($tsize{$size}<<22)|$pd|($sf{$sf_char}<<12)|(®_code($rn)<<5)|(®_code($rm)<<16), $inst); + } elsif ($regs =~ m/(\w+)/o) { # ptrue + my $pat = $1; + return &verify_inst($opcode|($tsize{$size}<<22)|$pd|($pattern{$pat}<<5), $inst); + } + # Generic pattern: Starts with a vector register (cpy, revb) + } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/m,\s*(.*)/o) { + my ($zd, $size, $pg, $regs) = ($1, $2, $3, $4); + if ($regs =~ m/w([0-9]+)/o) { # CPY from GPR + my $wn = $1; + my $opcode = $opcode_scalar_pred{"cpy"}; + return &verify_inst($opcode|($tsize{$size}<<22)|$zd|($pg<<10)|($wn<<5), $inst); + } elsif ($regs =~ m/z([0-9]+)\.([bhsd])/o) { # 2-operand predicated (revb) + my ($zn) = ($1); + my $opcode = $opcode_pred{$mnemonic}; + return &verify_inst($opcode|($tsize{$size}<<22)|$zd|($pg<<10)|($zn<<5), $inst); + } + } + sprintf "%s // fail to parse: %s", $mnemonic, $arg; +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +if ($debug_encoder == 1) { + &create_verifier(); +} + +foreach my $line (split("\n",$code)) { + my $original_line = $line; + my $encoded_line = ""; + # Perform variable substitution + $line =~ s/\`([^\`]*)\`/eval($1)/ge; + # Predicated instructions + if ($line =~ /^\s*(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/) { + $encoded_line = sve_pred($1, $2); + } + elsif ($line =~ /^\s*(\w+)\s+(d[0-9]+,\s*p[0-9].*)/) { + $encoded_line = sve_pred($1, $2); + } + elsif ($line =~ /^\s*(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/) { + $encoded_line = sve_pred($1, $2); + } + elsif ($line =~ /^\s*(\w+)\s+(p[0-9]+\.[bhsd].*)/) { + $encoded_line = sve_pred($1, $2); + } + # Specific unpredicated instructions + elsif ($line =~ /^\s*(dup)\s+(z[0-9]+\.q,\s*z[0-9]+\.q\[0\])/) { + $encoded_line = sve_unpred($1, $2); + } + elsif ($line =~ /^\s*(dup)\s+(z[0-9]+\.[bhsdq],\s*(?:w|x)[0-9]+)/) { + $encoded_line = sve_unpred($1, $2); + } + elsif ($line =~ /^\s*(mov)\s+(z[0-9]+\.d,\s*z[0-9]+\.d)/) { + $encoded_line = sve_unpred("mov", $2); + } + elsif ($line =~ /^\s*(umull[bt]|umlal[bt])\s+(z[0-9]+\.d,\s*z[0-9]+\.s,\s*z[0-9]+\.s(?:\[[0-9]+\])?)/) { + $encoded_line = sve_unpred($1, $2); + } + elsif ($line =~ /^\s*(cntd)\s+((x|w)[0-9]+.*)/) { + $encoded_line = sve_unpred($1, $2); + } + # 3. Generic Unpredicated "catch-all" + elsif ($line =~ /^\s*(\w+)\s+(z[0-9]+\.[bhsdq].*)/) { + $encoded_line = sve_unpred($1, $2); + } + if ($encoded_line) { + print $encoded_line, "\n"; + } else { + print $original_line, "\n"; + } +} + +} + STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/poly1305/build.info b/crypto/poly1305/build.info index e359a2225d..5c35c8ecee 100644 --- a/crypto/poly1305/build.info +++ b/crypto/poly1305/build.info @@ -14,7 +14,7 @@ IF[{- !$disabled{asm} -}] $POLY1305ASM_s390x=poly1305-s390x.S $POLY1305ASM_armv4=poly1305-armv4.S - $POLY1305ASM_aarch64=poly1305-armv8.S + $POLY1305ASM_aarch64=poly1305-armv8.S poly1305-armv9-sve2.S $POLY1305ASM_ppc32=poly1305_ppc.c poly1305-ppc.s poly1305-ppcfp.s $POLY1305ASM_ppc64=$POLY1305ASM_ppc32 @@ -45,7 +45,9 @@ GENERATE[poly1305-ppcfp.s]=asm/poly1305-ppcfp.pl GENERATE[poly1305-armv4.S]=asm/poly1305-armv4.pl INCLUDE[poly1305-armv4.o]=.. GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl +GENERATE[poly1305-armv9-sve2.S]=asm/poly1305-armv9-sve2.pl INCLUDE[poly1305-armv8.o]=.. +INCLUDE[poly1305-armv9-sve2.o]=.. GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl INCLUDE[poly1305-mips.o]=.. GENERATE[poly1305-c64xplus.S]=asm/poly1305-c64xplus.pl