crypto/poly1305: Add SVE2 vector-length agnostic implementation.

Implement Poly1305 using SVE2 VLA instructions for AArch64.

This implementation is selected at runtime if SVE2 is present and the vector length is 256, 512, 1024 or 2048 bits.

Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
Reviewed-by: Paul Dale <paul.dale@oracle.com>
(Merged from https://github.com/openssl/openssl/pull/28454)
This commit is contained in:
Iakov Polyak
2025-09-05 11:19:33 +01:00
committed by Pauli
parent 679a10110e
commit 2657697b6d
7 changed files with 1463 additions and 5 deletions

View File

@@ -120,6 +120,14 @@ _armv8_sve2_probe:
ret
.size _armv8_sve2_probe,.-_armv8_sve2_probe
.globl _armv8_sve_get_vl_bytes
.type _armv8_sve_get_vl_bytes,%function
_armv8_sve_get_vl_bytes:
AARCH64_VALID_CALL_TARGET
.inst 0x0420e3e0 // cntb x0
ret
.size _armv8_sve_get_vl_bytes,.-_armv8_sve_get_vl_bytes
.globl _armv8_cpuid_probe
.type _armv8_cpuid_probe,%function
_armv8_cpuid_probe:

View File

@@ -86,9 +86,10 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
# define ARMV8_SHA3 (1<<11)
# define ARMV8_UNROLL8_EOR3 (1<<12)
# define ARMV8_SVE (1<<13)
# define ARMV8_SVE2 (1<<14)
# define ARMV9_SVE2 (1<<14)
# define ARMV8_HAVE_SHA3_AND_WORTH_USING (1<<15)
# define ARMV8_UNROLL12_EOR3 (1<<16)
# define ARMV9_SVE2_POLY1305 (1<<17)
/*
* MIDR_EL1 system register

View File

@@ -24,11 +24,18 @@
#include <unistd.h>
#endif
#include "arm_arch.h"
#ifdef __aarch64__
#include <stdint.h>
#endif
unsigned int OPENSSL_armcap_P = 0;
unsigned int OPENSSL_arm_midr = 0;
unsigned int OPENSSL_armv8_rsa_neonized = 0;
#ifdef __aarch64__
uint64_t _armv8_sve_get_vl_bytes(void);
#endif
#ifdef _WIN32
void OPENSSL_cpuid_setup(void)
{
@@ -346,7 +353,7 @@ void OPENSSL_cpuid_setup(void)
OPENSSL_armcap_P |= ARMV8_SVE;
if (getauxval(OSSL_HWCAP2) & OSSL_HWCAP2_SVE2)
OPENSSL_armcap_P |= ARMV8_SVE2;
OPENSSL_armcap_P |= ARMV9_SVE2;
if (getauxval(OSSL_HWCAP2) & OSSL_HWCAP2_RNG)
OPENSSL_armcap_P |= ARMV8_RNG;
@@ -391,7 +398,7 @@ void OPENSSL_cpuid_setup(void)
}
# ifdef __aarch64__
OPENSSL_armcap_P |= arm_probe_for(_armv8_sve_probe, ARMV8_SVE);
OPENSSL_armcap_P |= arm_probe_for(_armv8_sve2_probe, ARMV8_SVE2);
OPENSSL_armcap_P |= arm_probe_for(_armv8_sve2_probe, ARMV9_SVE2);
OPENSSL_armcap_P |= arm_probe_for(_armv8_rng_probe, ARMV8_RNG);
# endif
@@ -450,6 +457,17 @@ void OPENSSL_cpuid_setup(void)
MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_QCOMM, QCOM_CPU_PART_ORYON_X1)) &&
(OPENSSL_armcap_P & ARMV8_SHA3))
OPENSSL_armcap_P |= ARMV8_HAVE_SHA3_AND_WORTH_USING;
if (OPENSSL_armcap_P & ARMV9_SVE2) {
uint64_t vl_bytes = _armv8_sve_get_vl_bytes();
if (vl_bytes > 16 && (vl_bytes & (vl_bytes - 1)) == 0) {
/*
* This implementation faster if vector length > 128 bits
* But vector length must be a power of 2 (e.g. 256, 512 bits)
*/
OPENSSL_armcap_P |= ARMV9_SVE2_POLY1305;
}
}
# endif
}
#endif /* _WIN32, __ARM_MAX_ARCH__ >= 7 */

View File

@@ -756,7 +756,7 @@ ChaCha20_ctr32_sve:
mov $sve2flag,0
adrp $tmp,OPENSSL_armcap_P
ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
tst $tmpw,#ARMV8_SVE2
tst $tmpw,#ARMV9_SVE2
b.eq 1f
mov $sve2flag,1
b 2f

View File

@@ -69,6 +69,8 @@ $code.=<<___;
.globl poly1305_emit
.hidden poly1305_emit
.extern poly1305_blocks_sve2
.type poly1305_init,%function
.align 5
poly1305_init:
@@ -109,6 +111,13 @@ poly1305_init:
csel $d0,$d0,$r0,eq
csel $d1,$d1,$r1,eq
tst w17, #ARMV9_SVE2_POLY1305
adrp $r0,poly1305_blocks_sve2
add $r0,$r0,#:lo12:poly1305_blocks_sve2
csel $d0,$d0,$r0,eq
#ifdef __ILP32__
stp w12,w13,[$len]
#else

File diff suppressed because it is too large Load Diff

View File

@@ -14,7 +14,7 @@ IF[{- !$disabled{asm} -}]
$POLY1305ASM_s390x=poly1305-s390x.S
$POLY1305ASM_armv4=poly1305-armv4.S
$POLY1305ASM_aarch64=poly1305-armv8.S
$POLY1305ASM_aarch64=poly1305-armv8.S poly1305-armv9-sve2.S
$POLY1305ASM_ppc32=poly1305_ppc.c poly1305-ppc.s poly1305-ppcfp.s
$POLY1305ASM_ppc64=$POLY1305ASM_ppc32
@@ -45,7 +45,9 @@ GENERATE[poly1305-ppcfp.s]=asm/poly1305-ppcfp.pl
GENERATE[poly1305-armv4.S]=asm/poly1305-armv4.pl
INCLUDE[poly1305-armv4.o]=..
GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl
GENERATE[poly1305-armv9-sve2.S]=asm/poly1305-armv9-sve2.pl
INCLUDE[poly1305-armv8.o]=..
INCLUDE[poly1305-armv9-sve2.o]=..
GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl
INCLUDE[poly1305-mips.o]=..
GENERATE[poly1305-c64xplus.S]=asm/poly1305-c64xplus.pl