diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
index 8dc06dd52a..f38a64bc6b 100755
--- a/crypto/arm64cpuid.pl
+++ b/crypto/arm64cpuid.pl
@@ -120,6 +120,14 @@ _armv8_sve2_probe:
 	ret
 .size	_armv8_sve2_probe,.-_armv8_sve2_probe
 
+.globl	_armv8_sve_get_vl_bytes
+.type	_armv8_sve_get_vl_bytes,%function
+_armv8_sve_get_vl_bytes:
+	AARCH64_VALID_CALL_TARGET
+	.inst	0x0420e3e0	// cntb x0
+	ret
+.size	_armv8_sve_get_vl_bytes,.-_armv8_sve_get_vl_bytes
+
 .globl	_armv8_cpuid_probe
 .type	_armv8_cpuid_probe,%function
 _armv8_cpuid_probe:
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index fc780a7080..b037e1b9f1 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -86,9 +86,10 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 # define ARMV8_SHA3      (1<<11)
 # define ARMV8_UNROLL8_EOR3      (1<<12)
 # define ARMV8_SVE       (1<<13)
-# define ARMV8_SVE2      (1<<14)
+# define ARMV9_SVE2      (1<<14)
 # define ARMV8_HAVE_SHA3_AND_WORTH_USING     (1<<15)
 # define ARMV8_UNROLL12_EOR3     (1<<16)
+# define ARMV9_SVE2_POLY1305 (1<<17)
 
 /*
  * MIDR_EL1 system register
diff --git a/crypto/armcap.c b/crypto/armcap.c
index 7eeea93bd1..84f621aeb8 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -24,11 +24,18 @@
 #include <unistd.h>
 #endif
 #include "arm_arch.h"
+#ifdef __aarch64__
+#include <stdint.h>
+#endif
 
 unsigned int OPENSSL_armcap_P = 0;
 unsigned int OPENSSL_arm_midr = 0;
 unsigned int OPENSSL_armv8_rsa_neonized = 0;
 
+#ifdef __aarch64__
+uint64_t _armv8_sve_get_vl_bytes(void);
+#endif
+
 #ifdef _WIN32
 void OPENSSL_cpuid_setup(void)
 {
@@ -346,7 +353,7 @@ void OPENSSL_cpuid_setup(void)
             OPENSSL_armcap_P |= ARMV8_SVE;
 
         if (getauxval(OSSL_HWCAP2) & OSSL_HWCAP2_SVE2)
-            OPENSSL_armcap_P |= ARMV8_SVE2;
+            OPENSSL_armcap_P |= ARMV9_SVE2;
 
         if (getauxval(OSSL_HWCAP2) & OSSL_HWCAP2_RNG)
             OPENSSL_armcap_P |= ARMV8_RNG;
@@ -391,7 +398,7 @@ void OPENSSL_cpuid_setup(void)
     }
 #  ifdef __aarch64__
     OPENSSL_armcap_P |= arm_probe_for(_armv8_sve_probe, ARMV8_SVE);
-    OPENSSL_armcap_P |= arm_probe_for(_armv8_sve2_probe, ARMV8_SVE2);
+    OPENSSL_armcap_P |= arm_probe_for(_armv8_sve2_probe, ARMV9_SVE2);
     OPENSSL_armcap_P |= arm_probe_for(_armv8_rng_probe, ARMV8_RNG);
 #  endif
 
@@ -450,6 +457,17 @@ void OPENSSL_cpuid_setup(void)
          MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_QCOMM, QCOM_CPU_PART_ORYON_X1)) &&
         (OPENSSL_armcap_P & ARMV8_SHA3))
         OPENSSL_armcap_P |= ARMV8_HAVE_SHA3_AND_WORTH_USING;
+    if (OPENSSL_armcap_P & ARMV9_SVE2) {
+        uint64_t vl_bytes = _armv8_sve_get_vl_bytes();
+
+        if (vl_bytes > 16 && (vl_bytes & (vl_bytes - 1)) == 0) {
+            /*
+             * This implementation faster if vector length > 128 bits
+             * But vector length must be a power of 2 (e.g. 256, 512 bits)
+             */
+            OPENSSL_armcap_P |= ARMV9_SVE2_POLY1305;
+        }
+    }
 # endif
 }
 #endif /* _WIN32, __ARM_MAX_ARCH__ >= 7 */
diff --git a/crypto/chacha/asm/chacha-armv8-sve.pl b/crypto/chacha/asm/chacha-armv8-sve.pl
index 62a8be6fe1..40454c3322 100755
--- a/crypto/chacha/asm/chacha-armv8-sve.pl
+++ b/crypto/chacha/asm/chacha-armv8-sve.pl
@@ -756,7 +756,7 @@ ChaCha20_ctr32_sve:
 	mov	$sve2flag,0
 	adrp	$tmp,OPENSSL_armcap_P
 	ldr	$tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
-	tst	$tmpw,#ARMV8_SVE2
+	tst	$tmpw,#ARMV9_SVE2
 	b.eq	1f
 	mov	$sve2flag,1
 	b	2f
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
index cc2052ecc9..6659cd631f 100755
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -69,6 +69,8 @@ $code.=<<___;
 .globl	poly1305_emit
 .hidden	poly1305_emit
 
+.extern poly1305_blocks_sve2
+
 .type	poly1305_init,%function
 .align	5
 poly1305_init:
@@ -109,6 +111,13 @@ poly1305_init:
 	csel	$d0,$d0,$r0,eq
 	csel	$d1,$d1,$r1,eq
 
+	tst	w17, #ARMV9_SVE2_POLY1305
+
+	adrp	$r0,poly1305_blocks_sve2
+	add	$r0,$r0,#:lo12:poly1305_blocks_sve2
+
+	csel	$d0,$d0,$r0,eq
+
 #ifdef	__ILP32__
 	stp	w12,w13,[$len]
 #else
diff --git a/crypto/poly1305/asm/poly1305-armv9-sve2.pl b/crypto/poly1305/asm/poly1305-armv9-sve2.pl
new file mode 100755
index 0000000000..b68741fe58
--- /dev/null
+++ b/crypto/poly1305/asm/poly1305-armv9-sve2.pl
@@ -0,0 +1,1420 @@
+#! /usr/bin/env perl
+# Copyright 2016-2025 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+##############################################################################
+#
+# Copyright (c) 2025, Iakov Polyak <iakov.polyak@linaro.org>
+# This file is an SVE2 port-and-merge of POLY1305 hash algorithm, derived from
+# the OpenSSL Neon implementation and a vector length agnostic (VLA)
+# RISC-V implementation from the CRYPTOGAMS project.
+#
+##############################################################################
+#
+# Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions
+#are met:
+#
+#      *	Redistributions of source code must retain copyright notices,
+#	this list of conditions and the following disclaimer.
+#
+#      *	Redistributions in binary form must reproduce the above
+#	copyright notice, this list of conditions and the following
+#	disclaimer in the documentation and/or other materials
+#	provided with the distribution.
+#
+#      *	Neither the name of the CRYPTOGAMS nor the names of its
+#	copyright holder and contributors may be used to endorse or
+#	promote products derived from this software without specific
+#	prior written permission.
+#
+#ALTERNATIVELY, provided that this notice is retained in full, this
+#product may be distributed under the terms of the GNU General Public
+#License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+#those given above.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+##############################################################################
+#
+# September 2025
+#
+# This is a 100% vector length agnostic implementation and has
+# been tested with QEMU for the vector length of up to 2048 bits.
+#
+# On Graviton4, with the vector register length of 128 bits,
+# it is less efficient than the Neon implementation by only 6%.
+# This number has been obtained by running
+# `openssl speed -evp ChaCha20-POLY1305` and
+# `openssl speed -evp ChaCha20`, pinned to a single CPU,
+# converting the 8192-byte result to cycles per byte
+# using actual average runtime CPU frequency from `perf stat`,
+# and taking the difference. On Graviton 4, this results in 
+# 0.62 cpb for Neon and 0.66 for SVE2.
+# 
+# While Neon should probably be the default choice on a 128-bit architecture,
+# speed-up is clearly expected with 256-bit and larger vector registers
+# in the future.
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+
+my ($h0,$h1,$h2,$r0,$r1,$r2,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+my ($SVE_R0,$SVE_R1,$SVE_S1,$SVE_R2,$SVE_S2,$SVE_R3,$SVE_S3,$SVE_R4,$SVE_S4) = map("z$_.s",(0..8));
+my ($SVE_INlo_0,$SVE_INlo_1,$SVE_INlo_2,$SVE_INlo_3,$SVE_INlo_4) = map("z$_.s",(9..13));
+my ($SVE_INhi_0,$SVE_INhi_1,$SVE_INhi_2,$SVE_INhi_3,$SVE_INhi_4) = map("z$_.s",(14..18));
+my ($SVE_ACC0,$SVE_ACC1,$SVE_ACC2,$SVE_ACC3,$SVE_ACC4) = map("z$_.d",(19..23));
+my ($SVE_H0,$SVE_H1,$SVE_H2,$SVE_H3,$SVE_H4) = map("z$_.s",(24..28));
+my ($SVE_T0,$SVE_T1,$SVE_MASK) = map("z$_",(29..31));
+
+my ($vl,$vl0,$vl1,$vl2,$vl3,$vl4) = ("x16",$h0,$h1,$h2,$r0,$r1);
+my ($cs0,$cs1,$cs2,$cs3,$cs4,$cs5) = map("x$_",(19..24));
+my ($pwr,$mask) = map("x$_",(25..26));
+my $is_base2_26 = "w17";
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.arch armv8-a
+
+.extern poly1305_blocks
+
+// --- poly1305_sw_2_26 ---
+// Performs conversion of 3 base2_44 to 5 base2_26 scalars and
+//  stores them in memory at addresses [x5], [x5,#28], [x5,#56],
+//  [x5,#84] and [x5,#112].
+//
+// This is a leaf function and does not modify stack.
+//
+// Calling Convention:
+//   Inputs:
+//     x5: Pointer into memory where 1st value should be stored.
+//     x7-x9: The three base2_44 scalar values (r0-r2)
+//   Clobbers (uses as temporaries):
+//     x10-x15
+.type	poly1305_sw_2_26,%function
+.align	5
+poly1305_sw_2_26:
+	// Converts 3 base2_44 -> 5 base2_26 values and stores
+	mov		x15,#0x3ffffff			// w15  : 2^26-1 mask
+	and		x10,$r0,x15				// w10 -> r0
+	lsr		x11,$r0,#26				// w11 : top 18 bits of r0
+	str		w10,[x5]				// Store r0
+	bfi		x11,$r1,#18,#8			// w11 -> r1
+	ubfx	x12,$r1,#8,#26			// w12 -> r2
+	str		w11,[x5,#28]			// Store r1
+	lsr		x13,$r1,#34				// w13 : top 10 bits of r1
+	str		w12,[x5,#56]			// Store r2
+	bfi		x13,$r2,#10,#16			// w13 -> r3
+	lsr		x14,$r2,#16				// w14 -> r4
+	str		w13,[x5,#84]			// Store r3
+	str		w14,[x5,#112]			// Store r4
+	ret
+.size   poly1305_sw_2_26,.-poly1305_sw_2_26
+
+// --- poly1305_sqr_2_44 ---
+// Calculates base2_44 squaring operation.
+//
+// This is a leaf function and does not modify stack.
+// It however uses callee-saved registers as scratch, so those must be
+//  saved on stack prior to calling.
+//
+// Calling Convention:
+//   Inputs:
+//     x7-x9: The three base2_44 scalar values (r0-r2)
+//   Outputs:
+//     x7-x9: The three base2_44 scalar values, squared (r0-r2)
+//   Clobbers (uses as temporaries):
+//     x10-x15, x19-x24, x26
+.type	poly1305_sqr_2_44,%function
+.align	5
+poly1305_sqr_2_44:
+
+    // Pre-calculate constants and doubled terms.
+	mov		x12,#20
+	lsl		x13,$r1,#1		// x13 = r1 * 2
+	mul		x12,$r2,x12		// x12 = r2 * 20
+	lsl		x10,$r0,#1		// x10 = r0 * 2
+
+    // --- Calculate d2 = r1*r1 + 2*r0*r2 ---
+	umulh	$cs5,$r1,$r1	// high part of r1*r1
+	mul		$cs4,$r1,$r1	// low part of r1*r1
+	umulh	x15,x10,$r2		// high part of (r0*2)*r2
+	mul		x14,x10,$r2		// low part of (r0*2)*r2
+
+    // --- Calculate d0 = r0*r0 + 20*(2*r1*r2) ---
+	umulh	$cs1,$r0,$r0	// high part of r0*r0
+	mul		$cs0,$r0,$r0	// low part of r0*r0
+	umulh	x11,x13,x12		// high part of (r1*2)*(r2*20)
+	mul		x10,x13,x12		// low part of (r1*2)*(r2*20)
+
+	adds	$cs4,$cs4,x14	// d2_lo
+	adc		$cs5,$cs5,x15	// d2_hi
+
+    // --- Calculate d1 = 2*r0*r1 + 20*r2*r2 ---
+    // d1 is a 128-bit result stored in x7:x6 (hi:lo)
+	umulh	$cs3,$r0,x13	// high part of r0*(r1*2)
+	mul		$cs2,$r0,x13	// low part of r0*(r1*2)
+	umulh	x13,$r2,x12		// high part of r2*(r2*20)
+	mul		x12,$r2,x12		// low part of r2*(r2*20)
+
+	adds	$cs0,$cs0,x10	// d0_lo
+	adc		$cs1,$cs1,x11	// d0_hi
+
+	adds	$cs2,$cs2,x12	// d1_lo
+	adc		$cs3,$cs3,x13	// d1_hi
+
+    // --- Reduction and Carry Propagation ---
+    // Reduce the 128-bit d0, d1, d2 back to three 44-bit limbs in x0, x1, x2
+	lsr		x10,$cs0,#44	// (d0_lo >> 44)
+	lsl		x11,$cs1,#20	// (d0_hi << 20) - high 20 bits are zero
+	and		$r0,$cs0,$mask	// r0 -> d0_lo & mask
+	orr		x10,x10,x11		// x10 -> 64-bit carry from d0
+    
+	lsr		x12,$cs2,#44	// (d1_lo >> 44)
+	lsl		x13,$cs3,#20	// (d1_hi << 20)
+	and		$r1,$cs2,$mask	// r1 -> d1_lo & mask
+	orr		x12,x12,x13		// x12 -> 64-bit carry from d1
+	add		$r1,$r1,x10		// r1 += carry from d0
+
+	lsr		x11,$mask,#2	// x11 -> 2^42-1 mask for d2 reduction
+	lsr		x10,$cs4,#42	// (d2_lo >> 42)
+	lsl		x13,$cs5,#22	// (d2_hi << 22)
+	and		$r2,$cs4,x11	// r2 -> d2_lo & 2^42-1 mask
+	orr		x10,x10,x13		// x10 -> final carry from d2
+	add		$r2,$r2,x12		// r2 += carry from d1
+
+    // Handle ripple-carry from r2 and apply the *5 reduction.
+	lsr		x13,$r2,#42		// Get carry from r2 (if r2 >= 2^42)
+	and		$r2,$r2,x11		// Mask r2 back down to 42 bits
+	add		x10,x10,x13		// Add this ripple-carry to the final carry
+
+	add		x11,x10,x10,lsl #2	// x11 -> final_carry * 5
+	add		$r0,$r0,x11			// r0 += final_carry * 5
+
+    // Final ripple-carry chain to ensure all limbs are 44 bits.
+	lsr		x11,$r1,#44		// Get carry from r1
+	and		$r1,$r1,$mask	// Mask r1 to 44 bits
+	add		$r2,$r2,x11		// r2 += carry from r1
+    
+	lsr		x10,$r0,#44		// Get carry from r0
+	and		$r0,$r0,$mask	// Mask r0 to 44 bits
+	add		$r1,$r1,x10		// r1 += carry from r0
+
+    ret
+.size	poly1305_sqr_2_44,.-poly1305_sqr_2_44
+
+// --- poly1305_lazy_reduce_sve2 ---
+// Performs lazy reduction on five accumulator vectors as discussed
+// in "NEON crypto" by D.J. Bernstein and P. Schwabe.
+//
+// This is a leaf function and does not modify GPRs or the stack.
+//
+// Calling Convention:
+//   Inputs:
+//     z19-z23: The five 64-bit .d accumulator vectors (ACC0-ACC4)
+//   Outputs:
+//     z24-z28: The five 32-bit .s final limb vectors (H0-H4)
+//     z31: All-zeros (resets mask)
+//   Clobbers (uses as temporaries):
+//     z29, z30
+
+.type	poly1305_lazy_reduce_sve2,%function
+.align	5
+poly1305_lazy_reduce_sve2:
+	dup 	${SVE_MASK}.d,#-1
+	lsr 	${SVE_T0}.d,$SVE_ACC3,#26
+	trn1	$SVE_H3,z22.s,z24.s					// reproducing Neon's `xtn` - treat ACC3 as a .s vector
+	lsr 	${SVE_MASK}.d,${SVE_MASK}.d,#38
+	lsr 	${SVE_T1}.d,$SVE_ACC0,#26
+	and 	$SVE_ACC0,$SVE_ACC0,${SVE_MASK}.d
+	add 	$SVE_ACC4,$SVE_ACC4,${SVE_T0}.d	    // h3 -> h4
+	// Neon's bic is replaced with &=$SVE_MASK (because of using even-indexed elements)
+	and 	z27.d,z27.d,${SVE_MASK}.d			// refer to SVE_H3 as .d
+	add 	$SVE_ACC1,$SVE_ACC1,${SVE_T1}.d	    // h0 -> h1
+
+	lsr 	${SVE_T0}.d,$SVE_ACC4,#26
+	trn1	$SVE_H4,z23.s,z24.s					// reproducing Neon's `xtn` - treat ACC4 as a .s vector
+	lsr 	${SVE_T1}.d,$SVE_ACC1,#26
+	trn1	$SVE_H1,z20.s,z24.s					// reproducing Neon's `xtn` - treat ACC1 as a .s vector
+	and 	z28.d,z28.d,${SVE_MASK}.d			// refer to SVE_H4 as .d
+	add 	$SVE_ACC2,$SVE_ACC2,${SVE_T1}.d	    // h1 -> h2
+
+	add 	$SVE_ACC0,$SVE_ACC0,${SVE_T0}.d
+	lsl 	${SVE_T0}.d,${SVE_T0}.d,#2
+	shrnb	${SVE_T1}.s,$SVE_ACC2,#26			// check it's OK
+	trn1	$SVE_H2,z21.s,z24.s					// reproducing Neon's `xtn` - treat ACC2 as a .s vector
+	add 	$SVE_ACC0,$SVE_ACC0,${SVE_T0}.d		// h4 -> h0
+	and 	z25.d,z25.d,${SVE_MASK}.d			// refer to SVE_H1 as .d
+	add 	$SVE_H3,$SVE_H3,${SVE_T1}.s			// h2 -> h3
+	and 	z26.d,z26.d,${SVE_MASK}.d			// refer to SVE_H2 as .d
+
+	shrnb	${SVE_T0}.s,$SVE_ACC0,#26
+	trn1	$SVE_H0,z19.s,z24.s					// reproducing Neon's `xtn` - treat ACC0 as a .s vector - re-writing H0 here...
+	lsr 	${SVE_T1}.s,$SVE_H3,#26
+	and 	z27.d,z27.d,${SVE_MASK}.d			// refer to SVE_H3 as .d
+	add 	$SVE_H1,$SVE_H1,${SVE_T0}.s			// h0 -> h1
+	and 	z24.d,z24.d,${SVE_MASK}.d			// refer to SVE_H0 as .d
+	add 	$SVE_H4,$SVE_H4,${SVE_T1}.s			// h3 -> h4
+
+	eor 	${SVE_MASK}.d,${SVE_MASK}.d,${SVE_MASK}.d	// reset zero mask
+
+    ret
+.size	poly1305_lazy_reduce_sve2,.-poly1305_lazy_reduce_sve2
+
+// --- poly1305_blocks_sve2 ---
+// Main function, implementing POLY1305 algorithm as discussed
+// in "NEON crypto" by D.J. Bernstein and P. Schwabe, in a VLA fashion,
+// using SVE2.
+//
+// It is mostly a port-and-merge of the 128-bit Neon implementation herein and
+//  a VLA risc-v implementation in https://github.com/dot-asm/cryptogams.
+//
+.globl	poly1305_blocks_sve2
+.type	poly1305_blocks_sve2,%function
+.align	5
+poly1305_blocks_sve2:
+.Lpoly1305_blocks_sve2:
+	AARCH64_VALID_CALL_TARGET
+	ldr	$is_base2_26,[$ctx,#24]
+	// Estimate vector width and branch to scalar if input too short
+	cntd	$vl					// vector width in 64-bit lanes (vl)
+	lsl	$vl0,$vl,#4				// vl * 16 (bytes per vector input blocks) 
+	add $vl1,$vl0,$vl0,lsl #1	// 3 * vl * 16 - new threshold.
+	cmp	$len,$vl1
+	b.hs	.Lblocks_sve2
+	cbz	$is_base2_26,.Lshort_blocks	// Call scalar f-n if short; if in base 2^26 - proceed
+
+.Lblocks_sve2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-144]!		// Allowing for callee-saved reg-s
+	add	x29,sp,#0
+
+	//Store some callee-saved GPRs
+	stp	x19,x20,[sp,#16]
+ 	stp	x21,x22,[sp,#32]
+ 	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+
+	ands	$len,$len,#-16
+	b.eq	.Lno_data_sve2
+
+	cbz	$is_base2_26,.Lbase2_64_sve2
+
+	ldp	w10,w11,[$ctx]			// load hash value base 2^26
+	ldp	w12,w13,[$ctx,#8]
+	ldr	w14,[$ctx,#16]
+
+	neg	$vl1,$vl0				// - (vl * 16)
+	sub	$vl0,$vl0,#1			// (vl * 16) - 1
+	and	$vl2,$len,$vl1			// $len - ($len % (vl * 16)) -> VLA length
+	and	$vl4,$len,$vl0			// $len % (vl * 16) -> scalar remainder
+	cbz	$vl4,.Leven_sve2		// If no scalar "head", proceed to VLA
+	add	$vl3,$inp,$vl4			// Pointer to the start of the VLA data
+	stp	$vl2,$vl3,[sp,#-16]!	// Backup VLA length and ptr
+	mov	$len,$vl4				// So that scalar part knows it's length
+
+	add	$h0,x10,x11,lsl#26		// base 2^26 -> base 2^64
+	lsr	$h1,x12,#12
+	adds	$h0,$h0,x12,lsl#52
+	add	$h1,$h1,x13,lsl#14
+	adc	$h1,$h1,xzr
+	lsr	$h2,x14,#24
+	adds	$h1,$h1,x14,lsl#40
+	adc	$d2,$h2,xzr				// can be partially reduced...
+
+	and	$t0,$d2,#-4				// ... so reduce
+	and	$h2,$d2,#3
+	add	$t0,$t0,$d2,lsr#2
+	adds	$h0,$h0,$t0
+	adcs	$h1,$h1,xzr
+	adc	$h2,$h2,xzr
+
+	stp	$h0,$h1,[$ctx]			// store hash value base 2^64
+	str	$h2,[$ctx,#16]
+
+	bl	poly1305_blocks			// Calculate the scalar "head"
+	ldp	$len,$inp,[sp],#16		// Recover updated length and input ptr
+	ldr	x30,[sp,#8]
+
+	cbz	$padbit,.Lzero_padbit_sve2	// hash already stored in poly1305_blocks
+
+	ldp	$h0,$h1,[$ctx]			// load hash value base 2^64
+	ldr $h2,[$ctx,#16]
+
+	and	x10,$h0,#0x03ffffff		// base 2^64 -> base 2^26
+	ubfx	x11,$h0,#26,#26
+	extr	x12,$h1,$h0,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,$h1,#14,#26
+	extr	x14,$h2,$h1,#40
+
+	cbnz	$len,.Leven_sve2
+
+	stp	w10,w11,[$ctx]			// store hash value base 2^26
+	stp	w12,w13,[$ctx,#8]
+	str	w14,[$ctx,#16]
+	b	.Lno_data_sve2
+
+.align	4
+.Lzero_padbit_sve2:
+	str	xzr,[$ctx,#24]
+	b	.Lno_data_sve2
+
+.align	4
+.Lbase2_64_sve2:
+	neg	$vl1,$vl0				// - (vl * 16)
+	sub	$vl0,$vl0,#1			// (vl * 16) - 1
+	and	$vl2,$len,$vl1			// $len - ($len % (vl * 16)) -> VLA length
+	and	$vl4,$len,$vl0			// $len % (vl * 16) -> scalar remainder
+	cbz	$vl4,.Linit_sve2		// If no scalar "head", proceed to VLA
+	add	$vl3,$inp,$vl4			// Pointer to the start of the VLA data
+	stp	$vl2,$vl3,[sp,#-16]!	// Backup VLA length and ptr
+	mov	$len,$vl4				// So that scalar part knows it's length
+	bl	poly1305_blocks			// Calculate the scalar "head"
+	ldp	$len,$inp,[sp],#16		// Recover updated length and input ptr
+
+.Linit_sve2:
+	// Calculating and storing r-powers (powers of a key).
+	// The layout of how r-powers are stored in memory:
+	//////////////////////////////////////////////////////////////////////////////////////
+	//                   lobe 1                           lobe 2                   etc. //
+	//      | .. r^{max},r^{max/2},...,r^2,r | .. r^{max},r^{max/2},...,r^2,r | ..      //
+	//     / \                              / \                              / \        //
+	//  [$ctx,48]                       [$ctx,48+28]                     [$ctx,48+56]   //
+	//////////////////////////////////////////////////////////////////////////////////////
+
+	ldr w5,[$ctx,#28]		// Load top power (if exists - 0 by default)
+	add $pwr,$ctx,#48+28	// Point to the end of powers allocation (1st lobe)
+
+	mov $mask,#-1
+	lsr $mask,$mask,#20		//2^44-1
+
+	cbnz	w5,.Lpwrs_precomputed
+
+	ldp	$r0,$r1,[$ctx,#32]	// load key value
+
+	lsr	$r2,$r1,#24			// base2_64 -> base2_44
+	extr	$r1,$r1,$r0,#44
+	and	$r0,$r0,$mask
+	and	$r1,$r1,$mask
+
+	mov	x4,$vl
+	add	x5,$pwr,#-4
+	bl	poly1305_sw_2_26
+
+.Loop_pwrs_sqr:
+	lsr	x4,x4,#1
+	add	x5,x5,#-4
+	bl	poly1305_sqr_2_44
+	bl	poly1305_sw_2_26
+	cbnz	 x4,.Loop_pwrs_sqr
+
+	sub	x5,x5,$pwr
+	str	w5,[$ctx,#28]
+
+.Lpwrs_precomputed:
+	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
+	ldr $h2,[$ctx,#16]
+
+	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x11,$h0,#26,#26
+	extr	x12,$h1,$h0,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,$h1,#14,#26
+	extr	x14,$h2,$h1,#40
+
+	stp	d8,d9,[sp,#80]		// meet ABI requirements
+	stp	d10,d11,[sp,#96]
+	stp	d12,d13,[sp,#112]
+	stp	d14,d15,[sp,#128]
+
+    // Zeroing H0-H4 registers
+	eor 	z24.d,z24.d,z24.d  // H0
+	eor 	z25.d,z25.d,z25.d  // H1
+	eor 	z26.d,z26.d,z26.d  // H2
+	eor 	z27.d,z27.d,z27.d  // H3
+	eor 	z28.d,z28.d,z28.d  // H4
+
+	// Using Neon's fmov here for speed.
+	//  We only need the low 26 bits in the first step so no need for post-mov reshuffle.
+	fmov	d24,x10		// H0
+	fmov	d25,x11		// H1
+	fmov	d26,x12		// H2
+	fmov	d27,x13		// H3
+	fmov	d28,x14		// H4
+
+	ldr	x30,[sp,#8]
+
+	mov	x4,#1
+	stur	w4,[$ctx,#24]		// set is_base2_26
+	b	.Ldo_sve2
+
+.align	4
+.Leven_sve2:
+	// In principle all this could be moved to Ldo_sve2
+	stp	d8,d9,[sp,#80]		// meet ABI requirements
+	stp	d10,d11,[sp,#96]
+	stp	d12,d13,[sp,#112]
+	stp	d14,d15,[sp,#128]
+
+	eor 	z24.d,z24.d,z24.d  // H0
+	eor 	z25.d,z25.d,z25.d  // H1
+	eor 	z26.d,z26.d,z26.d  // H2
+	eor 	z27.d,z27.d,z27.d  // H3
+	eor 	z28.d,z28.d,z28.d  // H4
+
+	fmov	d24,x10		// H0
+	fmov	d25,x11		// H1
+	fmov	d26,x12		// H2
+	fmov	d27,x13		// H3
+	fmov	d28,x14		// H4
+
+.Ldo_sve2:
+    ptrue   p0.b, ALL               		// Set all-true predicate
+
+	// Load r-powers.
+	// They are stored in five lobes, in the order r^{max},...,r^2,r^1 each.
+	// We need specific powers to be at specific R- and S-vector indices.
+	// Hence we can't load all of them, an arbitrary amount, dependent on VL.
+	// Instead we load {r^{max},r^{max/2}} and {r^2,r^1} in batches,
+	//  and then interleave them using zip1 as {r^{max},r^2,r^{max/2},r}.
+	// We don't really care where r^{max} and r^{max/2} are, but we want
+	//  r^2 and r to be in either even or odd lanes. We chose lanes 1 and 3.
+	// Intermediate r-powers (r^{max/4},..,r^4), if applicable, will be
+	//  reloaded into lane 0 iteratively in Loop_reduce_sve2.
+
+	ldr 	w5,[$ctx,#28]
+	sxtw	x5,w5				// Zero-extend
+	add 	$pwr,$ctx,#48+28	// Pointer to the end of the r-powers 1st lobe
+	add		x10,$ctx,#48+20		// Pointer to r^2.
+	add		$pwr,$pwr,x5		// Pointer to the r^{max}
+
+	mov		x15,#2
+	whilelo	p1.s,xzr,x15
+
+	// If wouldn't need to load in two chunks, could use ld1rqw - 
+	//  optimisation potential for 256-bit vector.
+	ld1w	{ $SVE_R0 },p1/z,[$pwr]
+	ld1w	{ $SVE_T0.s },p1/z,[x10]
+	add		$pwr,$pwr,#28
+	add		x10,x10,#28
+	zip1	$SVE_R0,$SVE_R0,$SVE_T0.s
+
+	ld1w	{ $SVE_R1 },p1/z,[$pwr]
+	ld1w	{ $SVE_T1.s },p1/z,[x10]
+	add		$pwr,$pwr,#28
+	add		x10,x10,#28
+	zip1	$SVE_R1,$SVE_R1,$SVE_T1.s
+
+	ld1w	{ $SVE_R2 },p1/z,[$pwr]
+	ld1w	{ $SVE_T0.s },p1/z,[x10]
+	add		$pwr,$pwr,#28
+	add		x10,x10,#28
+	zip1	$SVE_R2,$SVE_R2,$SVE_T0.s
+
+	ld1w	{ $SVE_R3 },p1/z,[$pwr]
+	ld1w	{ $SVE_T1.s },p1/z,[x10]
+	add		$pwr,$pwr,#28
+	add		x10,x10,#28
+	zip1	$SVE_R3,$SVE_R3,$SVE_T1.s
+
+	ld1w	{ $SVE_R4 },p1/z,[$pwr]
+	ld1w	{ $SVE_T0.s },p1/z,[x10]
+	sub		$pwr,$pwr,#104				// Adjust to 1st lobe, 3d power
+	zip1	$SVE_R4,$SVE_R4,$SVE_T0.s
+
+	// Broadcast r-powers loaded above to higher parts of the R-vectors.
+	cmp		$vl,#2
+	b.eq	.L_skip_dup_broadcast
+	dup		z0.q,z0.q[0]
+	dup		z1.q,z1.q[0]
+	dup		z3.q,z3.q[0]
+	dup		z5.q,z5.q[0]
+	dup		z7.q,z7.q[0]
+
+.L_skip_dup_broadcast:
+	// Calculate S-vectors (r^x*5)
+	adr     $SVE_S1,[$SVE_R1,$SVE_R1,lsl #2]
+	adr     $SVE_S2,[$SVE_R2,$SVE_R2,lsl #2]
+	adr     $SVE_S3,[$SVE_R3,$SVE_R3,lsl #2]
+	adr     $SVE_S4,[$SVE_R4,$SVE_R4,lsl #2]
+
+	// Load initial input blocks
+	lsr		x15,$len,#4
+	whilelo	p1.s,xzr,x15					// Set predicate for blocks loading
+	lsl	$padbit,$padbit,#24
+	ld4w	{ z9.s-z12.s },p1/z,[$inp]		// Loading all blocks at once
+
+#ifdef  __AARCH64EB__
+	revb	z9.s,  p0/m, z9.s
+	revb	z10.s, p0/m, z10.s
+	revb	z11.s, p0/m, z11.s
+	revb	z12.s, p0/m, z12.s
+#endif
+
+	// In-vector (VLA) conversion base2_64 -> base2_26.
+	dup 	${SVE_MASK}.s,#-1
+	lsr 	${SVE_MASK}.s,${SVE_MASK}.s,#6
+
+	lsr		${SVE_T0}.s,z11.s,#14		// T0 -> z11 >> 14
+	lsr		z13.s,z12.s,#8				// z13 -> l4
+	lsl		z11.s,z11.s,#12				// z11 -> upper part of l2
+	lsl		z12.s,z12.s,#18				// z12 -> upper part of l3
+	lsr		${SVE_T1}.s,z10.s,#20		// T1 -> z10 >> 20
+	orr		z12.d,z12.d,${SVE_T0}.d		// z12 -> final l3
+	lsl		z10.s,z10.s,#6				// z10 -> upper part of l1
+	lsr		${SVE_T0}.s,z9.s,#26		// T0 -> z9 >> 26
+	and		z9.d,z9.d,${SVE_MASK}.d		// z9 is now final l0
+	orr		z11.d,z11.d,${SVE_T1}.d		// z11 -> final l2
+	orr		z10.d,z10.d,${SVE_T0}.d		// z10 -> final l1
+	dup		${SVE_T1}.s,w3				// x3 -> $padbit but need it as a word
+	eor 	${SVE_T0}.d,${SVE_T0}.d,${SVE_T0}.d	// set zero mask
+	orr		z13.d,z13.d,${SVE_T1}.d		// l4 += padbit
+	and		z12.d,z12.d,${SVE_MASK}.d	// Mask l3
+	and		z11.d,z11.d,${SVE_MASK}.d	// Mask l2
+	and		z10.d,z10.d,${SVE_MASK}.d	// Mask l1
+
+
+	// Move high blocks from INlo -> INhi and sparcify (put in even lanes)
+	zip2	z14.s,z9.s,${SVE_T0}.s
+	zip2	z18.s,z13.s,${SVE_T0}.s
+	zip2	z17.s,z12.s,${SVE_T0}.s
+	zip2	z16.s,z11.s,${SVE_T0}.s
+	zip2	z15.s,z10.s,${SVE_T0}.s
+
+	// Sparcify blocks to even lanes in INlo
+	zip1	z9.s,z9.s,${SVE_T0}.s
+	zip1	z13.s,z13.s,${SVE_T0}.s
+	zip1	z12.s,z12.s,${SVE_T0}.s
+	zip1	z11.s,z11.s,${SVE_T0}.s
+	zip1	z10.s,z10.s,${SVE_T0}.s
+
+	subs	$len,$len,$vl,lsl #5		// By half vector width * 32
+
+	b.ls	.Lskip_loop_sve2
+
+.align	4
+.Loop_sve2:
+	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// ((inp[0]*r^{vl*2} + inp[vl]  *r^{vl} + inp[2*vl]  )*r^{vl} + inp[3*vl]  )*r^{vl}
+	//+((inp[1]*r^{vl*2} + inp[vl+1]*r^{vl} + inp[2*vl+1])*r^{vl} + inp[3*vl+1])*r^{vl-1}
+	//+...
+	//   \_______________________________/    \_________________________________________/ 
+	//      first main loop iteration                       long tail
+	//
+	// ((inp[0]*r^{vl*2} + inp[vl]  *r^{vl} + inp[2*vl]  )*r^{vl*2} + inp[3*vl]  *r^{vl} + inp[4*vl]  )*r^{vl}
+	//+((inp[1]*r^{vl*2} + inp[vl+1]*r^{vl} + inp[2*vl+1])*r^{vl*2} + inp[3*vl+1]*r^{vl} + inp[4*vl+1])*r^{vl-1}
+	//+...
+	//   \_______________________________/    \________________________________________/   \___________________/
+	//      first main loop iteration             second main loop iteration                    short tail
+	//
+	// Note that we start with inp[vl:vl*2]*r^{vl}, as it
+	// doesn't depend on reduction in previous iteration.
+	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Hash-key power product f-la for the 5 limbs in base2^26 representation:
+	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
+	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
+	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
+	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
+	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+	add		$inp,$inp,$vl,lsl #5
+
+	umullb	$SVE_ACC4,$SVE_INhi_0,${SVE_R4}[2]
+	umullb	$SVE_ACC3,$SVE_INhi_0,${SVE_R3}[2]
+	umullb	$SVE_ACC2,$SVE_INhi_0,${SVE_R2}[2]
+	umullb	$SVE_ACC1,$SVE_INhi_0,${SVE_R1}[2]
+	umullb	$SVE_ACC0,$SVE_INhi_0,${SVE_R0}[2]
+
+	umlalb	$SVE_ACC4,$SVE_INhi_1,${SVE_R3}[2]
+	umlalb	$SVE_ACC3,$SVE_INhi_1,${SVE_R2}[2]
+	umlalb	$SVE_ACC2,$SVE_INhi_1,${SVE_R1}[2]
+	umlalb	$SVE_ACC1,$SVE_INhi_1,${SVE_R0}[2]
+	umlalb	$SVE_ACC0,$SVE_INhi_1,${SVE_S4}[2]
+
+	umlalb	$SVE_ACC4,$SVE_INhi_2,${SVE_R2}[2]
+	umlalb	$SVE_ACC3,$SVE_INhi_2,${SVE_R1}[2]
+	umlalb	$SVE_ACC2,$SVE_INhi_2,${SVE_R0}[2]
+	umlalb	$SVE_ACC1,$SVE_INhi_2,${SVE_S4}[2]
+	umlalb	$SVE_ACC0,$SVE_INhi_2,${SVE_S3}[2]
+
+	umlalb	$SVE_ACC4,$SVE_INhi_3,${SVE_R1}[2]
+	umlalb	$SVE_ACC3,$SVE_INhi_3,${SVE_R0}[2]
+	umlalb	$SVE_ACC2,$SVE_INhi_3,${SVE_S4}[2]
+	umlalb	$SVE_ACC1,$SVE_INhi_3,${SVE_S3}[2]
+	umlalb	$SVE_ACC0,$SVE_INhi_3,${SVE_S2}[2]
+
+	add 	$SVE_INlo_2,$SVE_INlo_2,$SVE_H2
+	umlalb	$SVE_ACC4,$SVE_INhi_4,${SVE_R0}[2]
+	umlalb	$SVE_ACC3,$SVE_INhi_4,${SVE_S4}[2]
+	umlalb	$SVE_ACC2,$SVE_INhi_4,${SVE_S3}[2]
+	umlalb	$SVE_ACC1,$SVE_INhi_4,${SVE_S2}[2]
+	umlalb	$SVE_ACC0,$SVE_INhi_4,${SVE_S1}[2]
+
+	//////////////////////////////////////////////////////////////////////
+	// (hash+inp[0:vl])*r^{vl*2} and accumulate
+	// Interleave add+mul with loading and converting the next input batch
+
+	add 	$SVE_INlo_0,$SVE_INlo_0,$SVE_H0
+	 lsr	x15,$len,#4
+	umlalb	$SVE_ACC3,$SVE_INlo_2,${SVE_R1}[0]
+	 whilelo	p1.s,xzr,x15
+	umlalb	$SVE_ACC0,$SVE_INlo_2,${SVE_S3}[0]
+	 ld4w	{ z14.s-z17.s }, p1/z, [$inp]
+	umlalb	$SVE_ACC4,$SVE_INlo_2,${SVE_R2}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_2,${SVE_S4}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_2,${SVE_R0}[0]
+
+#ifdef  __AARCH64EB__
+	revb	z14.s, p0/m, z14.s
+	revb	z15.s, p0/m, z15.s
+	revb	z16.s, p0/m, z16.s
+	revb	z17.s, p0/m, z17.s
+#endif
+
+	add 	$SVE_INlo_1,$SVE_INlo_1,$SVE_H1
+	 dup 	${SVE_MASK}.s,#-1
+	umlalb	$SVE_ACC3,$SVE_INlo_0,${SVE_R3}[0]
+	 lsr 	${SVE_MASK}.s,${SVE_MASK}.s,#6
+	umlalb	$SVE_ACC4,$SVE_INlo_0,${SVE_R4}[0]
+	 lsr	${SVE_T0}.s,z16.s,#14		// T0 -> z16 >> 14
+	umlalb	$SVE_ACC2,$SVE_INlo_0,${SVE_R2}[0]
+	 lsr	z18.s,z17.s,#8				// z18 -> l4
+	umlalb	$SVE_ACC0,$SVE_INlo_0,${SVE_R0}[0]
+	 lsl	z16.s,z16.s,#12				// z16 -> upper part of l2
+	umlalb	$SVE_ACC1,$SVE_INlo_0,${SVE_R1}[0]
+	 lsl	z17.s,z17.s,#18				// z17 -> upper part of l3
+
+	add 	$SVE_INlo_3,$SVE_INlo_3,$SVE_H3
+	 lsr	${SVE_T1}.s,z15.s,#20		// T1 -> z15 >> 20
+	umlalb	$SVE_ACC3,$SVE_INlo_1,${SVE_R2}[0]
+	 orr	z17.d,z17.d,${SVE_T0}.d		// z17 -> final l3
+	umlalb	$SVE_ACC4,$SVE_INlo_1,${SVE_R3}[0]
+	 lsl	z15.s,z15.s,#6				// z15 -> upper part of l1
+	umlalb	$SVE_ACC0,$SVE_INlo_1,${SVE_S4}[0]
+	 lsr	${SVE_T0}.s,z14.s,#26		// T0 -> z14 >> 26
+	umlalb	$SVE_ACC2,$SVE_INlo_1,${SVE_R1}[0]
+	 and	z14.d,z14.d,${SVE_MASK}.d	// z14 is now final l0
+	umlalb	$SVE_ACC1,$SVE_INlo_1,${SVE_R0}[0]
+	 orr	z16.d,z16.d,${SVE_T1}.d		// z16 -> final l2
+
+	add 	$SVE_INlo_4,$SVE_INlo_4,$SVE_H4
+	 orr	z15.d,z15.d,${SVE_T0}.d		// z15 -> final l1
+	umlalb	$SVE_ACC3,$SVE_INlo_3,${SVE_R0}[0]
+	 dup	${SVE_T1}.s,w3
+	umlalb	$SVE_ACC0,$SVE_INlo_3,${SVE_S2}[0]
+	 eor 	${SVE_T0}.d,${SVE_T0}.d,${SVE_T0}.d	// set zero mask
+	umlalb	$SVE_ACC4,$SVE_INlo_3,${SVE_R1}[0]
+	 orr	z18.d,z18.d,${SVE_T1}.d		// l4 += padbit
+	umlalb	$SVE_ACC1,$SVE_INlo_3,${SVE_S3}[0]
+	 and	z17.d,z17.d,${SVE_MASK}.d	// Mask l3
+	umlalb	$SVE_ACC2,$SVE_INlo_3,${SVE_S4}[0]
+	 and	z16.d,z16.d,${SVE_MASK}.d	// Mask l2
+
+	umlalb	$SVE_ACC3,$SVE_INlo_4,${SVE_S4}[0]
+	 and	z15.d,z15.d,${SVE_MASK}.d	// Mask l1
+	umlalb	$SVE_ACC0,$SVE_INlo_4,${SVE_S1}[0]
+	 zip1	z9.s,z14.s,${SVE_T0}.s
+	umlalb	$SVE_ACC4,$SVE_INlo_4,${SVE_R0}[0]
+	 zip1	z10.s,z15.s,${SVE_T0}.s
+	umlalb	$SVE_ACC1,$SVE_INlo_4,${SVE_S2}[0]
+	 zip1	z11.s,z16.s,${SVE_T0}.s
+	umlalb	$SVE_ACC2,$SVE_INlo_4,${SVE_S3}[0]
+	 zip1	z12.s,z17.s,${SVE_T0}.s
+	 zip1	z13.s,z18.s,${SVE_T0}.s
+
+	// Sparcify blocks to even lanes in INlo
+	zip2	z14.s,z14.s,${SVE_T0}.s
+	zip2	z15.s,z15.s,${SVE_T0}.s
+	zip2	z16.s,z16.s,${SVE_T0}.s
+	zip2	z17.s,z17.s,${SVE_T0}.s
+	zip2	z18.s,z18.s,${SVE_T0}.s
+
+	subs	$len,$len,$vl,lsl #5
+
+	// Lazy reduction
+	bl		poly1305_lazy_reduce_sve2
+	ldr	x30,[sp,#8]
+
+	b.hi	.Loop_sve2
+
+.Lskip_loop_sve2:
+
+	adds	$len,$len,$vl,lsl #4		// By half the usual input size
+	b.eq	.Lshort_tail_sve2
+
+.Long_tail_sve2:
+	////////////////////////////////////////////////////////////////
+	// (hash + inp[lo])*r^{vl} + inp[hi])*r^{vl..1}               //
+	//  \____________________/                                    //
+	//  first part of long tail                                   //
+	////////////////////////////////////////////////////////////////
+	//NB `vl` here (and in the code) is the vector length in double words.
+	// Intereaving algebra with copying INhi -> INlo for the next steps.
+
+	add 	$SVE_INlo_2,$SVE_INlo_2,$SVE_H2
+	add 	$SVE_INlo_0,$SVE_INlo_0,$SVE_H0
+	add 	$SVE_INlo_1,$SVE_INlo_1,$SVE_H1
+	add 	$SVE_INlo_3,$SVE_INlo_3,$SVE_H3
+	add 	$SVE_INlo_4,$SVE_INlo_4,$SVE_H4
+
+	umullb	$SVE_ACC3,$SVE_INlo_2,${SVE_R1}[2]
+	umullb	$SVE_ACC0,$SVE_INlo_2,${SVE_S3}[2]
+	umullb	$SVE_ACC4,$SVE_INlo_2,${SVE_R2}[2]
+	umullb	$SVE_ACC1,$SVE_INlo_2,${SVE_S4}[2]
+	umullb	$SVE_ACC2,$SVE_INlo_2,${SVE_R0}[2]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_0,${SVE_R3}[2]
+	umlalb	$SVE_ACC4,$SVE_INlo_0,${SVE_R4}[2]
+	umlalb	$SVE_ACC2,$SVE_INlo_0,${SVE_R2}[2]
+	umlalb	$SVE_ACC0,$SVE_INlo_0,${SVE_R0}[2]
+	umlalb	$SVE_ACC1,$SVE_INlo_0,${SVE_R1}[2]
+	mov		z11.d,z16.d
+
+	umlalb	$SVE_ACC3,$SVE_INlo_1,${SVE_R2}[2]
+	umlalb	$SVE_ACC4,$SVE_INlo_1,${SVE_R3}[2]
+	umlalb	$SVE_ACC0,$SVE_INlo_1,${SVE_S4}[2]
+	umlalb	$SVE_ACC2,$SVE_INlo_1,${SVE_R1}[2]
+	umlalb	$SVE_ACC1,$SVE_INlo_1,${SVE_R0}[2]
+	mov		z9.d,z14.d	
+
+	umlalb	$SVE_ACC3,$SVE_INlo_3,${SVE_R0}[2]
+	umlalb	$SVE_ACC0,$SVE_INlo_3,${SVE_S2}[2]
+	umlalb	$SVE_ACC4,$SVE_INlo_3,${SVE_R1}[2]
+	umlalb	$SVE_ACC1,$SVE_INlo_3,${SVE_S3}[2]
+	umlalb	$SVE_ACC2,$SVE_INlo_3,${SVE_S4}[2]
+	mov		z10.d,z15.d
+
+	umlalb	$SVE_ACC3,$SVE_INlo_4,${SVE_S4}[2]
+	umlalb	$SVE_ACC0,$SVE_INlo_4,${SVE_S1}[2]
+	umlalb	$SVE_ACC4,$SVE_INlo_4,${SVE_R0}[2]
+	umlalb	$SVE_ACC1,$SVE_INlo_4,${SVE_S2}[2]
+	umlalb	$SVE_ACC2,$SVE_INlo_4,${SVE_S3}[2]
+	mov		z12.d,z17.d
+
+	// Lazy reduction
+	bl		poly1305_lazy_reduce_sve2
+	ldr	x30,[sp,#8]
+
+	mov		z13.d,z18.d
+
+.Lshort_tail_sve2:
+
+	cmp     $vl, #2
+    b.ls    .Last_reduce_sve2
+
+	mov		x15,#1
+	whilelo p1.s,xzr,x15
+
+.Loop_reduce_sve2:
+	////////////////////////////////////////////////////////////////
+	// (hash + inp[hi])*r^{vl/2..2}                               //
+	//       \____________________/                               //
+	//  iterative reduction part of the short tail                //
+	////////////////////////////////////////////////////////////////
+	// Last column of products is calculated by iteratively "folding" vectors:
+	// 1. If vl==2 - skip to Last_reduce_sve2
+	// 2. calculate product with r^{vl/2} -> ACC{0-4}
+	// 3. lazy reduction -> H{0-4}
+	// 4. upper half of vectors (INlo{0-4}) is copied to lower halves
+	// 5. If vl/2==2 - go to Last_reduce_sve2
+	// 6. continue with 2.
+	// NB: this part is skipped for 128-bit case (vl==2)
+	// For 256-bit, no intermediate loading is necessary - r^2 is already in [1].
+	//  So a special case can be easily implemented, when corresponding hardware is available.
+
+	// Load the intermediate r-power into the 0th lanes of vectors
+	// Interleave with broadcasting and S-vector calculation.
+	ldr		w10,[$pwr]
+	ldr		w11,[$pwr,#28]
+	ldr		w12,[$pwr,#56]
+	cpy		$SVE_R0,p1/m,w10
+	ldr		w13,[$pwr,#84]
+	cpy		$SVE_R1,p1/m,w11
+	dup		z0.q,z0.q[0]
+	ldr		w14,[$pwr,#112]
+	cpy		$SVE_R2,p1/m,w12
+	dup		z1.q,z1.q[0]
+	cpy		$SVE_R3,p1/m,w13
+	dup		z3.q,z3.q[0]
+	cpy		$SVE_R4,p1/m,w14
+	add		$pwr,$pwr,#4			// Increment pointer for the next iteration
+	dup		z5.q,z5.q[0]
+	dup		z7.q,z7.q[0]
+
+	// Interleaved hash contraction and S-vector calc.
+	add 	$SVE_INlo_2,$SVE_INlo_2,$SVE_H2
+	adr     $SVE_S1,[$SVE_R1,$SVE_R1,lsl #2]
+	add 	$SVE_INlo_0,$SVE_INlo_0,$SVE_H0
+	adr     $SVE_S2,[$SVE_R2,$SVE_R2,lsl #2]
+	add 	$SVE_INlo_1,$SVE_INlo_1,$SVE_H1
+	adr     $SVE_S3,[$SVE_R3,$SVE_R3,lsl #2]
+	add 	$SVE_INlo_3,$SVE_INlo_3,$SVE_H3
+	adr     $SVE_S4,[$SVE_R4,$SVE_R4,lsl #2]
+	add 	$SVE_INlo_4,$SVE_INlo_4,$SVE_H4
+
+	umullb	$SVE_ACC3,$SVE_INlo_0,${SVE_R3}[0]
+	umullb	$SVE_ACC4,$SVE_INlo_0,${SVE_R4}[0]
+	umullb	$SVE_ACC2,$SVE_INlo_0,${SVE_R2}[0]
+	umullb	$SVE_ACC0,$SVE_INlo_0,${SVE_R0}[0]
+	umullb	$SVE_ACC1,$SVE_INlo_0,${SVE_R1}[0]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_1,${SVE_R2}[0]
+	umlalb	$SVE_ACC4,$SVE_INlo_1,${SVE_R3}[0]
+	umlalb	$SVE_ACC0,$SVE_INlo_1,${SVE_S4}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_1,${SVE_R1}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_1,${SVE_R0}[0]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_2,${SVE_R1}[0]
+	umlalb	$SVE_ACC0,$SVE_INlo_2,${SVE_S3}[0]
+	umlalb	$SVE_ACC4,$SVE_INlo_2,${SVE_R2}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_2,${SVE_S4}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_2,${SVE_R0}[0]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_3,${SVE_R0}[0]
+	umlalb	$SVE_ACC0,$SVE_INlo_3,${SVE_S2}[0]
+	umlalb	$SVE_ACC4,$SVE_INlo_3,${SVE_R1}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_3,${SVE_S3}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_3,${SVE_S4}[0]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_4,${SVE_S4}[0]
+	umlalb	$SVE_ACC0,$SVE_INlo_4,${SVE_S1}[0]
+	umlalb	$SVE_ACC4,$SVE_INlo_4,${SVE_R0}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_4,${SVE_S2}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_4,${SVE_S3}[0]
+
+	// Lazy reduction
+	bl		poly1305_lazy_reduce_sve2
+	ldr	x30,[sp,#8]
+
+	// Move higher part of vectors to lower part, depending on current vl
+	// NB look-up is done in terms of single-word lanes, hence indices
+	//  start from vl (refer to as w16) and not vl/2
+	// Higher part now contains "junk"
+	index	${SVE_T0}.s,w16,#1
+	tbl		${SVE_INlo_0},${SVE_INlo_0},${SVE_T0}.s
+	tbl		${SVE_INlo_1},${SVE_INlo_1},${SVE_T0}.s
+	tbl		${SVE_INlo_2},${SVE_INlo_2},${SVE_T0}.s
+	tbl		${SVE_INlo_3},${SVE_INlo_3},${SVE_T0}.s
+	tbl		${SVE_INlo_4},${SVE_INlo_4},${SVE_T0}.s
+	lsr		$vl,$vl,#1		// vl /= 2
+	cmp 	$vl,#2
+	b.hi	.Loop_reduce_sve2
+
+.Last_reduce_sve2:
+	////////////////////////////////////////////////////////////////
+	// (hash + inp[n-1])*r^2                                      //
+	//+(hash + inp[n]  )*r                                        //
+	//       \_____________/                                      //
+	//  Final part of the short tail                              //
+	////////////////////////////////////////////////////////////////
+
+	//Last hash addition - now everything stored in SVE_Hx
+	add 	$SVE_H2,$SVE_H2,$SVE_INlo_2
+	add 	$SVE_H0,$SVE_H0,$SVE_INlo_0
+	add 	$SVE_H1,$SVE_H1,$SVE_INlo_1
+	add 	$SVE_H3,$SVE_H3,$SVE_INlo_3
+	add 	$SVE_H4,$SVE_H4,$SVE_INlo_4
+
+	// Shift even lanes to odd lanes and set even to zero
+	//  because r^2 and r^1 are in lanes 1 and 3 of R-vectors
+	trn1	$SVE_H2,${SVE_MASK}.s,$SVE_H2
+	trn1	$SVE_H0,${SVE_MASK}.s,$SVE_H0
+	trn1	$SVE_H1,${SVE_MASK}.s,$SVE_H1
+	trn1	$SVE_H3,${SVE_MASK}.s,$SVE_H3
+	trn1	$SVE_H4,${SVE_MASK}.s,$SVE_H4
+
+	umullt	$SVE_ACC3,$SVE_H2,${SVE_R1}
+	umullt	$SVE_ACC0,$SVE_H2,${SVE_S3}
+	umullt	$SVE_ACC4,$SVE_H2,${SVE_R2}
+	umullt	$SVE_ACC1,$SVE_H2,${SVE_S4}
+	umullt	$SVE_ACC2,$SVE_H2,${SVE_R0}
+
+	umlalt	$SVE_ACC3,$SVE_H0,${SVE_R3}
+	umlalt	$SVE_ACC4,$SVE_H0,${SVE_R4}
+	umlalt	$SVE_ACC2,$SVE_H0,${SVE_R2}
+	umlalt	$SVE_ACC0,$SVE_H0,${SVE_R0}
+	umlalt	$SVE_ACC1,$SVE_H0,${SVE_R1}
+
+	umlalt	$SVE_ACC3,$SVE_H1,${SVE_R2}
+	umlalt	$SVE_ACC4,$SVE_H1,${SVE_R3}
+	umlalt	$SVE_ACC0,$SVE_H1,${SVE_S4}
+	umlalt	$SVE_ACC2,$SVE_H1,${SVE_R1}
+	umlalt	$SVE_ACC1,$SVE_H1,${SVE_R0}
+
+	umlalt	$SVE_ACC3,$SVE_H3,${SVE_R0}
+	umlalt	$SVE_ACC0,$SVE_H3,${SVE_S2}
+	umlalt	$SVE_ACC4,$SVE_H3,${SVE_R1}
+	umlalt	$SVE_ACC1,$SVE_H3,${SVE_S3}
+	umlalt	$SVE_ACC2,$SVE_H3,${SVE_S4}
+
+	umlalt	$SVE_ACC3,$SVE_H4,${SVE_S4}
+	umlalt	$SVE_ACC0,$SVE_H4,${SVE_S1}
+	umlalt	$SVE_ACC4,$SVE_H4,${SVE_R0}
+	umlalt	$SVE_ACC1,$SVE_H4,${SVE_S2}
+	umlalt	$SVE_ACC2,$SVE_H4,${SVE_S3}
+
+	// Generate predicate for the last two double words
+	mov		x15,#2
+	whilelo p2.d,xzr,x15
+
+	dup 	${SVE_MASK}.d,#-1
+	lsr 	${SVE_MASK}.d,${SVE_MASK}.d,#38
+
+	////////////////////////////////////////////////////////////////
+	// horizontal add
+
+	//In Neon implementation, one effectively using lower 64 bits of vector registers here.
+	//Here and below I use hard-coded FP registers.
+
+	uaddv	d22,p2,$SVE_ACC3
+	 ldp	d8,d9,[sp,#80]		// meet ABI requirements
+	uaddv	d19,p2,$SVE_ACC0
+	 ldp	d10,d11,[sp,#96]
+	uaddv	d23,p2,$SVE_ACC4
+	 ldp	d12,d13,[sp,#112]
+	uaddv	d20,p2,$SVE_ACC1
+	 ldp	d14,d15,[sp,#128]
+	uaddv	d21,p2,$SVE_ACC2
+
+	////////////////////////////////////////////////////////////////
+	// Lazy reduction, but without narrowing
+
+	// Since results were accumulated in the lower 64 bits,
+	//  one can refer to them as FP/aSIMD reg-s.
+
+	ushr	d29,d22,#26
+	and 	v22.8b,v22.8b,v31.8b
+	ushr	d30,d19,#26
+	and 	v19.8b,v19.8b,v31.8b
+
+	add 	d23,d23,d29				// h3 -> h4
+	add 	d20,d20,d30				// h0 -> h1
+
+	ushr	d29,d23,#26
+	and 	v23.8b,v23.8b,v31.8b
+	ushr	d30,d20,#26
+	and 	v20.8b,v20.8b,v31.8b
+	add 	d21,d21,d30				// h1 -> h2
+
+	add 	d19,d19,d29
+	shl 	d29,d29,#2
+	ushr	d30,d21,#26
+	and 	v21.8b,v21.8b,v31.8b
+	add 	d19,d19,d29				// h4 -> h0
+	add 	d22,d22,d30				// h2 -> h3
+
+	ushr	d29,d19,#26
+	and 	v19.8b,v19.8b,v31.8b
+	ushr 	d30,d22,#26
+	and 	v22.8b,v22.8b,v31.8b
+	add 	d20,d20,d29				// h0 -> h1
+	add 	d23,d23,d30				// h3 -> h4
+
+	////////////////////////////////////////////////////////////////
+	// write the result, can be partially reduced
+
+	stp 	s19,s20,[$ctx],#8
+	stp 	s21,s22,[$ctx],#8
+	str 	s23,[$ctx]
+	
+.Lno_data_sve2:
+	// Restore the callee-saved GPRs
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldr	x29,[sp],#144
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Lshort_blocks:
+	b	poly1305_blocks
+
+.size	poly1305_blocks_sve2,.-poly1305_blocks_sve2
+___
+
+##############################################################################
+#
+# SVE instruction encoder, adapted from chacha20-sve.pl
+#
+##############################################################################
+
+my $debug_encoder = 0;
+
+{
+my  %opcode_unpred = (
+	"eor"          => 0x04a03000,
+	"add"          => 0x04200000,
+	"orr"          => 0x04603000,
+	"mov"          => 0x04603000, # Alias for ORR
+	"and"          => 0x04203000,
+	"lsl"          => 0x04209C00,
+	"lsr"          => 0x04209400,
+	"zip1"         => 0x05206000,
+	"zip2"         => 0x05206400,
+	"trn1"         => 0x05207000,
+	"dup_gpr"      => 0x05203800,
+	"dup_elem"     => 0x05302000,
+	"cntd"         => 0x04e0e000,
+	"tbl"          => 0x05203000,
+	"adr"          => 0x04a0a000,
+	"umullb"       => 0x44e0d000,
+    "umullt"       => 0x45c07c00,
+    "umlalb"       => 0x44e09000,
+    "umlalt"       => 0x44c04c00,
+	"shrnb"        => 0x45201000);
+
+my  %opcode_imm_unpred = (
+	"dup"          => 0x2538C000,
+	"index"        => 0x04204400);
+
+my %opcode_scalar_pred = (
+	"cpy"          => 0x0528A000);
+
+my  %opcode_pred = (
+	"whilelo"      => 0x25200C00,
+	"ptrue"        => 0x2518E000,
+	"ld4w"         => 0xA560E000,
+	"ld1w"         => 0xA540A000,
+	"revb"         => 0x05248000,
+    "uaddv"        => 0x04012000);
+
+my  %tsize = (
+	'b'          => 0,
+	'h'          => 1,
+	's'          => 2,
+	'd'          => 3,
+	'q'          => 3); # To handle dup zx.q,zx.q[i] case
+
+my %sf = (
+	"w"          => 0,
+	"x"          => 1);
+
+my %pattern = ("ALL" => 31);
+
+sub create_verifier {
+	my $filename="./compile_sve.sh";
+
+$scripts = <<'___';
+#! /bin/bash
+set -e
+CROSS_COMPILE=${CROSS_COMPILE:-'aarch64-linux-gnu-'}
+
+[ -z "$1" ] && exit 1
+INST_TO_COMPILE="$1"
+FILENAME_BASE=${1%% *}
+TMPFILE="/tmp/${FILENAME_BASE}_test"
+OBJDUMP_LOG="/tmp/${FILENAME_BASE}_objdump.log"
+
+echo "--- DEBUG INFO ---" >&2
+echo "Received \$1 (Instruction): '$1'" >&2
+echo "Using Filename Base: '$FILENAME_BASE'" >&2
+echo "------------------" >&2
+
+ARCH=`uname -p | xargs echo -n`
+
+if [ $ARCH == 'aarch64' ]; then
+    CC=gcc-11
+    AS=as
+    OBJDUMP=objdump
+else
+    CC=${CROSS_COMPILE}gcc
+    AS=${CROSS_COMPILE}as
+    OBJDUMP=${CROSS_COMPILE}objdump
+fi
+
+cat > "${TMPFILE}.c" << EOF
+extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
+{
+    asm("$INST_TO_COMPILE");
+}
+int main(int argc, char *argv[])
+{
+}
+EOF
+
+$CC -march=armv8.2-a+sve+sve2 -S -o "${TMPFILE}.s" "${TMPFILE}.c"
+
+$AS -march=armv8-a+sve2 -o "${TMPFILE}.o" "${TMPFILE}.s"
+
+#$OBJDUMP -d "${TMPFILE}.o" > "$OBJDUMP_LOG"
+
+#cat "$OBJDUMP_LOG" | awk -F"\n" -v RS="\n\n" '$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",$2}'
+$OBJDUMP -d "${TMPFILE}.o" | awk -F"\n" -v RS="\n\n" '$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",$2}'
+
+rm "${TMPFILE}.c" "${TMPFILE}.s" "${TMPFILE}.o"
+___
+	open(FH, '>', $filename) or die $!;
+	print FH $scripts;
+	close(FH);
+	system("chmod a+x ./compile_sve.sh");
+}
+
+sub compile_sve {
+	my $inst = shift;
+    return `./compile_sve.sh "$inst"`;
+}
+
+sub verify_inst {
+	my ($code,$inst)=@_;
+	my $hexcode = (sprintf "%08x", $code);
+
+	if ($debug_encoder == 1) {
+		my $expect=&compile_sve($inst);
+		if ($expect ne $hexcode) {
+			return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
+		}
+	}
+	return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
+}
+
+sub reg_code {
+	my $code = shift;
+
+	if ($code == "zr") {
+		return "31";
+	}
+	return $code;
+}
+
+sub encode_size_imm() {
+	my ($mnemonic, $isize, $const)=@_;
+	my $esize = (8<<$tsize{$isize});
+	my $tsize_imm;
+	if ($mnemonic eq "shrnb") {
+        # Formula for narrowing shifts
+        $tsize_imm = $esize - $const;
+    } elsif ($mnemonic eq "lsr") {
+        # Formula for logical right shifts
+        $tsize_imm = 2*$esize - $const;
+    } else {
+        # Default formula for logical left shifts (lsl)
+        $tsize_imm = $esize + $const;
+    }
+	return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
+}
+
+sub sve_unpred {
+    my ($mnemonic,$arg)=@_;
+    my $inst = (sprintf "%s %s", $mnemonic,$arg);
+    # Special case: Widening multiplies (indexed and vector)
+    if (($mnemonic =~ /^(umull[bt]|umlal[bt])/) && $arg =~ m/z([0-9]+)\.d,\s*z([0-9]+)\.s,\s*z([0-9]+)\.s(\[([0-9]+)\])?/o) {
+        my ($zd, $zn, $zm, $indexed, $imm) = ($1, $2, $3, $4, $5);
+        my $opcode = $opcode_unpred{$mnemonic};
+        if ($indexed) {
+			# Split the 2-bit immediate index into its parts.
+            my $i2h = ($imm >> 1) & 0x1; # High bit of index
+            my $i2l = $imm & 0x1;       # Low bit of index
+            # Get the low 4 bits of the Zm register.
+            my $zm_low = $zm & 0xF;
+            return &verify_inst($opcode|($i2h << 20)|($zm_low << 16)|($i2l << 11)|($zn << 5)|$zd,$inst);
+        } else {
+            return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst);
+        }
+    # Special case: 3-register vector ADR with lsl #2
+    } elsif ($mnemonic eq "adr" && $arg =~ m/z([0-9]+)\.s,\s*\[z([0-9]+)\.s,\s*z([0-9]+)\.s,\s*lsl\s*#2\]/o) {
+        my ($zd, $zn, $zm) = ($1, $2, $3);
+        my $opcode = $opcode_unpred{"adr"};
+        # Per the manual, the 'sz' bit (22) must be 0 for .s size.
+        # It is already 0 in our base, so we do nothing.
+        # The 'msz' field (bits 11-10) must be '10'. We achieve this by setting bit 11.
+        $opcode |= (1<<11);
+        return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst);
+    # Special case: 'cntd xd' alias
+    } elsif ($mnemonic eq "cntd" && $arg =~ m/x([0-9]+)/o) {
+        my ($xd) = ($1);
+        my $opcode = $opcode_unpred{$mnemonic};
+        my $pattern_all = $pattern{"ALL"} << 5;
+        return &verify_inst($opcode|$xd|$pattern_all, $inst);
+    # Special parser for SHRNB's unique syntax (Zd.s, Zn.d, #imm)
+    } elsif ($mnemonic eq "shrnb" && $arg =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.([bhsd]),\s*#([0-9]+)/o) {
+        my ($zd, $zn, $size_src, $imm) = ($1, $2, $3, $4);
+        my $opcode = $opcode_unpred{$mnemonic};
+        return &verify_inst($opcode|&encode_size_imm($mnemonic,$size_src,$imm)|($zn << 5)|$zd, $inst);
+	} elsif ($mnemonic eq "dup" && $arg =~ m/z([0-9]+)\.q,\s*z([0-9]+)\.q\[0\]/o) { # DUP from element
+        my ($zd, $zn) = ($1, $2);
+        my $opcode = $opcode_unpred{"dup_elem"};
+        return &verify_inst($opcode | ($zn << 5) | $zd, $inst);
+	} elsif ($mnemonic eq "dup" && $arg =~ m/z([0-9]+)\.([bhsdq]),\s*w([0-9]+)/o) { # DUP from GPR (wX/xX)
+        my ($zd, $size, $rn) = ($1, $2, $3);
+        my $opcode = $opcode_unpred{"dup_gpr"};
+        $opcode |= ($tsize{$size}<<22);
+        return &verify_inst($opcode|$zd|($rn<<5), $inst);
+	# Generic argument patterns
+    } elsif ($arg =~ m/z([0-9]+)\.([bhsdq]),\s*(.*)/o) {
+        my ($zd, $size, $regs) = ($1, $2, $3);
+        my $opcode = $opcode_unpred{$mnemonic};
+		# Handle shift-by-immediate separately due to its unique encoding.
+        if ($mnemonic eq "lsl" || $mnemonic eq "lsr") {
+            if ($regs =~ m/z([0-9]+)\.[bhsd],\s*#([0-9]+)/o) {
+                my ($zn, $imm) = ($1, $2);
+                return &verify_inst($opcode|$zd|($zn<<5)|&encode_size_imm($mnemonic,$size,$imm), $inst);
+            }
+        }
+		if ($mnemonic !~ /^(and|orr|eor|mov)$/) {
+        	$opcode |= ($tsize{$size}<<22);
+    	}
+        if ($regs =~ m/z([0-9]+)\.[bhsdq],\s*z([0-9]+)\.[bhsdq]/o) { # 3-operand vector
+            my ($zn, $zm) = ($1, $2);
+            return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst);
+        } elsif ($regs =~ m/z([0-9]+)\.[bhsdq]/o) { # 2-operand vector (mov)
+            my $zn = $1;
+            my $zm = ($mnemonic eq "mov") ? $zn : 0;
+            return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst);
+        } elsif ($regs =~ m/w([0-9]+),\s*#1/o) { # index
+            my ($rn, $rm) = ($1, 1);
+            $opcode = $opcode_imm_unpred{"index"};
+			$opcode |= ($tsize{$size}<<22);
+            return &verify_inst($opcode|$zd|($rn<<5)|($rm<<16), $inst);
+        } elsif ($regs =~ m/#(-?[0-9]+)/o) { # dup from immediate
+            my $imm = $1;
+            $opcode = $opcode_imm_unpred{"dup"};
+			$opcode |= ($tsize{$size}<<22);
+            my $imm_val = $imm & 0xff; # Only accounting for a simple case with zero shift.
+            return &verify_inst($opcode|$zd|($imm_val<<5), $inst);
+        }
+    }
+    sprintf "%s // fail to parse: %s", $mnemonic, $arg;
+}
+
+sub sve_pred {
+    my ($mnemonic, $arg)=@_;
+    my $inst = (sprintf "%s %s", $mnemonic,$arg);
+    # Special case: Multi-register loads (ld4w)
+    if ($arg =~ m/\{\s*z([0-9]+)\.s-z([0-9]+)\.s\s*\},\s*p([0-9]+)\/z,\s*\[(x[0-9]+)\]/o) {
+        my ($zt, $pg, $xn) = ($1, $3, $4);
+        $xn =~ s/x//;
+        my $opcode = $opcode_pred{$mnemonic};
+        return &verify_inst($opcode|$zt|($pg<<10)|($xn<<5), $inst);
+    # Special case: Single-register loads (ld1w)
+    } elsif ($arg =~ m/\{\s*z([0-9]+)\.s\s*\},\s*p([0-9]+)\/z,\s*\[(x[0-9]+)\]/o) {
+        my ($zt, $pg, $xn) = ($1, $2, $3);
+        $xn =~ s/x//;
+        my $opcode = $opcode_pred{$mnemonic};
+        return &verify_inst($opcode|$zt|($pg<<10)|($xn<<5), $inst);
+    # Special case: uaddv (scalar destination)
+    } elsif ($mnemonic eq "uaddv" && $arg =~ m/d([0-9]+),\s*p([0-9]+),\s*z([0-9]+)\.([bhsd])/o) {
+        my ($vd, $pg, $zn, $size) = ($1, $2, $3, $4);
+        my $opcode = $opcode_pred{$mnemonic};
+        return &verify_inst($opcode|($tsize{$size}<<22)|$vd|($pg<<10)|($zn<<5), $inst);
+    # Generic pattern: Starts with a predicate register (whilelo, ptrue)
+    } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(.*)/o) {
+        my ($pd, $size, $regs) = ($1, $2, $3);
+        my $opcode = $opcode_pred{$mnemonic};
+        if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) { # whilelo
+            my ($sf_char, $rn, $rm) = ($1, $2, $3);
+            return &verify_inst($opcode|($tsize{$size}<<22)|$pd|($sf{$sf_char}<<12)|(&reg_code($rn)<<5)|(&reg_code($rm)<<16), $inst);
+        } elsif ($regs =~ m/(\w+)/o) { # ptrue
+            my $pat = $1;
+            return &verify_inst($opcode|($tsize{$size}<<22)|$pd|($pattern{$pat}<<5), $inst);
+        }
+    # Generic pattern: Starts with a vector register (cpy, revb)
+    } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/m,\s*(.*)/o) {
+        my ($zd, $size, $pg, $regs) = ($1, $2, $3, $4);
+        if ($regs =~ m/w([0-9]+)/o) { # CPY from GPR
+            my $wn = $1;
+            my $opcode = $opcode_scalar_pred{"cpy"};
+            return &verify_inst($opcode|($tsize{$size}<<22)|$zd|($pg<<10)|($wn<<5), $inst);
+        } elsif ($regs =~ m/z([0-9]+)\.([bhsd])/o) { # 2-operand predicated (revb)
+            my ($zn) = ($1);
+            my $opcode = $opcode_pred{$mnemonic};
+            return &verify_inst($opcode|($tsize{$size}<<22)|$zd|($pg<<10)|($zn<<5), $inst);
+        }
+    }
+    sprintf "%s // fail to parse: %s", $mnemonic, $arg;
+}
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/\/\// and !/^$/);
+	print;
+}
+close SELF;
+
+if ($debug_encoder == 1) {
+	&create_verifier();
+}
+
+foreach my $line (split("\n",$code)) {
+    my $original_line = $line;
+    my $encoded_line = "";
+    # Perform variable substitution
+    $line =~ s/\`([^\`]*)\`/eval($1)/ge;
+    # Predicated instructions
+    if ($line =~ /^\s*(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/) {
+        $encoded_line = sve_pred($1, $2);
+    }
+	elsif ($line =~ /^\s*(\w+)\s+(d[0-9]+,\s*p[0-9].*)/) {
+        $encoded_line = sve_pred($1, $2);
+    }
+    elsif ($line =~ /^\s*(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/) {
+        $encoded_line = sve_pred($1, $2);
+    }
+    elsif ($line =~ /^\s*(\w+)\s+(p[0-9]+\.[bhsd].*)/) {
+        $encoded_line = sve_pred($1, $2);
+    }
+    # Specific unpredicated instructions
+    elsif ($line =~ /^\s*(dup)\s+(z[0-9]+\.q,\s*z[0-9]+\.q\[0\])/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    elsif ($line =~ /^\s*(dup)\s+(z[0-9]+\.[bhsdq],\s*(?:w|x)[0-9]+)/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    elsif ($line =~ /^\s*(mov)\s+(z[0-9]+\.d,\s*z[0-9]+\.d)/) {
+        $encoded_line = sve_unpred("mov", $2);
+    }
+    elsif ($line =~ /^\s*(umull[bt]|umlal[bt])\s+(z[0-9]+\.d,\s*z[0-9]+\.s,\s*z[0-9]+\.s(?:\[[0-9]+\])?)/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    elsif ($line =~ /^\s*(cntd)\s+((x|w)[0-9]+.*)/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    # 3. Generic Unpredicated "catch-all"
+    elsif ($line =~ /^\s*(\w+)\s+(z[0-9]+\.[bhsdq].*)/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    if ($encoded_line) {
+        print $encoded_line, "\n";
+    } else {
+        print $original_line, "\n";
+    }
+}
+
+}
+ STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/poly1305/build.info b/crypto/poly1305/build.info
index e359a2225d..5c35c8ecee 100644
--- a/crypto/poly1305/build.info
+++ b/crypto/poly1305/build.info
@@ -14,7 +14,7 @@ IF[{- !$disabled{asm} -}]
   $POLY1305ASM_s390x=poly1305-s390x.S
 
   $POLY1305ASM_armv4=poly1305-armv4.S
-  $POLY1305ASM_aarch64=poly1305-armv8.S
+  $POLY1305ASM_aarch64=poly1305-armv8.S poly1305-armv9-sve2.S
 
   $POLY1305ASM_ppc32=poly1305_ppc.c poly1305-ppc.s poly1305-ppcfp.s
   $POLY1305ASM_ppc64=$POLY1305ASM_ppc32
@@ -45,7 +45,9 @@ GENERATE[poly1305-ppcfp.s]=asm/poly1305-ppcfp.pl
 GENERATE[poly1305-armv4.S]=asm/poly1305-armv4.pl
 INCLUDE[poly1305-armv4.o]=..
 GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl
+GENERATE[poly1305-armv9-sve2.S]=asm/poly1305-armv9-sve2.pl
 INCLUDE[poly1305-armv8.o]=..
+INCLUDE[poly1305-armv9-sve2.o]=..
 GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl
 INCLUDE[poly1305-mips.o]=..
 GENERATE[poly1305-c64xplus.S]=asm/poly1305-c64xplus.pl