Merge pull request #28055 from nishith-fujitsu:sve_fastGEMM1t

dnn: add SVE optimized fastGEMM1T function and SVE dispatch #28055 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch **Description** This PR enables fastGemm1t vectorized with SVE for AARCH64 architecture that called by recurrent layers and fully connected layers with SVE dispatching mechanism. **ARM Compatibility:** Modified the build scripts, and configuration files to ensure compatibility with ARM processors. **Checklist** Code changes have been tested on ARM devices (Graviton3). **Modifications** - Implemented FastGemm1T kernel in SVE with Vector length agnostic approach. - Added Flags and checks to call our ported Kernel in Recurrent Layer and FullyConnected layer. - Changes made to cmakelist.txt to dispatch our ported kernel for SVE. - Flag OpenCV Dispatch with SVE optimization is added to support SVE implemented kernel for OpenCV. According to OpenCV build optimization https://github.com/opencv/opencv/wiki/CPU-optimizations-build-options cmake \ -DCPU_BASELINE=NEON\ -D CPU_DISPATCH=SVE\ **Performance Improvement** - The suggested optimizations Improves the performance of LSTM layer and fully connected layer. <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns="http://www.w3.org/TR/REC-html40"> <head> <meta name=ProgId content=Excel.Sheet> <meta name=Generator content="Microsoft Excel 15"> <link id=Main-File rel=Main-File href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip.htm"> <link rel=File-List href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml"> <style>  </style> </head> <body link="#467886" vlink="#96607D"> Name of Test | dnn_neon | dnn_sve | dnn_sve vs dnn_neon(x-factor) -- | -- | -- | -- lstm::Layer_LSTM::BATCH=1, IN=64, HIDDEN=192, TS=100 | 2.878 | 2.326 | 1.24 lstm::Layer_LSTM::BATCH=1, IN=192, HIDDEN=192, TS=100 | 4.162 | 3.08 | 1.35 lstm::Layer_LSTM::BATCH=1, IN=192, HIDDEN=512, TS=100 | 18.627 | 16.152 | 1.15 lstm::Layer_LSTM::BATCH=1, IN=1024, HIDDEN=192, TS=100 | 10.98 | 7.976 | 1.38 lstm::Layer_LSTM::BATCH=64, IN=64, HIDDEN=192, TS=2 | 4.41 | 3.459 | 1.27 lstm::Layer_LSTM::BATCH=64, IN=192, HIDDEN=192, TS=2 | 6.567 | 4.807 | 1.37 lstm::Layer_LSTM::BATCH=64, IN=192, HIDDEN=512, TS=2 | 28.471 | 22.909 | 1.24 lstm::Layer_LSTM::BATCH=64, IN=1024, HIDDEN=192, TS=2 | 15.491 | 12.537 | 1.24 lstm::Layer_LSTM::BATCH=128, IN=64, HIDDEN=192, TS=2 | 8.848 | 6.821 | 1.3 lstm::Layer_LSTM::BATCH=128, IN=192, HIDDEN=192, TS=2 | 12.969 | 9.522 | 1.36 lstm::Layer_LSTM::BATCH=128, IN=192, HIDDEN=512, TS=2 | 55.52 | 45.746 | 1.21 lstm::Layer_LSTM::BATCH=128, IN=1024, HIDDEN=192, TS=2 | 31.226 | 26.132 | 1.19 </body> </html> <html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns="http://www.w3.org/TR/REC-html40"> <head> <meta name=ProgId content=Excel.Sheet> <meta name=Generator content="Microsoft Excel 15"> <link id=Main-File rel=Main-File href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip.htm"> <link rel=File-List href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml"> <style>  </style> </head> <body link="#467886" vlink="#96607D"> Name of Test | dnn_neon | dnn_sve | dnn_sve vs dnn_neon(x-factor) -- | -- | -- | -- fc::Layer_FullyConnected::([5, 16, 512, 128], 256, false, OCV/CPU) | 5.086 | 4.483 | 1.13 fc::Layer_FullyConnected::([5, 16, 512, 128], 256, true, OCV/CPU) | 8.512 | 8.347 | 1.02 fc::Layer_FullyConnected::([5, 16, 512, 128], 512, false, OCV/CPU) | 9.467 | 8.965 | 1.06 fc::Layer_FullyConnected::([5, 16, 512, 128], 512, true, OCV/CPU) | 14.855 | 13.527 | 1.1 fc::Layer_FullyConnected::([5, 16, 512, 128], 1024, false, OCV/CPU) | 18.821 | 18.023 | 1.04 fc::Layer_FullyConnected::([5, 16, 512, 128], 1024, true, OCV/CPU) | 27.558 | 24.966 | 1.1 fc::Layer_FullyConnected::([5, 512, 384, 0], 256, false, OCV/CPU) | 0.924 | 0.804 | 1.15 fc::Layer_FullyConnected::([5, 512, 384, 0], 256, true, OCV/CPU) | 1.259 | 1.126 | 1.12 fc::Layer_FullyConnected::([5, 512, 384, 0], 512, false, OCV/CPU) | 1.957 | 1.655 | 1.18 fc::Layer_FullyConnected::([5, 512, 384, 0], 512, true, OCV/CPU) | 2.831 | 2.775 | 1.02 fc::Layer_FullyConnected::([5, 512, 384, 0], 1024, false, OCV/CPU) | 5.92 | 6.379 | 0.93 fc::Layer_FullyConnected::([5, 512, 384, 0], 1024, true, OCV/CPU) | 8.924 | 8.993 | 0.99 </body> </html>
2026-01-18 17:21:42 +01:00 · 2025-12-03 13:12:28 +05:30
parent e3fe5a5681
commit 8efc0fd47b
11 changed files with 241 additions and 5 deletions
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -49,7 +49,7 @@

 set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
 list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
-list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
+list(APPEND CPU_ALL_OPTIMIZATIONS SVE NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
@@ -104,6 +104,7 @@ ocv_optimization_process_obsolete_option(ENABLE_AVX2 AVX2 ON)
 ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON)

 ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF)
+ocv_optimization_process_obsolete_option(ENABLE_SVE SVE ON)
 ocv_optimization_process_obsolete_option(ENABLE_NEON NEON ON)

 ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON)
@@ -352,7 +353,7 @@ if(X86 OR X86_64)
  endif()

 elseif(ARM OR AARCH64)
-
+  ocv_update(CPU_SVE_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sve.cpp")
  ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
  ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
  ocv_update(CPU_NEON_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_fp16.cpp")
@@ -369,16 +370,24 @@ elseif(ARM OR AARCH64)
    endif()
    ocv_update(CPU_FP16_IMPLIES "NEON")
  else()
-    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
+    if (UNIX AND NOT APPLE)
+      #Current Apple silicone M4 does not support SVE,
+      #but some Xcode versions reports their support.
+      ocv_update(CPU_KNOWN_OPTIMIZATIONS "SVE;NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
+    else()
+      ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
+    endif()
    ocv_update(CPU_FP16_IMPLIES "NEON")
    ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
    ocv_update(CPU_NEON_FP16_IMPLIES "NEON")
    ocv_update(CPU_NEON_BF16_IMPLIES "NEON")
    if(MSVC)
+      ocv_update(CPU_SVE_FLAGS_ON "")
      ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "")
      ocv_update(CPU_NEON_FP16_FLAGS_ON "")
      ocv_update(CPU_NEON_BF16_FLAGS_ON "")
    else()
+      ocv_update(CPU_SVE_FLAGS_ON "-march=armv8.2-a+sve")
      ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
      ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16")
      ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+bf16")
--- a/cmake/checks/cpu_sve.cpp
+++ b/cmake/checks/cpu_sve.cpp
@@ -0,0 +1,24 @@
+#include <stdio.h>
+
+#if defined(__ARM_FEATURE_SVE)
+#  include <arm_sve.h>
+#  define CV_SVE 1
+#endif
+
+#if defined(CV_SVE)
+int test()
+{
+    const float src[1024] = {0.0};
+    svbool_t pg = svptrue_b32();
+    svfloat32_t val = svld1(pg, src);
+    return (int)svlastb_f32(pg, val);
+}
+#else
+#error "SVE is not supported"
+#endif
+
+int main()
+{
+  printf("%d\n", test());
+  return 0;
+}
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -237,6 +237,10 @@ struct VZeroUpperGuard {
 #elif defined(__ARM_NEON)
 #  include <arm_neon.h>
 #  define CV_NEON 1
+#ifdef __ARM_FEATURE_SVE
+# include<arm_sve.h>
+# define CV_SVE 1
+#endif
 #elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
 #  include <altivec.h>
 #  undef vector
@@ -362,6 +366,10 @@ struct VZeroUpperGuard {
 #  define CV_NEON 0
 #endif

+#ifndef CV_SVE
+#  define CV_SVE 0
+#endif
+
 #ifndef CV_RVV071
 #  define CV_RVV071 0
 #endif
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@@ -399,6 +399,27 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_AVX512_ICL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_ICL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SVE
+#  define CV_TRY_SVE 1
+#  define CV_CPU_FORCE_SVE 1
+#  define CV_CPU_HAS_SUPPORT_SVE 1
+#  define CV_CPU_CALL_SVE(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SVE_(fn, args) return (opt_SVE::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SVE
+#  define CV_TRY_SVE 1
+#  define CV_CPU_FORCE_SVE 0
+#  define CV_CPU_HAS_SUPPORT_SVE (cv::checkHardwareSupport(CV_CPU_SVE))
+#  define CV_CPU_CALL_SVE(fn, args) if (CV_CPU_HAS_SUPPORT_SVE) return (opt_SVE::fn args)
+#  define CV_CPU_CALL_SVE_(fn, args) if (CV_CPU_HAS_SUPPORT_SVE) return (opt_SVE::fn args)
+#else
+#  define CV_TRY_SVE 0
+#  define CV_CPU_FORCE_SVE 0
+#  define CV_CPU_HAS_SUPPORT_SVE 0
+#  define CV_CPU_CALL_SVE(fn, args)
+#  define CV_CPU_CALL_SVE_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SVE(fn, args, mode, ...)  CV_CPU_CALL_SVE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
 #  define CV_TRY_NEON 1
 #  define CV_CPU_FORCE_NEON 1
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -279,6 +279,7 @@ namespace cv {
 #define CV_CPU_NEON_DOTPROD     101
 #define CV_CPU_NEON_FP16        102
 #define CV_CPU_NEON_BF16        103
+#define CV_CPU_SVE              104

 #define CV_CPU_MSA              150

@@ -341,6 +342,7 @@ enum CpuFeatures {
    CPU_NEON_DOTPROD    = 101,
    CPU_NEON_FP16       = 102,
    CPU_NEON_BF16       = 103,
+    CPU_SVE             = 104,

    CPU_MSA             = 150,

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -427,6 +427,7 @@ struct HWFeatures
        g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD";
        g_hwFeatureNames[CPU_NEON_FP16] = "NEON_FP16";
        g_hwFeatureNames[CPU_NEON_BF16] = "NEON_BF16";
+        g_hwFeatureNames[CPU_SVE] = "SVE";

        g_hwFeatureNames[CPU_VSX] = "VSX";
        g_hwFeatureNames[CPU_VSX3] = "VSX3";
@@ -589,6 +590,7 @@ struct HWFeatures
                {
                    have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP
                    have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP
+                    have[CV_CPU_SVE] = (auxv.a_un.a_val & (1 << 22)) != 0; // HWCAP_SVE
                }
 #if defined(AT_HWCAP2)
                else if (auxv.a_type == AT_HWCAP2)
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -4,7 +4,7 @@ endif()

 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")

-ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX NEON)
+ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX NEON SVE)
 ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX NEON)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -228,6 +228,7 @@ public:
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
            p.useRVV = checkHardwareSupport(CPU_RVV);
            p.useLASX = checkHardwareSupport(CPU_LASX);
+            p.useSVE = checkHardwareSupport(CPU_SVE);

            parallel_for_(Range(0, nstripes), p, nstripes);
        }
@@ -277,6 +278,12 @@ public:
                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
                else
            #endif
+            #if CV_TRY_SVE
+                if( useSVE ) {
+                    opt_SVE::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
+                }
+                else
+            #endif
            #if CV_TRY_RVV && CV_RVV
                if( useRVV )
                    opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
@@ -342,6 +349,7 @@ public:
        bool useAVX512;
        bool useRVV;
        bool useLASX;
+        bool useSVE;
    };

 #ifdef HAVE_OPENCL
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@@ -53,7 +53,119 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
               size_t bstep, float* cptr, size_t cstep,
               int ma, int na, int nb );

-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && defined(CV_CPU_COMPILE_SVE)
+#include <arm_sve.h>
+// dst = vec * weights^t + bias
+
+void fastGEMM1T( const float* vec, const float* weights,
+                 size_t wstep, const float* bias,
+                 float* dst, int nvecs, int vecsize )
+{
+    svbool_t pg_all = svptrue_b32();
+    int i = 0;
+    int vl = svcntw();
+    for( ; i <= nvecs - 15; i += 15 )
+    {
+        const float* wrow0 = weights + i * wstep; // base pointer for row i
+        // we will use wrow0 + k, wrow0 + wstep + k, etc
+        svfloat32_t vs0 = svdup_f32(0.0f), vs1 = svdup_f32(0.0f),
+               vs2 = svdup_f32(0.0f), vs3 = svdup_f32(0.0f),
+               vs4 = svdup_f32(0.0f), vs5 = svdup_f32(0.0f),
+               vs6 = svdup_f32(0.0f), vs7 = svdup_f32(0.0f),
+               vs8 = svdup_f32(0.0f), vs9 = svdup_f32(0.0f),
+               vs10 = svdup_f32(0.0f), vs11 = svdup_f32(0.0f),
+               vs12 = svdup_f32(0.0f), vs13 = svdup_f32(0.0f),
+               vs14 = svdup_f32(0.0f);
+        int k = 0;
+        for( ; k <= vecsize - vl; k += vl )
+        {
+            // load input chunk
+            const float* vecptr = reinterpret_cast<const float*>(vec) + k;
+            svfloat32_t v = svld1_f32(pg_all, vecptr);
+            // load weights from each of 15 rows at offset k
+            vs0 = svmla_f32_m(pg_all, vs0, svld1_f32(pg_all, wrow0 + k), v);
+            vs1 = svmla_f32_m(pg_all, vs1, svld1_f32(pg_all, wrow0 + wstep + k), v);
+            vs2 = svmla_f32_m(pg_all, vs2, svld1_f32(pg_all, wrow0 + wstep*2 + k), v);
+            vs3 = svmla_f32_m(pg_all, vs3, svld1_f32(pg_all, wrow0 + wstep*3 + k), v);
+            vs4 = svmla_f32_m(pg_all, vs4, svld1_f32(pg_all, wrow0 + wstep*4 + k), v);
+            vs5 = svmla_f32_m(pg_all, vs5, svld1_f32(pg_all, wrow0 + wstep*5 + k), v);
+            vs6 = svmla_f32_m(pg_all, vs6, svld1_f32(pg_all, wrow0 + wstep*6 + k), v);
+            vs7 = svmla_f32_m(pg_all, vs7, svld1_f32(pg_all, wrow0 + wstep*7 + k), v);
+            vs8 = svmla_f32_m(pg_all, vs8, svld1_f32(pg_all, wrow0 + wstep*8 + k), v);
+            vs9 = svmla_f32_m(pg_all, vs9, svld1_f32(pg_all, wrow0 + wstep*9 + k), v);
+            vs10 = svmla_f32_m(pg_all, vs10, svld1_f32(pg_all, wrow0 + wstep*10 + k), v);
+            vs11 = svmla_f32_m(pg_all, vs11, svld1_f32(pg_all, wrow0 + wstep*11 + k), v);
+            vs12 = svmla_f32_m(pg_all, vs12, svld1_f32(pg_all, wrow0 + wstep*12 + k), v);
+            vs13 = svmla_f32_m(pg_all, vs13, svld1_f32(pg_all, wrow0 + wstep*13 + k), v);
+            vs14 = svmla_f32_m(pg_all, vs14, svld1_f32(pg_all, wrow0 + wstep*14 + k), v);
+        }
+        if(k < vecsize){
+            svbool_t pg_tail = svwhilelt_b32(k, vecsize);
+            const float* vecptr = reinterpret_cast<const float*>(vec) + k;
+            svfloat32_t v = svld1_f32(pg_tail, vecptr);
+            const float* wptr = wrow0 + k;
+            vs0 = svmla_f32_m(pg_tail, vs0, svld1_f32(pg_tail, wptr), v);
+            vs1 = svmla_f32_m(pg_tail, vs1, svld1_f32(pg_tail, wptr + wstep), v);
+            vs2 = svmla_f32_m(pg_tail, vs2, svld1_f32(pg_tail, wptr + wstep*2), v);
+            vs3 = svmla_f32_m(pg_tail, vs3, svld1_f32(pg_tail, wptr + wstep*3), v);
+            vs4 = svmla_f32_m(pg_tail, vs4, svld1_f32(pg_tail, wptr + wstep*4), v);
+            vs5 = svmla_f32_m(pg_tail, vs5, svld1_f32(pg_tail, wptr + wstep*5), v);
+            vs6 = svmla_f32_m(pg_tail, vs6, svld1_f32(pg_tail, wptr + wstep*6), v);
+            vs7 = svmla_f32_m(pg_tail, vs7, svld1_f32(pg_tail, wptr + wstep*7), v);
+            vs8 = svmla_f32_m(pg_tail, vs8, svld1_f32(pg_tail, wptr + wstep*8), v);
+            vs9 = svmla_f32_m(pg_tail, vs9, svld1_f32(pg_tail, wptr + wstep*9), v);
+            vs10 = svmla_f32_m(pg_tail, vs10, svld1_f32(pg_tail, wptr + wstep*10), v);
+            vs11 = svmla_f32_m(pg_tail, vs11, svld1_f32(pg_tail, wptr + wstep*11), v);
+            vs12 = svmla_f32_m(pg_tail, vs12, svld1_f32(pg_tail, wptr + wstep*12), v);
+            vs13 = svmla_f32_m(pg_tail, vs13, svld1_f32(pg_tail, wptr + wstep*13), v);
+            vs14 = svmla_f32_m(pg_tail, vs14, svld1_f32(pg_tail, wptr + wstep*14), v);
+        }
+        float sum[15];
+        sum[0] = svaddv_f32(pg_all, vs0);
+
+        sum[1] = svaddv_f32(pg_all, vs1);
+        sum[2] = svaddv_f32(pg_all, vs2);
+        sum[3] = svaddv_f32(pg_all, vs3);
+        sum[4] = svaddv_f32(pg_all, vs4);
+        sum[5] = svaddv_f32(pg_all, vs5);
+        sum[6] = svaddv_f32(pg_all, vs6);
+        sum[7] = svaddv_f32(pg_all, vs7);
+        sum[8] = svaddv_f32(pg_all, vs8);
+        sum[9] = svaddv_f32(pg_all, vs9);
+        sum[10] = svaddv_f32(pg_all, vs10);
+        sum[11] = svaddv_f32(pg_all, vs11);
+        sum[12] = svaddv_f32(pg_all, vs12);
+        sum[13] = svaddv_f32(pg_all, vs13);
+        sum[14] = svaddv_f32(pg_all, vs14);
+        for (int j = 0; j < 15; j += vl) {
+            svbool_t pg = svwhilelt_b32(j, 15);
+            svfloat32_t v_sum  = svld1_f32(pg, sum + j);
+            svfloat32_t v_bias = svld1_f32(pg, bias + i + j);
+            svst1_f32(pg, dst + i + j, svadd_f32_z(pg, v_sum, v_bias));
+        }
+    }
+    float temp = 0.f;
+    for( ; i < nvecs; i++ )
+    {
+        const float* wrow = weights + i * wstep;
+        svfloat32_t vs0 = svdup_f32(0.0f);
+        int k = 0;
+        for( ; k <= vecsize - vl; k += vl )
+        {
+            svfloat32_t v = svld1_f32(pg_all, reinterpret_cast<const float*>(vec) + k);
+            vs0 = svmla_f32_m(pg_all, vs0, svld1_f32(pg_all, wrow + k), v);
+        }
+        if (k != vecsize) {
+            svbool_t pg_tail = svwhilelt_b32(k, vecsize);
+            svfloat32_t v = svld1_f32(pg_tail, reinterpret_cast<const float*>(vec) + k);
+            vs0 = svmla_f32_m(pg_tail, vs0, svld1_f32(pg_tail, wrow + k), v);
+        }
+        temp = svaddv_f32(pg_all, vs0);
+        dst[i] = temp + bias[i];
+    }
+}
+#endif
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON && !defined(CV_CPU_COMPILE_SVE)

 static const uint32_t tailMaskArray[7] = {
    0u, 0u, 0u, 0u,
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -138,6 +138,9 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
 #if CV_TRY_AVX2
    bool useAVX2;
 #endif
+#if CV_TRY_SVE
+    bool useSVE;
+#endif
 #if CV_TRY_NEON
    bool useNEON;
 #endif
@@ -156,6 +159,9 @@ public:
 #if CV_TRY_AVX2
          , useAVX2(checkHardwareSupport(CPU_AVX2))
 #endif
+#if CV_TRY_SVE
+          , useSVE(checkHardwareSupport(CPU_SVE))
+#endif
 #if CV_TRY_NEON
          , useNEON(checkHardwareSupport(CPU_NEON))
 #endif
@@ -495,6 +501,13 @@ public:
                && Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F
                && Wh.cols >= 8;
 #endif
+#if CV_TRY_SVE
+            bool canUseSVE = gates.isContinuous() && bias.isContinuous()
+                && Wx.depth() == CV_32F && gates.depth() == CV_32F
+                && bias.depth() == CV_32F;
+            bool canUseSVE_hInternal = hInternal.isContinuous() && gates.isContinuous() && bias.isContinuous()
+                && Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F;
+#endif
 #if CV_TRY_NEON
            bool canUseNeon = gates.isContinuous() && bias.isContinuous()
                && Wx.depth() == CV_32F && gates.depth() == CV_32F
@@ -554,6 +567,23 @@ public:
                }
                else
 #endif
+#if CV_TRY_SVE
+                if (useSVE && canUseSVE && xCurr.isContinuous())
+                {
+                    for (int n = 0; n < xCurr.rows; n++) {
+                        opt_SVE::fastGEMM1T(
+                            xCurr.ptr<float>(n),
+                            Wx.ptr<float>(),
+                            Wx.step1(),
+                            bias.ptr<float>(),
+                            gates.ptr<float>(n),
+                            Wx.rows,
+                            Wx.cols
+                        );
+                    }
+                }
+                else
+#endif
 #if CV_TRY_NEON
                if (useNEON && canUseNeon && xCurr.isContinuous())
                {
@@ -610,6 +640,23 @@ public:
                }
                else
 #endif
+#if CV_TRY_SVE
+                if (useSVE && canUseSVE_hInternal)
+                {
+                    for (int n = 0; n < hInternal.rows; n++) {
+                        opt_SVE::fastGEMM1T(
+                            hInternal.ptr<float>(n),
+                            Wh.ptr<float>(),
+                            Wh.step1(),
+                            gates.ptr<float>(n),
+                            gates.ptr<float>(n),
+                            Wh.rows,
+                            Wh.cols
+                        );
+                    }
+                }
+                else
+#endif
 #if CV_TRY_NEON
                if (useNEON && canUseNeon_hInternal)
                {
--- a/platforms/linux/flags-aarch64.cmake
+++ b/platforms/linux/flags-aarch64.cmake
@@ -1,6 +1,9 @@
 # see https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html#index-march
 function(ocv_set_platform_flags VAR)
  unset(flags)
+  if(ENABLE_SVE)
+   set(flags "${flags}+sve")
+  endif()
  if(ENABLE_BF16)
    set(flags "${flags}+bf16")
  endif()