0
0
mirror of https://github.com/opencv/opencv.git synced 2026-01-18 17:21:42 +01:00

Merge pull request #28055 from nishith-fujitsu:sve_fastGEMM1t

dnn: add SVE optimized fastGEMM1T function and SVE dispatch #28055

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch

**Description**
This PR enables fastGemm1t vectorized with SVE for AARCH64 architecture that called by recurrent layers and fully connected layers with SVE dispatching mechanism.

**ARM Compatibility:**
Modified the build scripts, and configuration files to ensure compatibility with ARM processors.

**Checklist**

Code changes have been tested on ARM devices (Graviton3).

**Modifications**

- Implemented FastGemm1T kernel in SVE with Vector length agnostic approach.

- Added Flags and checks to call our ported Kernel in Recurrent Layer and FullyConnected layer.

- Changes made to cmakelist.txt to dispatch our ported kernel for SVE.

- Flag OpenCV Dispatch with SVE optimization is added to support SVE implemented kernel for OpenCV. According to OpenCV build optimization https://github.com/opencv/opencv/wiki/CPU-optimizations-build-options 
cmake \
    -DCPU_BASELINE=NEON\
    -D CPU_DISPATCH=SVE\

**Performance Improvement**
- The suggested optimizations Improves the performance of LSTM layer and fully connected layer.
<html xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File
href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip.htm">
<link rel=File-List
href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml">
<style>
<!--table
	{mso-displayed-decimal-separator:"\.";
	mso-displayed-thousand-separator:"\,";}
@page
	{margin:.75in .7in .75in .7in;
	mso-header-margin:.3in;
	mso-footer-margin:.3in;}
tr
	{mso-height-source:auto;}
col
	{mso-width-source:auto;}
br
	{mso-data-placement:same-cell;}
td
	{padding-top:1px;
	padding-right:1px;
	padding-left:1px;
	mso-ignore:padding;
	color:black;
	font-size:11.0pt;
	font-weight:400;
	font-style:normal;
	text-decoration:none;
	font-family:"Aptos Narrow", sans-serif;
	mso-font-charset:0;
	mso-number-format:General;
	text-align:general;
	vertical-align:bottom;
	border:none;
	mso-background-source:auto;
	mso-pattern:auto;
	mso-protection:locked visible;
	white-space:nowrap;
	mso-rotate:0;}
.xl63
	{border:.5pt solid windowtext;}
.xl64
	{text-align:center;}
.xl65
	{text-align:center;
	border:.5pt solid windowtext;}
-->
</style>
</head>

<body link="#467886" vlink="#96607D">


Name of Test | dnn_neon | dnn_sve | dnn_sve   vs dnn_neon(x-factor)
-- | -- | -- | --
lstm::Layer_LSTM::BATCH=1,   IN=64, HIDDEN=192, TS=100 | 2.878 | 2.326 | 1.24
lstm::Layer_LSTM::BATCH=1,   IN=192, HIDDEN=192, TS=100 | 4.162 | 3.08 | 1.35
lstm::Layer_LSTM::BATCH=1,   IN=192, HIDDEN=512, TS=100 | 18.627 | 16.152 | 1.15
lstm::Layer_LSTM::BATCH=1,   IN=1024, HIDDEN=192, TS=100 | 10.98 | 7.976 | 1.38
lstm::Layer_LSTM::BATCH=64,   IN=64, HIDDEN=192, TS=2 | 4.41 | 3.459 | 1.27
lstm::Layer_LSTM::BATCH=64,   IN=192, HIDDEN=192, TS=2 | 6.567 | 4.807 | 1.37
lstm::Layer_LSTM::BATCH=64,   IN=192, HIDDEN=512, TS=2 | 28.471 | 22.909 | 1.24
lstm::Layer_LSTM::BATCH=64,   IN=1024, HIDDEN=192, TS=2 | 15.491 | 12.537 | 1.24
lstm::Layer_LSTM::BATCH=128,   IN=64, HIDDEN=192, TS=2 | 8.848 | 6.821 | 1.3
lstm::Layer_LSTM::BATCH=128,   IN=192, HIDDEN=192, TS=2 | 12.969 | 9.522 | 1.36
lstm::Layer_LSTM::BATCH=128,   IN=192, HIDDEN=512, TS=2 | 55.52 | 45.746 | 1.21
lstm::Layer_LSTM::BATCH=128,   IN=1024, HIDDEN=192, TS=2 | 31.226 | 26.132 | 1.19

</body>

</html>

<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File
href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip.htm">
<link rel=File-List
href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml">
<style>
<!--table
	{mso-displayed-decimal-separator:"\.";
	mso-displayed-thousand-separator:"\,";}
@page
	{margin:.75in .7in .75in .7in;
	mso-header-margin:.3in;
	mso-footer-margin:.3in;}
tr
	{mso-height-source:auto;}
col
	{mso-width-source:auto;}
br
	{mso-data-placement:same-cell;}
td
	{padding-top:1px;
	padding-right:1px;
	padding-left:1px;
	mso-ignore:padding;
	color:black;
	font-size:11.0pt;
	font-weight:400;
	font-style:normal;
	text-decoration:none;
	font-family:"Aptos Narrow", sans-serif;
	mso-font-charset:0;
	mso-number-format:General;
	text-align:general;
	vertical-align:bottom;
	border:none;
	mso-background-source:auto;
	mso-pattern:auto;
	mso-protection:locked visible;
	white-space:nowrap;
	mso-rotate:0;}
.xl65
	{border:.5pt solid windowtext;}
.xl66
	{text-align:center;}
.xl67
	{text-align:center;
	border:.5pt solid windowtext;}
-->
</style>
</head>

<body link="#467886" vlink="#96607D">


Name of Test | dnn_neon | dnn_sve | dnn_sve   vs dnn_neon(x-factor)
-- | -- | -- | --
fc::Layer_FullyConnected::([5,   16, 512, 128], 256, false, OCV/CPU) | 5.086 | 4.483 | 1.13
fc::Layer_FullyConnected::([5,   16, 512, 128], 256, true, OCV/CPU) | 8.512 | 8.347 | 1.02
fc::Layer_FullyConnected::([5,   16, 512, 128], 512, false, OCV/CPU) | 9.467 | 8.965 | 1.06
fc::Layer_FullyConnected::([5,   16, 512, 128], 512, true, OCV/CPU) | 14.855 | 13.527 | 1.1
fc::Layer_FullyConnected::([5,   16, 512, 128], 1024, false, OCV/CPU) | 18.821 | 18.023 | 1.04
fc::Layer_FullyConnected::([5,   16, 512, 128], 1024, true, OCV/CPU) | 27.558 | 24.966 | 1.1
fc::Layer_FullyConnected::([5,   512, 384, 0], 256, false, OCV/CPU) | 0.924 | 0.804 | 1.15
fc::Layer_FullyConnected::([5,   512, 384, 0], 256, true, OCV/CPU) | 1.259 | 1.126 | 1.12
fc::Layer_FullyConnected::([5,   512, 384, 0], 512, false, OCV/CPU) | 1.957 | 1.655 | 1.18
fc::Layer_FullyConnected::([5,   512, 384, 0], 512, true, OCV/CPU) | 2.831 | 2.775 | 1.02
fc::Layer_FullyConnected::([5,   512, 384, 0], 1024, false, OCV/CPU) | 5.92 | 6.379 | 0.93
fc::Layer_FullyConnected::([5,   512, 384, 0], 1024, true, OCV/CPU) | 8.924 | 8.993 | 0.99

</body>

</html>
This commit is contained in:
nishith-fujitsu
2025-12-03 13:12:28 +05:30
committed by GitHub
parent e3fe5a5681
commit 8efc0fd47b
11 changed files with 241 additions and 5 deletions

View File

@@ -49,7 +49,7 @@
set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
list(APPEND CPU_ALL_OPTIMIZATIONS SVE NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
@@ -104,6 +104,7 @@ ocv_optimization_process_obsolete_option(ENABLE_AVX2 AVX2 ON)
ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON)
ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF)
ocv_optimization_process_obsolete_option(ENABLE_SVE SVE ON)
ocv_optimization_process_obsolete_option(ENABLE_NEON NEON ON)
ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON)
@@ -352,7 +353,7 @@ if(X86 OR X86_64)
endif()
elseif(ARM OR AARCH64)
ocv_update(CPU_SVE_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sve.cpp")
ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
ocv_update(CPU_NEON_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_fp16.cpp")
@@ -369,16 +370,24 @@ elseif(ARM OR AARCH64)
endif()
ocv_update(CPU_FP16_IMPLIES "NEON")
else()
ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
if (UNIX AND NOT APPLE)
#Current Apple silicone M4 does not support SVE,
#but some Xcode versions reports their support.
ocv_update(CPU_KNOWN_OPTIMIZATIONS "SVE;NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
else()
ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
endif()
ocv_update(CPU_FP16_IMPLIES "NEON")
ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
ocv_update(CPU_NEON_FP16_IMPLIES "NEON")
ocv_update(CPU_NEON_BF16_IMPLIES "NEON")
if(MSVC)
ocv_update(CPU_SVE_FLAGS_ON "")
ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "")
ocv_update(CPU_NEON_FP16_FLAGS_ON "")
ocv_update(CPU_NEON_BF16_FLAGS_ON "")
else()
ocv_update(CPU_SVE_FLAGS_ON "-march=armv8.2-a+sve")
ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16")
ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+bf16")

24
cmake/checks/cpu_sve.cpp Normal file
View File

@@ -0,0 +1,24 @@
#include <stdio.h>
#if defined(__ARM_FEATURE_SVE)
# include <arm_sve.h>
# define CV_SVE 1
#endif
#if defined(CV_SVE)
int test()
{
const float src[1024] = {0.0};
svbool_t pg = svptrue_b32();
svfloat32_t val = svld1(pg, src);
return (int)svlastb_f32(pg, val);
}
#else
#error "SVE is not supported"
#endif
int main()
{
printf("%d\n", test());
return 0;
}

View File

@@ -237,6 +237,10 @@ struct VZeroUpperGuard {
#elif defined(__ARM_NEON)
# include <arm_neon.h>
# define CV_NEON 1
#ifdef __ARM_FEATURE_SVE
# include<arm_sve.h>
# define CV_SVE 1
#endif
#elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
# include <altivec.h>
# undef vector
@@ -362,6 +366,10 @@ struct VZeroUpperGuard {
# define CV_NEON 0
#endif
#ifndef CV_SVE
# define CV_SVE 0
#endif
#ifndef CV_RVV071
# define CV_RVV071 0
#endif

View File

@@ -399,6 +399,27 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX512_ICL(fn, args, mode, ...) CV_CPU_CALL_AVX512_ICL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SVE
# define CV_TRY_SVE 1
# define CV_CPU_FORCE_SVE 1
# define CV_CPU_HAS_SUPPORT_SVE 1
# define CV_CPU_CALL_SVE(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_SVE_(fn, args) return (opt_SVE::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SVE
# define CV_TRY_SVE 1
# define CV_CPU_FORCE_SVE 0
# define CV_CPU_HAS_SUPPORT_SVE (cv::checkHardwareSupport(CV_CPU_SVE))
# define CV_CPU_CALL_SVE(fn, args) if (CV_CPU_HAS_SUPPORT_SVE) return (opt_SVE::fn args)
# define CV_CPU_CALL_SVE_(fn, args) if (CV_CPU_HAS_SUPPORT_SVE) return (opt_SVE::fn args)
#else
# define CV_TRY_SVE 0
# define CV_CPU_FORCE_SVE 0
# define CV_CPU_HAS_SUPPORT_SVE 0
# define CV_CPU_CALL_SVE(fn, args)
# define CV_CPU_CALL_SVE_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_SVE(fn, args, mode, ...) CV_CPU_CALL_SVE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
# define CV_TRY_NEON 1
# define CV_CPU_FORCE_NEON 1

View File

@@ -279,6 +279,7 @@ namespace cv {
#define CV_CPU_NEON_DOTPROD 101
#define CV_CPU_NEON_FP16 102
#define CV_CPU_NEON_BF16 103
#define CV_CPU_SVE 104
#define CV_CPU_MSA 150
@@ -341,6 +342,7 @@ enum CpuFeatures {
CPU_NEON_DOTPROD = 101,
CPU_NEON_FP16 = 102,
CPU_NEON_BF16 = 103,
CPU_SVE = 104,
CPU_MSA = 150,

View File

@@ -427,6 +427,7 @@ struct HWFeatures
g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD";
g_hwFeatureNames[CPU_NEON_FP16] = "NEON_FP16";
g_hwFeatureNames[CPU_NEON_BF16] = "NEON_BF16";
g_hwFeatureNames[CPU_SVE] = "SVE";
g_hwFeatureNames[CPU_VSX] = "VSX";
g_hwFeatureNames[CPU_VSX3] = "VSX3";
@@ -589,6 +590,7 @@ struct HWFeatures
{
have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP
have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP
have[CV_CPU_SVE] = (auxv.a_un.a_val & (1 << 22)) != 0; // HWCAP_SVE
}
#if defined(AT_HWCAP2)
else if (auxv.a_type == AT_HWCAP2)

View File

@@ -4,7 +4,7 @@ endif()
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX NEON)
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX NEON SVE)
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX NEON)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)

View File

@@ -228,6 +228,7 @@ public:
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
p.useRVV = checkHardwareSupport(CPU_RVV);
p.useLASX = checkHardwareSupport(CPU_LASX);
p.useSVE = checkHardwareSupport(CPU_SVE);
parallel_for_(Range(0, nstripes), p, nstripes);
}
@@ -277,6 +278,12 @@ public:
opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
else
#endif
#if CV_TRY_SVE
if( useSVE ) {
opt_SVE::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
}
else
#endif
#if CV_TRY_RVV && CV_RVV
if( useRVV )
opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
@@ -342,6 +349,7 @@ public:
bool useAVX512;
bool useRVV;
bool useLASX;
bool useSVE;
};
#ifdef HAVE_OPENCL

View File

@@ -53,7 +53,119 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
size_t bstep, float* cptr, size_t cstep,
int ma, int na, int nb );
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && defined(CV_CPU_COMPILE_SVE)
#include <arm_sve.h>
// dst = vec * weights^t + bias
void fastGEMM1T( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize )
{
svbool_t pg_all = svptrue_b32();
int i = 0;
int vl = svcntw();
for( ; i <= nvecs - 15; i += 15 )
{
const float* wrow0 = weights + i * wstep; // base pointer for row i
// we will use wrow0 + k, wrow0 + wstep + k, etc
svfloat32_t vs0 = svdup_f32(0.0f), vs1 = svdup_f32(0.0f),
vs2 = svdup_f32(0.0f), vs3 = svdup_f32(0.0f),
vs4 = svdup_f32(0.0f), vs5 = svdup_f32(0.0f),
vs6 = svdup_f32(0.0f), vs7 = svdup_f32(0.0f),
vs8 = svdup_f32(0.0f), vs9 = svdup_f32(0.0f),
vs10 = svdup_f32(0.0f), vs11 = svdup_f32(0.0f),
vs12 = svdup_f32(0.0f), vs13 = svdup_f32(0.0f),
vs14 = svdup_f32(0.0f);
int k = 0;
for( ; k <= vecsize - vl; k += vl )
{
// load input chunk
const float* vecptr = reinterpret_cast<const float*>(vec) + k;
svfloat32_t v = svld1_f32(pg_all, vecptr);
// load weights from each of 15 rows at offset k
vs0 = svmla_f32_m(pg_all, vs0, svld1_f32(pg_all, wrow0 + k), v);
vs1 = svmla_f32_m(pg_all, vs1, svld1_f32(pg_all, wrow0 + wstep + k), v);
vs2 = svmla_f32_m(pg_all, vs2, svld1_f32(pg_all, wrow0 + wstep*2 + k), v);
vs3 = svmla_f32_m(pg_all, vs3, svld1_f32(pg_all, wrow0 + wstep*3 + k), v);
vs4 = svmla_f32_m(pg_all, vs4, svld1_f32(pg_all, wrow0 + wstep*4 + k), v);
vs5 = svmla_f32_m(pg_all, vs5, svld1_f32(pg_all, wrow0 + wstep*5 + k), v);
vs6 = svmla_f32_m(pg_all, vs6, svld1_f32(pg_all, wrow0 + wstep*6 + k), v);
vs7 = svmla_f32_m(pg_all, vs7, svld1_f32(pg_all, wrow0 + wstep*7 + k), v);
vs8 = svmla_f32_m(pg_all, vs8, svld1_f32(pg_all, wrow0 + wstep*8 + k), v);
vs9 = svmla_f32_m(pg_all, vs9, svld1_f32(pg_all, wrow0 + wstep*9 + k), v);
vs10 = svmla_f32_m(pg_all, vs10, svld1_f32(pg_all, wrow0 + wstep*10 + k), v);
vs11 = svmla_f32_m(pg_all, vs11, svld1_f32(pg_all, wrow0 + wstep*11 + k), v);
vs12 = svmla_f32_m(pg_all, vs12, svld1_f32(pg_all, wrow0 + wstep*12 + k), v);
vs13 = svmla_f32_m(pg_all, vs13, svld1_f32(pg_all, wrow0 + wstep*13 + k), v);
vs14 = svmla_f32_m(pg_all, vs14, svld1_f32(pg_all, wrow0 + wstep*14 + k), v);
}
if(k < vecsize){
svbool_t pg_tail = svwhilelt_b32(k, vecsize);
const float* vecptr = reinterpret_cast<const float*>(vec) + k;
svfloat32_t v = svld1_f32(pg_tail, vecptr);
const float* wptr = wrow0 + k;
vs0 = svmla_f32_m(pg_tail, vs0, svld1_f32(pg_tail, wptr), v);
vs1 = svmla_f32_m(pg_tail, vs1, svld1_f32(pg_tail, wptr + wstep), v);
vs2 = svmla_f32_m(pg_tail, vs2, svld1_f32(pg_tail, wptr + wstep*2), v);
vs3 = svmla_f32_m(pg_tail, vs3, svld1_f32(pg_tail, wptr + wstep*3), v);
vs4 = svmla_f32_m(pg_tail, vs4, svld1_f32(pg_tail, wptr + wstep*4), v);
vs5 = svmla_f32_m(pg_tail, vs5, svld1_f32(pg_tail, wptr + wstep*5), v);
vs6 = svmla_f32_m(pg_tail, vs6, svld1_f32(pg_tail, wptr + wstep*6), v);
vs7 = svmla_f32_m(pg_tail, vs7, svld1_f32(pg_tail, wptr + wstep*7), v);
vs8 = svmla_f32_m(pg_tail, vs8, svld1_f32(pg_tail, wptr + wstep*8), v);
vs9 = svmla_f32_m(pg_tail, vs9, svld1_f32(pg_tail, wptr + wstep*9), v);
vs10 = svmla_f32_m(pg_tail, vs10, svld1_f32(pg_tail, wptr + wstep*10), v);
vs11 = svmla_f32_m(pg_tail, vs11, svld1_f32(pg_tail, wptr + wstep*11), v);
vs12 = svmla_f32_m(pg_tail, vs12, svld1_f32(pg_tail, wptr + wstep*12), v);
vs13 = svmla_f32_m(pg_tail, vs13, svld1_f32(pg_tail, wptr + wstep*13), v);
vs14 = svmla_f32_m(pg_tail, vs14, svld1_f32(pg_tail, wptr + wstep*14), v);
}
float sum[15];
sum[0] = svaddv_f32(pg_all, vs0);
sum[1] = svaddv_f32(pg_all, vs1);
sum[2] = svaddv_f32(pg_all, vs2);
sum[3] = svaddv_f32(pg_all, vs3);
sum[4] = svaddv_f32(pg_all, vs4);
sum[5] = svaddv_f32(pg_all, vs5);
sum[6] = svaddv_f32(pg_all, vs6);
sum[7] = svaddv_f32(pg_all, vs7);
sum[8] = svaddv_f32(pg_all, vs8);
sum[9] = svaddv_f32(pg_all, vs9);
sum[10] = svaddv_f32(pg_all, vs10);
sum[11] = svaddv_f32(pg_all, vs11);
sum[12] = svaddv_f32(pg_all, vs12);
sum[13] = svaddv_f32(pg_all, vs13);
sum[14] = svaddv_f32(pg_all, vs14);
for (int j = 0; j < 15; j += vl) {
svbool_t pg = svwhilelt_b32(j, 15);
svfloat32_t v_sum = svld1_f32(pg, sum + j);
svfloat32_t v_bias = svld1_f32(pg, bias + i + j);
svst1_f32(pg, dst + i + j, svadd_f32_z(pg, v_sum, v_bias));
}
}
float temp = 0.f;
for( ; i < nvecs; i++ )
{
const float* wrow = weights + i * wstep;
svfloat32_t vs0 = svdup_f32(0.0f);
int k = 0;
for( ; k <= vecsize - vl; k += vl )
{
svfloat32_t v = svld1_f32(pg_all, reinterpret_cast<const float*>(vec) + k);
vs0 = svmla_f32_m(pg_all, vs0, svld1_f32(pg_all, wrow + k), v);
}
if (k != vecsize) {
svbool_t pg_tail = svwhilelt_b32(k, vecsize);
svfloat32_t v = svld1_f32(pg_tail, reinterpret_cast<const float*>(vec) + k);
vs0 = svmla_f32_m(pg_tail, vs0, svld1_f32(pg_tail, wrow + k), v);
}
temp = svaddv_f32(pg_all, vs0);
dst[i] = temp + bias[i];
}
}
#endif
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON && !defined(CV_CPU_COMPILE_SVE)
static const uint32_t tailMaskArray[7] = {
0u, 0u, 0u, 0u,

View File

@@ -138,6 +138,9 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
#if CV_TRY_AVX2
bool useAVX2;
#endif
#if CV_TRY_SVE
bool useSVE;
#endif
#if CV_TRY_NEON
bool useNEON;
#endif
@@ -156,6 +159,9 @@ public:
#if CV_TRY_AVX2
, useAVX2(checkHardwareSupport(CPU_AVX2))
#endif
#if CV_TRY_SVE
, useSVE(checkHardwareSupport(CPU_SVE))
#endif
#if CV_TRY_NEON
, useNEON(checkHardwareSupport(CPU_NEON))
#endif
@@ -495,6 +501,13 @@ public:
&& Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F
&& Wh.cols >= 8;
#endif
#if CV_TRY_SVE
bool canUseSVE = gates.isContinuous() && bias.isContinuous()
&& Wx.depth() == CV_32F && gates.depth() == CV_32F
&& bias.depth() == CV_32F;
bool canUseSVE_hInternal = hInternal.isContinuous() && gates.isContinuous() && bias.isContinuous()
&& Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F;
#endif
#if CV_TRY_NEON
bool canUseNeon = gates.isContinuous() && bias.isContinuous()
&& Wx.depth() == CV_32F && gates.depth() == CV_32F
@@ -554,6 +567,23 @@ public:
}
else
#endif
#if CV_TRY_SVE
if (useSVE && canUseSVE && xCurr.isContinuous())
{
for (int n = 0; n < xCurr.rows; n++) {
opt_SVE::fastGEMM1T(
xCurr.ptr<float>(n),
Wx.ptr<float>(),
Wx.step1(),
bias.ptr<float>(),
gates.ptr<float>(n),
Wx.rows,
Wx.cols
);
}
}
else
#endif
#if CV_TRY_NEON
if (useNEON && canUseNeon && xCurr.isContinuous())
{
@@ -610,6 +640,23 @@ public:
}
else
#endif
#if CV_TRY_SVE
if (useSVE && canUseSVE_hInternal)
{
for (int n = 0; n < hInternal.rows; n++) {
opt_SVE::fastGEMM1T(
hInternal.ptr<float>(n),
Wh.ptr<float>(),
Wh.step1(),
gates.ptr<float>(n),
gates.ptr<float>(n),
Wh.rows,
Wh.cols
);
}
}
else
#endif
#if CV_TRY_NEON
if (useNEON && canUseNeon_hInternal)
{

View File

@@ -1,6 +1,9 @@
# see https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html#index-march
function(ocv_set_platform_flags VAR)
unset(flags)
if(ENABLE_SVE)
set(flags "${flags}+sve")
endif()
if(ENABLE_BF16)
set(flags "${flags}+bf16")
endif()