mirror of
https://github.com/opencv/opencv.git
synced 2026-01-18 17:21:42 +01:00
Merge pull request #28055 from nishith-fujitsu:sve_fastGEMM1t
dnn: add SVE optimized fastGEMM1T function and SVE dispatch #28055 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch **Description** This PR enables fastGemm1t vectorized with SVE for AARCH64 architecture that called by recurrent layers and fully connected layers with SVE dispatching mechanism. **ARM Compatibility:** Modified the build scripts, and configuration files to ensure compatibility with ARM processors. **Checklist** Code changes have been tested on ARM devices (Graviton3). **Modifications** - Implemented FastGemm1T kernel in SVE with Vector length agnostic approach. - Added Flags and checks to call our ported Kernel in Recurrent Layer and FullyConnected layer. - Changes made to cmakelist.txt to dispatch our ported kernel for SVE. - Flag OpenCV Dispatch with SVE optimization is added to support SVE implemented kernel for OpenCV. According to OpenCV build optimization https://github.com/opencv/opencv/wiki/CPU-optimizations-build-options cmake \ -DCPU_BASELINE=NEON\ -D CPU_DISPATCH=SVE\ **Performance Improvement** - The suggested optimizations Improves the performance of LSTM layer and fully connected layer. <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns="http://www.w3.org/TR/REC-html40"> <head> <meta name=ProgId content=Excel.Sheet> <meta name=Generator content="Microsoft Excel 15"> <link id=Main-File rel=Main-File href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip.htm"> <link rel=File-List href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml"> <style> <!--table {mso-displayed-decimal-separator:"\."; mso-displayed-thousand-separator:"\,";} @page {margin:.75in .7in .75in .7in; mso-header-margin:.3in; mso-footer-margin:.3in;} tr {mso-height-source:auto;} col {mso-width-source:auto;} br {mso-data-placement:same-cell;} td {padding-top:1px; padding-right:1px; padding-left:1px; mso-ignore:padding; color:black; font-size:11.0pt; font-weight:400; font-style:normal; text-decoration:none; font-family:"Aptos Narrow", sans-serif; mso-font-charset:0; mso-number-format:General; text-align:general; vertical-align:bottom; border:none; mso-background-source:auto; mso-pattern:auto; mso-protection:locked visible; white-space:nowrap; mso-rotate:0;} .xl63 {border:.5pt solid windowtext;} .xl64 {text-align:center;} .xl65 {text-align:center; border:.5pt solid windowtext;} --> </style> </head> <body link="#467886" vlink="#96607D"> Name of Test | dnn_neon | dnn_sve | dnn_sve vs dnn_neon(x-factor) -- | -- | -- | -- lstm::Layer_LSTM::BATCH=1, IN=64, HIDDEN=192, TS=100 | 2.878 | 2.326 | 1.24 lstm::Layer_LSTM::BATCH=1, IN=192, HIDDEN=192, TS=100 | 4.162 | 3.08 | 1.35 lstm::Layer_LSTM::BATCH=1, IN=192, HIDDEN=512, TS=100 | 18.627 | 16.152 | 1.15 lstm::Layer_LSTM::BATCH=1, IN=1024, HIDDEN=192, TS=100 | 10.98 | 7.976 | 1.38 lstm::Layer_LSTM::BATCH=64, IN=64, HIDDEN=192, TS=2 | 4.41 | 3.459 | 1.27 lstm::Layer_LSTM::BATCH=64, IN=192, HIDDEN=192, TS=2 | 6.567 | 4.807 | 1.37 lstm::Layer_LSTM::BATCH=64, IN=192, HIDDEN=512, TS=2 | 28.471 | 22.909 | 1.24 lstm::Layer_LSTM::BATCH=64, IN=1024, HIDDEN=192, TS=2 | 15.491 | 12.537 | 1.24 lstm::Layer_LSTM::BATCH=128, IN=64, HIDDEN=192, TS=2 | 8.848 | 6.821 | 1.3 lstm::Layer_LSTM::BATCH=128, IN=192, HIDDEN=192, TS=2 | 12.969 | 9.522 | 1.36 lstm::Layer_LSTM::BATCH=128, IN=192, HIDDEN=512, TS=2 | 55.52 | 45.746 | 1.21 lstm::Layer_LSTM::BATCH=128, IN=1024, HIDDEN=192, TS=2 | 31.226 | 26.132 | 1.19 </body> </html> <html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns="http://www.w3.org/TR/REC-html40"> <head> <meta name=ProgId content=Excel.Sheet> <meta name=Generator content="Microsoft Excel 15"> <link id=Main-File rel=Main-File href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip.htm"> <link rel=File-List href="file:///C:/Users/jaiswaln/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml"> <style> <!--table {mso-displayed-decimal-separator:"\."; mso-displayed-thousand-separator:"\,";} @page {margin:.75in .7in .75in .7in; mso-header-margin:.3in; mso-footer-margin:.3in;} tr {mso-height-source:auto;} col {mso-width-source:auto;} br {mso-data-placement:same-cell;} td {padding-top:1px; padding-right:1px; padding-left:1px; mso-ignore:padding; color:black; font-size:11.0pt; font-weight:400; font-style:normal; text-decoration:none; font-family:"Aptos Narrow", sans-serif; mso-font-charset:0; mso-number-format:General; text-align:general; vertical-align:bottom; border:none; mso-background-source:auto; mso-pattern:auto; mso-protection:locked visible; white-space:nowrap; mso-rotate:0;} .xl65 {border:.5pt solid windowtext;} .xl66 {text-align:center;} .xl67 {text-align:center; border:.5pt solid windowtext;} --> </style> </head> <body link="#467886" vlink="#96607D"> Name of Test | dnn_neon | dnn_sve | dnn_sve vs dnn_neon(x-factor) -- | -- | -- | -- fc::Layer_FullyConnected::([5, 16, 512, 128], 256, false, OCV/CPU) | 5.086 | 4.483 | 1.13 fc::Layer_FullyConnected::([5, 16, 512, 128], 256, true, OCV/CPU) | 8.512 | 8.347 | 1.02 fc::Layer_FullyConnected::([5, 16, 512, 128], 512, false, OCV/CPU) | 9.467 | 8.965 | 1.06 fc::Layer_FullyConnected::([5, 16, 512, 128], 512, true, OCV/CPU) | 14.855 | 13.527 | 1.1 fc::Layer_FullyConnected::([5, 16, 512, 128], 1024, false, OCV/CPU) | 18.821 | 18.023 | 1.04 fc::Layer_FullyConnected::([5, 16, 512, 128], 1024, true, OCV/CPU) | 27.558 | 24.966 | 1.1 fc::Layer_FullyConnected::([5, 512, 384, 0], 256, false, OCV/CPU) | 0.924 | 0.804 | 1.15 fc::Layer_FullyConnected::([5, 512, 384, 0], 256, true, OCV/CPU) | 1.259 | 1.126 | 1.12 fc::Layer_FullyConnected::([5, 512, 384, 0], 512, false, OCV/CPU) | 1.957 | 1.655 | 1.18 fc::Layer_FullyConnected::([5, 512, 384, 0], 512, true, OCV/CPU) | 2.831 | 2.775 | 1.02 fc::Layer_FullyConnected::([5, 512, 384, 0], 1024, false, OCV/CPU) | 5.92 | 6.379 | 0.93 fc::Layer_FullyConnected::([5, 512, 384, 0], 1024, true, OCV/CPU) | 8.924 | 8.993 | 0.99 </body> </html>
This commit is contained in:
@@ -49,7 +49,7 @@
|
||||
|
||||
set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS SVE NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
|
||||
@@ -104,6 +104,7 @@ ocv_optimization_process_obsolete_option(ENABLE_AVX2 AVX2 ON)
|
||||
ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON)
|
||||
|
||||
ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF)
|
||||
ocv_optimization_process_obsolete_option(ENABLE_SVE SVE ON)
|
||||
ocv_optimization_process_obsolete_option(ENABLE_NEON NEON ON)
|
||||
|
||||
ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON)
|
||||
@@ -352,7 +353,7 @@ if(X86 OR X86_64)
|
||||
endif()
|
||||
|
||||
elseif(ARM OR AARCH64)
|
||||
|
||||
ocv_update(CPU_SVE_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sve.cpp")
|
||||
ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
|
||||
ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
|
||||
ocv_update(CPU_NEON_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_fp16.cpp")
|
||||
@@ -369,16 +370,24 @@ elseif(ARM OR AARCH64)
|
||||
endif()
|
||||
ocv_update(CPU_FP16_IMPLIES "NEON")
|
||||
else()
|
||||
ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
|
||||
if (UNIX AND NOT APPLE)
|
||||
#Current Apple silicone M4 does not support SVE,
|
||||
#but some Xcode versions reports their support.
|
||||
ocv_update(CPU_KNOWN_OPTIMIZATIONS "SVE;NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
|
||||
else()
|
||||
ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
|
||||
endif()
|
||||
ocv_update(CPU_FP16_IMPLIES "NEON")
|
||||
ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
|
||||
ocv_update(CPU_NEON_FP16_IMPLIES "NEON")
|
||||
ocv_update(CPU_NEON_BF16_IMPLIES "NEON")
|
||||
if(MSVC)
|
||||
ocv_update(CPU_SVE_FLAGS_ON "")
|
||||
ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "")
|
||||
ocv_update(CPU_NEON_FP16_FLAGS_ON "")
|
||||
ocv_update(CPU_NEON_BF16_FLAGS_ON "")
|
||||
else()
|
||||
ocv_update(CPU_SVE_FLAGS_ON "-march=armv8.2-a+sve")
|
||||
ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
|
||||
ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16")
|
||||
ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+bf16")
|
||||
|
||||
24
cmake/checks/cpu_sve.cpp
Normal file
24
cmake/checks/cpu_sve.cpp
Normal file
@@ -0,0 +1,24 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
# include <arm_sve.h>
|
||||
# define CV_SVE 1
|
||||
#endif
|
||||
|
||||
#if defined(CV_SVE)
|
||||
int test()
|
||||
{
|
||||
const float src[1024] = {0.0};
|
||||
svbool_t pg = svptrue_b32();
|
||||
svfloat32_t val = svld1(pg, src);
|
||||
return (int)svlastb_f32(pg, val);
|
||||
}
|
||||
#else
|
||||
#error "SVE is not supported"
|
||||
#endif
|
||||
|
||||
int main()
|
||||
{
|
||||
printf("%d\n", test());
|
||||
return 0;
|
||||
}
|
||||
@@ -237,6 +237,10 @@ struct VZeroUpperGuard {
|
||||
#elif defined(__ARM_NEON)
|
||||
# include <arm_neon.h>
|
||||
# define CV_NEON 1
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
# include<arm_sve.h>
|
||||
# define CV_SVE 1
|
||||
#endif
|
||||
#elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
|
||||
# include <altivec.h>
|
||||
# undef vector
|
||||
@@ -362,6 +366,10 @@ struct VZeroUpperGuard {
|
||||
# define CV_NEON 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SVE
|
||||
# define CV_SVE 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_RVV071
|
||||
# define CV_RVV071 0
|
||||
#endif
|
||||
|
||||
@@ -399,6 +399,27 @@
|
||||
#endif
|
||||
#define __CV_CPU_DISPATCH_CHAIN_AVX512_ICL(fn, args, mode, ...) CV_CPU_CALL_AVX512_ICL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
|
||||
|
||||
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SVE
|
||||
# define CV_TRY_SVE 1
|
||||
# define CV_CPU_FORCE_SVE 1
|
||||
# define CV_CPU_HAS_SUPPORT_SVE 1
|
||||
# define CV_CPU_CALL_SVE(fn, args) return (cpu_baseline::fn args)
|
||||
# define CV_CPU_CALL_SVE_(fn, args) return (opt_SVE::fn args)
|
||||
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SVE
|
||||
# define CV_TRY_SVE 1
|
||||
# define CV_CPU_FORCE_SVE 0
|
||||
# define CV_CPU_HAS_SUPPORT_SVE (cv::checkHardwareSupport(CV_CPU_SVE))
|
||||
# define CV_CPU_CALL_SVE(fn, args) if (CV_CPU_HAS_SUPPORT_SVE) return (opt_SVE::fn args)
|
||||
# define CV_CPU_CALL_SVE_(fn, args) if (CV_CPU_HAS_SUPPORT_SVE) return (opt_SVE::fn args)
|
||||
#else
|
||||
# define CV_TRY_SVE 0
|
||||
# define CV_CPU_FORCE_SVE 0
|
||||
# define CV_CPU_HAS_SUPPORT_SVE 0
|
||||
# define CV_CPU_CALL_SVE(fn, args)
|
||||
# define CV_CPU_CALL_SVE_(fn, args)
|
||||
#endif
|
||||
#define __CV_CPU_DISPATCH_CHAIN_SVE(fn, args, mode, ...) CV_CPU_CALL_SVE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
|
||||
|
||||
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
|
||||
# define CV_TRY_NEON 1
|
||||
# define CV_CPU_FORCE_NEON 1
|
||||
|
||||
@@ -279,6 +279,7 @@ namespace cv {
|
||||
#define CV_CPU_NEON_DOTPROD 101
|
||||
#define CV_CPU_NEON_FP16 102
|
||||
#define CV_CPU_NEON_BF16 103
|
||||
#define CV_CPU_SVE 104
|
||||
|
||||
#define CV_CPU_MSA 150
|
||||
|
||||
@@ -341,6 +342,7 @@ enum CpuFeatures {
|
||||
CPU_NEON_DOTPROD = 101,
|
||||
CPU_NEON_FP16 = 102,
|
||||
CPU_NEON_BF16 = 103,
|
||||
CPU_SVE = 104,
|
||||
|
||||
CPU_MSA = 150,
|
||||
|
||||
|
||||
@@ -427,6 +427,7 @@ struct HWFeatures
|
||||
g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD";
|
||||
g_hwFeatureNames[CPU_NEON_FP16] = "NEON_FP16";
|
||||
g_hwFeatureNames[CPU_NEON_BF16] = "NEON_BF16";
|
||||
g_hwFeatureNames[CPU_SVE] = "SVE";
|
||||
|
||||
g_hwFeatureNames[CPU_VSX] = "VSX";
|
||||
g_hwFeatureNames[CPU_VSX3] = "VSX3";
|
||||
@@ -589,6 +590,7 @@ struct HWFeatures
|
||||
{
|
||||
have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP
|
||||
have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP
|
||||
have[CV_CPU_SVE] = (auxv.a_un.a_val & (1 << 22)) != 0; // HWCAP_SVE
|
||||
}
|
||||
#if defined(AT_HWCAP2)
|
||||
else if (auxv.a_type == AT_HWCAP2)
|
||||
|
||||
@@ -4,7 +4,7 @@ endif()
|
||||
|
||||
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
|
||||
|
||||
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX NEON)
|
||||
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX NEON SVE)
|
||||
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX NEON)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
|
||||
|
||||
@@ -228,6 +228,7 @@ public:
|
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
|
||||
p.useRVV = checkHardwareSupport(CPU_RVV);
|
||||
p.useLASX = checkHardwareSupport(CPU_LASX);
|
||||
p.useSVE = checkHardwareSupport(CPU_SVE);
|
||||
|
||||
parallel_for_(Range(0, nstripes), p, nstripes);
|
||||
}
|
||||
@@ -277,6 +278,12 @@ public:
|
||||
opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_SVE
|
||||
if( useSVE ) {
|
||||
opt_SVE::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_RVV && CV_RVV
|
||||
if( useRVV )
|
||||
opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
|
||||
@@ -342,6 +349,7 @@ public:
|
||||
bool useAVX512;
|
||||
bool useRVV;
|
||||
bool useLASX;
|
||||
bool useSVE;
|
||||
};
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
@@ -53,7 +53,119 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
|
||||
size_t bstep, float* cptr, size_t cstep,
|
||||
int ma, int na, int nb );
|
||||
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && defined(CV_CPU_COMPILE_SVE)
|
||||
#include <arm_sve.h>
|
||||
// dst = vec * weights^t + bias
|
||||
|
||||
void fastGEMM1T( const float* vec, const float* weights,
|
||||
size_t wstep, const float* bias,
|
||||
float* dst, int nvecs, int vecsize )
|
||||
{
|
||||
svbool_t pg_all = svptrue_b32();
|
||||
int i = 0;
|
||||
int vl = svcntw();
|
||||
for( ; i <= nvecs - 15; i += 15 )
|
||||
{
|
||||
const float* wrow0 = weights + i * wstep; // base pointer for row i
|
||||
// we will use wrow0 + k, wrow0 + wstep + k, etc
|
||||
svfloat32_t vs0 = svdup_f32(0.0f), vs1 = svdup_f32(0.0f),
|
||||
vs2 = svdup_f32(0.0f), vs3 = svdup_f32(0.0f),
|
||||
vs4 = svdup_f32(0.0f), vs5 = svdup_f32(0.0f),
|
||||
vs6 = svdup_f32(0.0f), vs7 = svdup_f32(0.0f),
|
||||
vs8 = svdup_f32(0.0f), vs9 = svdup_f32(0.0f),
|
||||
vs10 = svdup_f32(0.0f), vs11 = svdup_f32(0.0f),
|
||||
vs12 = svdup_f32(0.0f), vs13 = svdup_f32(0.0f),
|
||||
vs14 = svdup_f32(0.0f);
|
||||
int k = 0;
|
||||
for( ; k <= vecsize - vl; k += vl )
|
||||
{
|
||||
// load input chunk
|
||||
const float* vecptr = reinterpret_cast<const float*>(vec) + k;
|
||||
svfloat32_t v = svld1_f32(pg_all, vecptr);
|
||||
// load weights from each of 15 rows at offset k
|
||||
vs0 = svmla_f32_m(pg_all, vs0, svld1_f32(pg_all, wrow0 + k), v);
|
||||
vs1 = svmla_f32_m(pg_all, vs1, svld1_f32(pg_all, wrow0 + wstep + k), v);
|
||||
vs2 = svmla_f32_m(pg_all, vs2, svld1_f32(pg_all, wrow0 + wstep*2 + k), v);
|
||||
vs3 = svmla_f32_m(pg_all, vs3, svld1_f32(pg_all, wrow0 + wstep*3 + k), v);
|
||||
vs4 = svmla_f32_m(pg_all, vs4, svld1_f32(pg_all, wrow0 + wstep*4 + k), v);
|
||||
vs5 = svmla_f32_m(pg_all, vs5, svld1_f32(pg_all, wrow0 + wstep*5 + k), v);
|
||||
vs6 = svmla_f32_m(pg_all, vs6, svld1_f32(pg_all, wrow0 + wstep*6 + k), v);
|
||||
vs7 = svmla_f32_m(pg_all, vs7, svld1_f32(pg_all, wrow0 + wstep*7 + k), v);
|
||||
vs8 = svmla_f32_m(pg_all, vs8, svld1_f32(pg_all, wrow0 + wstep*8 + k), v);
|
||||
vs9 = svmla_f32_m(pg_all, vs9, svld1_f32(pg_all, wrow0 + wstep*9 + k), v);
|
||||
vs10 = svmla_f32_m(pg_all, vs10, svld1_f32(pg_all, wrow0 + wstep*10 + k), v);
|
||||
vs11 = svmla_f32_m(pg_all, vs11, svld1_f32(pg_all, wrow0 + wstep*11 + k), v);
|
||||
vs12 = svmla_f32_m(pg_all, vs12, svld1_f32(pg_all, wrow0 + wstep*12 + k), v);
|
||||
vs13 = svmla_f32_m(pg_all, vs13, svld1_f32(pg_all, wrow0 + wstep*13 + k), v);
|
||||
vs14 = svmla_f32_m(pg_all, vs14, svld1_f32(pg_all, wrow0 + wstep*14 + k), v);
|
||||
}
|
||||
if(k < vecsize){
|
||||
svbool_t pg_tail = svwhilelt_b32(k, vecsize);
|
||||
const float* vecptr = reinterpret_cast<const float*>(vec) + k;
|
||||
svfloat32_t v = svld1_f32(pg_tail, vecptr);
|
||||
const float* wptr = wrow0 + k;
|
||||
vs0 = svmla_f32_m(pg_tail, vs0, svld1_f32(pg_tail, wptr), v);
|
||||
vs1 = svmla_f32_m(pg_tail, vs1, svld1_f32(pg_tail, wptr + wstep), v);
|
||||
vs2 = svmla_f32_m(pg_tail, vs2, svld1_f32(pg_tail, wptr + wstep*2), v);
|
||||
vs3 = svmla_f32_m(pg_tail, vs3, svld1_f32(pg_tail, wptr + wstep*3), v);
|
||||
vs4 = svmla_f32_m(pg_tail, vs4, svld1_f32(pg_tail, wptr + wstep*4), v);
|
||||
vs5 = svmla_f32_m(pg_tail, vs5, svld1_f32(pg_tail, wptr + wstep*5), v);
|
||||
vs6 = svmla_f32_m(pg_tail, vs6, svld1_f32(pg_tail, wptr + wstep*6), v);
|
||||
vs7 = svmla_f32_m(pg_tail, vs7, svld1_f32(pg_tail, wptr + wstep*7), v);
|
||||
vs8 = svmla_f32_m(pg_tail, vs8, svld1_f32(pg_tail, wptr + wstep*8), v);
|
||||
vs9 = svmla_f32_m(pg_tail, vs9, svld1_f32(pg_tail, wptr + wstep*9), v);
|
||||
vs10 = svmla_f32_m(pg_tail, vs10, svld1_f32(pg_tail, wptr + wstep*10), v);
|
||||
vs11 = svmla_f32_m(pg_tail, vs11, svld1_f32(pg_tail, wptr + wstep*11), v);
|
||||
vs12 = svmla_f32_m(pg_tail, vs12, svld1_f32(pg_tail, wptr + wstep*12), v);
|
||||
vs13 = svmla_f32_m(pg_tail, vs13, svld1_f32(pg_tail, wptr + wstep*13), v);
|
||||
vs14 = svmla_f32_m(pg_tail, vs14, svld1_f32(pg_tail, wptr + wstep*14), v);
|
||||
}
|
||||
float sum[15];
|
||||
sum[0] = svaddv_f32(pg_all, vs0);
|
||||
|
||||
sum[1] = svaddv_f32(pg_all, vs1);
|
||||
sum[2] = svaddv_f32(pg_all, vs2);
|
||||
sum[3] = svaddv_f32(pg_all, vs3);
|
||||
sum[4] = svaddv_f32(pg_all, vs4);
|
||||
sum[5] = svaddv_f32(pg_all, vs5);
|
||||
sum[6] = svaddv_f32(pg_all, vs6);
|
||||
sum[7] = svaddv_f32(pg_all, vs7);
|
||||
sum[8] = svaddv_f32(pg_all, vs8);
|
||||
sum[9] = svaddv_f32(pg_all, vs9);
|
||||
sum[10] = svaddv_f32(pg_all, vs10);
|
||||
sum[11] = svaddv_f32(pg_all, vs11);
|
||||
sum[12] = svaddv_f32(pg_all, vs12);
|
||||
sum[13] = svaddv_f32(pg_all, vs13);
|
||||
sum[14] = svaddv_f32(pg_all, vs14);
|
||||
for (int j = 0; j < 15; j += vl) {
|
||||
svbool_t pg = svwhilelt_b32(j, 15);
|
||||
svfloat32_t v_sum = svld1_f32(pg, sum + j);
|
||||
svfloat32_t v_bias = svld1_f32(pg, bias + i + j);
|
||||
svst1_f32(pg, dst + i + j, svadd_f32_z(pg, v_sum, v_bias));
|
||||
}
|
||||
}
|
||||
float temp = 0.f;
|
||||
for( ; i < nvecs; i++ )
|
||||
{
|
||||
const float* wrow = weights + i * wstep;
|
||||
svfloat32_t vs0 = svdup_f32(0.0f);
|
||||
int k = 0;
|
||||
for( ; k <= vecsize - vl; k += vl )
|
||||
{
|
||||
svfloat32_t v = svld1_f32(pg_all, reinterpret_cast<const float*>(vec) + k);
|
||||
vs0 = svmla_f32_m(pg_all, vs0, svld1_f32(pg_all, wrow + k), v);
|
||||
}
|
||||
if (k != vecsize) {
|
||||
svbool_t pg_tail = svwhilelt_b32(k, vecsize);
|
||||
svfloat32_t v = svld1_f32(pg_tail, reinterpret_cast<const float*>(vec) + k);
|
||||
vs0 = svmla_f32_m(pg_tail, vs0, svld1_f32(pg_tail, wrow + k), v);
|
||||
}
|
||||
temp = svaddv_f32(pg_all, vs0);
|
||||
dst[i] = temp + bias[i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON && !defined(CV_CPU_COMPILE_SVE)
|
||||
|
||||
static const uint32_t tailMaskArray[7] = {
|
||||
0u, 0u, 0u, 0u,
|
||||
|
||||
@@ -138,6 +138,9 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
|
||||
#if CV_TRY_AVX2
|
||||
bool useAVX2;
|
||||
#endif
|
||||
#if CV_TRY_SVE
|
||||
bool useSVE;
|
||||
#endif
|
||||
#if CV_TRY_NEON
|
||||
bool useNEON;
|
||||
#endif
|
||||
@@ -156,6 +159,9 @@ public:
|
||||
#if CV_TRY_AVX2
|
||||
, useAVX2(checkHardwareSupport(CPU_AVX2))
|
||||
#endif
|
||||
#if CV_TRY_SVE
|
||||
, useSVE(checkHardwareSupport(CPU_SVE))
|
||||
#endif
|
||||
#if CV_TRY_NEON
|
||||
, useNEON(checkHardwareSupport(CPU_NEON))
|
||||
#endif
|
||||
@@ -495,6 +501,13 @@ public:
|
||||
&& Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F
|
||||
&& Wh.cols >= 8;
|
||||
#endif
|
||||
#if CV_TRY_SVE
|
||||
bool canUseSVE = gates.isContinuous() && bias.isContinuous()
|
||||
&& Wx.depth() == CV_32F && gates.depth() == CV_32F
|
||||
&& bias.depth() == CV_32F;
|
||||
bool canUseSVE_hInternal = hInternal.isContinuous() && gates.isContinuous() && bias.isContinuous()
|
||||
&& Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F;
|
||||
#endif
|
||||
#if CV_TRY_NEON
|
||||
bool canUseNeon = gates.isContinuous() && bias.isContinuous()
|
||||
&& Wx.depth() == CV_32F && gates.depth() == CV_32F
|
||||
@@ -554,6 +567,23 @@ public:
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_SVE
|
||||
if (useSVE && canUseSVE && xCurr.isContinuous())
|
||||
{
|
||||
for (int n = 0; n < xCurr.rows; n++) {
|
||||
opt_SVE::fastGEMM1T(
|
||||
xCurr.ptr<float>(n),
|
||||
Wx.ptr<float>(),
|
||||
Wx.step1(),
|
||||
bias.ptr<float>(),
|
||||
gates.ptr<float>(n),
|
||||
Wx.rows,
|
||||
Wx.cols
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_NEON
|
||||
if (useNEON && canUseNeon && xCurr.isContinuous())
|
||||
{
|
||||
@@ -610,6 +640,23 @@ public:
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_SVE
|
||||
if (useSVE && canUseSVE_hInternal)
|
||||
{
|
||||
for (int n = 0; n < hInternal.rows; n++) {
|
||||
opt_SVE::fastGEMM1T(
|
||||
hInternal.ptr<float>(n),
|
||||
Wh.ptr<float>(),
|
||||
Wh.step1(),
|
||||
gates.ptr<float>(n),
|
||||
gates.ptr<float>(n),
|
||||
Wh.rows,
|
||||
Wh.cols
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_NEON
|
||||
if (useNEON && canUseNeon_hInternal)
|
||||
{
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
# see https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html#index-march
|
||||
function(ocv_set_platform_flags VAR)
|
||||
unset(flags)
|
||||
if(ENABLE_SVE)
|
||||
set(flags "${flags}+sve")
|
||||
endif()
|
||||
if(ENABLE_BF16)
|
||||
set(flags "${flags}+bf16")
|
||||
endif()
|
||||
|
||||
Reference in New Issue
Block a user