mirror of
https://github.com/openssl/openssl.git
synced 2026-01-18 17:11:31 +01:00
SHA512 performance optimized by RISCV RVV
This patch is dedicated to improve SHA512 speed with RISCV Cryptographic Vector Extension. Below performance output is calculated by Xuantie C930 FPGA with VLEN256. - sha512 speed might be improved from 197032K to 1010986KB Reviewed-by: Paul Yang <paulyang.inf@gmail.com> Reviewed-by: Paul Dale <paul.dale@oracle.com> (Merged from https://github.com/openssl/openssl/pull/29263)
This commit is contained in:
20
.github/workflows/riscv-more-cross-compiles.yml
vendored
20
.github/workflows/riscv-more-cross-compiles.yml
vendored
@@ -161,6 +161,26 @@ jobs:
|
||||
qemucpu: "rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true,zbkc=true,zbkx=true,zknd=true,zkne=true,zknh=true,zksed=true,zksh=true,zkr=true,zkt=true,v=true,vlen=128,zvbb=true,zvbc=true,zvkb=true,zvkg=true,zvkned=true,zvknha=true,zvknhb=true,zvksed=true,zvksh=true",
|
||||
opensslcapsname: riscvcap, # OPENSSL_riscvcap
|
||||
opensslcaps: "rv64gc_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd_zkne_zknh_zksed_zksh_zkr_zkt_v_zvbb_zvbc_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh"
|
||||
}, {
|
||||
# RV64GC with all currently OpenSSL-supported extensions, with zvl256
|
||||
# crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl
|
||||
arch: riscv64-linux-gnu,
|
||||
libs: libc6-dev-riscv64-cross,
|
||||
target: linux64-riscv64,
|
||||
fips: no,
|
||||
qemucpu: "rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true,zbkc=true,zbkx=true,zknd=true,zkne=true,zknh=true,zksed=true,zksh=true,zkr=true,zkt=true,v=true,vlen=256,zvbb=true,zvbc=true,zvkb=true,zvkg=true,zvkned=true,zvknha=true,zvknhb=true,zvksed=true,zvksh=true",
|
||||
opensslcapsname: riscvcap, # OPENSSL_riscvcap
|
||||
opensslcaps: "rv64gc_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd_zkne_zknh_zksed_zksh_zkr_zkt_v_zvbb_zvbc_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvl256"
|
||||
}, {
|
||||
# RV64GC with all currently OpenSSL-supported extensions, with zvl512
|
||||
# crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl
|
||||
arch: riscv64-linux-gnu,
|
||||
libs: libc6-dev-riscv64-cross,
|
||||
target: linux64-riscv64,
|
||||
fips: no,
|
||||
qemucpu: "rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true,zbkc=true,zbkx=true,zknd=true,zkne=true,zknh=true,zksed=true,zksh=true,zkr=true,zkt=true,v=true,vlen=512,zvbb=true,zvbc=true,zvkb=true,zvkg=true,zvkned=true,zvknha=true,zvknhb=true,zvksed=true,zvksh=true",
|
||||
opensslcapsname: riscvcap, # OPENSSL_riscvcap
|
||||
opensslcaps: "rv64gc_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd_zkne_zknh_zksed_zksh_zkr_zkt_v_zvbb_zvbc_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvl512"
|
||||
}, {
|
||||
# Inline asm
|
||||
# zbb/zbkb:
|
||||
|
||||
@@ -624,6 +624,14 @@ sub vmv_v_i {
|
||||
return ".word ".($template | ($imm << 15) | ($vd << 7));
|
||||
}
|
||||
|
||||
sub vmv1r_v {
|
||||
# vmv1r.v vd, vs1
|
||||
my $template = 0b1001111_00000_00000_011_00000_1010111;
|
||||
my $vd = read_vreg shift;
|
||||
my $vs1 = read_vreg shift;
|
||||
return ".word ".($template | ($vs1 << 20) | ($vd << 7));
|
||||
}
|
||||
|
||||
sub vmv_v_x {
|
||||
# vmv.v.x vd, rs1
|
||||
my $template = 0b0101111_00000_00000_100_00000_1010111;
|
||||
|
||||
@@ -70,6 +70,7 @@ my $K512 = "K512";
|
||||
|
||||
# Function arguments
|
||||
my ($H, $INP, $LEN, $KT, $H2, $INDEX_PATTERN) = ("a0", "a1", "a2", "a3", "t3", "t4");
|
||||
my ($T0, $T1) = ("t0", "t1");
|
||||
|
||||
################################################################################
|
||||
# void sha512_block_data_order_zvkb_zvknhb(void *c, const void *p, size_t len)
|
||||
@@ -78,8 +79,6 @@ $code .= <<___;
|
||||
.globl sha512_block_data_order_zvkb_zvknhb
|
||||
.type sha512_block_data_order_zvkb_zvknhb,\@function
|
||||
sha512_block_data_order_zvkb_zvknhb:
|
||||
@{[vsetivli "zero", 4, "e64", "m2", "ta", "ma"]}
|
||||
|
||||
# H is stored as {a,b,c,d},{e,f,g,h}, but we need {f,e,b,a},{h,g,d,c}
|
||||
# The dst vtype is e64m2 and the index vtype is e8mf4.
|
||||
# We use index-load with the following index pattern at v1.
|
||||
@@ -105,9 +104,226 @@ sha512_block_data_order_zvkb_zvknhb:
|
||||
@{[vsetivli "zero", 1, "e8", "m1", "ta", "ma"]}
|
||||
@{[vmv_v_i $V0, 0x01]}
|
||||
|
||||
@{[vsetivli "zero", 4, "e64", "m2", "ta", "ma"]}
|
||||
# Obtain VLEN and select the corresponding branch
|
||||
csrr t0, vlenb
|
||||
srl t1, t0, 5
|
||||
beqz t1, sha512_block_data_order_zvkb_zvknhb_zvl128
|
||||
sha512_block_data_order_zvkb_zvknhb_zvl256_zvl512:
|
||||
# When vlen=256 or 512, the round constants K512 can be loaded
|
||||
# at once in vector register files.
|
||||
@{[vsetivli "zero", 4, "e64", "m1", "ta", "ma"]}
|
||||
# Load round constants K512
|
||||
la $KT, $K512
|
||||
@{[vle64_v $V2, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V3, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V4, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V5, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V6, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V7, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V8, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V9, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V11, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V13, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V15, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V17, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V19, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V21, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V23, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V25, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V27, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V29, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V30, ($KT)]}
|
||||
addi $KT, $KT, 32
|
||||
@{[vle64_v $V31, ($KT)]}
|
||||
|
||||
L_round_loop:
|
||||
L_round_loop_256_512:
|
||||
# Decrement length by 1
|
||||
addi $LEN, $LEN, -1
|
||||
|
||||
# Keep the current state as we need it later: H' = H+{a',b',c',...,h'}.
|
||||
@{[vmv1r_v $V26, $V22]}
|
||||
@{[vmv1r_v $V28, $V24]}
|
||||
|
||||
# Load the 1024-bits of the message block in v10, v12, v14, v16
|
||||
# and perform the endian swap.
|
||||
@{[vle64_v $V10, $INP]}
|
||||
@{[vrev8_v $V10, $V10]}
|
||||
addi $INP, $INP, 32
|
||||
@{[vle64_v $V12, $INP]}
|
||||
@{[vrev8_v $V12, $V12]}
|
||||
addi $INP, $INP, 32
|
||||
@{[vle64_v $V14, $INP]}
|
||||
@{[vrev8_v $V14, $V14]}
|
||||
addi $INP, $INP, 32
|
||||
@{[vle64_v $V16, $INP]}
|
||||
@{[vrev8_v $V16, $V16]}
|
||||
addi $INP, $INP, 32
|
||||
|
||||
# Quad-round 0 (+0, v10->v12->v14->v16)
|
||||
@{[vadd_vv $V18, $V2, $V10]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V14, $V12, $V0]}
|
||||
@{[vsha2ms_vv $V10, $V18, $V16]}
|
||||
|
||||
# Quad-round 1 (+1, v12->v14->v16->v10)
|
||||
@{[vadd_vv $V18, $V3, $V12]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V16, $V14, $V0]}
|
||||
@{[vsha2ms_vv $V12, $V18, $V10]}
|
||||
|
||||
# Quad-round 2 (+2, v14->v16->v10->v12)
|
||||
@{[vadd_vv $V18, $V4, $V14]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V10, $V16, $V0]}
|
||||
@{[vsha2ms_vv $V14, $V18, $V12]}
|
||||
|
||||
# Quad-round 3 (+3, v16->v10->v12->v14)
|
||||
@{[vadd_vv $V18, $V5, $V16]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V12, $V10, $V0]}
|
||||
@{[vsha2ms_vv $V16, $V18, $V14]}
|
||||
|
||||
# Quad-round 4 (+4, v10->v12->v14->v16)
|
||||
@{[vadd_vv $V18, $V6, $V10]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V14, $V12, $V0]}
|
||||
@{[vsha2ms_vv $V10, $V18, $V16]}
|
||||
|
||||
# Quad-round 5 (+5, v12->v14->v16->v10)
|
||||
@{[vadd_vv $V18, $V7, $V12]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V16, $V14, $V0]}
|
||||
@{[vsha2ms_vv $V12, $V18, $V10]}
|
||||
|
||||
# Quad-round 6 (+6, v14->v16->v10->v12)
|
||||
@{[vadd_vv $V18, $V8, $V14]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V10, $V16, $V0]}
|
||||
@{[vsha2ms_vv $V14, $V18, $V12]}
|
||||
|
||||
# Quad-round 7 (+7, v16->v10->v12->v14)
|
||||
@{[vadd_vv $V18, $V9, $V16]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V12, $V10, $V0]}
|
||||
@{[vsha2ms_vv $V16, $V18, $V14]}
|
||||
|
||||
# Quad-round 8 (+8, v10->v12->v14->v16)
|
||||
@{[vadd_vv $V18, $V11, $V10]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V14, $V12, $V0]}
|
||||
@{[vsha2ms_vv $V10, $V18, $V16]}
|
||||
|
||||
# Quad-round 9 (+9, v12->v14->v16->v10)
|
||||
@{[vadd_vv $V18, $V13, $V12]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V16, $V14, $V0]}
|
||||
@{[vsha2ms_vv $V12, $V18, $V10]}
|
||||
|
||||
# Quad-round 10 (+10, v14->v16->v10->v12)
|
||||
@{[vadd_vv $V18, $V15, $V14]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V10, $V16, $V0]}
|
||||
@{[vsha2ms_vv $V14, $V18, $V12]}
|
||||
|
||||
# Quad-round 11 (+11, v16->v10->v12->v14)
|
||||
@{[vadd_vv $V18, $V17, $V16]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V12, $V10, $V0]}
|
||||
@{[vsha2ms_vv $V16, $V18, $V14]}
|
||||
|
||||
# Quad-round 12 (+12, v10->v12->v14->v16)
|
||||
@{[vadd_vv $V18, $V19, $V10]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V14, $V12, $V0]}
|
||||
@{[vsha2ms_vv $V10, $V18, $V16]}
|
||||
|
||||
# Quad-round 13 (+13, v12->v14->v16->v10)
|
||||
@{[vadd_vv $V18, $V21, $V12]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V16, $V14, $V0]}
|
||||
@{[vsha2ms_vv $V12, $V18, $V10]}
|
||||
|
||||
# Quad-round 14 (+14, v14->v16->v10->v12)
|
||||
@{[vadd_vv $V18, $V23, $V14]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V10, $V16, $V0]}
|
||||
@{[vsha2ms_vv $V14, $V18, $V12]}
|
||||
|
||||
# Quad-round 15 (+15, v16->v10->v12->v14)
|
||||
@{[vadd_vv $V18, $V25, $V16]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
@{[vmerge_vvm $V18, $V12, $V10, $V0]}
|
||||
@{[vsha2ms_vv $V16, $V18, $V14]}
|
||||
|
||||
# Quad-round 16 (+0, v10->v12->v14->v16)
|
||||
# Note that we stop generating new message schedule words (Wt, v10-16)
|
||||
# as we already generated all the words we end up consuming (i.e., W[79:76]).
|
||||
@{[vadd_vv $V18, $V27, $V10]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
|
||||
# Quad-round 17 (+1, v12->v14->v16->v10)
|
||||
@{[vadd_vv $V18, $V29, $V12]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
|
||||
# Quad-round 18 (+2, v14->v16->v10->v12)
|
||||
@{[vadd_vv $V18, $V30, $V14]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
|
||||
# Quad-round 19 (+3, v16->v10->v12->v14)
|
||||
@{[vadd_vv $V18, $V31, $V16]}
|
||||
@{[vsha2cl_vv $V24, $V22, $V18]}
|
||||
@{[vsha2ch_vv $V22, $V24, $V18]}
|
||||
|
||||
# H' = H+{a',b',c',...,h'}
|
||||
@{[vadd_vv $V22, $V26, $V22]}
|
||||
@{[vadd_vv $V24, $V28, $V24]}
|
||||
bnez $LEN, L_round_loop_256_512
|
||||
|
||||
# Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}.
|
||||
@{[vsuxei8_v $V22, ($H), $V1]}
|
||||
@{[vsuxei8_v $V24, ($H2), $V1]}
|
||||
|
||||
ret
|
||||
sha512_block_data_order_zvkb_zvknhb_zvl128:
|
||||
@{[vsetivli $T0, 4, "e64", "m2", "ta", "ma"]}
|
||||
L_round_loop_128:
|
||||
# Load round constants K512
|
||||
la $KT, $K512
|
||||
|
||||
@@ -204,7 +420,7 @@ L_round_loop:
|
||||
# H' = H+{a',b',c',...,h'}
|
||||
@{[vadd_vv $V22, $V26, $V22]}
|
||||
@{[vadd_vv $V24, $V28, $V24]}
|
||||
bnez $LEN, L_round_loop
|
||||
bnez $LEN, L_round_loop_128
|
||||
|
||||
# Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}.
|
||||
@{[vsuxei8_v $V22, ($H), $V1]}
|
||||
|
||||
Reference in New Issue
Block a user