This patch introduces an optimized powerpc64 assembly implementation for sha256-compress-n. This takes advantage of the vshasigma instruction, as well as unrolling loops to best take advantage of running instructions in parallel.
The following data was captured on a POWER 10 LPAR @ ~3.896GHz
Current C implementation: Algorithm mode Mbyte/s sha256 update 280.97 hmac-sha256 64 bytes 80.81 hmac-sha256 256 bytes 170.50 hmac-sha256 1024 bytes 241.92 hmac-sha256 4096 bytes 268.54 hmac-sha256 single msg 276.16
With optimized assembly: Algorithm mode Mbyte/s sha256 update 461.45 hmac-sha256 64 bytes 123.88 hmac-sha256 256 bytes 268.81 hmac-sha256 1024 bytes 390.91 hmac-sha256 4096 bytes 438.02 hmac-sha256 single msg 453.83
Signed-off-by: Eric Richter erichte@linux.ibm.com ---
I split this patch to be standalone, rather than delay even further trying to update SHA512 -- I will update the SHA512 implementation when this one stabilizes.
Regarding the load vperm needed for little endian: unfortunately we don't have a spare vector register to store the mask between rounds, so the best that can be done while maintaining p8 support will be to store the mask in a VSX register like the state values, and avoid the load. This is a negligible performance change however, yielding around +1MB/s on larger block counts (update, hmac 1024/4096/single msg) and -1MB/s on smaller (hmac 64/256).
Dropping p8 support allows the use of the lxvb16x instruction, which does not need to be permuted, however that is as well a negligible performance improvement at the cost of dropping a whole cpu set. So I see a few options: A) leave as-is, consider storing the mask in a VSX register B) drop p8 support, use lxvb16x C) have a compile-time switch to use permute on p8, and use the single instruction for p9 an up.
v3: - use protected zone instead of allocating stack space - add GPRs constants for multiples of 4 for loads - around +3.4 MB/s for sha256 update - move extend logic to its own macro called by EXTENDROUND - use 8 VSX registers to store previous state instead of the stack - around +11.0 MB/s for sha256 update
fat-ppc.c | 12 + powerpc64/fat/sha256-compress-n-2.asm | 36 +++ powerpc64/p8/sha256-compress-n.asm | 364 ++++++++++++++++++++++++++ 3 files changed, 412 insertions(+) create mode 100644 powerpc64/fat/sha256-compress-n-2.asm create mode 100644 powerpc64/p8/sha256-compress-n.asm
diff --git a/fat-ppc.c b/fat-ppc.c index cd76f7a1..efbeb2ec 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64) +
static void CONSTRUCTOR fat_init (void) @@ -231,6 +235,8 @@ fat_init (void) _nettle_ghash_update_arm64() */ _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64; _nettle_ghash_update_vec = _nettle_ghash_update_ppc64; + + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; } else { @@ -239,6 +245,7 @@ fat_init (void) _nettle_aes_invert_vec = _nettle_aes_invert_c; _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; } if (features.have_altivec) { @@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *, size_t blocks, const uint8_t *m), (ctx, blocks, m)) + +DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, + (uint32_t *state, const uint32_t *k, + size_t blocks, const uint8_t *input), + (state, k, blocks, input)) diff --git a/powerpc64/fat/sha256-compress-n-2.asm b/powerpc64/fat/sha256-compress-n-2.asm new file mode 100644 index 00000000..4f4eee9d --- /dev/null +++ b/powerpc64/fat/sha256-compress-n-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha256-compress-n-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha256-compress-n.asm') diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm new file mode 100644 index 00000000..c1ce0e8f --- /dev/null +++ b/powerpc64/p8/sha256-compress-n.asm @@ -0,0 +1,364 @@ +C x86_64/sha256-compress-n.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +.file "sha256-compress-n.asm" + +C Parameters in +define(`SP', `r1') +define(`STATE', `r3') +define(`K', `r4') +define(`NUMBLOCKS', `r5') +define(`INPUT', `r6') + +define(`T0', `r7') +define(`T1', `r8') +define(`TK', `r9') +define(`COUNT', `r10') +define(`TC0', `0') C Index instructions allow literal 0 instead of a GPR +define(`TC4', `r11') +define(`TC8', `r12') +define(`TC12', `r14') +define(`TC16', `r15') + +C State registers +define(`VSA', `v0') +define(`VSB', `v1') +define(`VSC', `v2') +define(`VSD', `v3') +define(`VSE', `v4') +define(`VSF', `v5') +define(`VSG', `v6') +define(`VSH', `v7') + +C Previous state value registers stored in VSX +define(`VSXA', `vs0') +define(`VSXB', `vs1') +define(`VSXC', `vs2') +define(`VSXD', `vs3') +define(`VSXE', `vs4') +define(`VSXF', `vs5') +define(`VSXG', `vs6') +define(`VSXH', `vs7') + +C Current K values +define(`VK', `v8') + +C Temp registers for math +define(`VT0', `v9') +define(`VT1', `v10') +define(`VT2', `v11') +define(`VT3', `v12') +define(`VT4', `v13') + +C Convenience named registers for sigma(a) and sigma(e) +define(`SIGA', `v14') +define(`SIGE', `v15') + +C Registers v16-v31 are used for input words W[0] through W[15] + +C Convert an index for W[i] to the corresponding vector register v[16 + i] +define(`IV', `m4_unquote(v`'eval((($1) % 16) + 16))') + +C ROUND(A B C D E F G H R EXT) +define(`ROUND', ` + + vadduwm VT1, VK, IV($9) C VT1: k+W + vadduwm VT4, $8, VT1 C VT4: H+k+W + + lxvw4x VSR(VK), TK, K C Load Key + addi TK, TK, 4 C Increment Pointer to next key + + vadduwm VT2, $4, $8 C VT2: H+D + vadduwm VT2, VT2, VT1 C VT2: H+D+k+W + + vshasigmaw SIGE, $5, 1, 0b1111 C Sigma(E) Se + vshasigmaw SIGA, $1, 1, 0 C Sigma(A) Sa + + vxor VT3, $2, $3 C VT3: b^c + vsel VT0, $7, $6, $5 C VT0: Ch. + vsel VT3, $3, $1, VT3 C VT3: Maj(a,b,c) + + vadduwm VT4, VT4, VT0 C VT4: Hkw + Ch. + vadduwm VT3, VT3, VT4 C VT3: HkW + Ch. + Maj. + + vadduwm VT0, VT0, VT2 C VT0: Ch. + DHKW + vadduwm $8, SIGE, SIGA C Anext: Se + Sa + vadduwm $4, VT0, SIGE C Dnext: Ch. + DHKW + Se + vadduwm $8, $8, VT3 C Anext: Se+Sa+HkW+Ch.+Maj. +') + +C Extend W[i] +define(`EXTEND', ` + vshasigmaw SIGE, IV($1 + 14), 0, 0b1111 + vshasigmaw SIGA, IV($1 + 1), 0, 0b0000 + vadduwm IV($1), IV($1), SIGE + vadduwm IV($1), IV($1), SIGA + vadduwm IV($1), IV($1), IV($1 + 9) +') + +define(`EXTENDROUND', ` + ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9) + C Schedule (data) for 16th round in future + EXTEND($9) +') +define(`NOEXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9)') + +define(`NOEXTENDROUNDS', ` + NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0) + NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1) + NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2) + NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3) + + NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4) + NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5) + NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6) + NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7) + + NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8) + NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9) + NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10) + NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11) + + NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12) + NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13) + NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14) + NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15) +') + +define(`EXTENDROUNDS', ` + EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0) + EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1) + EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2) + EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3) + + EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4) + EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5) + EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6) + EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7) + + EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8) + EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9) + EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10) + EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11) + + EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12) + EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13) + EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14) + EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15) +') + +define(`LOAD', ` + IF_BE(`lxvw4x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT') + IF_LE(` + lxvd2x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT + vperm IV($1), IV($1), IV($1), VT0 + ') +') + +define(`DOLOADS', ` + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') + LOAD(0) + LOAD(1) + LOAD(2) + LOAD(3) + addi INPUT, INPUT, 16 + LOAD(4) + LOAD(5) + LOAD(6) + LOAD(7) + addi INPUT, INPUT, 16 + LOAD(8) + LOAD(9) + LOAD(10) + LOAD(11) + addi INPUT, INPUT, 16 + LOAD(12) + LOAD(13) + LOAD(14) + LOAD(15) + addi INPUT, INPUT, 16 +') + +.text +PROLOGUE(_nettle_sha256_compress_n) + cmpwi 0, NUMBLOCKS, 0 + ble 0, .done + mtctr NUMBLOCKS + + C Store non-volatile registers + + li T0, -8 + li T1, -24 + stvx v20, T0, SP + stvx v21, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v22, T0, SP + stvx v23, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v24, T0, SP + stvx v25, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v26, T0, SP + stvx v27, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v28, T0, SP + stvx v29, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v30, T0, SP + stvx v31, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stdx r14, T0, SP + stdx r15, T1, SP + + li TC4, 4 + li TC8, 8 + li TC12, 12 + li TC16, 16 + + C Load state values + lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D + lxvw4x VSR(VSE), TC16, STATE C VSE contains E,F,G,H + + vsldoi VSB, VSA, VSA, 4 + vsldoi VSF, VSE, VSE, 4 + + vsldoi VSC, VSA, VSA, 8 + vsldoi VSG, VSE, VSE, 8 + + vsldoi VSD, VSA, VSA, 12 + vsldoi VSH, VSE, VSE, 12 + +.loop: + xxlor VSXA, VSR(VSA), VSR(VSA) + xxlor VSXB, VSR(VSB), VSR(VSB) + xxlor VSXC, VSR(VSC), VSR(VSC) + xxlor VSXD, VSR(VSD), VSR(VSD) + xxlor VSXE, VSR(VSE), VSR(VSE) + xxlor VSXF, VSR(VSF), VSR(VSF) + xxlor VSXG, VSR(VSG), VSR(VSG) + xxlor VSXH, VSR(VSH), VSR(VSH) + + li TK, 0 + lxvw4x VSR(VK), TK, K + addi TK, TK, 4 + + DOLOADS + + C "permute" state from VSA containing A,B,C,D into VSA,VSB,VSC,VSD + + EXTENDROUNDS + EXTENDROUNDS + EXTENDROUNDS + NOEXTENDROUNDS + + C Reload initial state from VSX registers + xxlor VSR(VT0), VSXA, VSXA + xxlor VSR(VT1), VSXB, VSXB + xxlor VSR(VT2), VSXC, VSXC + xxlor VSR(VT3), VSXD, VSXD + xxlor VSR(VT4), VSXE, VSXE + xxlor VSR(SIGA), VSXF, VSXF + xxlor VSR(SIGE), VSXG, VSXG + xxlor VSR(VK), VSXH, VSXH + + vadduwm VSA, VSA, VT0 + vadduwm VSB, VSB, VT1 + vadduwm VSC, VSC, VT2 + vadduwm VSD, VSD, VT3 + vadduwm VSE, VSE, VT4 + vadduwm VSF, VSF, SIGA + vadduwm VSG, VSG, SIGE + vadduwm VSH, VSH, VK + + bdnz .loop + + C Repack VSA,VSB,VSC,VSD into VSA,VSE for storing + vmrghw VSA, VSA, VSB + vmrghw VSC, VSC, VSD + vmrghw VSE, VSE, VSF + vmrghw VSG, VSG, VSH + + xxmrghd VSR(VSA), VSR(VSA), VSR(VSC) + xxmrghd VSR(VSE), VSR(VSE), VSR(VSG) + + stxvw4x VSR(VSA), 0, STATE + stxvw4x VSR(VSE), TC16, STATE + + + C Restore nonvolatile registers + li T0, -8 + li T1, -24 + lvx v20, T0, SP + lvx v21, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v22, T0, SP + lvx v23, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v24, T0, SP + lvx v25, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v26, T0, SP + lvx v27, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v28, T0, SP + lvx v29, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v30, T0, SP + lvx v31, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + ldx r14, T0, SP + ldx r15, T1, SP + +.done: + mr r3, INPUT + + blr +EPILOGUE(_nettle_sha256_compress_n) + +IF_LE(` +.data +.align 4 +.load_swap: + .byte 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 +')