According to the ABI, the stack pointer is quadword aligned, so starting the stack storage at offset -8, may cause the return address to be stepped on. Adjusting to use -16 as the starting point, which also matches other POWER assembly code.
Signed-off-by: Eric Richter erichte@linux.ibm.com --- powerpc64/p8/sha256-compress-n.asm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm index 309db1fa..e08ae132 100644 --- a/powerpc64/p8/sha256-compress-n.asm +++ b/powerpc64/p8/sha256-compress-n.asm @@ -216,8 +216,8 @@ PROLOGUE(_nettle_sha256_compress_n)
C Store non-volatile registers
- li T0, -8 - li T1, -24 + li T0, -16 + li T1, -32 stvx v20, T0, SP stvx v21, T1, SP subi T0, T0, 32 @@ -321,8 +321,8 @@ PROLOGUE(_nettle_sha256_compress_n)
C Restore nonvolatile registers - li T0, -8 - li T1, -24 + li T0, -16 + li T1, -32 lvx v20, T0, SP lvx v21, T1, SP subi T0, T0, 32
This patch introduces an optimized powerpc64 assembly implementation for sha512-compress, derived from the implementation for sha256-compress-n.
The following data was captured on a POWER 10 LPAR @ ~3.896GHz
Current C implementation: Algorithm mode Mbyte/s sha512 update 474.00 sha512_224 update 474.61 sha512_256 update 474.15 hmac-sha512 64 bytes 104.08 hmac-sha512 256 bytes 220.42 hmac-sha512 1024 bytes 368.58 hmac-sha512 4096 bytes 436.27 hmac-sha512 single msg 460.10
With optimized assembly: Algorithm mode Mbyte/s sha512 update 746.96 sha512_224 update 746.96 sha512_256 update 746.93 hmac-sha512 64 bytes 150.54 hmac-sha512 256 bytes 327.58 hmac-sha512 1024 bytes 562.49 hmac-sha512 4096 bytes 677.38 hmac-sha512 single msg 713.06
Signed-off-by: Eric Richter erichte@linux.ibm.com ---
Why the offsets -8 and -24?
Largely my own misinterpretation of the ABI, -16 looks like the correct starting spot. I've adjusted it in this patch, and sent a separate patch to fix it in sha256.
the gpr register usage could be trimmed a bit
Thanks for the STATE32 suggestion, that removed the need for one GPR, and then I discovered that the COUNT register was needlessly copied from the sha256 implementation, which saved another.
This puts the usage of GPRs solely within volatile registers. I can still make the change to reuse T0/T1 with TK to reduce at least one more if preferred.
Summarized changes from v1: - replace TC32, TC48 with STATE32 and STATE32 + TC16 - remove unused COUNT register reservation - adjust stack storage offset starting point to -16
fat-ppc.c | 10 + powerpc64/fat/sha512-compress-2.asm | 36 +++ powerpc64/p8/sha512-compress.asm | 326 ++++++++++++++++++++++++++++ 3 files changed, 372 insertions(+) create mode 100644 powerpc64/fat/sha512-compress-2.asm create mode 100644 powerpc64/p8/sha512-compress.asm
diff --git a/fat-ppc.c b/fat-ppc.c index aaccc116..5b6efd10 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -215,6 +215,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
+DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64) + /* Nop implementation for _gcm_aes_encrypt and _gcm_aes_decrypt. */ static size_t gcm_aes_crypt_c (struct gcm_key *key UNUSED, unsigned rounds UNUSED, @@ -253,6 +257,7 @@ fat_init (void) _nettle_gcm_aes_encrypt_vec = _nettle_gcm_aes_encrypt_ppc64; _nettle_gcm_aes_decrypt_vec = _nettle_gcm_aes_decrypt_ppc64; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; + _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64; } else { @@ -264,6 +269,7 @@ fat_init (void) _nettle_gcm_aes_encrypt_vec = gcm_aes_crypt_c; _nettle_gcm_aes_decrypt_vec = gcm_aes_crypt_c; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; + _nettle_sha512_compress_vec = _nettle_sha512_compress_c; } if (features.have_altivec) { @@ -378,3 +384,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, (uint32_t *state, const uint32_t *k, size_t blocks, const uint8_t *input), (state, k, blocks, input)) + +DEFINE_FAT_FUNC(_nettle_sha512_compress, void, + (uint64_t *state, const uint8_t *input, const uint64_t *k), + (state, input, k)) diff --git a/powerpc64/fat/sha512-compress-2.asm b/powerpc64/fat/sha512-compress-2.asm new file mode 100644 index 00000000..9445e5ba --- /dev/null +++ b/powerpc64/fat/sha512-compress-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha512-compress-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha512_compress) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha512-compress.asm') diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm new file mode 100644 index 00000000..bf8c1f8c --- /dev/null +++ b/powerpc64/p8/sha512-compress.asm @@ -0,0 +1,326 @@ +C x86_64/sha512-compress.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +.file "sha512-compress.asm" + +C Parameters in +define(`SP', `r1') +define(`STATE', `r3') +define(`INPUT', `r4') +define(`K', `r5') + +define(`T0', `r6') +define(`T1', `r7') +define(`TK', `r8') +define(`TC0', `0') C Index instructions allow literal 0 instead of a GPR +define(`TC8', `r9') +define(`TC16', `r10') +define(`TC24', `r11') +define(`STATE32', `r12') + +C State registers +define(`VSA', `v0') +define(`VSB', `v1') +define(`VSC', `v2') +define(`VSD', `v3') +define(`VSE', `v4') +define(`VSF', `v5') +define(`VSG', `v6') +define(`VSH', `v7') + +C Previous state value registers stored in VSX +define(`VSXAB', `vs0') +define(`VSXCD', `vs1') +define(`VSXEF', `vs2') +define(`VSXGH', `vs3') + +C Current K values +define(`VK', `v8') + +C Temp registers for math +define(`VT0', `v9') +define(`VT1', `v10') +define(`VT2', `v11') +define(`VT3', `v12') +define(`VT4', `v13') + +C Convenience named registers for sigma(a) and sigma(e) +define(`SIGA', `v14') +define(`SIGE', `v15') + +C Registers v16-v31 are used for input words W[0] through W[15] + +C Convert an index for W[i] to the corresponding vector register v[16 + i] +define(`IV', `m4_unquote(v`'eval((($1) % 16) + 16))') + +C ROUND(A B C D E F G H R) +define(`ROUND', ` + + vaddudm VT1, VK, IV($9) C VT1: k+W + vaddudm VT4, $8, VT1 C VT4: H+k+W + + lxvd2x VSR(VK), TK, K C Load Key + addi TK, TK, 8 C Increment Pointer to next key + + vaddudm VT2, $4, $8 C VT2: H+D + vaddudm VT2, VT2, VT1 C VT2: H+D+k+W + + vshasigmad SIGE, $5, 1, 0b1111 C Sigma(E) Se + vshasigmad SIGA, $1, 1, 0 C Sigma(A) Sa + + vxor VT3, $2, $3 C VT3: b^c + vsel VT0, $7, $6, $5 C VT0: Ch. + vsel VT3, $3, $1, VT3 C VT3: Maj(a,b,c) + + vaddudm VT4, VT4, VT0 C VT4: Hkw + Ch. + vaddudm VT3, VT3, VT4 C VT3: HkW + Ch. + Maj. + + vaddudm VT0, VT0, VT2 C VT0: Ch. + DHKW + vaddudm $8, SIGE, SIGA C Anext: Se + Sa + vaddudm $4, VT0, SIGE C Dnext: Ch. + DHKW + Se + vaddudm $8, $8, VT3 C Anext: Se+Sa+HkW+Ch.+Maj. +') + +C Extend W[i] +define(`EXTEND', ` + vshasigmad SIGE, IV($1 + 14), 0, 0b1111 + vshasigmad SIGA, IV($1 + 1), 0, 0b0000 + vaddudm IV($1), IV($1), SIGE + vaddudm IV($1), IV($1), SIGA + vaddudm IV($1), IV($1), IV($1 + 9) +') + +define(`EXTENDROUND', ` + ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9) + C Schedule (data) for 16th round in future + EXTEND($9) +') +define(`NOEXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9)') + +define(`NOEXTENDROUNDS', ` + NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0) + NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1) + NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2) + NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3) + + NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4) + NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5) + NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6) + NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7) + + NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8) + NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9) + NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10) + NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11) + + NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12) + NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13) + NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14) + NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15) +') + +define(`EXTENDROUNDS', ` + EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0) + EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1) + EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2) + EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3) + + EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4) + EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5) + EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6) + EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7) + + EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8) + EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9) + EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10) + EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11) + + EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12) + EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13) + EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14) + EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15) +') + +define(`LOAD', ` + IF_BE(`lxvd2x VSR(IV($1)), $2, INPUT') + IF_LE(` + lxvd2x VSR(IV($1)), $2, INPUT + vperm IV($1), IV($1), IV($1), VT0 + ') +') + +define(`DOLOADS', ` + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') + LOAD(0, TC0) + LOAD(1, TC8) + LOAD(2, TC16) + LOAD(3, TC24) + addi INPUT, INPUT, 32 + LOAD(4, TC0) + LOAD(5, TC8) + LOAD(6, TC16) + LOAD(7, TC24) + addi INPUT, INPUT, 32 + LOAD(8, TC0) + LOAD(9, TC8) + LOAD(10, TC16) + LOAD(11, TC24) + addi INPUT, INPUT, 32 + LOAD(12, TC0) + LOAD(13, TC8) + LOAD(14, TC16) + LOAD(15, TC24) +') + +.text +PROLOGUE(_nettle_sha512_compress) + C Store non-volatile registers + + li T0, -16 + li T1, -32 + stvx v20, T0, SP + stvx v21, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v22, T0, SP + stvx v23, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v24, T0, SP + stvx v25, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v26, T0, SP + stvx v27, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v28, T0, SP + stvx v29, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + stvx v30, T0, SP + stvx v31, T1, SP + + li TC8, 8 + li TC16, 16 + li TC24, 24 + + addi STATE32, STATE, 32 + + C Load state values + lxvd2x VSR(VSA), 0, STATE C VSA contains A, B + lxvd2x VSR(VSC), TC16, STATE C VSC contains C, D + lxvd2x VSR(VSE), 0, STATE32 C VSE contains E, F + lxvd2x VSR(VSG), TC16, STATE32 C VSG contains G, H + + C Temporarily store the original state values in VSX registers + xxlor VSXAB, VSR(VSA), VSR(VSA) + xxlor VSXCD, VSR(VSC), VSR(VSC) + xxlor VSXEF, VSR(VSE), VSR(VSE) + xxlor VSXGH, VSR(VSG), VSR(VSG) + + C Shift second state value into its own state register + vsldoi VSB, VSA, VSA, 8 + vsldoi VSD, VSC, VSC, 8 + vsldoi VSF, VSE, VSE, 8 + vsldoi VSH, VSG, VSG, 8 + + li TK, 0 + lxvd2x VSR(VK), TK, K + addi TK, TK, 8 + + DOLOADS + + EXTENDROUNDS + EXTENDROUNDS + EXTENDROUNDS + EXTENDROUNDS + NOEXTENDROUNDS + + C Reload initial state from VSX registers + xxlor VSR(VT0), VSXAB, VSXAB + xxlor VSR(VT1), VSXCD, VSXCD + xxlor VSR(VT2), VSXEF, VSXEF + xxlor VSR(VT3), VSXGH, VSXGH + + C Repack state values to two per register for storing + xxmrghd VSR(VSA), VSR(VSA), VSR(VSB) + xxmrghd VSR(VSC), VSR(VSC), VSR(VSD) + xxmrghd VSR(VSE), VSR(VSE), VSR(VSF) + xxmrghd VSR(VSG), VSR(VSG), VSR(VSH) + + C Perform the final add of the original state values + vaddudm VSA, VSA, VT0 + vaddudm VSC, VSC, VT1 + vaddudm VSE, VSE, VT2 + vaddudm VSG, VSG, VT3 + + stxvd2x VSR(VSA), 0, STATE + stxvd2x VSR(VSC), TC16, STATE + stxvd2x VSR(VSE), 0, STATE32 + stxvd2x VSR(VSG), TC16, STATE32 + + C Restore nonvolatile registers + li T0, -16 + li T1, -32 + lvx v20, T0, SP + lvx v21, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v22, T0, SP + lvx v23, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v24, T0, SP + lvx v25, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v26, T0, SP + lvx v27, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v28, T0, SP + lvx v29, T1, SP + subi T0, T0, 32 + subi T1, T1, 32 + lvx v30, T0, SP + lvx v31, T1, SP + + blr +EPILOGUE(_nettle_sha512_compress) + +IF_LE(` +.data +.align 4 +.load_swap: + .byte 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 +')
Eric Richter erichte@linux.ibm.com writes:
According to the ABI, the stack pointer is quadword aligned, so starting the stack storage at offset -8, may cause the return address to be stepped on. Adjusting to use -16 as the starting point, which also matches other POWER assembly code.
Thanks, applied!
I've noticed one more memory access issue when re-reading this code. The loading of the input data is done using
define(`LOAD', ` IF_BE(`lxvw4x VSR(IV($1)), $2, INPUT') IF_LE(` lxvd2x VSR(IV($1)), $2, INPUT vperm IV($1), IV($1), IV($1), VT0 ') ') [...] LOAD(0, TC0) LOAD(1, TC4) LOAD(2, TC8) LOAD(3, TC12) [...]
As I understand this, like for the state registers, we only use 32 bits of each of the vector registers representing the input block being expanded (it would be nice if we could find a more compact representation without complicating the input expansion logic, but that may be quite difficult).
So we read the 16 bytes at INPUT into register v16, using the first 4 of those bytes, then the 16 bytes as INPUT+4 into v17, using the first 4 bytes, etc.
So we do overlapping reads, and at the end we'll read 12 bytes beyond the end of the input buffer?
I think it should be possible to replace this with something like
LOAD(0, TC0) vsldoi IV(1), IV(0), IV(0), 4 vsldoi IV(2), IV(0), IV(0), 8 vsldoi IV(3), IV(0), IV(0), 12 LOAD(4, TC16) [...]
Do you agree? We could then eliminate some of the TC registers as well.
Regards, /Niels
nettle-bugs@lists.lysator.liu.se