I've updated this set to use the proper conventions for register names, and also adjusted the IV macro according to the suggestions provided.
I can also confirm that I've gotten a working build environment based on the approach the GitLab CI configuration, and that the ppc64 big-endian build does indeed pass tests.
Amended original cover letter:
This set introduces an optimized powerpc64 assembly implementation for SHA256 and SHA512. This have been derived from BSD-2-Clause licensed code authored by IBM, originally released in the IBM POWER Cryptography Reference Implementation project[1], modified to work in Nettle, contributed under the GPL license.
Development of this new implementation targetted POWER 10, however supports the POWER 8 and above ISA. The following commits provide the performance data I recorded on POWER 10, though similar improvements can be found on P8/P9.
I have tested this patch set on POWER 8 and POWER 10, hardware running little-endian linux distributions, and via qemu-user for big-endian ppc64.
Eric Richter (2): powerpc64: Add optimized assembly for sha256-compress-n powerpc64: Add optimized assembly for sha512-compress-n
fat-ppc.c | 22 ++ powerpc64/fat/sha256-compress-n-2.asm | 36 +++ powerpc64/fat/sha512-compress-2.asm | 36 +++ powerpc64/p8/sha256-compress-n.asm | 323 +++++++++++++++++++++++++ powerpc64/p8/sha512-compress.asm | 327 ++++++++++++++++++++++++++ 5 files changed, 744 insertions(+) create mode 100644 powerpc64/fat/sha256-compress-n-2.asm create mode 100644 powerpc64/fat/sha512-compress-2.asm create mode 100644 powerpc64/p8/sha256-compress-n.asm create mode 100644 powerpc64/p8/sha512-compress.asm
This patch introduces an optimized powerpc64 assembly implementation for sha256-compress-n. This takes advantage of the vshasigma instruction, as well as unrolling loops to best take advantage of running instructions in parallel.
The following data was captured on a POWER 10 LPAR @ ~3.896GHz
Current C implementation: Algorithm mode Mbyte/s sha256 update 280.97 hmac-sha256 64 bytes 80.81 hmac-sha256 256 bytes 170.50 hmac-sha256 1024 bytes 241.92 hmac-sha256 4096 bytes 268.54 hmac-sha256 single msg 276.16
With optimized assembly: Algorithm mode Mbyte/s sha256 update 446.42 hmac-sha256 64 bytes 124.89 hmac-sha256 256 bytes 268.90 hmac-sha256 1024 bytes 382.06 hmac-sha256 4096 bytes 425.38 hmac-sha256 single msg 439.75
Signed-off-by: Eric Richter erichte@linux.ibm.com --- fat-ppc.c | 12 + powerpc64/fat/sha256-compress-n-2.asm | 36 +++ powerpc64/p8/sha256-compress-n.asm | 323 ++++++++++++++++++++++++++ 3 files changed, 371 insertions(+) create mode 100644 powerpc64/fat/sha256-compress-n-2.asm create mode 100644 powerpc64/p8/sha256-compress-n.asm
diff --git a/fat-ppc.c b/fat-ppc.c index cd76f7a1..efbeb2ec 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c) DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64) +
static void CONSTRUCTOR fat_init (void) @@ -231,6 +235,8 @@ fat_init (void) _nettle_ghash_update_arm64() */ _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64; _nettle_ghash_update_vec = _nettle_ghash_update_ppc64; + + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; } else { @@ -239,6 +245,7 @@ fat_init (void) _nettle_aes_invert_vec = _nettle_aes_invert_c; _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; } if (features.have_altivec) { @@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *, size_t blocks, const uint8_t *m), (ctx, blocks, m)) + +DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, + (uint32_t *state, const uint32_t *k, + size_t blocks, const uint8_t *input), + (state, k, blocks, input)) diff --git a/powerpc64/fat/sha256-compress-n-2.asm b/powerpc64/fat/sha256-compress-n-2.asm new file mode 100644 index 00000000..4f4eee9d --- /dev/null +++ b/powerpc64/fat/sha256-compress-n-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha256-compress-n-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha256-compress-n.asm') diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm new file mode 100644 index 00000000..d76f337e --- /dev/null +++ b/powerpc64/p8/sha256-compress-n.asm @@ -0,0 +1,323 @@ +C x86_64/sha256-compress-n.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +.file "sha256-compress-n.asm" + +C Parameters in +define(`SP', `r1') +define(`STATE', `r3') +define(`K', `r4') +define(`NUMBLOCKS', `r5') +define(`INPUT', `r6') + +define(`T0', `r7') +define(`T1', `r8') +define(`TK', `r9') +define(`COUNT', `r10') + +C State registers +define(`VSA', `v0') +define(`VSB', `v1') +define(`VSC', `v2') +define(`VSD', `v3') +define(`VSE', `v4') +define(`VSF', `v5') +define(`VSG', `v6') +define(`VSH', `v7') + +C Current K values +define(`VK', `v8') + +C Temp registers for math +define(`VT0', `v9') +define(`VT1', `v10') +define(`VT2', `v11') +define(`VT3', `v12') +define(`VT4', `v13') + +C Convenience named registers for sigma(a) and sigma(e) +define(`SIGA', `v14') +define(`SIGE', `v15') + +C Registers v16-v31 are used for input words W[0] through W[15] + +C Convert an index for W[i] to the corresponding vector register v[16 + i] +define(`IV', `m4_unquote(v`'eval((($1) % 16) + 16))') + +C ROUND(A B C D E F G H R EXT) +define(`ROUND', ` + + vadduwm VT1, VK, IV($9) C VT1: k+W + vadduwm VT4, $8, VT1 C VT4: H+k+W + + lxvw4x VSR(VK), TK, K C Load Key + addi TK, TK, 4 C Increment Pointer to next key + + vadduwm VT2, $4, $8 C VT2: H+D + vadduwm VT2, VT2, VT1 C VT2: H+D+k+W + + vshasigmaw SIGE, $5, 1, 0b1111 C Sigma(E) Se + vshasigmaw SIGA, $1, 1, 0 C Sigma(A) Sa + + vxor VT3, $2, $3 C VT3: b^c + vsel VT0, $7, $6, $5 C VT0: Ch. + vsel VT3, $3, $1, VT3 C VT3: Maj(a,b,c) + + vadduwm VT4, VT4, VT0 C VT4: Hkw + Ch. + vadduwm VT3, VT3, VT4 C VT3: HkW + Ch. + Maj. + + vadduwm VT0, VT0, VT2 C VT0: Ch. + DHKW + vadduwm $8, SIGE, SIGA C Anext: Se + Sa + vadduwm $4, VT0, SIGE C Dnext: Ch. + DHKW + Se + vadduwm $8, $8, VT3 C Anext: Se+Sa+HkW+Ch.+Maj. + + + C Schedule (data) for 16th round in future + C Extend W[i] + ifelse(`$10', `1', ` + vshasigmaw SIGE, IV($9 + 14), 0, 0b1111 + vshasigmaw SIGA, IV($9 + 1), 0, 0b0000 + vadduwm IV($9), IV($9), SIGE + vadduwm IV($9), IV($9), SIGA + vadduwm IV($9), IV($9), IV($9 + 9) + ') +') + +define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 1)') +define(`NOEXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 0)') + +define(`NOEXTENDROUNDS', ` + NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0) + NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1) + NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2) + NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3) + + NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4) + NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5) + NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6) + NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7) + + NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8) + NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9) + NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10) + NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11) + + NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12) + NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13) + NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14) + NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15) +') + +define(`EXTENDROUNDS', ` + EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0) + EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1) + EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2) + EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3) + + EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4) + EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5) + EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6) + EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7) + + EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8) + EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9) + EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10) + EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11) + + EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12) + EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13) + EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14) + EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15) +') + +define(`LOAD', ` + IF_BE(`lxvw4x VSR(IV($1)), 0, INPUT') + IF_LE(` + lxvd2x VSR(IV($1)), 0, INPUT + vperm IV($1), IV($1), IV($1), VT0 + ') + addi INPUT, INPUT, 4 +') + +define(`DOLOADS', ` + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') + LOAD(0) + LOAD(1) + LOAD(2) + LOAD(3) + + LOAD(4) + LOAD(5) + LOAD(6) + LOAD(7) + + LOAD(8) + LOAD(9) + LOAD(10) + LOAD(11) + + LOAD(12) + LOAD(13) + LOAD(14) + LOAD(15) +') + +.text +PROLOGUE(_nettle_sha256_compress_n) + cmpwi 0, NUMBLOCKS, 0 + ble 0, .done + mtctr NUMBLOCKS + + C Store non-volatile registers + subi SP, SP, 64+(12*16) + std T0, 24(SP) + std T1, 16(SP) + std COUNT, 8(SP) + + li T0, 32 + stvx v20, 0, SP + subi T0, T0, 16 + stvx v21, T0, SP + subi T0, T0, 16 + stvx v22, T0, SP + subi T0, T0, 16 + stvx v23, T0, SP + subi T0, T0, 16 + stvx v24, T0, SP + subi T0, T0, 16 + stvx v25, T0, SP + subi T0, T0, 16 + stvx v26, T0, SP + subi T0, T0, 16 + stvx v27, T0, SP + subi T0, T0, 16 + stvx v28, T0, SP + subi T0, T0, 16 + stvx v29, T0, SP + subi T0, T0, 16 + stvx v30, T0, SP + subi T0, T0, 16 + stvx v31, T0, SP + + C Load state values + li T0, 16 + lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D + lxvw4x VSR(VSE), T0, STATE C VSE contains E,F,G,H + +.loop: + li TK, 0 + lxvw4x VSR(VK), TK, K + addi TK, TK, 4 + + DOLOADS + + C "permute" state from VSA containing A,B,C,D into VSA,VSB,VSC,VSD + vsldoi VSB, VSA, VSA, 4 + vsldoi VSF, VSE, VSE, 4 + + vsldoi VSC, VSA, VSA, 8 + vsldoi VSG, VSE, VSE, 8 + + vsldoi VSD, VSA, VSA, 12 + vsldoi VSH, VSE, VSE, 12 + + EXTENDROUNDS + EXTENDROUNDS + EXTENDROUNDS + NOEXTENDROUNDS + + C Reload initial state from stack + li T0, 16 + lxvw4x VSR(VT0), 0, STATE C VSA contains A,B,C,D + lxvw4x VSR(VT1), T0, STATE C VSE contains E,F,G,H + + C Repack VSA,VSB,VSC,VSD into VSA,VSE for storing + vmrghw VSA, VSA, VSB + vmrghw VSC, VSC, VSD + vmrghw VSE, VSE, VSF + vmrghw VSG, VSG, VSH + + xxmrghd VSR(VSA), VSR(VSA), VSR(VSC) + xxmrghd VSR(VSE), VSR(VSE), VSR(VSG) + + vadduwm VSA, VSA, VT0 + vadduwm VSE, VSE, VT1 + + li T0, 16 + stxvw4x VSR(VSA), 0, STATE + stxvw4x VSR(VSE), T0, STATE + + bdnz .loop + + C Restore nonvolatile registers + li T0, 32 + lvx v20, 0, SP + subi T0, T0, 16 + lvx v21, T0, SP + subi T0, T0, 16 + lvx v22, T0, SP + subi T0, T0, 16 + lvx v23, T0, SP + subi T0, T0, 16 + lvx v24, T0, SP + subi T0, T0, 16 + lvx v25, T0, SP + subi T0, T0, 16 + lvx v26, T0, SP + subi T0, T0, 16 + lvx v27, T0, SP + subi T0, T0, 16 + lvx v28, T0, SP + subi T0, T0, 16 + lvx v29, T0, SP + subi T0, T0, 16 + lvx v30, T0, SP + subi T0, T0, 16 + lvx v31, T0, SP + + ld T0, 24(SP) + ld T1, 16(SP) + ld COUNT, 8(SP) + addi SP, SP, 64+(12*16) + +.done: + mr r3, INPUT + + blr +EPILOGUE(_nettle_sha256_compress_n) + +IF_LE(` +.data +.align 4 +.load_swap: + .byte 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 +')
Eric Richter erichte@linux.ibm.com writes:
This patch introduces an optimized powerpc64 assembly implementation for sha256-compress-n. This takes advantage of the vshasigma instruction, as well as unrolling loops to best take advantage of running instructions in parallel.
Thanks. I'm now having a closer read of the assembly code. Comments below.
+C ROUND(A B C D E F G H R EXT) +define(`ROUND', `
- vadduwm VT1, VK, IV($9) C VT1: k+W
- vadduwm VT4, $8, VT1 C VT4: H+k+W
- lxvw4x VSR(VK), TK, K C Load Key
- addi TK, TK, 4 C Increment Pointer to next key
- vadduwm VT2, $4, $8 C VT2: H+D
- vadduwm VT2, VT2, VT1 C VT2: H+D+k+W
- vshasigmaw SIGE, $5, 1, 0b1111 C Sigma(E) Se
- vshasigmaw SIGA, $1, 1, 0 C Sigma(A) Sa
- vxor VT3, $2, $3 C VT3: b^c
- vsel VT0, $7, $6, $5 C VT0: Ch.
- vsel VT3, $3, $1, VT3 C VT3: Maj(a,b,c)
- vadduwm VT4, VT4, VT0 C VT4: Hkw + Ch.
- vadduwm VT3, VT3, VT4 C VT3: HkW + Ch. + Maj.
- vadduwm VT0, VT0, VT2 C VT0: Ch. + DHKW
- vadduwm $8, SIGE, SIGA C Anext: Se + Sa
- vadduwm $4, VT0, SIGE C Dnext: Ch. + DHKW + Se
- vadduwm $8, $8, VT3 C Anext: Se+Sa+HkW+Ch.+Maj.
- C Schedule (data) for 16th round in future
- C Extend W[i]
- ifelse(`$10', `1', `
vshasigmaw SIGE, IV($9 + 14), 0, 0b1111
vshasigmaw SIGA, IV($9 + 1), 0, 0b0000
vadduwm IV($9), IV($9), SIGE
vadduwm IV($9), IV($9), SIGA
vadduwm IV($9), IV($9), IV($9 + 9)
- ')
+')
I think it would be a bit simpler to take out the extend logic to its own macro.
+define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 1)')
If you do that, then you would define
define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9) EXTEND($9)')
(In other related code, input expansion is done at the beginning of a round iteration rather than at the end, but doing at the end like you do may be better scheduling).
+define(`LOAD', `
- IF_BE(`lxvw4x VSR(IV($1)), 0, INPUT')
- IF_LE(`
lxvd2x VSR(IV($1)), 0, INPUT
vperm IV($1), IV($1), IV($1), VT0
- ')
- addi INPUT, INPUT, 4
+')
+define(`DOLOADS', `
- IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
Could you have a dedicated register for the permutation constant, and load it only once at function entry? If you have general registers to spare, it could also make sense to use, e.g., three registers for the contant values 16, 32, 48, and use for indexing. Then you don't need to update the INPUT pointer as often, and you can use the same constants for other load/store sequences as well.
- LOAD(0)
- LOAD(1)
- LOAD(2)
- LOAD(3)
+PROLOGUE(_nettle_sha256_compress_n)
- cmpwi 0, NUMBLOCKS, 0
- ble 0, .done
- mtctr NUMBLOCKS
- C Store non-volatile registers
- subi SP, SP, 64+(12*16)
- std T0, 24(SP)
- std T1, 16(SP)
- std COUNT, 8(SP)
For save/restore of registers, I prefer to use the register names, not the defined symbols. And T0, T1, COUNT are defined to use r7, r8, r10, which *are* volatile, right?
Does the data stored fit in the 288 byte "protected zone"? If so, probably best to not modify the stack pointer.
- li T0, 32
- stvx v20, 0, SP
- subi T0, T0, 16
- stvx v21, T0, SP
Here it would also help a bit to allocate constants 16, 32, 48 in registers.
- subi T0, T0, 16
- stvx v22, T0, SP
- subi T0, T0, 16
- stvx v23, T0, SP
- subi T0, T0, 16
- stvx v24, T0, SP
- subi T0, T0, 16
- stvx v25, T0, SP
- subi T0, T0, 16
- stvx v26, T0, SP
- subi T0, T0, 16
- stvx v27, T0, SP
- subi T0, T0, 16
- stvx v28, T0, SP
- subi T0, T0, 16
- stvx v29, T0, SP
- subi T0, T0, 16
- stvx v30, T0, SP
- subi T0, T0, 16
- stvx v31, T0, SP
- C Load state values
- li T0, 16
- lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D
- lxvw4x VSR(VSE), T0, STATE C VSE contains E,F,G,H
+.loop:
- li TK, 0
- lxvw4x VSR(VK), TK, K
- addi TK, TK, 4
- DOLOADS
- C "permute" state from VSA containing A,B,C,D into VSA,VSB,VSC,VSD
Can you give a bit more detail on this permutation? Does the main round operations only use 32 bits each from the state registers? There's no reasonable way to use a more compact representation?
- vsldoi VSB, VSA, VSA, 4
- vsldoi VSF, VSE, VSE, 4
- vsldoi VSC, VSA, VSA, 8
- vsldoi VSG, VSE, VSE, 8
- vsldoi VSD, VSA, VSA, 12
- vsldoi VSH, VSE, VSE, 12
- EXTENDROUNDS
- EXTENDROUNDS
- EXTENDROUNDS
- NOEXTENDROUNDS
- C Reload initial state from stack
- li T0, 16
- lxvw4x VSR(VT0), 0, STATE C VSA contains A,B,C,D
- lxvw4x VSR(VT1), T0, STATE C VSE contains E,F,G,H
- C Repack VSA,VSB,VSC,VSD into VSA,VSE for storing
- vmrghw VSA, VSA, VSB
- vmrghw VSC, VSC, VSD
- vmrghw VSE, VSE, VSF
- vmrghw VSG, VSG, VSH
- xxmrghd VSR(VSA), VSR(VSA), VSR(VSC)
- xxmrghd VSR(VSE), VSR(VSE), VSR(VSG)
- vadduwm VSA, VSA, VT0
- vadduwm VSE, VSE, VT1
It seems unfortunate to have to do this conversion for each iteration of the loop, it would be nice if state could be converted to the most efficient form before entering the loop, and not converted back until after loop exit. But we probably don't have enoguh registers to keep the old state exploded into many registers. And load/store of exploded state doesn't seem that attractive either.
- li T0, 16
- stxvw4x VSR(VSA), 0, STATE
- stxvw4x VSR(VSE), T0, STATE
- bdnz .loop
Regards, /Niels Möller
On Sun, 2024-05-05 at 16:10 +0200, Niels Möller wrote:
Eric Richter erichte@linux.ibm.com writes:
This patch introduces an optimized powerpc64 assembly implementation for sha256-compress-n. This takes advantage of the vshasigma instruction, as well as unrolling loops to best take advantage of running instructions in parallel.
Thanks. I'm now having a closer read of the assembly code. Comments below.
+C ROUND(A B C D E F G H R EXT) +define(`ROUND', `
- vadduwm VT1, VK, IV($9) C VT1: k+W
- vadduwm VT4, $8, VT1 C VT4: H+k+W
- lxvw4x VSR(VK), TK, K C Load Key
- addi TK, TK, 4 C Increment Pointer
to next key
- vadduwm VT2, $4, $8 C VT2: H+D
- vadduwm VT2, VT2, VT1 C VT2:
H+D+k+W
- vshasigmaw SIGE, $5, 1, 0b1111 C Sigma(E) Se
- vshasigmaw SIGA, $1, 1, 0 C Sigma(A) Sa
- vxor VT3, $2, $3 C VT3: b^c
- vsel VT0, $7, $6, $5 C VT0: Ch.
- vsel VT3, $3, $1, VT3 C VT3: Maj(a,b,c)
- vadduwm VT4, VT4, VT0 C VT4: Hkw +
Ch.
- vadduwm VT3, VT3, VT4 C VT3: HkW +
Ch. + Maj.
- vadduwm VT0, VT0, VT2 C VT0: Ch. +
DHKW
- vadduwm $8, SIGE, SIGA C Anext: Se
- Sa
- vadduwm $4, VT0, SIGE C Dnext: Ch.
- DHKW + Se
- vadduwm $8, $8, VT3 C Anext:
Se+Sa+HkW+Ch.+Maj.
- C Schedule (data) for 16th round in future
- C Extend W[i]
- ifelse(`$10', `1', `
vshasigmaw SIGE, IV($9 + 14), 0, 0b1111
vshasigmaw SIGA, IV($9 + 1), 0, 0b0000
vadduwm IV($9), IV($9), SIGE
vadduwm IV($9), IV($9), SIGA
vadduwm IV($9), IV($9), IV($9 + 9)
- ')
+')
I think it would be a bit simpler to take out the extend logic to its own macro.
+define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 1)')
If you do that, then you would define define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9) EXTEND($9)')
(In other related code, input expansion is done at the beginning of a round iteration rather than at the end, but doing at the end like you do may be better scheduling).
Makes sense, I'll move that extend logic into its own macro.
You are correct, the expansion logic was moved to the end of the round for an improvement to scheduling on the CPU. The vshasigma instructions take more cycles and are scheduled on a different unit than the other arithmetic operations. This allows those to work in parallel with the beginning of the next round, as there are no dependent registers until the next vshasigma instructions in-round.
+define(`LOAD', `
- IF_BE(`lxvw4x VSR(IV($1)), 0, INPUT')
- IF_LE(`
lxvd2x VSR(IV($1)), 0, INPUT
vperm IV($1), IV($1), IV($1), VT0
- ')
- addi INPUT, INPUT, 4
+')
+define(`DOLOADS', `
- IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
Could you have a dedicated register for the permutation constant, and load it only once at function entry? If you have general registers to spare, it could also make sense to use, e.g., three registers for the contant values 16, 32, 48, and use for indexing. Then you don't need to update the INPUT pointer as often, and you can use the same constants for other load/store sequences as well.
There are plenty of GPRs to spare, I will test and bench a few options for using more GPRs as indexes.
As for VRs, unfortunately the current implementation uses all 32 VRs: 16 for W[i] 8 for state 7 for round arithmetic (two of these specifically for sigma, to avoid a dependency bubble) 1 for storing the key constant K
That said, I'm going to experiment with some VSX instructions to see if it is possible to spill over certain operations into VSRs, without needing an explicit copy back from VSR to VR.
- LOAD(0)
- LOAD(1)
- LOAD(2)
- LOAD(3)
+PROLOGUE(_nettle_sha256_compress_n)
- cmpwi 0, NUMBLOCKS, 0
- ble 0, .done
- mtctr NUMBLOCKS
- C Store non-volatile registers
- subi SP, SP, 64+(12*16)
- std T0, 24(SP)
- std T1, 16(SP)
- std COUNT, 8(SP)
For save/restore of registers, I prefer to use the register names, not the defined symbols. And T0, T1, COUNT are defined to use r7, r8, r10, which *are* volatile, right?
Ah yep, good catch!
Does the data stored fit in the 288 byte "protected zone"? If so, probably best to not modify the stack pointer.
At the moment it should as I'm currently moving the stack pointer 256 bytes. With the removal of the volatile GPRs and the addition of new index GPRs, I think it should still fit in the 288 byte zone. I will include this in the next version if it fits.
- li T0, 32
- stvx v20, 0, SP
- subi T0, T0, 16
- stvx v21, T0, SP
Here it would also help a bit to allocate constants 16, 32, 48 in registers.
- subi T0, T0, 16
- stvx v22, T0, SP
- subi T0, T0, 16
- stvx v23, T0, SP
- subi T0, T0, 16
- stvx v24, T0, SP
- subi T0, T0, 16
- stvx v25, T0, SP
- subi T0, T0, 16
- stvx v26, T0, SP
- subi T0, T0, 16
- stvx v27, T0, SP
- subi T0, T0, 16
- stvx v28, T0, SP
- subi T0, T0, 16
- stvx v29, T0, SP
- subi T0, T0, 16
- stvx v30, T0, SP
- subi T0, T0, 16
- stvx v31, T0, SP
- C Load state values
- li T0, 16
- lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D
- lxvw4x VSR(VSE), T0, STATE C VSE contains E,F,G,H
+.loop:
- li TK, 0
- lxvw4x VSR(VK), TK, K
- addi TK, TK, 4
- DOLOADS
- C "permute" state from VSA containing A,B,C,D into
VSA,VSB,VSC,VSD
Can you give a bit more detail on this permutation? Does the main round operations only use 32 bits each from the state registers? There's no reasonable way to use a more compact representation?
Correct, state registers VSA, VSB, VSC, etc only use the first 32-bits of their respective vector register for arithmetic in a round (64-bits in the logically identical sha512).
Unfortunately compacting state values A,B,C,D into a single VR introduces some performance problems when trying to do steps like: vsel VT3, $3, $1, VT3 C VT3: Maj(a,b,c) as this would require additional instructions extracting A,B,C into three VRs.
So this is largely a scalar implementation using vector instructions, as there is no scalar equivalent to vshasigma.
That said, this "permutation" allows the loading of four state values in a single load instruction rather than four separate loads, with the tradeoff of needing more arithmetic instructions to permute them into place. I think this should end up being faster (combined with parallel scheduling of the non-dependent vsldoi shifts), but I will run some tests to confirm this.
- vsldoi VSB, VSA, VSA, 4
- vsldoi VSF, VSE, VSE, 4
- vsldoi VSC, VSA, VSA, 8
- vsldoi VSG, VSE, VSE, 8
- vsldoi VSD, VSA, VSA, 12
- vsldoi VSH, VSE, VSE, 12
- EXTENDROUNDS
- EXTENDROUNDS
- EXTENDROUNDS
- NOEXTENDROUNDS
- C Reload initial state from stack
- li T0, 16
- lxvw4x VSR(VT0), 0, STATE C VSA contains A,B,C,D
- lxvw4x VSR(VT1), T0, STATE C VSE contains E,F,G,H
- C Repack VSA,VSB,VSC,VSD into VSA,VSE for storing
- vmrghw VSA, VSA, VSB
- vmrghw VSC, VSC, VSD
- vmrghw VSE, VSE, VSF
- vmrghw VSG, VSG, VSH
- xxmrghd VSR(VSA), VSR(VSA), VSR(VSC)
- xxmrghd VSR(VSE), VSR(VSE), VSR(VSG)
- vadduwm VSA, VSA, VT0
- vadduwm VSE, VSE, VT1
It seems unfortunate to have to do this conversion for each iteration of the loop, it would be nice if state could be converted to the most efficient form before entering the loop, and not converted back until after loop exit. But we probably don't have enoguh registers to keep the old state exploded into many registers. And load/store of exploded state doesn't seem that attractive either.
The necessary load/store per loop iteration is probably the worst offender for performance, as the merges can be scheduled in parallel. This is where I'd like to see if we can make the best use of VSRs to temporarily hold the old state, as a VR <-> VSR move *should* be faster than a load/store to memory.
- li T0, 16
- stxvw4x VSR(VSA), 0, STATE
- stxvw4x VSR(VSE), T0, STATE
- bdnz .loop
Regards, /Niels Möller
Thanks for the review!
- Eric
This patch introduces an optimized powerpc64 assembly implementation for sha512-compress, derived from the implementation for sha256-compress-n.
The following data was captured on a POWER 10 LPAR @ ~3.896GHz
Current C implementation: Algorithm mode Mbyte/s sha512 update 447.02 sha512-224 update 444.30 sha512-256 update 445.02 hmac-sha512 64 bytes 97.27 hmac-sha512 256 bytes 204.55 hmac-sha512 1024 bytes 342.86 hmac-sha512 4096 bytes 409.57 hmac-sha512 single msg 433.95
With optimized assembly: Algorithm mode Mbyte/s sha512 update 705.36 sha512-224 update 705.63 sha512-256 update 705.34 hmac-sha512 64 bytes 141.66 hmac-sha512 256 bytes 310.26 hmac-sha512 1024 bytes 534.22 hmac-sha512 4096 bytes 641.74 hmac-sha512 single msg 677.14
Signed-off-by: Eric Richter erichte@linux.ibm.com --- fat-ppc.c | 10 + powerpc64/fat/sha512-compress-2.asm | 36 +++ powerpc64/p8/sha512-compress.asm | 327 ++++++++++++++++++++++++++++ 3 files changed, 373 insertions(+) create mode 100644 powerpc64/fat/sha512-compress-2.asm create mode 100644 powerpc64/p8/sha512-compress.asm
diff --git a/fat-ppc.c b/fat-ppc.c index efbeb2ec..a228386a 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -207,6 +207,10 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c) DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
+DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c) +DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, ppc64) +
static void CONSTRUCTOR fat_init (void) @@ -237,6 +241,7 @@ fat_init (void) _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
_nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64; + _nettle_sha512_compress_vec = _nettle_sha512_compress_ppc64; } else { @@ -246,6 +251,7 @@ fat_init (void) _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c; + _nettle_sha512_compress_vec = _nettle_sha512_compress_c; } if (features.have_altivec) { @@ -350,3 +356,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, (uint32_t *state, const uint32_t *k, size_t blocks, const uint8_t *input), (state, k, blocks, input)) + +DEFINE_FAT_FUNC(_nettle_sha512_compress, void, + (uint64_t *state, const uint8_t *input, const uint64_t *k), + (state, input, k)) diff --git a/powerpc64/fat/sha512-compress-2.asm b/powerpc64/fat/sha512-compress-2.asm new file mode 100644 index 00000000..9445e5ba --- /dev/null +++ b/powerpc64/fat/sha512-compress-2.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/sha512-compress-2.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha512_compress) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/sha512-compress.asm') diff --git a/powerpc64/p8/sha512-compress.asm b/powerpc64/p8/sha512-compress.asm new file mode 100644 index 00000000..83fe0e36 --- /dev/null +++ b/powerpc64/p8/sha512-compress.asm @@ -0,0 +1,327 @@ +C x86_64/sha512-compress.asm + +ifelse(` + Copyright (C) 2024 Eric Richter, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +.file "sha512-compress.asm" + +C Parameters in +define(`SP', `r1') +define(`STATE', `r3') +define(`INPUT', `r4') +define(`K', `r5') + +define(`T0', `r7') +define(`T1', `r8') +define(`TK', `r9') +define(`COUNT', `r10') + +C State registers +define(`VSA', `v0') +define(`VSB', `v1') +define(`VSC', `v2') +define(`VSD', `v3') +define(`VSE', `v4') +define(`VSF', `v5') +define(`VSG', `v6') +define(`VSH', `v7') + +C Current K values +define(`VK', `v8') + +C Temp registers for math +define(`VT0', `v9') +define(`VT1', `v10') +define(`VT2', `v11') +define(`VT3', `v12') +define(`VT4', `v13') + +C Convenience named registers for sigma(a) and sigma(e) +define(`SIGA', `v14') +define(`SIGE', `v15') + +C Registers v16-v31 are used for input words W[0] through W[15] + +C Convert an index for W[i] to the corresponding vector register v[16 + i] +define(`IV', `m4_unquote(v`'eval((($1) % 16) + 16))') + +C ROUND(A B C D E F G H R EXT) +define(`ROUND', ` + + vaddudm VT1, VK, IV($9) C VT1: k+W + vaddudm VT4, $8, VT1 C VT4: H+k+W + + lxvd2x VSR(VK), TK, K C Load Key + addi TK, TK, 8 C Increment Pointer to next key + + vaddudm VT2, $4, $8 C VT2: H+D + vaddudm VT2, VT2, VT1 C VT2: H+D+k+W + + vshasigmad SIGE, $5, 1, 0b1111 C Sigma(E) Se + vshasigmad SIGA, $1, 1, 0 C Sigma(A) Sa + + vxor VT3, $2, $3 C VT3: b^c + vsel VT0, $7, $6, $5 C VT0: Ch. + vsel VT3, $3, $1, VT3 C VT3: Maj(a,b,c) + + vaddudm VT4, VT4, VT0 C VT4: Hkw + Ch. + vaddudm VT3, VT3, VT4 C VT3: HkW + Ch. + Maj. + + vaddudm VT0, VT0, VT2 C VT0: Ch. + DHKW + vaddudm $8, SIGE, SIGA C Anext: Se + Sa + vaddudm $4, VT0, SIGE C Dnext: Ch. + DHKW + Se + vaddudm $8, $8, VT3 C Anext: Se+Sa+HkW+Ch.+Maj. + + + C Schedule (data) for 16th round in future + C Extend W[i] + ifelse(`$10', `1', ` + vshasigmad SIGE, IV($9 + 14), 0, 0b1111 + vshasigmad SIGA, IV($9 + 1), 0, 0b0000 + vaddudm IV($9), IV($9), SIGE + vaddudm IV($9), IV($9), SIGA + vaddudm IV($9), IV($9), IV($9 + 9) + ') +') + +define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 1)') +define(`NOEXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 0)') + +define(`NOEXTENDROUNDS', ` + NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0) + NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1) + NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2) + NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3) + + NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4) + NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5) + NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6) + NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7) + + NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8) + NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9) + NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10) + NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11) + + NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12) + NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13) + NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14) + NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15) +') + +define(`EXTENDROUNDS', ` + EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0) + EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1) + EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2) + EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3) + + EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4) + EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5) + EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6) + EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7) + + EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8) + EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9) + EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10) + EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11) + + EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12) + EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13) + EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14) + EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15) +') + +define(`LOAD', ` + IF_BE(`lxvd2x VSR(IV($1)), 0, INPUT') + IF_LE(` + lxvd2x VSR(IV($1)), 0, INPUT + vperm IV($1), IV($1), IV($1), VT0 + ') + addi INPUT, INPUT, 8 +') + +define(`DOLOADS', ` + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') + LOAD(0) + LOAD(1) + LOAD(2) + LOAD(3) + + LOAD(4) + LOAD(5) + LOAD(6) + LOAD(7) + + LOAD(8) + LOAD(9) + LOAD(10) + LOAD(11) + + LOAD(12) + LOAD(13) + LOAD(14) + LOAD(15) +') + +.text +PROLOGUE(_nettle_sha512_compress) + + C Store non-volatile registers + subi SP, SP, 64+(12*16) + std T0, 24(SP) + std T1, 16(SP) + std COUNT, 8(SP) + + li T0, 32 + stvx v20, 0, SP + subi T0, T0, 16 + stvx v21, T0, SP + subi T0, T0, 16 + stvx v22, T0, SP + subi T0, T0, 16 + stvx v23, T0, SP + subi T0, T0, 16 + stvx v24, T0, SP + subi T0, T0, 16 + stvx v25, T0, SP + subi T0, T0, 16 + stvx v26, T0, SP + subi T0, T0, 16 + stvx v27, T0, SP + subi T0, T0, 16 + stvx v28, T0, SP + subi T0, T0, 16 + stvx v29, T0, SP + subi T0, T0, 16 + stvx v30, T0, SP + subi T0, T0, 16 + stvx v31, T0, SP + + C Load state values + li T0, 16 + lxvd2x VSR(VSA), 0, STATE C VSA contains A, B + lxvd2x VSR(VSC), T0, STATE C VSC contains C, D + addi T0, T0, 16 + lxvd2x VSR(VSE), T0, STATE C VSE contains E, F + addi T0, T0, 16 + lxvd2x VSR(VSG), T0, STATE C VSG contains G, H + + li TK, 0 + lxvd2x VSR(VK), TK, K + addi TK, TK, 8 C might need to be moved, or use swizzle + + DOLOADS + + C "permute" state from VSA containing A,B,C,D into VSA,VSB,VSC,VSD + vsldoi VSB, VSA, VSA, 8 + vsldoi VSD, VSC, VSC, 8 + vsldoi VSF, VSE, VSE, 8 + vsldoi VSH, VSG, VSG, 8 + + EXTENDROUNDS + EXTENDROUNDS + EXTENDROUNDS + EXTENDROUNDS + NOEXTENDROUNDS + + DATA_LOAD_VEC(VT4, .pack_lr, T0) + + C Reload initial state from stack + li T0, 16 + lxvd2x VSR(VT0), 0, STATE + lxvd2x VSR(VT1), T0, STATE + addi T0, T0, 16 + lxvd2x VSR(VT2), T0, STATE + addi T0, T0, 16 + lxvd2x VSR(VT3), T0, STATE + + C Repack VSA,VSB,VSC,VSD into VSA,VSC,VSE,VSG for storing + vperm VSA, VSA, VSB, VT4 + vperm VSC, VSC, VSD, VT4 + vperm VSE, VSE, VSF, VT4 + vperm VSG, VSG, VSH, VT4 + + vaddudm VSA, VSA, VT0 + vaddudm VSC, VSC, VT1 + vaddudm VSE, VSE, VT2 + vaddudm VSG, VSG, VT3 + + li T0, 16 + stxvd2x VSR(VSA), 0, STATE + stxvd2x VSR(VSC), T0, STATE + addi T0, T0, 16 + stxvd2x VSR(VSE), T0, STATE + addi T0, T0, 16 + stxvd2x VSR(VSG), T0, STATE + + C Restore nonvolatile registers + li T0, 32 + lvx v20, 0, SP + subi T0, T0, 16 + lvx v21, T0, SP + subi T0, T0, 16 + lvx v22, T0, SP + subi T0, T0, 16 + lvx v23, T0, SP + subi T0, T0, 16 + lvx v24, T0, SP + subi T0, T0, 16 + lvx v25, T0, SP + subi T0, T0, 16 + lvx v26, T0, SP + subi T0, T0, 16 + lvx v27, T0, SP + subi T0, T0, 16 + lvx v28, T0, SP + subi T0, T0, 16 + lvx v29, T0, SP + subi T0, T0, 16 + lvx v30, T0, SP + subi T0, T0, 16 + lvx v31, T0, SP + + ld T0, 24(SP) + ld T1, 16(SP) + ld COUNT, 8(SP) + addi SP, SP, 64+(12*16) + + blr +EPILOGUE(_nettle_sha512_compress) + +IF_LE(` +.data +.align 4 +.load_swap: + .byte 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 +') +.align 4 +.pack_lr: + IF_BE(`.byte 0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23') + IF_LE(`.byte 23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0')
nettle-bugs@lists.lysator.liu.se