[PATCH v3] powerpc64: Add optimized assembly for sha256-compress-n - nettle-bugs

4 Jun 2024

This patch introduces an optimized powerpc64 assembly implementation for
sha256-compress-n. This takes advantage of the vshasigma instruction, as
well as unrolling loops to best take advantage of running instructions
in parallel.
The following data was captured on a POWER 10 LPAR @ ~3.896GHz
Current C implementation:
         Algorithm         mode Mbyte/s
            sha256       update  280.97
       hmac-sha256     64 bytes   80.81
       hmac-sha256    256 bytes  170.50
       hmac-sha256   1024 bytes  241.92
       hmac-sha256   4096 bytes  268.54
       hmac-sha256   single msg  276.16
With optimized assembly:
         Algorithm         mode Mbyte/s
            sha256       update  461.45
       hmac-sha256     64 bytes  123.88
       hmac-sha256    256 bytes  268.81
       hmac-sha256   1024 bytes  390.91
       hmac-sha256   4096 bytes  438.02
       hmac-sha256   single msg  453.83
Signed-off-by: Eric Richter erichte@linux.ibm.com
---
I split this patch to be standalone, rather than delay even further trying
to update SHA512 -- I will update the SHA512 implementation when this one
stabilizes.
Regarding the load vperm needed for little endian: unfortunately we don't
have a spare vector register to store the mask between rounds, so the best
that can be done while maintaining p8 support will be to store the mask in
a VSX register like the state values, and avoid the load. This is a
negligible performance change however, yielding around +1MB/s on larger
block counts (update, hmac 1024/4096/single msg) and -1MB/s on smaller
(hmac 64/256).
Dropping p8 support allows the use of the lxvb16x instruction, which does
not need to be permuted, however that is as well a negligible performance
improvement at the cost of dropping a whole cpu set. So I see a few
options:
A) leave as-is, consider storing the mask in a VSX register
B) drop p8 support, use lxvb16x
C) have a compile-time switch to use permute on p8, and use the single
   instruction for p9 an up.
v3:
 - use protected zone instead of allocating stack space
 - add GPRs constants for multiples of 4 for loads
   - around +3.4 MB/s for sha256 update
 - move extend logic to its own macro called by EXTENDROUND
 - use 8 VSX registers to store previous state instead of the stack
   - around +11.0 MB/s for sha256 update
fat-ppc.c                             |  12 +
 powerpc64/fat/sha256-compress-n-2.asm |  36 +++
 powerpc64/p8/sha256-compress-n.asm    | 364 ++++++++++++++++++++++++++
 3 files changed, 412 insertions(+)
 create mode 100644 powerpc64/fat/sha256-compress-n-2.asm
 create mode 100644 powerpc64/p8/sha256-compress-n.asm

diff --git a/fat-ppc.c b/fat-ppc.c
index cd76f7a1..efbeb2ec 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func)
 DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c)
 DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
+
static void CONSTRUCTOR
 fat_init (void)
@@ -231,6 +235,8 @@ fat_init (void)
          _nettle_ghash_update_arm64() */
       _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64;
       _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
+
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
     }
   else
     {
@@ -239,6 +245,7 @@ fat_init (void)
       _nettle_aes_invert_vec = _nettle_aes_invert_c;
       _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
       _nettle_ghash_update_vec = _nettle_ghash_update_c;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
     }
   if (features.have_altivec)
     {
@@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *,
      size_t blocks,
    	 const uint8_t *m),
    	(ctx, blocks, m))
+
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+		(uint32_t *state, const uint32_t *k,
+		 size_t blocks, const uint8_t *input),
+		(state, k, blocks, input))
diff --git a/powerpc64/fat/sha256-compress-n-2.asm b/powerpc64/fat/sha256-compress-n-2.asm
new file mode 100644
index 00000000..4f4eee9d
--- /dev/null
+++ b/powerpc64/fat/sha256-compress-n-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha256-compress-n-2.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/sha256-compress-n.asm')
diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm
new file mode 100644
index 00000000..c1ce0e8f
--- /dev/null
+++ b/powerpc64/p8/sha256-compress-n.asm
@@ -0,0 +1,364 @@
+C x86_64/sha256-compress-n.asm
+
+ifelse(`
+   Copyright (C) 2024 Eric Richter, IBM Corporation
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+.file "sha256-compress-n.asm"
+
+C Parameters in
+define(`SP', `r1')
+define(`STATE', `r3')
+define(`K', `r4')
+define(`NUMBLOCKS', `r5')
+define(`INPUT', `r6')
+
+define(`T0', `r7')
+define(`T1', `r8')
+define(`TK', `r9')
+define(`COUNT', `r10')
+define(`TC0', `0')	C Index instructions allow literal 0 instead of a GPR
+define(`TC4', `r11')
+define(`TC8', `r12')
+define(`TC12', `r14')
+define(`TC16', `r15')
+
+C State registers
+define(`VSA', `v0')
+define(`VSB', `v1')
+define(`VSC', `v2')
+define(`VSD', `v3')
+define(`VSE', `v4')
+define(`VSF', `v5')
+define(`VSG', `v6')
+define(`VSH', `v7')
+
+C Previous state value registers stored in VSX
+define(`VSXA', `vs0')
+define(`VSXB', `vs1')
+define(`VSXC', `vs2')
+define(`VSXD', `vs3')
+define(`VSXE', `vs4')
+define(`VSXF', `vs5')
+define(`VSXG', `vs6')
+define(`VSXH', `vs7')
+
+C Current K values
+define(`VK', `v8')
+
+C Temp registers for math
+define(`VT0', `v9')
+define(`VT1', `v10')
+define(`VT2', `v11')
+define(`VT3', `v12')
+define(`VT4', `v13')
+
+C Convenience named registers for sigma(a) and sigma(e)
+define(`SIGA', `v14')
+define(`SIGE', `v15')
+
+C Registers v16-v31 are used for input words W[0] through W[15]
+
+C Convert an index for W[i] to the corresponding vector register v[16 + i]
+define(`IV', `m4_unquote(v`'eval((($1) % 16) + 16))')
+
+C ROUND(A B C D E F G H R EXT)
+define(`ROUND', `
+
+	vadduwm	VT1, VK, IV($9)               C VT1: k+W
+	vadduwm	VT4, $8, VT1                  C VT4: H+k+W
+
+	lxvw4x	VSR(VK), TK, K                C Load Key
+	addi	TK, TK, 4	              C Increment Pointer to next key
+
+	vadduwm	VT2, $4, $8	              C VT2: H+D
+	vadduwm	VT2, VT2, VT1                 C VT2: H+D+k+W
+
+	vshasigmaw	SIGE, $5, 1, 0b1111   C Sigma(E)  Se
+	vshasigmaw	SIGA, $1, 1, 0        C Sigma(A)  Sa
+
+	vxor	VT3, $2, $3                   C VT3: b^c
+	vsel	VT0, $7, $6, $5	              C VT0: Ch.
+	vsel	VT3, $3, $1, VT3              C VT3: Maj(a,b,c)
+
+	vadduwm	VT4, VT4, VT0                 C VT4: Hkw + Ch.
+	vadduwm	VT3, VT3, VT4                 C VT3: HkW + Ch. + Maj.
+
+	vadduwm	VT0, VT0, VT2                 C VT0: Ch. + DHKW
+	vadduwm	$8, SIGE, SIGA                C Anext: Se + Sa
+	vadduwm	$4, VT0, SIGE                 C Dnext: Ch. + DHKW + Se
+	vadduwm	$8, $8, VT3                   C Anext: Se+Sa+HkW+Ch.+Maj.
+')
+
+C Extend W[i]
+define(`EXTEND', `
+	vshasigmaw	SIGE, IV($1 + 14), 0, 0b1111
+	vshasigmaw	SIGA, IV($1 + 1), 0, 0b0000
+	vadduwm		IV($1), IV($1), SIGE
+	vadduwm		IV($1), IV($1), SIGA
+	vadduwm		IV($1), IV($1), IV($1 + 9)
+')
+
+define(`EXTENDROUND',	`
+	ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9)
+	C Schedule (data) for 16th round in future
+	EXTEND($9)
+')
+define(`NOEXTENDROUND',	`ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9)')
+
+define(`NOEXTENDROUNDS', `
+	NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0)
+	NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1)
+	NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2)
+	NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3)
+
+	NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4)
+	NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5)
+	NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6)
+	NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7)
+
+	NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8)
+	NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9)
+	NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10)
+	NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11)
+
+	NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12)
+	NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13)
+	NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14)
+	NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15)
+')
+
+define(`EXTENDROUNDS', `
+	EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0)
+	EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1)
+	EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2)
+	EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3)
+
+	EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4)
+	EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5)
+	EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6)
+	EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7)
+
+	EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8)
+	EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9)
+	EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10)
+	EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11)
+
+	EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12)
+	EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13)
+	EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14)
+	EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15)
+')
+
+define(`LOAD', `
+	IF_BE(`lxvw4x	VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT')
+	IF_LE(`
+		lxvd2x	VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT
+		vperm	IV($1), IV($1), IV($1), VT0
+	')
+')
+
+define(`DOLOADS', `
+	IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
+	LOAD(0)
+	LOAD(1)
+	LOAD(2)
+	LOAD(3)
+	addi	INPUT, INPUT, 16
+	LOAD(4)
+	LOAD(5)
+	LOAD(6)
+	LOAD(7)
+	addi	INPUT, INPUT, 16
+	LOAD(8)
+	LOAD(9)
+	LOAD(10)
+	LOAD(11)
+	addi	INPUT, INPUT, 16
+	LOAD(12)
+	LOAD(13)
+	LOAD(14)
+	LOAD(15)
+	addi	INPUT, INPUT, 16
+')
+
+.text
+PROLOGUE(_nettle_sha256_compress_n)
+	cmpwi	0, NUMBLOCKS, 0
+	ble	0, .done
+	mtctr	NUMBLOCKS
+
+	C Store non-volatile registers
+
+	li	T0, -8
+	li	T1, -24
+	stvx	v20, T0, SP
+	stvx	v21, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	stvx	v22, T0, SP
+	stvx	v23, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	stvx	v24, T0, SP
+	stvx	v25, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	stvx	v26, T0, SP
+	stvx	v27, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	stvx	v28, T0, SP
+	stvx	v29, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	stvx	v30, T0, SP
+	stvx	v31, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	stdx	r14, T0, SP
+	stdx	r15, T1, SP
+
+	li	TC4, 4
+	li	TC8, 8
+	li	TC12, 12
+	li	TC16, 16
+
+	C Load state values
+	lxvw4x	VSR(VSA), 0, STATE	C VSA contains A,B,C,D
+	lxvw4x	VSR(VSE), TC16, STATE	C VSE contains E,F,G,H
+
+	vsldoi	VSB, VSA, VSA, 4
+	vsldoi	VSF, VSE, VSE, 4
+
+	vsldoi	VSC, VSA, VSA, 8
+	vsldoi	VSG, VSE, VSE, 8
+
+	vsldoi	VSD, VSA, VSA, 12
+	vsldoi	VSH, VSE, VSE, 12
+
+.loop:
+	xxlor	VSXA, VSR(VSA), VSR(VSA)
+	xxlor	VSXB, VSR(VSB), VSR(VSB)
+	xxlor	VSXC, VSR(VSC), VSR(VSC)
+	xxlor	VSXD, VSR(VSD), VSR(VSD)
+	xxlor	VSXE, VSR(VSE), VSR(VSE)
+	xxlor	VSXF, VSR(VSF), VSR(VSF)
+	xxlor	VSXG, VSR(VSG), VSR(VSG)
+	xxlor	VSXH, VSR(VSH), VSR(VSH)
+
+	li	TK, 0
+	lxvw4x	VSR(VK), TK, K
+	addi	TK, TK, 4
+
+	DOLOADS
+
+	C "permute" state from VSA containing A,B,C,D into VSA,VSB,VSC,VSD
+
+	EXTENDROUNDS
+	EXTENDROUNDS
+	EXTENDROUNDS
+	NOEXTENDROUNDS
+
+	C Reload initial state from VSX registers
+	xxlor	VSR(VT0), VSXA, VSXA
+	xxlor	VSR(VT1), VSXB, VSXB
+	xxlor	VSR(VT2), VSXC, VSXC
+	xxlor	VSR(VT3), VSXD, VSXD
+	xxlor	VSR(VT4), VSXE, VSXE
+	xxlor	VSR(SIGA), VSXF, VSXF
+	xxlor	VSR(SIGE), VSXG, VSXG
+	xxlor	VSR(VK), VSXH, VSXH
+
+	vadduwm	VSA, VSA, VT0
+	vadduwm	VSB, VSB, VT1
+	vadduwm	VSC, VSC, VT2
+	vadduwm	VSD, VSD, VT3
+	vadduwm	VSE, VSE, VT4
+	vadduwm	VSF, VSF, SIGA
+	vadduwm	VSG, VSG, SIGE
+	vadduwm	VSH, VSH, VK
+
+	bdnz .loop
+
+	C Repack VSA,VSB,VSC,VSD into VSA,VSE for storing
+	vmrghw	VSA, VSA, VSB
+	vmrghw	VSC, VSC, VSD
+	vmrghw	VSE, VSE, VSF
+	vmrghw	VSG, VSG, VSH
+
+	xxmrghd	VSR(VSA), VSR(VSA), VSR(VSC)
+	xxmrghd	VSR(VSE), VSR(VSE), VSR(VSG)
+
+	stxvw4x	VSR(VSA), 0, STATE
+	stxvw4x	VSR(VSE), TC16, STATE
+
+
+	C Restore nonvolatile registers
+	li	T0, -8
+	li	T1, -24
+	lvx	v20, T0, SP
+	lvx	v21, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	lvx	v22, T0, SP
+	lvx	v23, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	lvx	v24, T0, SP
+	lvx	v25, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	lvx	v26, T0, SP
+	lvx	v27, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	lvx	v28, T0, SP
+	lvx	v29, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	lvx	v30, T0, SP
+	lvx	v31, T1, SP
+	subi	T0, T0, 32
+	subi	T1, T1, 32
+	ldx	r14, T0, SP
+	ldx	r15, T1, SP
+
+.done:
+	mr r3, INPUT
+
+	blr
+EPILOGUE(_nettle_sha256_compress_n)
+
+IF_LE(`
+.data
+.align 4
+.load_swap:
+	.byte 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7
+')
-- 
2.45.0