The last patch follows the C implementation but I just figured out a decent
way to do it.
---
powerpc64/p7/chacha-core-internal.asm | 22 +++++++++++++++++++++-
1 file changed, 21 insertions(+), 1 deletion(-)
diff --git a/powerpc64/p7/chacha-core-internal.asm
b/powerpc64/p7/chacha-core-internal.asm
index 33c721c1..76ca0d45 100644
--- a/powerpc64/p7/chacha-core-internal.asm
+++ b/powerpc64/p7/chacha-core-internal.asm
@@ -53,6 +53,10 @@ define(`S1', `v9')
define(`S2', `v10')
define(`S3', `v11')
+C Big-endian working state
+define(`LE_MASK', `v12')
+define(`LE_TEMP', `v13')
+
C QROUND(X0, X1, X2, X3)
define(`QROUND', `
C x0 += x1, x3 ^= x0, x3 lrot 16
@@ -77,10 +81,18 @@ define(`QROUND', `
vrlw $2, $2, ROT7
')
+C LE_SWAP32(X0, X1, X2, X3)
+define(`LE_SWAP32', `IF_BE(`
+ vperm X0, X0, X0, LE_MASK
+ vperm X1, X1, X1, LE_MASK
+ vperm X2, X2, X2, LE_MASK
+ vperm X3, X3, X3, LE_MASK
+')')
+
.text
- .align 4
C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_chacha_core)
li r6, 0x10 C set up some...
@@ -91,6 +103,12 @@ PROLOGUE(_nettle_chacha_core)
vspltisw ROT12, 12
vspltisw ROT8, 8
vspltisw ROT7, 7
+IF_BE(`
+ li r9, 0
+ lvsl LE_MASK, r9, r9
+ vspltisb LE_TEMP, 0x03
+ vxor LE_MASK, LE_MASK, LE_TEMP
+')
lxvw4x VSR(X0), 0, SRC
lxvw4x VSR(X1), r6, SRC
@@ -131,6 +149,8 @@ PROLOGUE(_nettle_chacha_core)
vadduwm X2, X2, S2
vadduwm X3, X3, S3
+ LE_SWAP32(X0, X1, X2, X3)
+
stxvw4x VSR(X0), 0, DST
stxvw4x VSR(X1), r6, DST
stxvw4x VSR(X2), r7, DST
--
2.17.1