The last patch follows the C implementation but I just figured out a decent way to do it. --- powerpc64/p7/chacha-core-internal.asm | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-)
diff --git a/powerpc64/p7/chacha-core-internal.asm b/powerpc64/p7/chacha-core-internal.asm index 33c721c1..76ca0d45 100644 --- a/powerpc64/p7/chacha-core-internal.asm +++ b/powerpc64/p7/chacha-core-internal.asm @@ -53,6 +53,10 @@ define(`S1', `v9') define(`S2', `v10') define(`S3', `v11')
+C Big-endian working state +define(`LE_MASK', `v12') +define(`LE_TEMP', `v13') + C QROUND(X0, X1, X2, X3) define(`QROUND', ` C x0 += x1, x3 ^= x0, x3 lrot 16 @@ -77,10 +81,18 @@ define(`QROUND', ` vrlw $2, $2, ROT7 ')
+C LE_SWAP32(X0, X1, X2, X3) +define(`LE_SWAP32', `IF_BE(` + vperm X0, X0, X0, LE_MASK + vperm X1, X1, X1, LE_MASK + vperm X2, X2, X2, LE_MASK + vperm X3, X3, X3, LE_MASK +')') + .text - .align 4 C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_chacha_core)
li r6, 0x10 C set up some... @@ -91,6 +103,12 @@ PROLOGUE(_nettle_chacha_core) vspltisw ROT12, 12 vspltisw ROT8, 8 vspltisw ROT7, 7 +IF_BE(` + li r9, 0 + lvsl LE_MASK, r9, r9 + vspltisb LE_TEMP, 0x03 + vxor LE_MASK, LE_MASK, LE_TEMP +')
lxvw4x VSR(X0), 0, SRC lxvw4x VSR(X1), r6, SRC @@ -131,6 +149,8 @@ PROLOGUE(_nettle_chacha_core) vadduwm X2, X2, S2 vadduwm X3, X3, S3
+ LE_SWAP32(X0, X1, X2, X3) + stxvw4x VSR(X0), 0, DST stxvw4x VSR(X1), r6, DST stxvw4x VSR(X2), r7, DST