--- powerpc64/p7/chacha-core-internal.asm | 55 ++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-)
diff --git a/powerpc64/p7/chacha-core-internal.asm b/powerpc64/p7/chacha-core-internal.asm index 33c721c1..922050ff 100644 --- a/powerpc64/p7/chacha-core-internal.asm +++ b/powerpc64/p7/chacha-core-internal.asm @@ -53,6 +53,18 @@ define(`S1', `v9') define(`S2', `v10') define(`S3', `v11')
+C Big-endian working state +define(`ROT24', `v12') +define(`ODD', `v13') +define(`EVEN', `v14') +define(`ZERO', `v15') +define(`NEG', `v16') + +define(`XR0', `v15') +define(`XR1', `v16') +define(`XR2', `v17') +define(`XR3', `v18') + C QROUND(X0, X1, X2, X3) define(`QROUND', ` C x0 += x1, x3 ^= x0, x3 lrot 16 @@ -77,10 +89,42 @@ define(`QROUND', ` vrlw $2, $2, ROT7 ')
+C LE_SWAP32(X0, X1, X2, X3) +define(`LE_SWAP32', `IF_BE(` + C xr = x lrot 8, xr &= 0x00FF00FF + C x = x lrot 24, x &= 0xFF00FF00 + C x |= xr + + vrlw XR0, X0, ROT8 + vrlw XR1, X1, ROT8 + vrlw XR2, X2, ROT8 + vrlw XR3, X3, ROT8 + + vand XR0, XR0, ODD + vand XR1, XR1, ODD + vand XR2, XR2, ODD + vand XR3, XR3, ODD + + vrlw X0, X0, ROT24 + vrlw X1, X1, ROT24 + vrlw X2, X2, ROT24 + vrlw X3, X3, ROT24 + + vand X0, X0, EVEN + vand X1, X1, EVEN + vand X2, X2, EVEN + vand X3, X3, EVEN + + vor X0, X0, XR0 + vor X1, X1, XR1 + vor X2, X2, XR2 + vor X3, X3, XR3 +')') + .text - .align 4 C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_chacha_core)
li r6, 0x10 C set up some... @@ -91,6 +135,13 @@ PROLOGUE(_nettle_chacha_core) vspltisw ROT12, 12 vspltisw ROT8, 8 vspltisw ROT7, 7 +IF_BE(` + vspltisw ZERO, 0 + vspltisw NEG, -1 + vmrghb ODD, ZERO, NEG + vmrghb EVEN, NEG, ZERO + vadduwm ROT24, ROT12, ROT12 +')
lxvw4x VSR(X0), 0, SRC lxvw4x VSR(X1), r6, SRC @@ -131,6 +182,8 @@ PROLOGUE(_nettle_chacha_core) vadduwm X2, X2, S2 vadduwm X3, X3, S3
+ LE_SWAP32(X0, X1, X2, X3) + stxvw4x VSR(X0), 0, DST stxvw4x VSR(X1), r6, DST stxvw4x VSR(X2), r7, DST