diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm index bd1cf63c..65638fad 100644 --- a/arm/neon/chacha-3core.asm +++ b/arm/neon/chacha-3core.asm @@ -64,10 +64,10 @@ define(`T3', `q7') C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_chacha_3core) - vldm SRC, {X0,X1,X2,X3} + vldm SRC, {s0-s15} vpush {q4,q5,q6,q7} adr r12, .Lcount1 - vld1.64 {Z3}, [r12] + vld1.32 {Z3}, [r12] vadd.i64 Y3, X3, Z3 C Increment 64-bit counter vadd.i64 Z3, Y3, Z3 @@ -213,17 +213,31 @@ PROLOGUE(_nettle_chacha_3core) vadd.i32 Y3, Y3, T2 vadd.i32 Z3, Z3, T3 - vldm SRC, {T0,T1,T2,T3} + vldm SRC, {s16-s31} vadd.i32 X0, X0, T0 vadd.i32 X1, X1, T1 vadd.i32 X2, X2, T2 vadd.i32 X3, X3, T3 - vstmia DST!, {X0,X1,X2,X3} + + C caller expects result little-endian +IF_BE(` vrev32.u8 X0, X0 + vrev32.u8 X1, X1 + vrev32.u8 X2, X2 + vrev32.u8 X3, X3') + vstmia DST!, {s0-s15} vadd.i32 Y0, Y0, T0 vadd.i32 Y1, Y1, T1 vadd.i32 Y2, Y2, T2 - vstmia DST!, {Y0,Y1,Y2,Y3} + +IF_LE(` vstmia DST!, {Y0,Y1,Y2,Y3}') + C caller expects result little-endian and there's not single word + C access to high q registers +IF_BE(` vrev32.u8 X0, Y0 + vrev32.u8 X1, Y1 + vrev32.u8 X2, Y2 + vrev32.u8 X3, Y3 + vstmia DST!, {s0-s15}') vadd.i32 Z0, Z0, T0 vadd.i32 Z1, Z1, T1 @@ -231,15 +245,20 @@ PROLOGUE(_nettle_chacha_3core) vpop {q4,q5,q6,q7} - vstm DST, {Z0,Z1,Z2,Z3} +IF_LE(` vstm DST, {Z0,Z1,Z2,Z3}') +IF_BE(` vrev32.u8 X0, Z0 + vrev32.u8 X1, Z1 + vrev32.u8 X2, Z2 + vrev32.u8 X3, Z3 + vstm DST, {s0-s15}') bx lr EPILOGUE(_nettle_chacha_3core) PROLOGUE(_nettle_chacha_3core32) - vldm SRC, {X0,X1,X2,X3} + vldm SRC, {s0-s15} vpush {q4,q5,q6,q7} adr r12, .Lcount1 - vld1.64 {Z3}, [r12] + vld1.32 {Z3}, [r12] vadd.i32 Y3, X3, Z3 C Increment 32-bit counter vadd.i32 Z3, Y3, Z3 diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm index b0a775bd..8e72c6b0 100644 --- a/arm/neon/chacha-core-internal.asm +++ b/arm/neon/chacha-core-internal.asm @@ -83,7 +83,7 @@ define(`QROUND', ` C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_chacha_core) - vldm SRC, {X0,X1,X2,X3} + vldm SRC, {s0-s15} vmov S0, X0 vmov S1, X1 @@ -96,15 +96,6 @@ PROLOGUE(_nettle_chacha_core) C 8 9 10 11 X2 C 12 13 14 15 X3 - C Input rows big-endian: - C 1 0 3 2 X0 - C 5 4 7 6 X1 - C 9 8 11 10 X2 - C 13 12 15 14 X3 - C even and odd columns switched because - C vldm loads consecutive doublewords and - C switches words inside them to make them BE - .Loop: QROUND(X0, X1, X2, X3) @@ -113,29 +104,17 @@ PROLOGUE(_nettle_chacha_core) C 5 6 7 4 >>> 3 C 10 11 8 9 >>> 2 C 15 12 13 14 >>> 1 - - C In big-endian rotate rows, to get - C 1 0 3 2 - C 6 5 4 7 >>> 1 - C 11 10 9 8 >>> 2 - C 12 15 14 13 >>> 3 - C different number of elements needs to be - C extracted on BE because of different column order -IF_LE(` vext.32 X1, X1, X1, #1') -IF_BE(` vext.32 X1, X1, X1, #3') + vext.32 X1, X1, X1, #1 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #3') -IF_BE(` vext.32 X3, X3, X3, #1') + vext.32 X3, X3, X3, #3 QROUND(X0, X1, X2, X3) subs ROUNDS, ROUNDS, #2 C Inverse rotation -IF_LE(` vext.32 X1, X1, X1, #3') -IF_BE(` vext.32 X1, X1, X1, #1') + vext.32 X1, X1, X1, #3 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #1') -IF_BE(` vext.32 X3, X3, X3, #3') + vext.32 X3, X3, X3, #1 bhi .Loop @@ -150,7 +129,7 @@ IF_BE(` vrev32.u8 X0, X0 vrev32.u8 X2, X2 vrev32.u8 X3, X3') - vstm DST, {X0,X1,X2,X3} + vstm DST, {s0-s15} bx lr EPILOGUE(_nettle_chacha_core) diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm index d622edd6..efd2626d 100644 --- a/arm/neon/salsa20-2core.asm +++ b/arm/neon/salsa20-2core.asm @@ -38,18 +38,18 @@ define(`SRC', `r1') define(`ROUNDS', `r2') C State, even elements in X, odd elements in Y -define(`X0', `q0') -define(`X1', `q1') -define(`X2', `q2') -define(`X3', `q3') -define(`Y0', `q8') -define(`Y1', `q9') -define(`Y2', `q10') -define(`Y3', `q11') -define(`T0', `q12') -define(`T1', `q13') -define(`T2', `q14') -define(`T3', `q15') +define(`X0', `q8') +define(`X1', `q9') +define(`X2', `q10') +define(`X3', `q11') +define(`Y0', `q12') +define(`Y1', `q13') +define(`Y2', `q14') +define(`Y3', `q15') +define(`T0', `q0') +define(`T1', `q1') +define(`T2', `q2') +define(`T3', `q3') .text .align 4 @@ -58,18 +58,23 @@ define(`T3', `q15') C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_salsa20_2core) - vldm SRC, {X0,X1,X2,X3} +IF_LE(` vldm SRC, {X0,X1,X2,X3}') +IF_BE(` vldm SRC, {s0-s15} + vmov X0, T0 + vmov X1, T1 + vmov X2, T2 + vmov X3, T3') adr r12, .Lcount1 vmov Y3, X0 - vld1.64 {Y1}, [r12] + vld1.32 {Y1}, [r12] vmov Y0, X1 vadd.i64 Y1, Y1, X2 C Increment counter vmov Y2, X3 vtrn.32 X0, Y3 C X0: 0 0 2 2 Y3: 1 1 3 3 vtrn.32 X1, Y0 C X1: 4 4 6 6 Y0: 5 5 7 7 - vtrn.32 X2, Y1 C X2: 8 8 10 10 Y1: 9 9 1 1 + vtrn.32 X2, Y1 C X2: 8 8 10 10 Y1: 9 9 11 11 vtrn.32 X3, Y2 C X3: 12 12 14 14 Y2: 13 13 15 15 C Swap, to get @@ -180,7 +185,7 @@ C Inverse swaps and transpositions vswp D1REG(Y0), D1REG(Y2) vswp D1REG(Y1), D1REG(Y3) - vldm SRC, {T0,T1,T2,T3} + vldm SRC, {s0-s15} vtrn.32 X0, Y3 vtrn.32 X1, Y0 @@ -193,14 +198,24 @@ C Add in the original context vadd.i32 X2, X2, T2 vadd.i32 X3, X3, T3 - vstmia DST!, {X0,X1,X2,X3} - vld1.64 {X0}, [r12] +IF_LE(` vstmia DST!, {X0,X1,X2,X3}') +IF_BE(` vrev32.u8 T0, X0 + vrev32.u8 T1, X1 + vrev32.u8 T2, X2 + vrev32.u8 T3, X3 + vstmia DST!, {s0-s15}') + vld1.32 {X0}, [r12] +IF_BE(` vldm SRC, {s0-s15}') vadd.i32 T0, T0, Y3 vadd.i64 T2, T2, X0 vadd.i32 T1, T1, Y0 vadd.i32 T2, T2, Y1 vadd.i32 T3, T3, Y2 - vstm DST, {T0,T1,T2,T3} +IF_BE(` vrev32.u8 T0, T0 + vrev32.u8 T1, T1 + vrev32.u8 T2, T2 + vrev32.u8 T3, T3') + vstm DST, {s0-s15} bx lr EPILOGUE(_nettle_salsa20_2core) diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm index d59d7b80..5f8ed15c 100644 --- a/arm/neon/salsa20-core-internal.asm +++ b/arm/neon/salsa20-core-internal.asm @@ -86,7 +86,18 @@ define(`QROUND', ` C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_salsa20_core) - vldm SRC, {X0,X1,X2,X3} + C FIXME: Construct in some other way? + adr r12, .Lmasks +IF_LE(` vldm r12, {M0101, M0110, M0011}') + C we need to load single words to avoid word-swapping by + C d-register loads and have no single word access to high q + C registers here +IF_BE(` vldm r12, {s0-s11} + vmov M0101, X0 + vmov M0110, X1 + vmov M0011, X2') + + vldm SRC, {s0-s15} C Input rows little-endian: C 0 1 2 3 X0 @@ -99,24 +110,6 @@ PROLOGUE(_nettle_salsa20_core) C 8 13 2 7 C 12 1 6 11 - C Input rows big-endian: - C 1 0 3 2 X0 - C 5 4 7 6 X1 - C 9 8 11 10 X2 - C 13 12 15 14 X3 - C even and odd columns switched because - C vldm loads consecutive doublewords and - C switches words inside them to make them BE - C Permuted to: - C 5 0 15 10 - C 9 4 3 14 - C 13 8 7 2 - C 1 12 11 6 - - C FIXME: Construct in some other way? - adr r12, .Lmasks - vldm r12, {M0101, M0110, M0011} - vmov S1, X1 vmov S2, X2 vmov S3, X3 @@ -160,29 +153,17 @@ PROLOGUE(_nettle_salsa20_core) C 3 4 9 14 >>> 1 C 2 7 8 13 >>> 2 C 1 6 11 12 >>> 3 - - C In big-endian rotate rows, to get - C 5 0 15 10 - C 4 3 14 9 >>> 3 - C 7 2 13 8 >>> 2 - C 6 1 12 11 >>> 1 - C different number of elements needs to be - C extracted on BE because of different column order -IF_LE(` vext.32 X1, X1, X1, #3') -IF_BE(` vext.32 X1, X1, X1, #1') + vext.32 X1, X1, X1, #3 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #1') -IF_BE(` vext.32 X3, X3, X3, #3') + vext.32 X3, X3, X3, #1 QROUND(X0, X3, X2, X1) subs ROUNDS, ROUNDS, #2 C Inverse rotation -IF_LE(` vext.32 X1, X1, X1, #1') -IF_BE(` vext.32 X1, X1, X1, #3') + vext.32 X1, X1, X1, #1 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #3') -IF_BE(` vext.32 X3, X3, X3, #1') + vext.32 X3, X3, X3, #3 bhi .Loop @@ -202,7 +183,7 @@ IF_BE(` vext.32 X3, X3, X3, #1') vbit X2, X3, M0101 vbit X3, T1, M0101 - vld1.64 {T0}, [SRC] + vld1.32 {T0}, [SRC] vadd.u32 X0, X0, T0 vadd.u32 X1, X1, S1 vadd.u32 X2, X2, S2 @@ -214,7 +195,7 @@ IF_BE(` vrev32.u8 X0, X0 vrev32.u8 X2, X2 vrev32.u8 X3, X3') - vstm DST, {X0,X1,X2,X3} + vstm DST, {s0-s15} bx lr EPILOGUE(_nettle_salsa20_core)