diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm index bd1cf63c..14a06a7a 100644 --- a/arm/neon/chacha-3core.asm +++ b/arm/neon/chacha-3core.asm @@ -69,8 +69,17 @@ PROLOGUE(_nettle_chacha_3core) adr r12, .Lcount1 vld1.64 {Z3}, [r12] + C The two words of the state treated as 64-bit counter here appear + C reversed to a big-endian machine and need to be switched before + C adding to it. Results need to be reverted back so the rest of the + C 32-bit operations can be applied as before. +IF_BE(` vrev64.u32 Z3, Z3 + vrev64.u32 X3, X3') vadd.i64 Y3, X3, Z3 C Increment 64-bit counter vadd.i64 Z3, Y3, Z3 +IF_BE(` vrev64.u32 Z3, Z3 + vrev64.u32 X3, X3 + vrev64.u32 Y3, Y3') .Lshared_entry: vmov Y0, X0 @@ -122,33 +131,39 @@ PROLOGUE(_nettle_chacha_3core) vadd.i32 X2, X2, X3 vsri.u32 Y3, T0, #24 - vext.32 X3, X3, X3, #3 +IF_LE(` vext.32 X3, X3, X3, #3') +IF_BE(` vext.32 X3, X3, X3, #1') vshl.i32 Z3, T1, #8 veor T0, X1, X2 vadd.i32 Y2, Y2, Y3 vsri.u32 Z3, T1, #24 - vext.32 Y3, Y3, Y3, #3 +IF_LE(` vext.32 Y3, Y3, Y3, #3') +IF_BE(` vext.32 Y3, Y3, Y3, #1') vshl.i32 X1, T0, #7 veor T1, Y1, Y2 vadd.i32 Z2, Z2, Z3 vsri.u32 X1, T0, #25 vshl.i32 Y1, T1, #7 veor T0, Z1, Z2 - vext.32 X1, X1, X1, #1 +IF_LE(` vext.32 X1, X1, X1, #1') +IF_BE(` vext.32 X1, X1, X1, #3') vsri.u32 Y1, T1, #25 vshl.i32 Z1, T0, #7 vext.32 Y2, Y2, Y2, #2 - vext.32 Y1, Y1, Y1, #1 +IF_LE(` vext.32 Y1, Y1, Y1, #1') +IF_BE(` vext.32 Y1, Y1, Y1, #3') vsri.u32 Z1, T0, #25 vext.32 X2, X2, X2, #2 C Second QROUND vadd.i32 X0, X0, X1 vext.32 Z2, Z2, Z2, #2 - vext.32 Z1, Z1, Z1, #1 +IF_LE(` vext.32 Z1, Z1, Z1, #1') +IF_BE(` vext.32 Z1, Z1, Z1, #3') veor X3, X3, X0 vadd.i32 Y0, Y0, Y1 - vext.32 Z3, Z3, Z3, #3 +IF_LE(` vext.32 Z3, Z3, Z3, #3') +IF_BE(` vext.32 Z3, Z3, Z3, #1') vrev32.16 X3, X3 C lrot 16 veor Y3, Y3, Y0 vadd.i32 Z0, Z0, Z1 @@ -181,31 +196,37 @@ PROLOGUE(_nettle_chacha_3core) vadd.i32 X2, X2, X3 vsri.u32 Y3, T0, #24 - vext.32 X3, X3, X3, #1 +IF_LE(` vext.32 X3, X3, X3, #1') +IF_BE(` vext.32 X3, X3, X3, #3') vshl.i32 Z3, T1, #8 veor T0, X1, X2 vext.32 X2, X2, X2, #2 vadd.i32 Y2, Y2, Y3 - vext.32 Y3, Y3, Y3, #1 +IF_LE(` vext.32 Y3, Y3, Y3, #1') +IF_BE(` vext.32 Y3, Y3, Y3, #3') vsri.u32 Z3, T1, #24 vshl.i32 X1, T0, #7 veor T1, Y1, Y2 vext.32 Y2, Y2, Y2, #2 vadd.i32 Z2, Z2, Z3 - vext.32 Z3, Z3, Z3, #1 +IF_LE(` vext.32 Z3, Z3, Z3, #1') +IF_BE(` vext.32 Z3, Z3, Z3, #3') vsri.u32 X1, T0, #25 vshl.i32 Y1, T1, #7 veor T0, Z1, Z2 vext.32 Z2, Z2, Z2, #2 - vext.32 X1, X1, X1, #3 +IF_LE(` vext.32 X1, X1, X1, #3') +IF_BE(` vext.32 X1, X1, X1, #1') vsri.u32 Y1, T1, #25 vshl.i32 Z1, T0, #7 - vext.32 Y1, Y1, Y1, #3 +IF_LE(` vext.32 Y1, Y1, Y1, #3') +IF_BE(` vext.32 Y1, Y1, Y1, #1') vsri.u32 Z1, T0, #25 subs ROUNDS, ROUNDS, #2 - vext.32 Z1, Z1, Z1, #3 +IF_LE(` vext.32 Z1, Z1, Z1, #3') +IF_BE(` vext.32 Z1, Z1, Z1, #1') bhi .Loop @@ -218,11 +239,23 @@ PROLOGUE(_nettle_chacha_3core) vadd.i32 X1, X1, T1 vadd.i32 X2, X2, T2 vadd.i32 X3, X3, T3 + + C caller expects result little-endian +IF_BE(` vrev32.u8 X0, X0 + vrev32.u8 X1, X1 + vrev32.u8 X2, X2 + vrev32.u8 X3, X3') vstmia DST!, {X0,X1,X2,X3} vadd.i32 Y0, Y0, T0 vadd.i32 Y1, Y1, T1 vadd.i32 Y2, Y2, T2 + + C caller expects result little-endian +IF_BE(` vrev32.u8 Y0, Y0 + vrev32.u8 Y1, Y1 + vrev32.u8 Y2, Y2 + vrev32.u8 Y3, Y3') vstmia DST!, {Y0,Y1,Y2,Y3} vadd.i32 Z0, Z0, T0 @@ -231,6 +264,11 @@ PROLOGUE(_nettle_chacha_3core) vpop {q4,q5,q6,q7} + C caller expects result little-endian +IF_BE(` vrev32.u8 Z0, Z0 + vrev32.u8 Z1, Z1 + vrev32.u8 Z2, Z2 + vrev32.u8 Z3, Z3') vstm DST, {Z0,Z1,Z2,Z3} bx lr EPILOGUE(_nettle_chacha_3core)