diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm index d622edd6..d247800b 100644 --- a/arm/neon/salsa20-2core.asm +++ b/arm/neon/salsa20-2core.asm @@ -64,13 +64,22 @@ PROLOGUE(_nettle_salsa20_2core) vmov Y3, X0 vld1.64 {Y1}, [r12] vmov Y0, X1 +IF_BE(` vrev64.u32 Y1, Y1 + vrev64.u32 X2, X2') vadd.i64 Y1, Y1, X2 C Increment counter +IF_BE(` vrev64.u32 Y1, Y1 + vrev64.u32 X2, X2') vmov Y2, X3 vtrn.32 X0, Y3 C X0: 0 0 2 2 Y3: 1 1 3 3 vtrn.32 X1, Y0 C X1: 4 4 6 6 Y0: 5 5 7 7 - vtrn.32 X2, Y1 C X2: 8 8 10 10 Y1: 9 9 1 1 + vtrn.32 X2, Y1 C X2: 8 8 10 10 Y1: 9 9 11 11 vtrn.32 X3, Y2 C X3: 12 12 14 14 Y2: 13 13 15 15 + C BE: + C X0: 3 3 1 1 Y3: 2 2 0 0 + C X1: 7 7 5 5 Y0: 6 6 4 4 + C X2: 11 11 9 9 Y1: 10 10 8 8 + C X3: 15 15 13 13 Y2: 14 14 12 12 C Swap, to get C X0: 0 10 Y0: 5 15 @@ -82,6 +91,12 @@ PROLOGUE(_nettle_salsa20_2core) vswp D1REG(Y0), D1REG(Y2) vswp D1REG(Y1), D1REG(Y3) + C BE: + C X0: 11 1 Y0: 14 4 + C X1: 15 5 Y1: 2 8 + C X2: 3 9 Y2: 6 12 + C X3: 7 13 Y3: 10 0 + .Loop: C Register layout (A is first block, B is second block) C @@ -196,7 +211,11 @@ C Add in the original context vstmia DST!, {X0,X1,X2,X3} vld1.64 {X0}, [r12] vadd.i32 T0, T0, Y3 +IF_BE(` vrev64.u32 X0, X0 + vrev64.u32 T2, T2') vadd.i64 T2, T2, X0 +IF_BE(` vrev64.u32 X0, X0 + vrev64.u32 T2, T2') vadd.i32 T1, T1, Y0 vadd.i32 T2, T2, Y1 vadd.i32 T3, T3, Y2