>From d487ab6b942407671b5d5b02f0d61ef493af214c Mon Sep 17 00:00:00 2001
From: Michael Weiser <michael.weiser@gmx.de>
Date: Fri, 25 Dec 2020 17:13:52 +0100
Subject: [PATCH] arm: Unify neon asm for big- and little-endian modes

Switch arm neon assembler routines to endianness-agnostic loads and
stores where possible to avoid modifications to the rest of the code.
This involves switching to vld1.32 for loading consecutive 32-bit words
in host endianness as well as vst1.8 for storing back to memory in
little-endian order as required by the caller. Where necessary, r3 is
used to store the precalculated offset into the source vector for the
secondary load operations. vstm is kept for little-endian platforms
because it is faster than vst1 on most ARM implementations.

vst1.x (at least on the Allwinner A20 Cortex-A7 implementation) seems to
interfer with itself on subsequent calls, slowing it down further. So we
reschedule some instructions to do stores as soon as results become
available to have some other calculations or loads before the next
vst1.x. This reliably saves two additional cycles per block on salsa20
and chacha which would otherwise be incurred.

vld1.x does not seem to suffer from this or at least not to a level
where two consecutive vld1.x run slower than an equivalent vldm.
Rescheduling them similarly did not improve performance beyond that of
vldm.

Signed-off-by: Michael Weiser <michael.weiser@gmx.de>
---
 arm/README                         | 14 ++++++-
 arm/neon/chacha-3core.asm          | 36 ++++++++++++++----
 arm/neon/chacha-core-internal.asm  | 47 +++++++-----------------
 arm/neon/salsa20-2core.asm         | 28 ++++++++++----
 arm/neon/salsa20-core-internal.asm | 59 ++++++++++--------------------
 5 files changed, 95 insertions(+), 89 deletions(-)

diff --git a/arm/README b/arm/README
index 1ba54e0d..03149002 100644
--- a/arm/README
+++ b/arm/README
@@ -70,12 +70,24 @@ If data is to be processed with bit operations only, endianness can be ignored
 because byte-swapping on load and store will cancel each other out. Shifts
 however have to be inverted. See arm/memxor.asm for an example.
 
-3. vld1.8
+3. v{ld,st}1.{8,32}
 
 NEON's vld instruction can be used to produce endianness-neutral code. vld1.8
 will load a byte sequence into a register regardless of memory endianness. This
 can be used to process byte sequences. See arm/neon/umac-nh.asm for example.
 
+In the same fashion, vst1.8 can be used do a little-endian store. See
+arm/neon/salsa and chacha routines for examples.
+
+NOTE: vst1.x (at least on the Allwinner A20 Cortex-A7 implementation) seems to
+interfer with itself on subsequent calls, slowing it down. This can be avoided
+by putting calculcations or loads inbetween two vld1.x stores.
+
+Similarly, vld1.32 is used in chacha and salsa routines where 32-bit operands
+are stored in host-endianness in RAM but need to be loaded sequentially without
+the distortion introduced by vldm/vstm. Consecutive vld1.x instructions do not
+seem to suffer from slowdown similar to vst1.x.
+
 4. vldm/vstm
 
 Care has to be taken when using vldm/vstm because they have two non-obvious
diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm
index bd1cf63c..c29c62a5 100644
--- a/arm/neon/chacha-3core.asm
+++ b/arm/neon/chacha-3core.asm
@@ -36,6 +36,7 @@ ifelse(`
 define(`DST', `r0')
 define(`SRC', `r1')
 define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
 
 C State, X, Y and Z representing consecutive blocks
 define(`X0', `q0')
@@ -64,10 +65,13 @@ define(`T3', `q7')
 	C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 
 PROLOGUE(_nettle_chacha_3core)
-	vldm	SRC, {X0,X1,X2,X3}
+	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+	add	SRCp32, SRC, #32
+	vld1.32	{X0,X1}, [SRC]
+	vld1.32	{X2,X3}, [SRCp32]
 	vpush	{q4,q5,q6,q7}
 	adr	r12, .Lcount1
-	vld1.64 {Z3}, [r12]
+	vld1.32 {Z3}, [r12]
 
 	vadd.i64	Y3, X3, Z3	C Increment 64-bit counter
 	vadd.i64	Z3, Y3, Z3
@@ -213,33 +217,49 @@ PROLOGUE(_nettle_chacha_3core)
 	vadd.i32	Y3, Y3, T2
 	vadd.i32	Z3, Z3, T3
 
-	vldm	SRC, {T0,T1,T2,T3}
+	vld1.32	{T0,T1}, [SRC]
 	vadd.i32	X0, X0, T0
 	vadd.i32	X1, X1, T1
+
+	C vst1.8 because caller expects results little-endian
+	C interleave loads, calculations and stores to save cycles on stores
+	C use vstm when little-endian for some additional speedup
+IF_BE(`	vst1.8	{X0,X1}, [DST]!')
+
+	vld1.32	{T2,T3}, [SRCp32]
 	vadd.i32	X2, X2, T2
 	vadd.i32	X3, X3, T3
-	vstmia	DST!, {X0,X1,X2,X3}
+IF_BE(`	vst1.8	{X2,X3}, [DST]!')
+IF_LE(`	vstmia	DST!, {X0,X1,X2,X3}')
 
 	vadd.i32	Y0, Y0, T0
 	vadd.i32	Y1, Y1, T1
+IF_BE(`	vst1.8	{Y0,Y1}, [DST]!')
+
 	vadd.i32	Y2, Y2, T2
-	vstmia	DST!, {Y0,Y1,Y2,Y3}
+IF_BE(`	vst1.8	{Y2,Y3}, [DST]!')
+IF_LE(`	vstmia	DST!, {Y0,Y1,Y2,Y3}')
 
 	vadd.i32	Z0, Z0, T0
 	vadd.i32	Z1, Z1, T1
+IF_BE(`	vst1.8	{Z0,Z1}, [DST]!')
+
 	vadd.i32	Z2, Z2, T2
 
 	vpop	{q4,q5,q6,q7}
 
-	vstm	DST, {Z0,Z1,Z2,Z3}
+IF_BE(`	vst1.8	{Z2,Z3}, [DST]')
+IF_LE(`	vstm	DST, {Z0,Z1,Z2,Z3}')
 	bx	lr
 EPILOGUE(_nettle_chacha_3core)
 
 PROLOGUE(_nettle_chacha_3core32)
-	vldm	SRC, {X0,X1,X2,X3}
+	add	SRCp32, SRC, #32
+	vld1.32	{X0,X1}, [SRC]
+	vld1.32	{X2,X3}, [SRCp32]
 	vpush	{q4,q5,q6,q7}
 	adr	r12, .Lcount1
-	vld1.64 {Z3}, [r12]
+	vld1.32 {Z3}, [r12]
 
 	vadd.i32	Y3, X3, Z3	C Increment 32-bit counter
 	vadd.i32	Z3, Y3, Z3
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm
index b0a775bd..5095be6a 100644
--- a/arm/neon/chacha-core-internal.asm
+++ b/arm/neon/chacha-core-internal.asm
@@ -83,7 +83,9 @@ define(`QROUND', `
 	C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 
 PROLOGUE(_nettle_chacha_core)
-	vldm	SRC, {X0,X1,X2,X3}
+	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+	vld1.32	{X0,X1}, [SRC]!		C SRC changed!
+	vld1.32	{X2,X3}, [SRC]
 
 	vmov	S0, X0
 	vmov	S1, X1
@@ -96,15 +98,6 @@ PROLOGUE(_nettle_chacha_core)
 	C	 8  9 10 11	X2
 	C	12 13 14 15	X3
 
-	C Input rows big-endian:
-	C	 1  0  3  2	X0
-	C	 5  4  7  6	X1
-	C	 9  8 11 10	X2
-	C	13 12 15 14	X3
-	C even and odd columns switched because
-	C vldm loads consecutive doublewords and
-	C switches words inside them to make them BE
-
 .Loop:
 	QROUND(X0, X1, X2, X3)
 
@@ -113,44 +106,32 @@ PROLOGUE(_nettle_chacha_core)
 	C	 5  6  7  4  >>> 3
 	C	10 11  8  9  >>> 2
 	C	15 12 13 14  >>> 1
-
-	C In big-endian rotate rows, to get
-	C	 1  0  3  2
-	C	 6  5  4  7  >>> 1
-	C	11 10  9  8  >>> 2
-	C	12 15 14 13  >>> 3
-	C different number of elements needs to be
-	C extracted on BE because of different column order
-IF_LE(`	vext.32	X1, X1, X1, #1')
-IF_BE(`	vext.32	X1, X1, X1, #3')
+	vext.32	X1, X1, X1, #1
 	vext.32	X2, X2, X2, #2
-IF_LE(`	vext.32	X3, X3, X3, #3')
-IF_BE(`	vext.32	X3, X3, X3, #1')
+	vext.32	X3, X3, X3, #3
 
 	QROUND(X0, X1, X2, X3)
 
 	subs	ROUNDS, ROUNDS, #2
 	C Inverse rotation
-IF_LE(`	vext.32	X1, X1, X1, #3')
-IF_BE(`	vext.32	X1, X1, X1, #1')
+	vext.32	X1, X1, X1, #3
 	vext.32	X2, X2, X2, #2
-IF_LE(`	vext.32	X3, X3, X3, #1')
-IF_BE(`	vext.32	X3, X3, X3, #3')
+	vext.32	X3, X3, X3, #1
 
 	bhi	.Loop
 
 	vadd.u32	X0, X0, S0
 	vadd.u32	X1, X1, S1
+
+	C vst1.8 because caller expects results little-endian
+	C use vstm when little-endian for some additional speedup
+IF_BE(`	vst1.8	{X0,X1}, [DST]!')
+
 	vadd.u32	X2, X2, S2
 	vadd.u32	X3, X3, S3
 
-	C caller expects result little-endian
-IF_BE(`	vrev32.u8	X0, X0
-	vrev32.u8	X1, X1
-	vrev32.u8	X2, X2
-	vrev32.u8	X3, X3')
-
-	vstm	DST, {X0,X1,X2,X3}
+IF_BE(`	vst1.8	{X2,X3}, [DST]')
+IF_LE(`	vstm  DST, {X0,X1,X2,X3}')
 	bx	lr
 EPILOGUE(_nettle_chacha_core)
 
diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm
index b3fe7e94..4d9da79b 100644
--- a/arm/neon/salsa20-2core.asm
+++ b/arm/neon/salsa20-2core.asm
@@ -36,6 +36,7 @@ ifelse(`
 define(`DST', `r0')
 define(`SRC', `r1')
 define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
 
 C State, even elements in X, odd elements in Y
 define(`X0', `q0')
@@ -58,11 +59,14 @@ define(`T3', `q15')
 
 	C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 PROLOGUE(_nettle_salsa20_2core)
-	vldm	SRC, {X0,X1,X2,X3}
+	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+	add	SRCp32, SRC, #32
+	vld1.32	{X0,X1}, [SRC]
+	vld1.32	{X2,X3}, [SRCp32]
 	adr	r12, .Lcount1
 
 	vmov	Y3, X0
-	vld1.64 {Y1}, [r12]
+	vld1.32 {Y1}, [r12]
 	vmov	Y0, X1
 	vadd.i64 Y1, Y1, X2	C Increment counter
 	vmov	Y2, X3
@@ -180,7 +184,8 @@ C Inverse swaps and transpositions
 	vswp	D1REG(Y0), D1REG(Y2)
 	vswp	D1REG(Y1), D1REG(Y3)
 
-	vldm	SRC, {T0,T1,T2,T3}
+	vld1.32	{T0,T1}, [SRC]
+	vld1.32	{T2,T3}, [SRCp32]
 
 	vtrn.32	X0, Y3
 	vtrn.32	X1, Y0
@@ -190,17 +195,26 @@ C Inverse swaps and transpositions
 C Add in the original context
 	vadd.i32	X0, X0, T0
 	vadd.i32	X1, X1, T1
+
+C vst1.8 because caller expects results little-endian
+C interleave loads, calculations and stores to save cycles on stores
+C use vstm when little-endian for some additional speedup
+IF_BE(`	vst1.8	{X0,X1}, [DST]!')
+
 	vadd.i32	X2, X2, T2
 	vadd.i32	X3, X3, T3
+IF_BE(`	vst1.8	{X2,X3}, [DST]!')
+IF_LE(`	vstmia	DST!, {X0,X1,X2,X3}')
 
-	vstmia	DST!, {X0,X1,X2,X3}
-	vld1.64 {X0}, [r12]
+	vld1.32 {X0}, [r12]
 	vadd.i32	T0, T0, Y3
 	vadd.i64	T2, T2, X0
 	vadd.i32	T1, T1, Y0
+IF_BE(`	vst1.8	{T0,T1}, [DST]!')
+
 	vadd.i32	T2, T2, Y1
 	vadd.i32	T3, T3, Y2
-
-	vstm	DST, {T0,T1,T2,T3}
+IF_BE(`	vst1.8	{T2,T3}, [DST]')
+IF_LE(`	vstm	DST, {T0,T1,T2,T3}')
 	bx	lr
 EPILOGUE(_nettle_salsa20_2core)
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
index d59d7b80..c5785da4 100644
--- a/arm/neon/salsa20-core-internal.asm
+++ b/arm/neon/salsa20-core-internal.asm
@@ -36,6 +36,7 @@ ifelse(`
 define(`DST', `r0')
 define(`SRC', `r1')
 define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
 
 define(`X0', `q0')
 define(`X1', `q1')
@@ -86,7 +87,10 @@ define(`QROUND', `
 	C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 
 PROLOGUE(_nettle_salsa20_core)
-	vldm	SRC, {X0,X1,X2,X3}
+	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+	add	SRCp32, SRC, #32
+	vld1.32	{X0,X1}, [SRC]
+	vld1.32	{X2,X3}, [SRCp32]
 
 	C Input rows little-endian:
 	C	 0  1  2  3	X0
@@ -99,23 +103,10 @@ PROLOGUE(_nettle_salsa20_core)
 	C	 8 13  2  7
 	C	12  1  6 11
 
-	C Input rows big-endian:
-	C	 1  0  3  2	X0
-	C	 5  4  7  6	X1
-	C	 9  8 11 10	X2
-	C	13 12 15 14	X3
-	C even and odd columns switched because
-	C vldm loads consecutive doublewords and
-	C switches words inside them to make them BE
-	C Permuted to:
-	C	 5  0 15 10
-	C	 9  4  3 14
-	C	13  8  7  2
-	C	 1 12 11  6
-
 	C FIXME: Construct in some other way?
 	adr	r12, .Lmasks
-	vldm	r12, {M0101, M0110, M0011}
+	vld1.32	{M0101, M0110}, [r12]!
+	vld1.32	{M0011}, [r12]
 
 	vmov	S1, X1
 	vmov	S2, X2
@@ -160,29 +151,17 @@ PROLOGUE(_nettle_salsa20_core)
 	C	 3  4  9 14  >>> 1
 	C	 2  7  8 13  >>> 2
 	C	 1  6 11 12  >>> 3
-
-	C In big-endian rotate rows, to get
-	C	 5  0 15 10
-	C	 4  3 14  9  >>> 3
-	C	 7  2 13  8  >>> 2
-	C	 6  1 12 11  >>> 1
-	C different number of elements needs to be
-	C extracted on BE because of different column order
-IF_LE(`	vext.32	X1, X1, X1, #3')
-IF_BE(`	vext.32	X1, X1, X1, #1')
+	vext.32	X1, X1, X1, #3
 	vext.32	X2, X2, X2, #2
-IF_LE(`	vext.32	X3, X3, X3, #1')
-IF_BE(`	vext.32	X3, X3, X3, #3')
+	vext.32	X3, X3, X3, #1
 
 	QROUND(X0, X3, X2, X1)
 
 	subs	ROUNDS, ROUNDS, #2
 	C Inverse rotation
-IF_LE(`	vext.32	X1, X1, X1, #1')
-IF_BE(`	vext.32	X1, X1, X1, #3')
+	vext.32	X1, X1, X1, #1
 	vext.32	X2, X2, X2, #2
-IF_LE(`	vext.32	X3, X3, X3, #3')
-IF_BE(`	vext.32	X3, X3, X3, #1')
+	vext.32	X3, X3, X3, #3
 
 	bhi	.Loop
 
@@ -202,19 +181,19 @@ IF_BE(`	vext.32	X3, X3, X3, #1')
 	vbit	X2, X3, M0101
 	vbit	X3, T1, M0101
 
-	vld1.64	{T0}, [SRC]
+	vld1.32	{T0}, [SRC]
 	vadd.u32	X0, X0, T0
 	vadd.u32	X1, X1, S1
+
+	C vst1.8 because caller expects results little-endian
+	C use vstm when little-endian for some additional speedup
+IF_BE(`	vst1.8	{X0,X1}, [DST]!')
+
 	vadd.u32	X2, X2, S2
 	vadd.u32	X3, X3, S3
 
-	C caller expects result little-endian
-IF_BE(`	vrev32.u8	X0, X0
-	vrev32.u8	X1, X1
-	vrev32.u8	X2, X2
-	vrev32.u8	X3, X3')
-
-	vstm	DST, {X0,X1,X2,X3}
+IF_BE(`	vst1.8	{X2,X3}, [DST]')
+IF_LE(`	vstm	DST, {X0,X1,X2,X3}')
 	bx	lr
 EPILOGUE(_nettle_salsa20_core)
 
-- 
2.30.0