>From 68b069504c34d773439d7ef4cb2b38d0a2af5ce8 Mon Sep 17 00:00:00 2001
From: Michael Weiser <michael.weiser@gmx.de>
Date: Tue, 29 Dec 2020 20:35:43 +0100
Subject: [PATCH 2/2] arm: Use vldm to s regs where possbile

Doesn't yield performance increase though.
---
 arm/neon/chacha-3core.asm          | 14 ++++----------
 arm/neon/chacha-core-internal.asm  |  5 ++---
 arm/neon/salsa20-2core.asm         | 12 +++++-------
 arm/neon/salsa20-core-internal.asm |  7 ++-----
 4 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm
index f9497c09..32cffa3d 100644
--- a/arm/neon/chacha-3core.asm
+++ b/arm/neon/chacha-3core.asm
@@ -36,7 +36,6 @@ ifelse(`
 define(`DST', `r0')
 define(`SRC', `r1')
 define(`ROUNDS', `r2')
-define(`SRCp32', `r3')
 
 C State, X, Y and Z representing consecutive blocks
 define(`X0', `q0')
@@ -65,10 +64,8 @@ define(`T3', `q7')
 	C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 
 PROLOGUE(_nettle_chacha_3core)
-	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
-	add	SRCp32, SRC, #32
-	vld1.32	{X0,X1}, [SRC]
-	vld1.32	{X2,X3}, [SRCp32]
+	C load to s regs to be endianness-neutral wrt consecutive 32-bit words
+	vldm	SRC, {s0-s15}
 	vpush	{q4,q5,q6,q7}
 	adr	r12, .Lcount1
 	vld1.32 {Z3}, [r12]
@@ -217,7 +214,7 @@ PROLOGUE(_nettle_chacha_3core)
 	vadd.i32	Y3, Y3, T2
 	vadd.i32	Z3, Z3, T3
 
-	vld1.32	{T0,T1}, [SRC]
+	vldm	SRC, {s16-s31}
 	vadd.i32	X0, X0, T0
 	vadd.i32	X1, X1, T1
 
@@ -225,7 +222,6 @@ PROLOGUE(_nettle_chacha_3core)
 	C interleave loads, calculations and stores to save cycles on stores
 	vst1.8	{X0,X1}, [DST]!
 
-	vld1.32	{T2,T3}, [SRCp32]
 	vadd.i32	X2, X2, T2
 	vadd.i32	X3, X3, T3
 	vst1.8	{X2,X3}, [DST]!
@@ -250,9 +246,7 @@ PROLOGUE(_nettle_chacha_3core)
 EPILOGUE(_nettle_chacha_3core)
 
 PROLOGUE(_nettle_chacha_3core32)
-	add	SRCp32, SRC, #32
-	vld1.32	{X0,X1}, [SRC]
-	vld1.32	{X2,X3}, [SRCp32]
+	vldm	SRC, {s0-s15}
 	vpush	{q4,q5,q6,q7}
 	adr	r12, .Lcount1
 	vld1.32 {Z3}, [r12]
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm
index 914815f2..a4b77122 100644
--- a/arm/neon/chacha-core-internal.asm
+++ b/arm/neon/chacha-core-internal.asm
@@ -83,9 +83,8 @@ define(`QROUND', `
 	C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 
 PROLOGUE(_nettle_chacha_core)
-	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
-	vld1.32	{X0,X1}, [SRC]!		C SRC changed!
-	vld1.32	{X2,X3}, [SRC]
+	C load to s regs to be endianness-neutral wrt consecutive 32-bit words
+	vldm	SRC, {s0-s15}
 
 	vmov	S0, X0
 	vmov	S1, X1
diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm
index e90147ed..ac09af69 100644
--- a/arm/neon/salsa20-2core.asm
+++ b/arm/neon/salsa20-2core.asm
@@ -36,7 +36,6 @@ ifelse(`
 define(`DST', `r0')
 define(`SRC', `r1')
 define(`ROUNDS', `r2')
-define(`SRCp32', `r3')
 
 C State, even elements in X, odd elements in Y
 define(`X0', `q0')
@@ -59,10 +58,8 @@ define(`T3', `q15')
 
 	C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 PROLOGUE(_nettle_salsa20_2core)
-	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
-	add	SRCp32, SRC, #32
-	vld1.32	{X0,X1}, [SRC]
-	vld1.32	{X2,X3}, [SRCp32]
+	C load to s regs to be endianness-neutral wrt consecutive 32-bit words
+	vldm	SRC, {s0-s15}
 	adr	r12, .Lcount1
 
 	vmov	Y3, X0
@@ -184,8 +181,9 @@ C Inverse swaps and transpositions
 	vswp	D1REG(Y0), D1REG(Y2)
 	vswp	D1REG(Y1), D1REG(Y3)
 
-	vld1.32	{T0,T1}, [SRC]
-	vld1.32	{T2,T3}, [SRCp32]
+	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+	vld1.32	{T0,T1}, [SRC]!		C SRC changed!
+	vld1.32	{T2,T3}, [SRC]
 
 	vtrn.32	X0, Y3
 	vtrn.32	X1, Y0
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
index 9f691a14..83ef263d 100644
--- a/arm/neon/salsa20-core-internal.asm
+++ b/arm/neon/salsa20-core-internal.asm
@@ -36,7 +36,6 @@ ifelse(`
 define(`DST', `r0')
 define(`SRC', `r1')
 define(`ROUNDS', `r2')
-define(`SRCp32', `r3')
 
 define(`X0', `q0')
 define(`X1', `q1')
@@ -87,10 +86,8 @@ define(`QROUND', `
 	C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 
 PROLOGUE(_nettle_salsa20_core)
-	C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
-	add	SRCp32, SRC, #32
-	vld1.32	{X0,X1}, [SRC]
-	vld1.32	{X2,X3}, [SRCp32]
+	C load to s regs to be endianness-neutral wrt consecutive 32-bit words
+	vldm	SRC, {s0-s15}
 
 	C Input rows little-endian:
 	C	 0  1  2  3	X0
-- 
2.29.2