Re: [AArch64] Optimize GHASH

23 Jan 2021

Hello Mamone,
On Fri, Jan 22, 2021 at 10:14:36PM +0200, Maamoun TK wrote:
...
...
The difference in index in dup EMSB nicely shows the doubleword
transposition compared to LE. If on LE the dup was done after the rev64,
it'd be H.b[7] vs. H.b[15].
I see what you did here, but I'm confused about ld1 and st1 instructions so
let me clarify one thing before going on, how do ld1 and st1 load and store
from/into memory in BE mode? If they perform in a normal way then there is
no point of using ldr at all, I just used it because it handles imm offset.
so to replace this line "ldr HQ,[TABLE,#16*H_Idx]" we can just add the
offset to the register that hold the address "add x1,TABLE,#16*H_Idx" then
I've just retested and reread some ARM documents. Here's a patch that
uses ld1.16b and thus eliminates almost all special BE treatment but
subsequently has to leave in all the rev64s as well. This has the
testsuite passing on BE and (still) LE. My take at an explanation below.

diff --git a/arm64/v8/gcm-hash.asm b/arm64/v8/gcm-hash.asm
index 1c14db54..8c8a370e 100644
--- a/arm64/v8/gcm-hash.asm
+++ b/arm64/v8/gcm-hash.asm
@@ -55,17 +55,10 @@ C common macros:
 .endm
.macro REDUCTION out
-IF_BE(`
-    pmull          T.1q,F.1d,POLY.1d
-    ext            \out().16b,F.16b,F.16b,#8
-    eor            R.16b,R.16b,T.16b
-    eor            \out().16b,\out().16b,R.16b
-',`
     pmull          T.1q,F.1d,POLY.1d
     eor            R.16b,R.16b,T.16b
     ext            R.16b,R.16b,R.16b,#8
     eor            \out().16b,F.16b,R.16b
-')
 .endm
C void gcm_init_key (union gcm_block *table)
@@ -108,27 +101,20 @@ define(`H4M', `v29')
 define(`H4L', `v30')
.macro PMUL_PARAM in, param1, param2
-IF_BE(`
-    pmull2         Hp.1q,\in().2d,POLY.2d
-    ext            Hm.16b,\in().16b,\in().16b,#8
-    eor            Hm.16b,Hm.16b,Hp.16b
-    zip            \param1().2d,\in().2d,Hm.2d
-    zip2           \param2().2d,\in().2d,Hm.2d
-',`
     pmull2         Hp.1q,\in().2d,POLY.2d
     eor            Hm.16b,\in().16b,Hp.16b
     ext            \param1().16b,Hm.16b,\in().16b,#8
     ext            \param2().16b,\in().16b,Hm.16b,#8
     ext            \param1().16b,\param1().16b,\param1().16b,#8
-')
 .endm
PROLOGUE(_nettle_gcm_init_key)
-    ldr            HQ,[TABLE,#16*H_Idx]
+    C LSB vector load: x1+0 into H.b[0] and x1+15 into H.b[15]
+    add            x1,TABLE,#16*H_Idx
+    ld1            {H.16b},[x1]
     dup            EMSB.16b,H.b[0]
-IF_LE(`
+    C treat H as two MSB doublewords
     rev64          H.16b,H.16b
-')
     mov            x1,#0xC200000000000000
     mov            x2,#1
     mov            POLY.d[0],x1
@@ -221,9 +207,7 @@ PROLOGUE(_nettle_gcm_hash)
     mov            POLY.d[0],x4
ld1            {D.16b},[X]
-IF_LE(`
     rev64          D.16b,D.16b
-')
ands           x4,LENGTH,#-64
     b.eq           L2x
@@ -234,12 +218,10 @@ IF_LE(`
L4x_loop:
     ld1            {C0.16b,C1.16b,C2.16b,C3.16b},[DATA],#64
-IF_LE(`
     rev64          C0.16b,C0.16b
     rev64          C1.16b,C1.16b
     rev64          C2.16b,C2.16b
     rev64          C3.16b,C3.16b
-')
eor            C0.16b,C0.16b,D.16b
@@ -262,10 +244,8 @@ L2x:
     ld1            {H1M.16b,H1L.16b,H2M.16b,H2L.16b},[TABLE]
ld1            {C0.16b,C1.16b},[DATA],#32
-IF_LE(`
     rev64          C0.16b,C0.16b
     rev64          C1.16b,C1.16b
-')
eor            C0.16b,C0.16b,D.16b
@@ -283,9 +263,7 @@ L1x:
     ld1            {H1M.16b,H1L.16b},[TABLE]
ld1            {C0.16b},[DATA],#16
-IF_LE(`
     rev64          C0.16b,C0.16b
-')
eor            C0.16b,C0.16b,D.16b
@@ -335,9 +313,7 @@ Lmod_8_done:
     REDUCTION D
Ldone:
-IF_LE(`
     rev64          D.16b,D.16b
-')
     st1            {D.16b},[X]
     ret
 EPILOGUE(_nettle_gcm_hash)
My understanding is that ld1 and st1 are "single-element structure"
operations. (Identical to vld1 in arm32 NEON we discussed recently for
chacha and salsa2 asm.) That means they load a number of elements of a
given type from consecutive memory locations into the corresponding
vector register indices.
ld1 {v0.4s},[x0] would load four 32bit words from consecutive memory
locations and put them into v0.s[0] through v0.s[3]. So x0+0..3 (bytes)
would go into v0.s[0], x0+4..7 would to into v0.s[1] and so on.
Endianness would apply to the internal byte order of the elements, so
each word would be loaded MSB-first in BE-mode and LSB-first in LE-mode.
So, given memory content such as:
x0 + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
byte 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
We should get on BE:
MSB     LSB
v0.s[0]:     0 1 2 3
v0.s[1]:     4 5 6 7
v0.s[2]:   8 9 10 11
v0.s[3]: 12 13 14 15
Or looked at as byte-vectors:
|v0.s[0]|v0.s[1]| v0.s[2] | v0.s[3]  |
      v0.b[0]                             v0.b[15]
v0.16b:  3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12
On LE we should get:
MSB     LSB
v0.d[0]:     3 2 1 0
v0.d[1]:     7 6 5 4
v0.d[2]:   11 10 9 8
v0.d[3]: 15 14 13 12
v0.b[0]                             v0.b[15]
v0.16b: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
This was just meant as intro. I've not actually tested this. I hope I
got it right and not just added to everyone's confusion (mine included).
:/
Back to ld1.16b: This now loads a vector of 16 bytes consecutively.
Since bytes have no endianness there will be no changes in order on
either LE and BE modes. The register content will look the same on both:
x0 +    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
byte:   0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    v0.b[0]                             v0.b[15]
v0.16b: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
So larger datatypes loaded that way should be stored little-endian in
memory to make sense as e.g. .d[0] after such a load. Or we need to
rev64 them.
...
load the H value by using ld1 "ld1 {H.16b},[x1]" in this way we can still
have to deal with LE as transposed doublewords and with BE in normal way
(not transposed doublewords or transposed quadword).
After sending my last email I realised that the doublewords aren't
actually transposed with BE as such. They're just transposed compared to
the original LE routine because the ldr instruction loads in completely
reversed order in each mode and the LE routine does convert the internal
byte order of the doublewords to BE but not the overall order of the
128bit quadword because it doesn't need to and regards them as a
vector of two doublewords anyway.
ld1.16b doesn't change that at all. It just behaves the same on LE and
BE. So we'll always load vectors of bytes. And it'll always be an LSB
load. And if we want to treat them as big-endian doublewords we have to
adjust them accordingly. That's why we now also need all the rev64s on
BE above.
That opens another topic: As you may have noticed I haven't got the
slightest idea of what the code is actually doing. Assembly also isn't
my first language either. I'm only mechanically trying to get BE mode to
produce the same results as LE.
This made me realise that I haven't the faintest idea what we're getting
as input and producing as output either. :/ So are we working on blocks
of bytes and producing blocks of bytes and just treating them as
big-endian 64bit doublewords internally to exploit availability of
instructions that can work on these types or could we actually declare
the elements of TABLE to be quadwords in host endianness? Then we could
actually throw ld1.2d at them and eliminate all the rev64s.
Duh, I think we can regardless, at least for BE:
diff --git a/arm64/v8/gcm-hash.asm b/arm64/v8/gcm-hash.asm
index 1c14db54..642e3840 100644
--- a/arm64/v8/gcm-hash.asm
+++ b/arm64/v8/gcm-hash.asm
@@ -55,17 +55,10 @@ C common macros:
 .endm
.macro REDUCTION out
-IF_BE(`
-    pmull          T.1q,F.1d,POLY.1d
-    ext            \out().16b,F.16b,F.16b,#8
-    eor            R.16b,R.16b,T.16b
-    eor            \out().16b,\out().16b,R.16b
-',`
     pmull          T.1q,F.1d,POLY.1d
     eor            R.16b,R.16b,T.16b
     ext            R.16b,R.16b,R.16b,#8
     eor            \out().16b,F.16b,R.16b
-')
 .endm
C void gcm_init_key (union gcm_block *table)
@@ -108,27 +101,20 @@ define(`H4M', `v29')
 define(`H4L', `v30')
.macro PMUL_PARAM in, param1, param2
-IF_BE(`
-    pmull2         Hp.1q,\in().2d,POLY.2d
-    ext            Hm.16b,\in().16b,\in().16b,#8
-    eor            Hm.16b,Hm.16b,Hp.16b
-    zip            \param1().2d,\in().2d,Hm.2d
-    zip2           \param2().2d,\in().2d,Hm.2d
-',`
     pmull2         Hp.1q,\in().2d,POLY.2d
     eor            Hm.16b,\in().16b,Hp.16b
     ext            \param1().16b,Hm.16b,\in().16b,#8
     ext            \param2().16b,\in().16b,Hm.16b,#8
     ext            \param1().16b,\param1().16b,\param1().16b,#8
-')
 .endm
PROLOGUE(_nettle_gcm_init_key)
-    ldr            HQ,[TABLE,#16*H_Idx]
-    dup            EMSB.16b,H.b[0]
+    add            x1,TABLE,#16*H_Idx
+    ld1            {H.2d},[x1]
 IF_LE(`
     rev64          H.16b,H.16b
 ')
+    dup            EMSB.16b,H.b[7]
     mov            x1,#0xC200000000000000
     mov            x2,#1
     mov            POLY.d[0],x1
@@ -220,7 +206,7 @@ PROLOGUE(_nettle_gcm_hash)
     mov            x4,#0xC200000000000000
     mov            POLY.d[0],x4
-    ld1            {D.16b},[X]
+    ld1            {D.2d},[X]
 IF_LE(`
     rev64          D.16b,D.16b
 ')
@@ -233,7 +219,7 @@ IF_LE(`
     ld1            {H3M.16b,H3L.16b,H4M.16b,H4L.16b},[x5]
L4x_loop:
-    ld1            {C0.16b,C1.16b,C2.16b,C3.16b},[DATA],#64
+    ld1            {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
 IF_LE(`
     rev64          C0.16b,C0.16b
     rev64          C1.16b,C1.16b
@@ -261,7 +247,7 @@ L2x:
ld1            {H1M.16b,H1L.16b,H2M.16b,H2L.16b},[TABLE]
-    ld1            {C0.16b,C1.16b},[DATA],#32
+    ld1            {C0.2d,C1.2d},[DATA],#32
 IF_LE(`
     rev64          C0.16b,C0.16b
     rev64          C1.16b,C1.16b
@@ -282,7 +268,7 @@ L1x:
ld1            {H1M.16b,H1L.16b},[TABLE]
-    ld1            {C0.16b},[DATA],#16
+    ld1            {C0.2d},[DATA],#16
 IF_LE(`
     rev64          C0.16b,C0.16b
 ')
@@ -335,9 +321,7 @@ Lmod_8_done:
     REDUCTION D
Ldone:
-IF_LE(`
     rev64          D.16b,D.16b
-')
     st1            {D.16b},[X]
     ret
 EPILOGUE(_nettle_gcm_hash)
Please excuse my laboured and longwinded thinking. ;) I really have to
start thinking in vectors also.
This also works for the whole TABLE and gives host-endianness storage
there (where ld1.16b should have caused it to be little-endian before,
if that's at all relevant):
diff --git a/arm64/v8/gcm-hash.asm b/arm64/v8/gcm-hash.asm
index 1c14db54..bd6820b3 100644
--- a/arm64/v8/gcm-hash.asm
+++ b/arm64/v8/gcm-hash.asm
@@ -55,17 +55,10 @@ C common macros:
 .endm
.macro REDUCTION out
-IF_BE(`
-    pmull          T.1q,F.1d,POLY.1d
-    ext            \out().16b,F.16b,F.16b,#8
-    eor            R.16b,R.16b,T.16b
-    eor            \out().16b,\out().16b,R.16b
-',`
     pmull          T.1q,F.1d,POLY.1d
     eor            R.16b,R.16b,T.16b
     ext            R.16b,R.16b,R.16b,#8
     eor            \out().16b,F.16b,R.16b
-')
 .endm
C void gcm_init_key (union gcm_block *table)
@@ -108,27 +101,20 @@ define(`H4M', `v29')
 define(`H4L', `v30')
.macro PMUL_PARAM in, param1, param2
-IF_BE(`
-    pmull2         Hp.1q,\in().2d,POLY.2d
-    ext            Hm.16b,\in().16b,\in().16b,#8
-    eor            Hm.16b,Hm.16b,Hp.16b
-    zip            \param1().2d,\in().2d,Hm.2d
-    zip2           \param2().2d,\in().2d,Hm.2d
-',`
     pmull2         Hp.1q,\in().2d,POLY.2d
     eor            Hm.16b,\in().16b,Hp.16b
     ext            \param1().16b,Hm.16b,\in().16b,#8
     ext            \param2().16b,\in().16b,Hm.16b,#8
     ext            \param1().16b,\param1().16b,\param1().16b,#8
-')
 .endm
PROLOGUE(_nettle_gcm_init_key)
-    ldr            HQ,[TABLE,#16*H_Idx]
-    dup            EMSB.16b,H.b[0]
+    add            x1,TABLE,#16*H_Idx
+    ld1            {H.2d},[x1]
 IF_LE(`
     rev64          H.16b,H.16b
 ')
+    dup            EMSB.16b,H.b[7]
     mov            x1,#0xC200000000000000
     mov            x2,#1
     mov            POLY.d[0],x1
@@ -154,7 +140,7 @@ IF_LE(`
PMUL_PARAM H2,H2M,H2L
-    st1            {H1M.16b,H1L.16b,H2M.16b,H2L.16b},[TABLE],#64
+    st1            {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64
C --- calculate H^3 = H^1*H^2 ---
@@ -172,7 +158,7 @@ IF_LE(`
PMUL_PARAM H4,H4M,H4L
-    st1            {H3M.16b,H3L.16b,H4M.16b,H4L.16b},[TABLE]
+    st1            {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE]
ret
 EPILOGUE(_nettle_gcm_init_key)
@@ -220,7 +206,7 @@ PROLOGUE(_nettle_gcm_hash)
     mov            x4,#0xC200000000000000
     mov            POLY.d[0],x4
-    ld1            {D.16b},[X]
+    ld1            {D.2d},[X]
 IF_LE(`
     rev64          D.16b,D.16b
 ')
@@ -229,11 +215,11 @@ IF_LE(`
     b.eq           L2x
add            x5,TABLE,#64
-    ld1            {H1M.16b,H1L.16b,H2M.16b,H2L.16b},[TABLE]
-    ld1            {H3M.16b,H3L.16b,H4M.16b,H4L.16b},[x5]
+    ld1            {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
+    ld1            {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5]
L4x_loop:
-    ld1            {C0.16b,C1.16b,C2.16b,C3.16b},[DATA],#64
+    ld1            {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
 IF_LE(`
     rev64          C0.16b,C0.16b
     rev64          C1.16b,C1.16b
@@ -259,9 +245,9 @@ L2x:
     tst            LENGTH,#-32
     b.eq           L1x
-    ld1            {H1M.16b,H1L.16b,H2M.16b,H2L.16b},[TABLE]
+    ld1            {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
-    ld1            {C0.16b,C1.16b},[DATA],#32
+    ld1            {C0.2d,C1.2d},[DATA],#32
 IF_LE(`
     rev64          C0.16b,C0.16b
     rev64          C1.16b,C1.16b
@@ -280,9 +266,9 @@ L1x:
     tst            LENGTH,#-16
     b.eq           Lmod
-    ld1            {H1M.16b,H1L.16b},[TABLE]
+    ld1            {H1M.2d,H1L.2d},[TABLE]
-    ld1            {C0.16b},[DATA],#16
+    ld1            {C0.2d},[DATA],#16
 IF_LE(`
     rev64          C0.16b,C0.16b
 ')
@@ -297,7 +283,7 @@ Lmod:
     tst            LENGTH,#15
     b.eq           Ldone
-    ld1            {H1M.16b,H1L.16b},[TABLE]
+    ld1            {H1M.2d,H1L.2d},[TABLE]
tbz            LENGTH,3,Lmod_8
     ldr            C0D,[DATA],#8
@@ -338,6 +324,6 @@ Ldone:
 IF_LE(`
     rev64          D.16b,D.16b
 ')
-    st1            {D.16b},[X]
+    st1            {D.2d},[X]
     ret
 EPILOGUE(_nettle_gcm_hash)
And as always after all this guesswork I have found a likely very
relevant comment in gcm.c:
/* Shift uses big-endian representation. */
#if WORDS_BIGENDIAN
  reduce = shift_table[x->u64[1] & 0xff];
Is that it? Or is TABLE just internal to the routine and we can store
there however we please? (Apart from H at TABLE[128] initialised for us
by gcm_set_key and stored BE?)
-- 
Thanks,
Michael

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

Re: [AArch64] Optimize GHASH