[AArch64] Optimize GHASH

14 Dec 2020


      I made a merge request in the main repo that enables optimized GHASH on
AArch64 architecture. The implementation is based on Niels Möller's
enhanced algorithm which yields more speedup on AArch64 arch in
comparison with intel algorithm. Using the Karatsuba algorithm with Intel
algorithm yielded an overhead so I dropped its benchmark result. I'll
attach the file of Intel algorithm implementation here since it's not
include in the MR.
Here is the benchmark result on AArch64:
*---------------------------------------------------------------------------------------------*
| C version       |   Intel algorithm  |   Niels Möller's enhanced
algorithm  |
|  208 Mbyte/s  |   2781 Mbyte/s   |   3255 Mbyte/s
          |
*---------------------------------------------------------------------------------------------*
This is +17% performance boost of the enhanced algorithm over the Intel
algorithm, it's not as impressive as PowerPC benchmark result but it did a
great job on AArch64 considering PMULL instruction doesn't have
the assistance that vpmsumd offers by multiply four polynomials then
summing.
I tried to avoid using the stack in this implementation so I wrote a
procedure to handle leftovers by just using the registers, let me know if
there's a room for improvement here.
regards,
Mamone
C arm/v8/gcm-hash.asm
ifelse(`
   Copyright (C) 2020 Niels Möller and Mamone Tarsha
   This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.
or
* the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.
You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
')
C gcm_set_key() assigns H value in the middle element of the table
define(`H_Idx', `128')
.file "gcm-hash.asm"
.text
C void gcm_init_key (union gcm_block *table)
C This function populates the gcm table as the following layout
C
*******************************************************************************
C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
     |
C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div
x⁶⁴) |
C |
    |
C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
     |
C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div
x⁶⁴) |
C |
    |
C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
     |
C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div
x⁶⁴) |
C |
    |
C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
     |
C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div
x⁶⁴) |
C
*******************************************************************************
define(`TABLE', `x0')
define(`ZERO', `v0')
define(`EMSB', `v1')
define(`POLY', `v2')
define(`B', `v3')
define(`H', `v4')
define(`HQ', `q4')
define(`H_t', `v5')
define(`H2', `v6')
define(`H2_t', `v7')
define(`H3', `v16')
define(`H3_t', `v17')
define(`H4', `v18')
define(`H4_t', `v19')
define(`H_m', `v20')
define(`H_m1', `v21')
define(`H_h', `v22')
define(`H_l', `v23')
define(`RP', `v24')
define(`Ml', `v25')
define(`Mh', `v26')
PROLOGUE(_nettle_gcm_init_key)
    ldr            HQ,[TABLE,#16*H_Idx]
    dup            EMSB.16b,H.b[0]
    rev64          H.16b,H.16b
    mov            x9,#0xC200000000000000
    mov            x10,#1
    mov            POLY.d[0],x9
    mov            POLY.d[1],x10
    sshr           EMSB.16b,EMSB.16b,#7
    and            EMSB.16b,EMSB.16b,POLY.16b
    ushr           B.2d,H.2d,#63
    and            B.16b,B.16b,POLY.16b
    ext            B.16b,B.16b,B.16b,#8
    shl            H.2d,H.2d,#1
    orr            H.16b,H.16b,B.16b
    eor            H.16b,H.16b,EMSB.16b
eor            ZERO.16b,ZERO.16b,ZERO.16b
    dup            POLY.2d,POLY.d[0]
    ext            H_t.16b,H.16b,H.16b,#8
pmull          H_m.1q,H.1d,H_t.1d
    pmull2         H_m1.1q,H.2d,H_t.2d
    pmull          H_h.1q,H.1d,H.1d
    pmull2         H_l.1q,H.2d,H.2d
eor            H_m.16b,H_m.16b,H_m1.16b
    pmull          RP.1q,H_l.1d,POLY.1d
    ext            Ml.16b,ZERO.16b,H_m.16b,#8
    ext            Mh.16b,H_m.16b,ZERO.16b,#8
    ext            RP.16b,RP.16b,RP.16b,#8
    eor            H_l.16b,H_l.16b,Ml.16b
    eor            H_h.16b,H_h.16b,Mh.16b
    eor            H_l.16b,H_l.16b,RP.16b
pmull2         RP.1q,H_l.2d,POLY.2d
    eor            H_h.16b,H_h.16b,H_l.16b
    eor            H2_t.16b,H_h.16b,RP.16b
    ext            H2.16b,H2_t.16b,H2_t.16b,#8
st1            {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE],#64
pmull          H_m.1q,H.1d,H2_t.1d
    pmull2         H_m1.1q,H.2d,H2_t.2d
    pmull          H_h.1q,H.1d,H2.1d
    pmull2         H_l.1q,H.2d,H2.2d
eor            H_m.16b,H_m.16b,H_m1.16b
    pmull          RP.1q,H_l.1d,POLY.1d
    ext            Ml.16b,ZERO.16b,H_m.16b,#8
    ext            Mh.16b,H_m.16b,ZERO.16b,#8
    ext            RP.16b,RP.16b,RP.16b,#8
    eor            H_l.16b,H_l.16b,Ml.16b
    eor            H_h.16b,H_h.16b,Mh.16b
    eor            H_l.16b,H_l.16b,RP.16b
pmull2         RP.1q,H_l.2d,POLY.2d
    eor            H_h.16b,H_h.16b,H_l.16b
    eor            H3_t.16b,H_h.16b,RP.16b
    ext            H3.16b,H3_t.16b,H3_t.16b,#8
pmull          H_m.1q,H2.1d,H2_t.1d
    pmull2         H_m1.1q,H2.2d,H2_t.2d
    pmull          H_h.1q,H2.1d,H2.1d
    pmull2         H_l.1q,H2.2d,H2.2d
eor            H_m.16b,H_m.16b,H_m1.16b
    pmull          RP.1q,H_l.1d,POLY.1d
    ext            Ml.16b,ZERO.16b,H_m.16b,#8
    ext            Mh.16b,H_m.16b,ZERO.16b,#8
    ext            RP.16b,RP.16b,RP.16b,#8
    eor            H_l.16b,H_l.16b,Ml.16b
    eor            H_h.16b,H_h.16b,Mh.16b
    eor            H_l.16b,H_l.16b,RP.16b
pmull2         RP.1q,H_l.2d,POLY.2d
    eor            H_h.16b,H_h.16b,H_l.16b
    eor            H4_t.16b,H_h.16b,RP.16b
    ext            H4.16b,H4_t.16b,H4_t.16b,#8
st1            {H3.16b,H3_t.16b,H4.16b,H4_t.16b},[TABLE]
ret
EPILOGUE(_nettle_gcm_init_key)
define(`TABLE', `x0')
define(`X', `x1')
define(`LENGTH', `x2')
define(`DATA', `x3')
define(`POLY', `v0')
define(`ZERO', `v1')
define(`D', `v2')
define(`C0', `v3')
define(`C0D', `d3')
define(`C1', `v4')
define(`C2', `v5')
define(`C3', `v6')
define(`RP', `v7')
define(`H', `v16')
define(`H_t', `v17')
define(`H2', `v18')
define(`H2_t', `v19')
define(`H3', `v20')
define(`H3_t', `v21')
define(`H4', `v22')
define(`H4_t', `v23')
define(`H_m', `v24')
define(`H_m1', `v25')
define(`H_h', `v26')
define(`H_l', `v27')
define(`H_m2', `v28')
define(`H_m3', `v29')
define(`H_h2', `v30')
define(`H_l2', `v31')
define(`Ml', `v4')
define(`Mh', `v5')
C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
    C                size_t length, const uint8_t *data)
PROLOGUE(_nettle_gcm_hash)
    mov            x10,#0xC200000000000000
    mov            POLY.d[0],x10
    dup            POLY.2d,POLY.d[0]
    eor            ZERO.16b,ZERO.16b,ZERO.16b
ld1            {D.16b},[X]
    rev64          D.16b,D.16b
ands           x10,LENGTH,#-64
    b.eq           L2x
add            x9,TABLE,64
    ld1            {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE]
    ld1            {H3.16b,H3_t.16b,H4.16b,H4_t.16b},[x9]
L4x_loop:
    ld1            {C0.16b,C1.16b,C2.16b,C3.16b},[DATA],#64
    rev64          C0.16b,C0.16b
    rev64          C1.16b,C1.16b
    rev64          C2.16b,C2.16b
    rev64          C3.16b,C3.16b
eor            C0.16b,C0.16b,D.16b
pmull          H_m.1q,C1.1d,H3_t.1d
    pmull2         H_m1.1q,C1.2d,H3_t.2d
    pmull          H_h.1q,C1.1d,H3.1d
    pmull2         H_l.1q,C1.2d,H3.2d
pmull          H_m2.1q,C2.1d,H2_t.1d
    pmull2         H_m3.1q,C2.2d,H2_t.2d
    pmull          H_h2.1q,C2.1d,H2.1d
    pmull2         H_l2.1q,C2.2d,H2.2d
eor            H_m.16b,H_m.16b,H_m2.16b
    eor            H_m1.16b,H_m1.16b,H_m3.16b
    eor            H_h.16b,H_h.16b,H_h2.16b
    eor            H_l.16b,H_l.16b,H_l2.16b
pmull          H_m2.1q,C3.1d,H_t.1d
    pmull2         H_m3.1q,C3.2d,H_t.2d
    pmull          H_h2.1q,C3.1d,H.1d
    pmull2         H_l2.1q,C3.2d,H.2d
eor            H_m.16b,H_m.16b,H_m2.16b
    eor            H_m1.16b,H_m1.16b,H_m3.16b
    eor            H_h.16b,H_h.16b,H_h2.16b
    eor            H_l.16b,H_l.16b,H_l2.16b
pmull          H_m2.1q,C0.1d,H4_t.1d
    pmull2         H_m3.1q,C0.2d,H4_t.2d
    pmull          H_h2.1q,C0.1d,H4.1d
    pmull2         H_l2.1q,C0.2d,H4.2d
eor            H_m.16b,H_m.16b,H_m2.16b
    eor            H_m1.16b,H_m1.16b,H_m3.16b
    eor            H_h.16b,H_h.16b,H_h2.16b
    eor            H_l.16b,H_l.16b,H_l2.16b
eor            H_m.16b,H_m.16b,H_m1.16b
    pmull          RP.1q,H_l.1d,POLY.1d
    ext            Ml.16b,ZERO.16b,H_m.16b,#8
    ext            Mh.16b,H_m.16b,ZERO.16b,#8
    ext            RP.16b,RP.16b,RP.16b,#8
    eor            H_l.16b,H_l.16b,Ml.16b
    eor            H_h.16b,H_h.16b,Mh.16b
    eor            H_l.16b,H_l.16b,RP.16b
pmull2         RP.1q,H_l.2d,POLY.2d
    eor            H_h.16b,H_h.16b,H_l.16b
    eor            D.16b,H_h.16b,RP.16b
    ext            D.16b,D.16b,D.16b,#8
subs           x10,x10,64
    b.ne           L4x_loop
and            LENGTH,LENGTH,#63
L2x:
    tst            LENGTH,#-32
    b.eq           L1x
ld1            {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE]
ld1            {C0.16b,C1.16b},[DATA],#32
    rev64          C0.16b,C0.16b
    rev64          C1.16b,C1.16b
eor            C0.16b,C0.16b,D.16b
pmull          H_m.1q,C1.1d,H_t.1d
    pmull2         H_m1.1q,C1.2d,H_t.2d
    pmull          H_h.1q,C1.1d,H.1d
    pmull2         H_l.1q,C1.2d,H.2d
pmull          H_m2.1q,C0.1d,H2_t.1d
    pmull2         H_m3.1q,C0.2d,H2_t.2d
    pmull          H_h2.1q,C0.1d,H2.1d
    pmull2         H_l2.1q,C0.2d,H2.2d
eor            H_m.16b,H_m.16b,H_m2.16b
    eor            H_m1.16b,H_m1.16b,H_m3.16b
    eor            H_h.16b,H_h.16b,H_h2.16b
    eor            H_l.16b,H_l.16b,H_l2.16b
eor            H_m.16b,H_m.16b,H_m1.16b
    pmull          RP.1q,H_l.1d,POLY.1d
    ext            Ml.16b,ZERO.16b,H_m.16b,#8
    ext            Mh.16b,H_m.16b,ZERO.16b,#8
    ext            RP.16b,RP.16b,RP.16b,#8
    eor            H_l.16b,H_l.16b,Ml.16b
    eor            H_h.16b,H_h.16b,Mh.16b
    eor            H_l.16b,H_l.16b,RP.16b
pmull2         RP.1q,H_l.2d,POLY.2d
    eor            H_h.16b,H_h.16b,H_l.16b
    eor            D.16b,H_h.16b,RP.16b
    ext            D.16b,D.16b,D.16b,#8
and            LENGTH,LENGTH,#31
L1x:
    tst            LENGTH,#-16
    b.eq           Lmod
ld1            {H.16b,H_t.16b},[TABLE]
ld1            {C0.16b},[DATA],#16
    rev64          C0.16b,C0.16b
eor            C0.16b,C0.16b,D.16b
pmull          H_m.1q,C0.1d,H_t.1d
    pmull2         H_m1.1q,C0.2d,H_t.2d
    pmull          H_h.1q,C0.1d,H.1d
    pmull2         H_l.1q,C0.2d,H.2d
eor            H_m.16b,H_m.16b,H_m1.16b
    pmull          RP.1q,H_l.1d,POLY.1d
    ext            Ml.16b,ZERO.16b,H_m.16b,#8
    ext            Mh.16b,H_m.16b,ZERO.16b,#8
    ext            RP.16b,RP.16b,RP.16b,#8
    eor            H_l.16b,H_l.16b,Ml.16b
    eor            H_h.16b,H_h.16b,Mh.16b
    eor            H_l.16b,H_l.16b,RP.16b
pmull2         RP.1q,H_l.2d,POLY.2d
    eor            H_h.16b,H_h.16b,H_l.16b
    eor            D.16b,H_h.16b,RP.16b
    ext            D.16b,D.16b,D.16b,#8
Lmod:
    tst            LENGTH,#15
    b.eq           Ldone
ld1            {H.16b,H_t.16b},[TABLE]
tbz            LENGTH,3,Lmod_8
    ldr            C0D,[DATA],#8
    rev64          C0.16b,C0.16b
    mov            x10,#0
    mov            C0.d[1],x10
Lmod_8:
    tst            LENGTH,#7
    b.eq           Lmod_8_done
    mov            x9,#0
    mov            x8,#64
    and            x7,LENGTH,#7
Lmod_8_loop:
    mov            x10,#0
    ldrb           w10,[DATA],#1
    sub            x8,x8,#8
    lsl            x10,x10,x8
    orr            x9,x9,x10
    subs           x7,x7,#1
    b.ne           Lmod_8_loop
    tbz            LENGTH,3,Lmod_8_load
    mov            C0.d[1],x9
    b              Lmod_8_done
Lmod_8_load:
    mov            x10,#0
    mov            C0.d[0],x9
    mov            C0.d[1],x10
Lmod_8_done:
    eor            C0.16b,C0.16b,D.16b
pmull          H_m.1q,C0.1d,H_t.1d
    pmull2         H_m1.1q,C0.2d,H_t.2d
    pmull          H_h.1q,C0.1d,H.1d
    pmull2         H_l.1q,C0.2d,H.2d
eor            H_m.16b,H_m.16b,H_m1.16b
    pmull          RP.1q,H_l.1d,POLY.1d
    ext            Ml.16b,ZERO.16b,H_m.16b,#8
    ext            Mh.16b,H_m.16b,ZERO.16b,#8
    ext            RP.16b,RP.16b,RP.16b,#8
    eor            H_l.16b,H_l.16b,Ml.16b
    eor            H_h.16b,H_h.16b,Mh.16b
    eor            H_l.16b,H_l.16b,RP.16b
pmull2         RP.1q,H_l.2d,POLY.2d
    eor            H_h.16b,H_h.16b,H_l.16b
    eor            D.16b,H_h.16b,RP.16b
    ext            D.16b,D.16b,D.16b,#8
Ldone:
    rev64          D.16b,D.16b
    st1            {D.16b},[X]
    ret
EPILOGUE(_nettle_gcm_hash)

[1] https://hub.docker.com/r/michaelweisernettleci/buildroot

The BE pine64 board is also all updated now and standing by.