I made a merge request in the main repo that enables optimized GHASH on AArch64 architecture. The implementation is based on Niels Möller's enhanced algorithm which yields more speedup on AArch64 arch in comparison with intel algorithm. Using the Karatsuba algorithm with Intel algorithm yielded an overhead so I dropped its benchmark result. I'll attach the file of Intel algorithm implementation here since it's not include in the MR.
Here is the benchmark result on AArch64:
*---------------------------------------------------------------------------------------------* | C version | Intel algorithm | Niels Möller's enhanced algorithm | | 208 Mbyte/s | 2781 Mbyte/s | 3255 Mbyte/s | *---------------------------------------------------------------------------------------------*
This is +17% performance boost of the enhanced algorithm over the Intel algorithm, it's not as impressive as PowerPC benchmark result but it did a great job on AArch64 considering PMULL instruction doesn't have the assistance that vpmsumd offers by multiply four polynomials then summing.
I tried to avoid using the stack in this implementation so I wrote a procedure to handle leftovers by just using the registers, let me know if there's a room for improvement here.
regards, Mamone
C arm/v8/gcm-hash.asm
ifelse(` Copyright (C) 2020 Niels Möller and Mamone Tarsha This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
or
* the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/. ')
C gcm_set_key() assigns H value in the middle element of the table define(`H_Idx', `128')
.file "gcm-hash.asm"
.text
C void gcm_init_key (union gcm_block *table)
C This function populates the gcm table as the following layout C ******************************************************************************* C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) | C | | C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) | C | | C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) | C | | C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | C *******************************************************************************
define(`TABLE', `x0')
define(`ZERO', `v0') define(`EMSB', `v1') define(`POLY', `v2') define(`B', `v3')
define(`H', `v4') define(`HQ', `q4') define(`H_t', `v5') define(`H2', `v6') define(`H2_t', `v7') define(`H3', `v16') define(`H3_t', `v17') define(`H4', `v18') define(`H4_t', `v19') define(`H_m', `v20') define(`H_m1', `v21') define(`H_h', `v22') define(`H_l', `v23') define(`RP', `v24') define(`Ml', `v25') define(`Mh', `v26')
PROLOGUE(_nettle_gcm_init_key) ldr HQ,[TABLE,#16*H_Idx] dup EMSB.16b,H.b[0] rev64 H.16b,H.16b mov x9,#0xC200000000000000 mov x10,#1 mov POLY.d[0],x9 mov POLY.d[1],x10 sshr EMSB.16b,EMSB.16b,#7 and EMSB.16b,EMSB.16b,POLY.16b ushr B.2d,H.2d,#63 and B.16b,B.16b,POLY.16b ext B.16b,B.16b,B.16b,#8 shl H.2d,H.2d,#1 orr H.16b,H.16b,B.16b eor H.16b,H.16b,EMSB.16b
eor ZERO.16b,ZERO.16b,ZERO.16b dup POLY.2d,POLY.d[0] ext H_t.16b,H.16b,H.16b,#8
pmull H_m.1q,H.1d,H_t.1d pmull2 H_m1.1q,H.2d,H_t.2d pmull H_h.1q,H.1d,H.1d pmull2 H_l.1q,H.2d,H.2d
eor H_m.16b,H_m.16b,H_m1.16b pmull RP.1q,H_l.1d,POLY.1d ext Ml.16b,ZERO.16b,H_m.16b,#8 ext Mh.16b,H_m.16b,ZERO.16b,#8 ext RP.16b,RP.16b,RP.16b,#8 eor H_l.16b,H_l.16b,Ml.16b eor H_h.16b,H_h.16b,Mh.16b eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d eor H_h.16b,H_h.16b,H_l.16b eor H2_t.16b,H_h.16b,RP.16b ext H2.16b,H2_t.16b,H2_t.16b,#8
st1 {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE],#64
pmull H_m.1q,H.1d,H2_t.1d pmull2 H_m1.1q,H.2d,H2_t.2d pmull H_h.1q,H.1d,H2.1d pmull2 H_l.1q,H.2d,H2.2d
eor H_m.16b,H_m.16b,H_m1.16b pmull RP.1q,H_l.1d,POLY.1d ext Ml.16b,ZERO.16b,H_m.16b,#8 ext Mh.16b,H_m.16b,ZERO.16b,#8 ext RP.16b,RP.16b,RP.16b,#8 eor H_l.16b,H_l.16b,Ml.16b eor H_h.16b,H_h.16b,Mh.16b eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d eor H_h.16b,H_h.16b,H_l.16b eor H3_t.16b,H_h.16b,RP.16b ext H3.16b,H3_t.16b,H3_t.16b,#8
pmull H_m.1q,H2.1d,H2_t.1d pmull2 H_m1.1q,H2.2d,H2_t.2d pmull H_h.1q,H2.1d,H2.1d pmull2 H_l.1q,H2.2d,H2.2d
eor H_m.16b,H_m.16b,H_m1.16b pmull RP.1q,H_l.1d,POLY.1d ext Ml.16b,ZERO.16b,H_m.16b,#8 ext Mh.16b,H_m.16b,ZERO.16b,#8 ext RP.16b,RP.16b,RP.16b,#8 eor H_l.16b,H_l.16b,Ml.16b eor H_h.16b,H_h.16b,Mh.16b eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d eor H_h.16b,H_h.16b,H_l.16b eor H4_t.16b,H_h.16b,RP.16b ext H4.16b,H4_t.16b,H4_t.16b,#8
st1 {H3.16b,H3_t.16b,H4.16b,H4_t.16b},[TABLE]
ret EPILOGUE(_nettle_gcm_init_key)
define(`TABLE', `x0') define(`X', `x1') define(`LENGTH', `x2') define(`DATA', `x3')
define(`POLY', `v0') define(`ZERO', `v1')
define(`D', `v2') define(`C0', `v3') define(`C0D', `d3') define(`C1', `v4') define(`C2', `v5') define(`C3', `v6') define(`RP', `v7') define(`H', `v16') define(`H_t', `v17') define(`H2', `v18') define(`H2_t', `v19') define(`H3', `v20') define(`H3_t', `v21') define(`H4', `v22') define(`H4_t', `v23') define(`H_m', `v24') define(`H_m1', `v25') define(`H_h', `v26') define(`H_l', `v27') define(`H_m2', `v28') define(`H_m3', `v29') define(`H_h2', `v30') define(`H_l2', `v31') define(`Ml', `v4') define(`Mh', `v5')
C void gcm_hash (const struct gcm_key *key, union gcm_block *x, C size_t length, const uint8_t *data)
PROLOGUE(_nettle_gcm_hash) mov x10,#0xC200000000000000 mov POLY.d[0],x10 dup POLY.2d,POLY.d[0] eor ZERO.16b,ZERO.16b,ZERO.16b
ld1 {D.16b},[X] rev64 D.16b,D.16b
ands x10,LENGTH,#-64 b.eq L2x
add x9,TABLE,64 ld1 {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE] ld1 {H3.16b,H3_t.16b,H4.16b,H4_t.16b},[x9]
L4x_loop: ld1 {C0.16b,C1.16b,C2.16b,C3.16b},[DATA],#64 rev64 C0.16b,C0.16b rev64 C1.16b,C1.16b rev64 C2.16b,C2.16b rev64 C3.16b,C3.16b
eor C0.16b,C0.16b,D.16b
pmull H_m.1q,C1.1d,H3_t.1d pmull2 H_m1.1q,C1.2d,H3_t.2d pmull H_h.1q,C1.1d,H3.1d pmull2 H_l.1q,C1.2d,H3.2d
pmull H_m2.1q,C2.1d,H2_t.1d pmull2 H_m3.1q,C2.2d,H2_t.2d pmull H_h2.1q,C2.1d,H2.1d pmull2 H_l2.1q,C2.2d,H2.2d
eor H_m.16b,H_m.16b,H_m2.16b eor H_m1.16b,H_m1.16b,H_m3.16b eor H_h.16b,H_h.16b,H_h2.16b eor H_l.16b,H_l.16b,H_l2.16b
pmull H_m2.1q,C3.1d,H_t.1d pmull2 H_m3.1q,C3.2d,H_t.2d pmull H_h2.1q,C3.1d,H.1d pmull2 H_l2.1q,C3.2d,H.2d
eor H_m.16b,H_m.16b,H_m2.16b eor H_m1.16b,H_m1.16b,H_m3.16b eor H_h.16b,H_h.16b,H_h2.16b eor H_l.16b,H_l.16b,H_l2.16b
pmull H_m2.1q,C0.1d,H4_t.1d pmull2 H_m3.1q,C0.2d,H4_t.2d pmull H_h2.1q,C0.1d,H4.1d pmull2 H_l2.1q,C0.2d,H4.2d
eor H_m.16b,H_m.16b,H_m2.16b eor H_m1.16b,H_m1.16b,H_m3.16b eor H_h.16b,H_h.16b,H_h2.16b eor H_l.16b,H_l.16b,H_l2.16b
eor H_m.16b,H_m.16b,H_m1.16b pmull RP.1q,H_l.1d,POLY.1d ext Ml.16b,ZERO.16b,H_m.16b,#8 ext Mh.16b,H_m.16b,ZERO.16b,#8 ext RP.16b,RP.16b,RP.16b,#8 eor H_l.16b,H_l.16b,Ml.16b eor H_h.16b,H_h.16b,Mh.16b eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d eor H_h.16b,H_h.16b,H_l.16b eor D.16b,H_h.16b,RP.16b ext D.16b,D.16b,D.16b,#8
subs x10,x10,64 b.ne L4x_loop
and LENGTH,LENGTH,#63
L2x: tst LENGTH,#-32 b.eq L1x
ld1 {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE]
ld1 {C0.16b,C1.16b},[DATA],#32 rev64 C0.16b,C0.16b rev64 C1.16b,C1.16b
eor C0.16b,C0.16b,D.16b
pmull H_m.1q,C1.1d,H_t.1d pmull2 H_m1.1q,C1.2d,H_t.2d pmull H_h.1q,C1.1d,H.1d pmull2 H_l.1q,C1.2d,H.2d
pmull H_m2.1q,C0.1d,H2_t.1d pmull2 H_m3.1q,C0.2d,H2_t.2d pmull H_h2.1q,C0.1d,H2.1d pmull2 H_l2.1q,C0.2d,H2.2d
eor H_m.16b,H_m.16b,H_m2.16b eor H_m1.16b,H_m1.16b,H_m3.16b eor H_h.16b,H_h.16b,H_h2.16b eor H_l.16b,H_l.16b,H_l2.16b
eor H_m.16b,H_m.16b,H_m1.16b pmull RP.1q,H_l.1d,POLY.1d ext Ml.16b,ZERO.16b,H_m.16b,#8 ext Mh.16b,H_m.16b,ZERO.16b,#8 ext RP.16b,RP.16b,RP.16b,#8 eor H_l.16b,H_l.16b,Ml.16b eor H_h.16b,H_h.16b,Mh.16b eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d eor H_h.16b,H_h.16b,H_l.16b eor D.16b,H_h.16b,RP.16b ext D.16b,D.16b,D.16b,#8
and LENGTH,LENGTH,#31
L1x: tst LENGTH,#-16 b.eq Lmod
ld1 {H.16b,H_t.16b},[TABLE]
ld1 {C0.16b},[DATA],#16 rev64 C0.16b,C0.16b
eor C0.16b,C0.16b,D.16b
pmull H_m.1q,C0.1d,H_t.1d pmull2 H_m1.1q,C0.2d,H_t.2d pmull H_h.1q,C0.1d,H.1d pmull2 H_l.1q,C0.2d,H.2d
eor H_m.16b,H_m.16b,H_m1.16b pmull RP.1q,H_l.1d,POLY.1d ext Ml.16b,ZERO.16b,H_m.16b,#8 ext Mh.16b,H_m.16b,ZERO.16b,#8 ext RP.16b,RP.16b,RP.16b,#8 eor H_l.16b,H_l.16b,Ml.16b eor H_h.16b,H_h.16b,Mh.16b eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d eor H_h.16b,H_h.16b,H_l.16b eor D.16b,H_h.16b,RP.16b ext D.16b,D.16b,D.16b,#8
Lmod: tst LENGTH,#15 b.eq Ldone
ld1 {H.16b,H_t.16b},[TABLE]
tbz LENGTH,3,Lmod_8 ldr C0D,[DATA],#8 rev64 C0.16b,C0.16b mov x10,#0 mov C0.d[1],x10 Lmod_8: tst LENGTH,#7 b.eq Lmod_8_done mov x9,#0 mov x8,#64 and x7,LENGTH,#7 Lmod_8_loop: mov x10,#0 ldrb w10,[DATA],#1 sub x8,x8,#8 lsl x10,x10,x8 orr x9,x9,x10 subs x7,x7,#1 b.ne Lmod_8_loop tbz LENGTH,3,Lmod_8_load mov C0.d[1],x9 b Lmod_8_done Lmod_8_load: mov x10,#0 mov C0.d[0],x9 mov C0.d[1],x10 Lmod_8_done: eor C0.16b,C0.16b,D.16b
pmull H_m.1q,C0.1d,H_t.1d pmull2 H_m1.1q,C0.2d,H_t.2d pmull H_h.1q,C0.1d,H.1d pmull2 H_l.1q,C0.2d,H.2d
eor H_m.16b,H_m.16b,H_m1.16b pmull RP.1q,H_l.1d,POLY.1d ext Ml.16b,ZERO.16b,H_m.16b,#8 ext Mh.16b,H_m.16b,ZERO.16b,#8 ext RP.16b,RP.16b,RP.16b,#8 eor H_l.16b,H_l.16b,Ml.16b eor H_h.16b,H_h.16b,Mh.16b eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d eor H_h.16b,H_h.16b,H_l.16b eor D.16b,H_h.16b,RP.16b ext D.16b,D.16b,D.16b,#8
Ldone: rev64 D.16b,D.16b st1 {D.16b},[X] ret EPILOGUE(_nettle_gcm_hash)