Re: [Aarch64] Optimize SHA1 Compress

14 May 2021

Hi Maamoun,
you added the standard GNU License to these files, but the repository
you mention has no license at all (red flag), and looking at the code
it points to on which these files are "based" the current license if
ASL 2.0
How much are your patches "based" on the SHA-Intrinsic source?
The perf improvement is great btw.
Simo.
On Fri, 2021-05-14 at 08:45 +0300, Maamoun TK wrote:
...
This patch optimizes SHA1 compress function for arm64 architecture by
taking advantage of SHA-1 instructions of Armv8 crypto extension.
The SHA-1 instructions:
SHA1C: SHA1 hash update (choose)
SHA1H: SHA1 fixed rotate
SHA1M: SHA1 hash update (majority)
SHA1P: SHA1 hash update (parity)
SHA1SU0: SHA1 schedule update 0
SHA1SU1: SHA1 schedule update 1
The patch is based on sha1-arm.c - ARMv8 SHA extensions using C intrinsics
of repository https://github.com/noloader/SHA-Intrinsics by Jeffrey Walton.
The patch passes the testsuite of nettle library and the benchmark numbers
are considerably improved but the performance of the overall sha1 hash
function doesn't surpass the corresponding openssl numbers.
Benchmark on gcc117 instance of CFarm before applying the patch:
         Algorithm         mode        Mbyte/s
         sha1               update       214.16
         openssl sha1  update       849.44
         hmac-sha1     64 bytes     61.69
         hmac-sha1     256 bytes   131.50
         hmac-sha1    1024 bytes  185.20
         hmac-sha1    4096 bytes  204.55
         hmac-sha1    single msg  210.97
Benchmark on gcc117 instance of CFarm after applying the patch:
         Algorithm         mode        Mbyte/s
         sha1                update       795.57
         openssl sha1   update       849.25
         hmac-sha1      64 bytes    167.65
         hmac-sha1      256 bytes   408.24
         hmac-sha1     1024 bytes  636.68
         hmac-sha1     4096 bytes  739.42
         hmac-sha1     single msg  775.89

arm64/crypto/sha1-compress.asm | 245
+++++++++++++++++++++++++++++++++++++++++
 arm64/machine.m4               |   7 ++
 2 files changed, 252 insertions(+)
 create mode 100644 arm64/crypto/sha1-compress.asm

diff --git a/arm64/crypto/sha1-compress.asm b/arm64/crypto/sha1-compress.asm
new file mode 100644
index 00000000..bb3f1d35
--- /dev/null
+++ b/arm64/crypto/sha1-compress.asm
@@ -0,0 +1,245 @@
+C arm64/crypto/sha1-compress.asm



+ifelse(`

Copyright (C) 2021 Mamone Tarsha

Based on sha1-arm.c - ARMv8 SHA extensions using C intrinsics of
repository https://github.com/noloader/SHA-Intrinsics
sha1-arm.c is written and placed in public domain by Jeffrey Walton,
based on code from ARM, and by Johannes Schneiders, Skip
Hovsmith and Barry O'Rourke for the mbedTLS project.

This file is part of GNU Nettle.

GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:

* the GNU Lesser General Public License as published by the Free


  Software Foundation; either version 3 of the License, or (at your


  option) any later version.



or

* the GNU General Public License as published by the Free


  Software Foundation; either version 2 of the License, or (at your


  option) any later version.



or both in parallel, as here.

GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program.  If
not, see http://www.gnu.org/licenses/.

+')



+.file "sha1-compress.asm"
+.arch armv8-a+crypto



+.text



+C Register usage:



+define(`STATE', `x0')
+define(`INPUT', `x1')



+define(`CONST0', `v0')
+define(`CONST1', `v1')
+define(`CONST2', `v2')
+define(`CONST3', `v3')
+define(`MSG0', `v4')
+define(`MSG1', `v5')
+define(`MSG2', `v6')
+define(`MSG3', `v7')
+define(`ABCD', `v16')
+define(`ABCD_SAVED', `v17')
+define(`E0', `v18')
+define(`E0_SAVED', `v19')
+define(`E1', `v20')
+define(`TMP0', `v21')
+define(`TMP1', `v22')



+C void nettle_sha1_compress(uint32_t *state, const uint8_t *input)



+PROLOGUE(nettle_sha1_compress)

C Initialize constants
mov            w2,#0x7999
movk           w2,#0x5A82,lsl #16
dup            CONST0.4s,w2
mov            w2,#0xEBA1
movk           w2,#0x6ED9,lsl #16
dup            CONST1.4s,w2
mov            w2,#0xBCDC
movk           w2,#0x8F1B,lsl #16
dup            CONST2.4s,w2
mov            w2,#0xC1D6
movk           w2,#0xCA62,lsl #16
dup            CONST3.4s,w2

C Load state
add            x2,STATE,#16
movi           E0.4s,#0
ld1            {ABCD.4s},[STATE]
ld1            {E0.s}[0],[x2]

C Save state
mov            ABCD_SAVED.16b,ABCD.16b
mov            E0_SAVED.16b,E0.16b

C Load message
ld1            {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT]

C Reverse for little endian
rev32          MSG0.16b,MSG0.16b
rev32          MSG1.16b,MSG1.16b
rev32          MSG2.16b,MSG2.16b
rev32          MSG3.16b,MSG3.16b

add            TMP0.4s,MSG0.4s,CONST0.4s
add            TMP1.4s,MSG1.4s,CONST0.4s

C Rounds 0-3
sha1h          SFP(E1),SFP(ABCD)
sha1c          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG2.4s,CONST0.4s
sha1su0        MSG0.4s,MSG1.4s,MSG2.4s

C Rounds 4-7
sha1h          SFP(E0),SFP(ABCD)
sha1c          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG3.4s,CONST0.4s
sha1su1        MSG0.4s,MSG3.4s
sha1su0        MSG1.4s,MSG2.4s,MSG3.4s

C Rounds 8-11
sha1h          SFP(E1),SFP(ABCD)
sha1c          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG0.4s,CONST0.4s
sha1su1        MSG1.4s,MSG0.4s
sha1su0        MSG2.4s,MSG3.4s,MSG0.4s

C Rounds 12-15
sha1h          SFP(E0),SFP(ABCD)
sha1c          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG1.4s,CONST1.4s
sha1su1        MSG2.4s,MSG1.4s
sha1su0        MSG3.4s,MSG0.4s,MSG1.4s

C Rounds 16-19
sha1h          SFP(E1),SFP(ABCD)
sha1c          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG2.4s,CONST1.4s
sha1su1        MSG3.4s,MSG2.4s
sha1su0        MSG0.4s,MSG1.4s,MSG2.4s

C Rounds 20-23
sha1h          SFP(E0),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG3.4s,CONST1.4s
sha1su1        MSG0.4s,MSG3.4s
sha1su0        MSG1.4s,MSG2.4s,MSG3.4s

C Rounds 24-27
sha1h          SFP(E1),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG0.4s,CONST1.4s
sha1su1        MSG1.4s,MSG0.4s
sha1su0        MSG2.4s,MSG3.4s,MSG0.4s

C Rounds 28-31
sha1h          SFP(E0),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG1.4s,CONST1.4s
sha1su1        MSG2.4s,MSG1.4s
sha1su0        MSG3.4s,MSG0.4s,MSG1.4s

C Rounds 32-35
sha1h          SFP(E1),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG2.4s,CONST2.4s
sha1su1        MSG3.4s,MSG2.4s
sha1su0        MSG0.4s,MSG1.4s,MSG2.4s

C Rounds 36-39
sha1h          SFP(E0),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG3.4s,CONST2.4s
sha1su1        MSG0.4s,MSG3.4s
sha1su0        MSG1.4s,MSG2.4s,MSG3.4s

C Rounds 40-43
sha1h          SFP(E1),SFP(ABCD)
sha1m          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG0.4s,CONST2.4s
sha1su1        MSG1.4s,MSG0.4s
sha1su0        MSG2.4s,MSG3.4s,MSG0.4s

C Rounds 44-47
sha1h          SFP(E0),SFP(ABCD)
sha1m          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG1.4s,CONST2.4s
sha1su1        MSG2.4s,MSG1.4s
sha1su0        MSG3.4s,MSG0.4s,MSG1.4s

C Rounds 48-51
sha1h          SFP(E1),SFP(ABCD)
sha1m          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG2.4s,CONST2.4s
sha1su1        MSG3.4s,MSG2.4s
sha1su0        MSG0.4s,MSG1.4s,MSG2.4s

C Rounds 52-55
sha1h          SFP(E0),SFP(ABCD)
sha1m          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG3.4s,CONST3.4s
sha1su1        MSG0.4s,MSG3.4s
sha1su0        MSG1.4s,MSG2.4s,MSG3.4s

C Rounds 56-59
sha1h          SFP(E1),SFP(ABCD)
sha1m          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG0.4s,CONST3.4s
sha1su1        MSG1.4s,MSG0.4s
sha1su0        MSG2.4s,MSG3.4s,MSG0.4s

C Rounds 60-63
sha1h          SFP(E0),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG1.4s,CONST3.4s
sha1su1        MSG2.4s,MSG1.4s
sha1su0        MSG3.4s,MSG0.4s,MSG1.4s

C Rounds 64-67
sha1h          SFP(E1),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E0),TMP0.4s
add            TMP0.4s,MSG2.4s,CONST3.4s
sha1su1        MSG3.4s,MSG2.4s
sha1su0        MSG0.4s,MSG1.4s,MSG2.4s

C Rounds 68-71
sha1h          SFP(E0),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E1),TMP1.4s
add            TMP1.4s,MSG3.4s,CONST3.4s
sha1su1        MSG0.4s,MSG3.4s

C Rounds 72-75
sha1h          SFP(E1),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E0),TMP0.4s

C Rounds 76-79
sha1h          SFP(E0),SFP(ABCD)
sha1p          QFP(ABCD),SFP(E1),TMP1.4s

C Combine state
add            E0.4s,E0.4s,E0_SAVED.4s
add            ABCD.4s,ABCD.4s,ABCD_SAVED.4s

C Store state
st1            {ABCD.4s},[STATE]
st1            {E0.s}[0],[x2]

ret

+EPILOGUE(nettle_sha1_compress)
diff --git a/arm64/machine.m4 b/arm64/machine.m4
index e69de29b..7df62bcc 100644
--- a/arm64/machine.m4
+++ b/arm64/machine.m4
@@ -0,0 +1,7 @@
+C Get 32-bit floating-point register from vector register
+C SFP(VR)
+define(`SFP',``s'substr($1,1,len($1))')



+C Get 128-bit floating-point register from vector register
+C QFP(VR)
+define(`QFP',``q'substr($1,1,len($1))')
-- 
Simo Sorce
RHEL Crypto Team
Red Hat, Inc





    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

Re: [Aarch64] Optimize SHA1 Compress