[Aarch64] Optimize SHA1 Compress - nettle-bugs

14 May 2021

This patch optimizes SHA1 compress function for arm64 architecture by
taking advantage of SHA-1 instructions of Armv8 crypto extension.
The SHA-1 instructions:
SHA1C: SHA1 hash update (choose)
SHA1H: SHA1 fixed rotate
SHA1M: SHA1 hash update (majority)
SHA1P: SHA1 hash update (parity)
SHA1SU0: SHA1 schedule update 0
SHA1SU1: SHA1 schedule update 1
The patch is based on sha1-arm.c - ARMv8 SHA extensions using C intrinsics
of repository https://github.com/noloader/SHA-Intrinsics by Jeffrey Walton.
The patch passes the testsuite of nettle library and the benchmark numbers
are considerably improved but the performance of the overall sha1 hash
function doesn't surpass the corresponding openssl numbers.
Benchmark on gcc117 instance of CFarm before applying the patch:
         Algorithm         mode        Mbyte/s
         sha1               update       214.16
         openssl sha1  update       849.44
         hmac-sha1     64 bytes     61.69
         hmac-sha1     256 bytes   131.50
         hmac-sha1    1024 bytes  185.20
         hmac-sha1    4096 bytes  204.55
         hmac-sha1    single msg  210.97
Benchmark on gcc117 instance of CFarm after applying the patch:
         Algorithm         mode        Mbyte/s
         sha1                update       795.57
         openssl sha1   update       849.25
         hmac-sha1      64 bytes    167.65
         hmac-sha1      256 bytes   408.24
         hmac-sha1     1024 bytes  636.68
         hmac-sha1     4096 bytes  739.42
         hmac-sha1     single msg  775.89
---
 arm64/crypto/sha1-compress.asm | 245
+++++++++++++++++++++++++++++++++++++++++
 arm64/machine.m4               |   7 ++
 2 files changed, 252 insertions(+)
 create mode 100644 arm64/crypto/sha1-compress.asm

diff --git a/arm64/crypto/sha1-compress.asm b/arm64/crypto/sha1-compress.asm
new file mode 100644
index 00000000..bb3f1d35
--- /dev/null
+++ b/arm64/crypto/sha1-compress.asm
@@ -0,0 +1,245 @@
+C arm64/crypto/sha1-compress.asm
+
+ifelse(`
+   Copyright (C) 2021 Mamone Tarsha
+
+   Based on sha1-arm.c - ARMv8 SHA extensions using C intrinsics of
+   repository https://github.com/noloader/SHA-Intrinsics
+   sha1-arm.c is written and placed in public domain by Jeffrey Walton,
+   based on code from ARM, and by Johannes Schneiders, Skip
+   Hovsmith and Barry O'Rourke for the mbedTLS project.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+.file "sha1-compress.asm"
+.arch armv8-a+crypto
+
+.text
+
+C Register usage:
+
+define(`STATE', `x0')
+define(`INPUT', `x1')
+
+define(`CONST0', `v0')
+define(`CONST1', `v1')
+define(`CONST2', `v2')
+define(`CONST3', `v3')
+define(`MSG0', `v4')
+define(`MSG1', `v5')
+define(`MSG2', `v6')
+define(`MSG3', `v7')
+define(`ABCD', `v16')
+define(`ABCD_SAVED', `v17')
+define(`E0', `v18')
+define(`E0_SAVED', `v19')
+define(`E1', `v20')
+define(`TMP0', `v21')
+define(`TMP1', `v22')
+
+C void nettle_sha1_compress(uint32_t *state, const uint8_t *input)
+
+PROLOGUE(nettle_sha1_compress)
+    C Initialize constants
+    mov            w2,#0x7999
+    movk           w2,#0x5A82,lsl #16
+    dup            CONST0.4s,w2
+    mov            w2,#0xEBA1
+    movk           w2,#0x6ED9,lsl #16
+    dup            CONST1.4s,w2
+    mov            w2,#0xBCDC
+    movk           w2,#0x8F1B,lsl #16
+    dup            CONST2.4s,w2
+    mov            w2,#0xC1D6
+    movk           w2,#0xCA62,lsl #16
+    dup            CONST3.4s,w2
+
+    C Load state
+    add            x2,STATE,#16
+    movi           E0.4s,#0
+    ld1            {ABCD.4s},[STATE]
+    ld1            {E0.s}[0],[x2]
+
+    C Save state
+    mov            ABCD_SAVED.16b,ABCD.16b
+    mov            E0_SAVED.16b,E0.16b
+
+    C Load message
+    ld1            {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT]
+
+    C Reverse for little endian
+    rev32          MSG0.16b,MSG0.16b
+    rev32          MSG1.16b,MSG1.16b
+    rev32          MSG2.16b,MSG2.16b
+    rev32          MSG3.16b,MSG3.16b
+
+    add            TMP0.4s,MSG0.4s,CONST0.4s
+    add            TMP1.4s,MSG1.4s,CONST0.4s
+
+    C Rounds 0-3
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG2.4s,CONST0.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 4-7
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG3.4s,CONST0.4s
+    sha1su1        MSG0.4s,MSG3.4s
+    sha1su0        MSG1.4s,MSG2.4s,MSG3.4s
+
+    C Rounds 8-11
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG0.4s,CONST0.4s
+    sha1su1        MSG1.4s,MSG0.4s
+    sha1su0        MSG2.4s,MSG3.4s,MSG0.4s
+
+    C Rounds 12-15
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG1.4s,CONST1.4s
+    sha1su1        MSG2.4s,MSG1.4s
+    sha1su0        MSG3.4s,MSG0.4s,MSG1.4s
+
+    C Rounds 16-19
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG2.4s,CONST1.4s
+    sha1su1        MSG3.4s,MSG2.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 20-23
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG3.4s,CONST1.4s
+    sha1su1        MSG0.4s,MSG3.4s
+    sha1su0        MSG1.4s,MSG2.4s,MSG3.4s
+
+    C Rounds 24-27
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG0.4s,CONST1.4s
+    sha1su1        MSG1.4s,MSG0.4s
+    sha1su0        MSG2.4s,MSG3.4s,MSG0.4s
+
+    C Rounds 28-31
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG1.4s,CONST1.4s
+    sha1su1        MSG2.4s,MSG1.4s
+    sha1su0        MSG3.4s,MSG0.4s,MSG1.4s
+
+    C Rounds 32-35
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG2.4s,CONST2.4s
+    sha1su1        MSG3.4s,MSG2.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 36-39
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG3.4s,CONST2.4s
+    sha1su1        MSG0.4s,MSG3.4s
+    sha1su0        MSG1.4s,MSG2.4s,MSG3.4s
+
+    C Rounds 40-43
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG0.4s,CONST2.4s
+    sha1su1        MSG1.4s,MSG0.4s
+    sha1su0        MSG2.4s,MSG3.4s,MSG0.4s
+
+    C Rounds 44-47
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG1.4s,CONST2.4s
+    sha1su1        MSG2.4s,MSG1.4s
+    sha1su0        MSG3.4s,MSG0.4s,MSG1.4s
+
+    C Rounds 48-51
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG2.4s,CONST2.4s
+    sha1su1        MSG3.4s,MSG2.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 52-55
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG3.4s,CONST3.4s
+    sha1su1        MSG0.4s,MSG3.4s
+    sha1su0        MSG1.4s,MSG2.4s,MSG3.4s
+
+    C Rounds 56-59
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG0.4s,CONST3.4s
+    sha1su1        MSG1.4s,MSG0.4s
+    sha1su0        MSG2.4s,MSG3.4s,MSG0.4s
+
+    C Rounds 60-63
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG1.4s,CONST3.4s
+    sha1su1        MSG2.4s,MSG1.4s
+    sha1su0        MSG3.4s,MSG0.4s,MSG1.4s
+
+    C Rounds 64-67
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E0),TMP0.4s
+    add            TMP0.4s,MSG2.4s,CONST3.4s
+    sha1su1        MSG3.4s,MSG2.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 68-71
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP1.4s
+    add            TMP1.4s,MSG3.4s,CONST3.4s
+    sha1su1        MSG0.4s,MSG3.4s
+
+    C Rounds 72-75
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E0),TMP0.4s
+
+    C Rounds 76-79
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP1.4s
+
+    C Combine state
+    add            E0.4s,E0.4s,E0_SAVED.4s
+    add            ABCD.4s,ABCD.4s,ABCD_SAVED.4s
+
+    C Store state
+    st1            {ABCD.4s},[STATE]
+    st1            {E0.s}[0],[x2]
+
+    ret
+EPILOGUE(nettle_sha1_compress)
diff --git a/arm64/machine.m4 b/arm64/machine.m4
index e69de29b..7df62bcc 100644
--- a/arm64/machine.m4
+++ b/arm64/machine.m4
@@ -0,0 +1,7 @@
+C Get 32-bit floating-point register from vector register
+C SFP(VR)
+define(`SFP',``s'substr($1,1,len($1))')
+
+C Get 128-bit floating-point register from vector register
+C QFP(VR)
+define(`QFP',``q'substr($1,1,len($1))')
-- 
2.25.1