This patch optimizes SHA1 compress function for arm64 architecture by taking advantage of SHA-1 instructions of Armv8 crypto extension. The SHA-1 instructions: SHA1C: SHA1 hash update (choose) SHA1H: SHA1 fixed rotate SHA1M: SHA1 hash update (majority) SHA1P: SHA1 hash update (parity) SHA1SU0: SHA1 schedule update 0 SHA1SU1: SHA1 schedule update 1
The patch is based on sha1-arm.c - ARMv8 SHA extensions using C intrinsics of repository https://github.com/noloader/SHA-Intrinsics by Jeffrey Walton.
The patch passes the testsuite of nettle library and the benchmark numbers are considerably improved but the performance of the overall sha1 hash function doesn't surpass the corresponding openssl numbers.
Benchmark on gcc117 instance of CFarm before applying the patch: Algorithm mode Mbyte/s sha1 update 214.16 openssl sha1 update 849.44 hmac-sha1 64 bytes 61.69 hmac-sha1 256 bytes 131.50 hmac-sha1 1024 bytes 185.20 hmac-sha1 4096 bytes 204.55 hmac-sha1 single msg 210.97
Benchmark on gcc117 instance of CFarm after applying the patch: Algorithm mode Mbyte/s sha1 update 795.57 openssl sha1 update 849.25 hmac-sha1 64 bytes 167.65 hmac-sha1 256 bytes 408.24 hmac-sha1 1024 bytes 636.68 hmac-sha1 4096 bytes 739.42 hmac-sha1 single msg 775.89
--- arm64/crypto/sha1-compress.asm | 245 +++++++++++++++++++++++++++++++++++++++++ arm64/machine.m4 | 7 ++ 2 files changed, 252 insertions(+) create mode 100644 arm64/crypto/sha1-compress.asm
diff --git a/arm64/crypto/sha1-compress.asm b/arm64/crypto/sha1-compress.asm new file mode 100644 index 00000000..bb3f1d35 --- /dev/null +++ b/arm64/crypto/sha1-compress.asm @@ -0,0 +1,245 @@ +C arm64/crypto/sha1-compress.asm + +ifelse(` + Copyright (C) 2021 Mamone Tarsha + + Based on sha1-arm.c - ARMv8 SHA extensions using C intrinsics of + repository https://github.com/noloader/SHA-Intrinsics + sha1-arm.c is written and placed in public domain by Jeffrey Walton, + based on code from ARM, and by Johannes Schneiders, Skip + Hovsmith and Barry O'Rourke for the mbedTLS project. + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +.file "sha1-compress.asm" +.arch armv8-a+crypto + +.text + +C Register usage: + +define(`STATE', `x0') +define(`INPUT', `x1') + +define(`CONST0', `v0') +define(`CONST1', `v1') +define(`CONST2', `v2') +define(`CONST3', `v3') +define(`MSG0', `v4') +define(`MSG1', `v5') +define(`MSG2', `v6') +define(`MSG3', `v7') +define(`ABCD', `v16') +define(`ABCD_SAVED', `v17') +define(`E0', `v18') +define(`E0_SAVED', `v19') +define(`E1', `v20') +define(`TMP0', `v21') +define(`TMP1', `v22') + +C void nettle_sha1_compress(uint32_t *state, const uint8_t *input) + +PROLOGUE(nettle_sha1_compress) + C Initialize constants + mov w2,#0x7999 + movk w2,#0x5A82,lsl #16 + dup CONST0.4s,w2 + mov w2,#0xEBA1 + movk w2,#0x6ED9,lsl #16 + dup CONST1.4s,w2 + mov w2,#0xBCDC + movk w2,#0x8F1B,lsl #16 + dup CONST2.4s,w2 + mov w2,#0xC1D6 + movk w2,#0xCA62,lsl #16 + dup CONST3.4s,w2 + + C Load state + add x2,STATE,#16 + movi E0.4s,#0 + ld1 {ABCD.4s},[STATE] + ld1 {E0.s}[0],[x2] + + C Save state + mov ABCD_SAVED.16b,ABCD.16b + mov E0_SAVED.16b,E0.16b + + C Load message + ld1 {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT] + + C Reverse for little endian + rev32 MSG0.16b,MSG0.16b + rev32 MSG1.16b,MSG1.16b + rev32 MSG2.16b,MSG2.16b + rev32 MSG3.16b,MSG3.16b + + add TMP0.4s,MSG0.4s,CONST0.4s + add TMP1.4s,MSG1.4s,CONST0.4s + + C Rounds 0-3 + sha1h SFP(E1),SFP(ABCD) + sha1c QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG2.4s,CONST0.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 4-7 + sha1h SFP(E0),SFP(ABCD) + sha1c QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG3.4s,CONST0.4s + sha1su1 MSG0.4s,MSG3.4s + sha1su0 MSG1.4s,MSG2.4s,MSG3.4s + + C Rounds 8-11 + sha1h SFP(E1),SFP(ABCD) + sha1c QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG0.4s,CONST0.4s + sha1su1 MSG1.4s,MSG0.4s + sha1su0 MSG2.4s,MSG3.4s,MSG0.4s + + C Rounds 12-15 + sha1h SFP(E0),SFP(ABCD) + sha1c QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG1.4s,CONST1.4s + sha1su1 MSG2.4s,MSG1.4s + sha1su0 MSG3.4s,MSG0.4s,MSG1.4s + + C Rounds 16-19 + sha1h SFP(E1),SFP(ABCD) + sha1c QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG2.4s,CONST1.4s + sha1su1 MSG3.4s,MSG2.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 20-23 + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG3.4s,CONST1.4s + sha1su1 MSG0.4s,MSG3.4s + sha1su0 MSG1.4s,MSG2.4s,MSG3.4s + + C Rounds 24-27 + sha1h SFP(E1),SFP(ABCD) + sha1p QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG0.4s,CONST1.4s + sha1su1 MSG1.4s,MSG0.4s + sha1su0 MSG2.4s,MSG3.4s,MSG0.4s + + C Rounds 28-31 + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG1.4s,CONST1.4s + sha1su1 MSG2.4s,MSG1.4s + sha1su0 MSG3.4s,MSG0.4s,MSG1.4s + + C Rounds 32-35 + sha1h SFP(E1),SFP(ABCD) + sha1p QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG2.4s,CONST2.4s + sha1su1 MSG3.4s,MSG2.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 36-39 + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG3.4s,CONST2.4s + sha1su1 MSG0.4s,MSG3.4s + sha1su0 MSG1.4s,MSG2.4s,MSG3.4s + + C Rounds 40-43 + sha1h SFP(E1),SFP(ABCD) + sha1m QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG0.4s,CONST2.4s + sha1su1 MSG1.4s,MSG0.4s + sha1su0 MSG2.4s,MSG3.4s,MSG0.4s + + C Rounds 44-47 + sha1h SFP(E0),SFP(ABCD) + sha1m QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG1.4s,CONST2.4s + sha1su1 MSG2.4s,MSG1.4s + sha1su0 MSG3.4s,MSG0.4s,MSG1.4s + + C Rounds 48-51 + sha1h SFP(E1),SFP(ABCD) + sha1m QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG2.4s,CONST2.4s + sha1su1 MSG3.4s,MSG2.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 52-55 + sha1h SFP(E0),SFP(ABCD) + sha1m QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG3.4s,CONST3.4s + sha1su1 MSG0.4s,MSG3.4s + sha1su0 MSG1.4s,MSG2.4s,MSG3.4s + + C Rounds 56-59 + sha1h SFP(E1),SFP(ABCD) + sha1m QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG0.4s,CONST3.4s + sha1su1 MSG1.4s,MSG0.4s + sha1su0 MSG2.4s,MSG3.4s,MSG0.4s + + C Rounds 60-63 + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG1.4s,CONST3.4s + sha1su1 MSG2.4s,MSG1.4s + sha1su0 MSG3.4s,MSG0.4s,MSG1.4s + + C Rounds 64-67 + sha1h SFP(E1),SFP(ABCD) + sha1p QFP(ABCD),SFP(E0),TMP0.4s + add TMP0.4s,MSG2.4s,CONST3.4s + sha1su1 MSG3.4s,MSG2.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 68-71 + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP1.4s + add TMP1.4s,MSG3.4s,CONST3.4s + sha1su1 MSG0.4s,MSG3.4s + + C Rounds 72-75 + sha1h SFP(E1),SFP(ABCD) + sha1p QFP(ABCD),SFP(E0),TMP0.4s + + C Rounds 76-79 + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP1.4s + + C Combine state + add E0.4s,E0.4s,E0_SAVED.4s + add ABCD.4s,ABCD.4s,ABCD_SAVED.4s + + C Store state + st1 {ABCD.4s},[STATE] + st1 {E0.s}[0],[x2] + + ret +EPILOGUE(nettle_sha1_compress) diff --git a/arm64/machine.m4 b/arm64/machine.m4 index e69de29b..7df62bcc 100644 --- a/arm64/machine.m4 +++ b/arm64/machine.m4 @@ -0,0 +1,7 @@ +C Get 32-bit floating-point register from vector register +C SFP(VR) +define(`SFP',``s'substr($1,1,len($1))') + +C Get 128-bit floating-point register from vector register +C QFP(VR) +define(`QFP',``q'substr($1,1,len($1))')