I made this patch operate AES ciphering with fixed key sizes of 128-bit, 192-bit, and 256-bit, in this case I eliminated the loading process of key expansion for every round. Since this technique produces performance benefits, I'm planning to keep the implementation as is and in case handling uncommon key size is mandatory, I can append additional branch to process message blocks with any key size. What do you think?
regards, Mamone
On Sat, May 1, 2021 at 5:39 PM Maamoun TK maamoun.tk@googlemail.com wrote:
This patch optimizes nettle_aes_encrypt() and nettle_aes_decrypt() functions for arm64 architecture, it takes advantage of 'aese' and 'aesmc' instructions to optimize the encryption function and 'aesd' and 'aesimc' to optimize the decryption function.
The patch passes the testsuite of nettle. I also run the benchmark on gcc117 instance of CFarm by configuring the library with "--disable-fat --enable-arm64-crypto" options then executing examples/nettle-benchmark:
aes128 ECB encrypt 2522.67 aes128 ECB decrypt 2522.53 aes192 ECB encrypt 2165.06 aes192 ECB decrypt 2165.04 aes256 ECB encrypt 1866.80 aes256 ECB decrypt 1866.38
openssl aes128 ECB encrypt 1043.52 openssl aes128 ECB decrypt 1043.05 openssl aes192 ECB encrypt 904.08 openssl aes192 ECB decrypt 903.85 openssl aes256 ECB encrypt 787.43 openssl aes256 ECB decrypt 787.20
gcm_aes128 encrypt 955.10 gcm_aes128 decrypt 955.06 gcm_aes128 update 3269.18 gcm_aes192 encrypt 896.26 gcm_aes192 decrypt 896.46 gcm_aes192 update 3270.24 gcm_aes256 encrypt 840.17 gcm_aes256 decrypt 843.53 gcm_aes256 update 3270.08
openssl gcm_aes128 encrypt 894.51 openssl gcm_aes128 decrypt 899.05 openssl gcm_aes128 update 1636.61 openssl gcm_aes192 encrypt 834.94 openssl gcm_aes192 decrypt 841.99 openssl gcm_aes192 update 1631.40 openssl gcm_aes256 encrypt 788.48 openssl gcm_aes256 decrypt 791.31 openssl gcm_aes256 update 1635.18
I'm a little suspicious about the benchmark numbers because as I remember the performance of gcm update doesn't double the openssl number, I repeat running the process but kept giving the same performance margin.
arm64/crypto/aes-decrypt-internal.asm | 223 ++++++++++++++++++++++++++++++++++ arm64/crypto/aes-encrypt-internal.asm | 223 ++++++++++++++++++++++++++++++++++ 2 files changed, 446 insertions(+) create mode 100644 arm64/crypto/aes-decrypt-internal.asm create mode 100644 arm64/crypto/aes-encrypt-internal.asm
diff --git a/arm64/crypto/aes-decrypt-internal.asm b/arm64/crypto/aes-decrypt-internal.asm new file mode 100644 index 00000000..4bfdb314 --- /dev/null +++ b/arm64/crypto/aes-decrypt-internal.asm @@ -0,0 +1,223 @@ +C arm64/crypto/aes-decrypt-internal.asm
+ifelse(`
- Copyright (C) 2021 Mamone Tarsha
- This file is part of GNU Nettle.
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
- or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
- or both in parallel, as here.
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received copies of the GNU General Public License and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
+')
+.file "aes-decrypt-internal.asm" +.arch armv8-a+crypto
+.text
+C Register usage:
+define(`ROUNDS', `x0') +define(`KEYS', `x1') +define(`LENGTH', `x3') +define(`DST', `x4') +define(`SRC', `x5')
+define(`S0', `v0') +define(`S1', `v1') +define(`S2', `v2') +define(`S3', `v3') +define(`K0', `v16') +define(`K1', `v17') +define(`K2', `v18') +define(`K3', `v19') +define(`K4', `v20') +define(`K5', `v21') +define(`K6', `v22') +define(`K7', `v23') +define(`K8', `v24') +define(`K9', `v25') +define(`K10', `v26') +define(`K11', `v27') +define(`K12', `v28') +define(`K13', `v29') +define(`K14', `v30')
+C AES_ROUND_4B(KEY) +define(`AES_ROUND_4B', m4_assert_numargs(1)`
- aesd S0.16b,$1.16b
- aesimc S0.16b,S0.16b
- aesd S1.16b,$1.16b
- aesimc S1.16b,S1.16b
- aesd S2.16b,$1.16b
- aesimc S2.16b,S2.16b
- aesd S3.16b,$1.16b
- aesimc S3.16b,S3.16b
+')
+C AES_LAST_ROUND_4B(KEY) +define(`AES_LAST_ROUND_4B', m4_assert_numargs(1)`
- aesd S0.16b,$1.16b
- eor S0.16b,S0.16b,K14.16b
- aesd S1.16b,$1.16b
- eor S1.16b,S1.16b,K14.16b
- aesd S2.16b,$1.16b
- eor S2.16b,S2.16b,K14.16b
- aesd S3.16b,$1.16b
- eor S3.16b,S3.16b,K14.16b
+')
+C AES_ROUND_1B(KEY) +define(`AES_ROUND_1B', m4_assert_numargs(1)`
- aesd S0.16b,$1.16b
- aesimc S0.16b,S0.16b
+')
+C AES_LAST_ROUND_1B(KEY) +define(`AES_LAST_ROUND_1B', m4_assert_numargs(1)`
- aesd S0.16b,$1.16b
- eor S0.16b,S0.16b,K14.16b
+')
+C _aes_decrypt(unsigned rounds, const uint32_t *keys, +C const struct aes_table *T, +C size_t length, uint8_t *dst, +C const uint8_t *src)
+PROLOGUE(_nettle_aes_decrypt)
- ands x6,LENGTH,#-64
- b.eq L1B
- mov x7,KEYS
- ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[x7],#64
- ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[x7],#64
- ld1 {K8.4s,K9.4s},[x7],#32
- cmp ROUNDS,#10
- b.eq L4B_last_key
- ld1 {K10.4s,K11.4s},[x7],#32
- cmp ROUNDS,#12
- b.eq L4B_last_key
- ld1 {K12.4s,K13.4s},[x7],#32
+L4B_last_key:
- ld1 {K14.4s},[x7]
+L4B_loop:
- ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
- AES_ROUND_4B(K0)
- AES_ROUND_4B(K1)
- AES_ROUND_4B(K2)
- AES_ROUND_4B(K3)
- AES_ROUND_4B(K4)
- AES_ROUND_4B(K5)
- AES_ROUND_4B(K6)
- AES_ROUND_4B(K7)
- AES_ROUND_4B(K8)
- cmp ROUNDS,#10
- b.eq L4B_10_round
- cmp ROUNDS,#12
- b.eq L4B_12_round
- b L4B_14_round
+L4B_10_round:
- AES_LAST_ROUND_4B(K9)
- b L4B_done
+L4B_12_round:
- AES_ROUND_4B(K9)
- AES_ROUND_4B(K10)
- AES_LAST_ROUND_4B(K11)
- b L4B_done
+L4B_14_round:
- AES_ROUND_4B(K9)
- AES_ROUND_4B(K10)
- AES_ROUND_4B(K11)
- AES_ROUND_4B(K12)
- AES_LAST_ROUND_4B(K13)
+L4B_done:
- st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
- subs x6,x6,#64
- b.ne L4B_loop
- and LENGTH,LENGTH,#63
+L1B:
- cbz LENGTH,Ldone
- mov x6,KEYS
- ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[x6],#64
- ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[x6],#64
- ld1 {K8.4s,K9.4s},[x6],#32
- cmp ROUNDS,#10
- b.eq L1B_last_key
- ld1 {K10.4s,K11.4s},[x6],#32
- cmp ROUNDS,#12
- b.eq L1B_last_key
- ld1 {K12.4s,K13.4s},[x6],#32
+L1B_last_key:
- ld1 {K14.4s},[x6]
+L1B_loop:
- ld1 {S0.16b},[SRC],#16
- AES_ROUND_1B(K0)
- AES_ROUND_1B(K1)
- AES_ROUND_1B(K2)
- AES_ROUND_1B(K3)
- AES_ROUND_1B(K4)
- AES_ROUND_1B(K5)
- AES_ROUND_1B(K6)
- AES_ROUND_1B(K7)
- AES_ROUND_1B(K8)
- cmp ROUNDS,#10
- b.eq L1B_10_round
- cmp ROUNDS,#12
- b.eq L1B_12_round
- b L1B_14_round
+L1B_10_round:
- AES_LAST_ROUND_1B(K9)
- b L1B_done
+L1B_12_round:
- AES_ROUND_1B(K9)
- AES_ROUND_1B(K10)
- AES_LAST_ROUND_1B(K11)
- b L1B_done
+L1B_14_round:
- AES_ROUND_1B(K9)
- AES_ROUND_1B(K10)
- AES_ROUND_1B(K11)
- AES_ROUND_1B(K12)
- AES_LAST_ROUND_1B(K13)
+L1B_done:
- st1 {S0.16b},[DST],#16
- subs LENGTH,LENGTH,#16
- b.ne L1B_loop
+Ldone:
- ret
+EPILOGUE(_nettle_aes_decrypt) diff --git a/arm64/crypto/aes-encrypt-internal.asm b/arm64/crypto/aes-encrypt-internal.asm new file mode 100644 index 00000000..314f9333 --- /dev/null +++ b/arm64/crypto/aes-encrypt-internal.asm @@ -0,0 +1,223 @@ +C arm64/crypto/aes-encrypt-internal.asm
+ifelse(`
- Copyright (C) 2021 Mamone Tarsha
- This file is part of GNU Nettle.
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
- or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
- or both in parallel, as here.
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received copies of the GNU General Public License and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
+')
+.file "aes-encrypt-internal.asm" +.arch armv8-a+crypto
+.text
+C Register usage:
+define(`ROUNDS', `x0') +define(`KEYS', `x1') +define(`LENGTH', `x3') +define(`DST', `x4') +define(`SRC', `x5')
+define(`S0', `v0') +define(`S1', `v1') +define(`S2', `v2') +define(`S3', `v3') +define(`K0', `v16') +define(`K1', `v17') +define(`K2', `v18') +define(`K3', `v19') +define(`K4', `v20') +define(`K5', `v21') +define(`K6', `v22') +define(`K7', `v23') +define(`K8', `v24') +define(`K9', `v25') +define(`K10', `v26') +define(`K11', `v27') +define(`K12', `v28') +define(`K13', `v29') +define(`K14', `v30')
+C AES_ROUND_4B(KEY) +define(`AES_ROUND_4B', m4_assert_numargs(1)`
- aese S0.16b,$1.16b
- aesmc S0.16b,S0.16b
- aese S1.16b,$1.16b
- aesmc S1.16b,S1.16b
- aese S2.16b,$1.16b
- aesmc S2.16b,S2.16b
- aese S3.16b,$1.16b
- aesmc S3.16b,S3.16b
+')
+C AES_LAST_ROUND_4B(KEY) +define(`AES_LAST_ROUND_4B', m4_assert_numargs(1)`
- aese S0.16b,$1.16b
- eor S0.16b,S0.16b,K14.16b
- aese S1.16b,$1.16b
- eor S1.16b,S1.16b,K14.16b
- aese S2.16b,$1.16b
- eor S2.16b,S2.16b,K14.16b
- aese S3.16b,$1.16b
- eor S3.16b,S3.16b,K14.16b
+')
+C AES_ROUND_1B(KEY) +define(`AES_ROUND_1B', m4_assert_numargs(1)`
- aese S0.16b,$1.16b
- aesmc S0.16b,S0.16b
+')
+C AES_LAST_ROUND_1B(KEY) +define(`AES_LAST_ROUND_1B', m4_assert_numargs(1)`
- aese S0.16b,$1.16b
- eor S0.16b,S0.16b,K14.16b
+')
+C _aes_encrypt(unsigned rounds, const uint32_t *keys, +C const struct aes_table *T, +C size_t length, uint8_t *dst, +C uint8_t *src)
+PROLOGUE(_nettle_aes_encrypt)
- ands x6,LENGTH,#-64
- b.eq L1B
- mov x7,KEYS
- ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[x7],#64
- ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[x7],#64
- ld1 {K8.4s,K9.4s},[x7],#32
- cmp ROUNDS,#10
- b.eq L4B_last_key
- ld1 {K10.4s,K11.4s},[x7],#32
- cmp ROUNDS,#12
- b.eq L4B_last_key
- ld1 {K12.4s,K13.4s},[x7],#32
+L4B_last_key:
- ld1 {K14.4s},[x7]
+L4B_loop:
- ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
- AES_ROUND_4B(K0)
- AES_ROUND_4B(K1)
- AES_ROUND_4B(K2)
- AES_ROUND_4B(K3)
- AES_ROUND_4B(K4)
- AES_ROUND_4B(K5)
- AES_ROUND_4B(K6)
- AES_ROUND_4B(K7)
- AES_ROUND_4B(K8)
- cmp ROUNDS,#10
- b.eq L4B_10_round
- cmp ROUNDS,#12
- b.eq L4B_12_round
- b L4B_14_round
+L4B_10_round:
- AES_LAST_ROUND_4B(K9)
- b L4B_done
+L4B_12_round:
- AES_ROUND_4B(K9)
- AES_ROUND_4B(K10)
- AES_LAST_ROUND_4B(K11)
- b L4B_done
+L4B_14_round:
- AES_ROUND_4B(K9)
- AES_ROUND_4B(K10)
- AES_ROUND_4B(K11)
- AES_ROUND_4B(K12)
- AES_LAST_ROUND_4B(K13)
+L4B_done:
- st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
- subs x6,x6,#64
- b.ne L4B_loop
- and LENGTH,LENGTH,#63
+L1B:
- cbz LENGTH,Ldone
- mov x6,KEYS
- ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[x6],#64
- ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[x6],#64
- ld1 {K8.4s,K9.4s},[x6],#32
- cmp ROUNDS,#10
- b.eq L1B_last_key
- ld1 {K10.4s,K11.4s},[x6],#32
- cmp ROUNDS,#12
- b.eq L1B_last_key
- ld1 {K12.4s,K13.4s},[x6],#32
+L1B_last_key:
- ld1 {K14.4s},[x6]
+L1B_loop:
- ld1 {S0.16b},[SRC],#16
- AES_ROUND_1B(K0)
- AES_ROUND_1B(K1)
- AES_ROUND_1B(K2)
- AES_ROUND_1B(K3)
- AES_ROUND_1B(K4)
- AES_ROUND_1B(K5)
- AES_ROUND_1B(K6)
- AES_ROUND_1B(K7)
- AES_ROUND_1B(K8)
- cmp ROUNDS,#10
- b.eq L1B_10_round
- cmp ROUNDS,#12
- b.eq L1B_12_round
- b L1B_14_round
+L1B_10_round:
- AES_LAST_ROUND_1B(K9)
- b L1B_done
+L1B_12_round:
- AES_ROUND_1B(K9)
- AES_ROUND_1B(K10)
- AES_LAST_ROUND_1B(K11)
- b L1B_done
+L1B_14_round:
- AES_ROUND_1B(K9)
- AES_ROUND_1B(K10)
- AES_ROUND_1B(K11)
- AES_ROUND_1B(K12)
- AES_LAST_ROUND_1B(K13)
+L1B_done:
- st1 {S0.16b},[DST],#16
- subs LENGTH,LENGTH,#16
- b.ne L1B_loop
+Ldone:
- ret
+EPILOGUE(_nettle_aes_encrypt)
-- 2.25.1