From e86ec9188319ef08d635da38c2f8bf891e56f995 Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy" yumkam@gmail.com Date: Wed, 2 Jan 2019 19:41:32 +0300 Subject: [PATCH] Add --enable-fat support for arm neon chacha20
On BCM2837B0 (Cortex-A53) @1.4GHz (Raspberry Pi 3B+), Before: `gnutls-cli --benchmark-ciphers` CHACHA20-POLY1305 (16384) 51.54 MB/sec `gnutls-cli --benchmark-tls-ciphers`: ECDHE_RSA_CHACHA20_POLY1305 (payload 1400) 21.31 MB/sec ECDHE_RSA_CHACHA20_POLY1305 (payload 15360) 24.60 MB/sec `nettle-benchmark` chacha encrypt 71.90 chacha decrypt 71.89 chacha_poly1305 encrypt 48.17 chacha_poly1305 decrypt 48.17 chacha_poly1305 update 146.03
After: `gnutls-cli --benchmark-ciphers` CHACHA20-POLY1305 (16384) 68.44 MB/sec `gnutls-cli --benchmark-tls-ciphers`: ECDHE_RSA_CHACHA20_POLY1305 (payload 1400) 27.25 MB/sec ECDHE_RSA_CHACHA20_POLY1305 (payload 15360) 32.41 MB/sec `nettle-benchmark` chacha encrypt 106.00 chacha decrypt 105.94 chacha_poly1305 encrypt 65.94 chacha_poly1305 decrypt 65.96 chacha_poly1305 update 175.24 --- arm/fat/chacha-core-internal-2.asm | 37 +++++++++++++++++++++++++++++++++++++ chacha-core-internal.c | 7 +++++++ configure.ac | 2 ++ fat-arm.c | 10 ++++++++++ fat-setup.h | 2 ++ 5 files changed, 58 insertions(+) create mode 100644 arm/fat/chacha-core-internal-2.asm
P.S. for reference, on same machine, openssl chacha20 encrypt 236.08 openssl chacha20 decrypt 236.14 openssl chacha20-poly1305 encrypt 181.26 openssl chacha20-poly1305 decrypt 182.09 openssl chacha20-poly1305 update 782.67 (with openssl 1.1.0j-1~deb9u1 from debian-stretch/armhf)
diff --git a/arm/fat/chacha-core-internal-2.asm b/arm/fat/chacha-core-internal-2.asm new file mode 100644 index 00000000..66a5c145 --- /dev/null +++ b/arm/fat/chacha-core-internal-2.asm @@ -0,0 +1,37 @@ +C arm/fat/chacha-core-internal-2.asm + + +ifelse(< + Copyright (C) 2015 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +dnl PROLOGUE(_nettle_chacha_core) picked up by configure + +define(<fat_transform>, <$1_neon>) +include_src(<arm/neon/chacha-core-internal.asm>) diff --git a/chacha-core-internal.c b/chacha-core-internal.c index af278bb0..0905834e 100644 --- a/chacha-core-internal.c +++ b/chacha-core-internal.c @@ -51,6 +51,13 @@
#include "macros.h"
+/* For fat builds */ +#if HAVE_NATIVE_chacha_core +void +_nettle_chacha_core_c(uint32_t *dst, const uint32_t *src, unsigned rounds); +#define _nettle_chacha_core _nettle_chacha_core_c +#endif + #ifndef CHACHA_DEBUG # define CHACHA_DEBUG 0 #endif diff --git a/configure.ac b/configure.ac index 305977d1..3f409fa4 100644 --- a/configure.ac +++ b/configure.ac @@ -472,6 +472,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ # Assembler files which generate additional object files if they are used. asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ + chacha-core-internal-2.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ sha3-permute-2.asm sha512-compress-2.asm \ umac-nh-n-2.asm umac-nh-2.asm" @@ -573,6 +574,7 @@ AC_SUBST([IF_ASM]) AH_VERBATIM([HAVE_NATIVE], [/* Define to 1 each of the following for which a native (ie. CPU specific) implementation of the corresponding routine exists. */ +#undef HAVE_NATIVE_chacha_core #undef HAVE_NATIVE_ecc_192_modp #undef HAVE_NATIVE_ecc_192_redc #undef HAVE_NATIVE_ecc_224_modp diff --git a/fat-arm.c b/fat-arm.c index 5e656359..56099e6f 100644 --- a/fat-arm.c +++ b/fat-arm.c @@ -171,6 +171,10 @@ DECLARE_FAT_FUNC(_nettle_umac_nh_n, umac_nh_n_func) DECLARE_FAT_FUNC_VAR(umac_nh_n, umac_nh_n_func, c); DECLARE_FAT_FUNC_VAR(umac_nh_n, umac_nh_n_func, neon);
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func) +DECLARE_FAT_FUNC_VAR(_chacha_core, chacha_core_func, c); +DECLARE_FAT_FUNC_VAR(_chacha_core, chacha_core_func, neon); + static void CONSTRUCTOR fat_init (void) { @@ -212,6 +216,7 @@ fat_init (void) nettle_sha3_permute_vec = _nettle_sha3_permute_neon; _nettle_umac_nh_vec = _nettle_umac_nh_neon; _nettle_umac_nh_n_vec = _nettle_umac_nh_n_neon; + _nettle_chacha_core_vec = _nettle_chacha_core_neon; } else { @@ -222,6 +227,7 @@ fat_init (void) nettle_sha3_permute_vec = _nettle_sha3_permute_c; _nettle_umac_nh_vec = _nettle_umac_nh_c; _nettle_umac_nh_n_vec = _nettle_umac_nh_n_c; + _nettle_chacha_core_vec = _nettle_chacha_core_c; } }
@@ -267,3 +273,7 @@ DEFINE_FAT_FUNC(_nettle_umac_nh_n, void, unsigned length, const uint8_t *msg), (out, n, key, length, msg))
+DEFINE_FAT_FUNC(_nettle_chacha_core, void, + (uint32_t *dst, const uint32_t *src, unsigned rounds), + (dst, src, rounds)) + diff --git a/fat-setup.h b/fat-setup.h index eb7166a7..b623ebf9 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -174,3 +174,5 @@ typedef void sha512_compress_func (uint64_t *state, const uint8_t *input, const typedef uint64_t umac_nh_func (const uint32_t *key, unsigned length, const uint8_t *msg); typedef void umac_nh_n_func (uint64_t *out, unsigned n, const uint32_t *key, unsigned length, const uint8_t *msg); + +typedef void chacha_core_func(uint32_t *dst, const uint32_t *src, unsigned rounds);
"Yuriy M. Kaminskiy" yumkam@gmail.com writes:
[PATCH] Add --enable-fat support for arm neon chacha20
Thanks, applied! Just pushed to the master-updates branch.
When the currrent ARM assembly was written, it was benchmarked on cortex a9 and a15.
Regards, /Niels
"Yuriy M. Kaminskiy" yumkam@gmail.com writes:
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func) +DECLARE_FAT_FUNC_VAR(_chacha_core, chacha_core_func, c); +DECLARE_FAT_FUNC_VAR(_chacha_core, chacha_core_func, neon);
^
The above underscores shouldn't be there. If I delete them, this works fine using debian's cross compilers and qemu.
Regards, /Niels
On 19.01.2019 18:17, Niels Möller wrote:
"Yuriy M. Kaminskiy" yumkam@gmail.com writes:
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func) +DECLARE_FAT_FUNC_VAR(_chacha_core, chacha_core_func, c); +DECLARE_FAT_FUNC_VAR(_chacha_core, chacha_core_func, neon);
^
The above underscores shouldn't be there. If I delete them, this works fine using debian's cross compilers and qemu.
Yes, sorry (I changed/fixed that in when testing, but different patch was "top" in quilt at the time, so it missed in version I sent :-|)
nettle-bugs@lists.lysator.liu.se