nisse@lysator.liu.se (Niels Möller) writes:
(iii) I've considered doing it earlier, to make it easier to implement aes without a round loop (like for all current versions of aes-encrypt-internal.*). E.g., on x86_64, for aes128 we could load all subkeys into registers and still have registers left to do two or more blocks in parallel, but then we'd need to override aes128_encrypt separately from the other aes*_encrypt.
I've given this a try, see experimental patch below. It adds a x86_64/aesni/aes128-encrypt.asm, with a 2-way loop. It gives a very modest speedup, 5%, when I benchmark on my laptop (which is now a pretty fast machine, AMD Ryzen 5). I've also added a cbc-aes128-encrypt.asm. That gives more significant speedup, almost 60%. I think main reason for the speedup is that we avoid reloading subkeys between blocks.
If we want to go this way, I wonder how to do it without an explosion of files and functions. For s390x, it seems each function will be very small, but not so for most other archs. There are at least three modes that are similar to cbc encrypt in that they have to process blocks sequentially, with no parallelism: CBC encrypt, CMAC, and XTS (there may be more). It's not so nice if we need (modes × ciphers) number of assembly files, with lots of duplication.
Regards, /Niels
diff --git a/ChangeLog b/ChangeLog index 3d19b1dd..68b8f632 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,13 @@ 2021-04-01 Niels Möller nisse@lysator.liu.se
+ * cbc-aes128-encrypt.c (nettle_cbc_aes128_encrypt): New file and function. + * x86_64/aesni/cbc-aes128-encrypt.asm: New file. + + * configure.ac (asm_replace_list): Add aes128-encrypt.asm + aes128-decrypt.asm. + * x86_64/aesni/aes128-encrypt.asm: New file, with 2-way loop. + * x86_64/aesni/aes128-decrypt.asm: Likewise. + Move aes128_encrypt and similar functions to their own files. To make it easier for assembly implementations to override specific AES variants. diff --git a/Makefile.in b/Makefile.in index 8d474d1e..b6b983fd 100644 --- a/Makefile.in +++ b/Makefile.in @@ -101,7 +101,8 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \ camellia256-set-encrypt-key.c camellia256-crypt.c \ camellia256-set-decrypt-key.c \ camellia256-meta.c \ - cast128.c cast128-meta.c cbc.c \ + cast128.c cast128-meta.c \ + cbc.c cbc-aes128-encrypt.c \ ccm.c ccm-aes128.c ccm-aes192.c ccm-aes256.c cfb.c \ siv-cmac.c siv-cmac-aes128.c siv-cmac-aes256.c \ cnd-memcpy.c \ diff --git a/cbc-aes128-encrypt.c b/cbc-aes128-encrypt.c new file mode 100644 index 00000000..5f7d1c8c --- /dev/null +++ b/cbc-aes128-encrypt.c @@ -0,0 +1,42 @@ +/* cbc-aes128-encrypt.c + + Copyright (C) 2013, 2014 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cbc.h" + +void +nettle_cbc_aes128_encrypt(struct cbc_aes128_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src) +{ + CBC_ENCRYPT(ctx, aes128_encrypt, length, dst, src); +} diff --git a/cbc.h b/cbc.h index 93b2e739..beece610 100644 --- a/cbc.h +++ b/cbc.h @@ -35,6 +35,7 @@ #define NETTLE_CBC_H_INCLUDED
#include "nettle-types.h" +#include "aes.h"
#ifdef __cplusplus extern "C" { @@ -79,6 +80,10 @@ memcpy((ctx)->iv, (data), sizeof((ctx)->iv)) sizeof((self)->iv), (self)->iv, \ (length), (dst), (src)))
+struct cbc_aes128_ctx CBC_CTX(struct aes128_ctx, AES_BLOCK_SIZE); +void +nettle_cbc_aes128_encrypt(struct cbc_aes128_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src); + #ifdef __cplusplus } #endif diff --git a/configure.ac b/configure.ac index be2916c1..26e41d89 100644 --- a/configure.ac +++ b/configure.ac @@ -544,6 +544,7 @@ fi # Files which replace a C source file (or otherwise don't correspond # to a new object file). asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ + aes128-encrypt.asm aes128-decrypt.asm cbc-aes128-encrypt.asm \ arcfour-crypt.asm camellia-crypt-internal.asm \ md5-compress.asm memxor.asm memxor3.asm \ poly1305-internal.asm \ diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c index 9ce3a733..686cf3b9 100644 --- a/examples/nettle-benchmark.c +++ b/examples/nettle-benchmark.c @@ -240,6 +240,21 @@ bench_ctr(void *arg) BENCH_BLOCK, info->dst, info->src); }
+struct bench_cbc_aes128_info +{ + struct cbc_aes128_ctx ctx; + + const uint8_t *src; + uint8_t *dst; +}; + +static void +bench_cbc_aes128(void *arg) +{ + struct bench_cbc_aes128_info *info = arg; + nettle_cbc_aes128_encrypt(&info->ctx, BENCH_BLOCK, info->dst, info->src); +} + struct bench_aead_info { void *ctx; @@ -740,6 +755,29 @@ time_cipher(const struct nettle_cipher *cipher) free(key); }
+static void +time_cbc_aes128(void) +{ + struct bench_cbc_aes128_info info; + uint8_t key[AES128_KEY_SIZE]; + uint8_t iv[AES_BLOCK_SIZE]; + + static uint8_t src_data[BENCH_BLOCK]; + static uint8_t data[BENCH_BLOCK]; + + init_key(sizeof(key), key); + init_key(sizeof(iv), iv); + init_data(data); + init_data(src_data); + + aes128_set_encrypt_key(&info.ctx.ctx, key); + CBC_SET_IV(&info.ctx, iv); + info.src = src_data; + info.dst = data; + display("aes128", "new cbc", AES_BLOCK_SIZE, + time_function(bench_cbc_aes128, &info)); +} + static void time_aead(const struct nettle_aead *aead) { @@ -1027,6 +1065,9 @@ main(int argc, char **argv) if (!alg || strstr ("hmac-sha512", alg)) time_hmac_sha512();
+ if (!alg || strstr ("cbc-aes128", alg)) + time_cbc_aes128(); + optind++; } while (alg && argv[optind]);
diff --git a/testsuite/cbc-test.c b/testsuite/cbc-test.c index 9394f1cb..ff0c4cbe 100644 --- a/testsuite/cbc-test.c +++ b/testsuite/cbc-test.c @@ -3,6 +3,43 @@ #include "cbc.h" #include "knuth-lfib.h"
+static void +test_cbc_aes128(const struct tstring *key, + const struct tstring *cleartext, + const struct tstring *ciphertext, + const struct tstring *iiv) +{ + struct cbc_aes128_ctx ctx; + uint8_t *data; + size_t length; + + ASSERT (cleartext->length == ciphertext->length); + length = cleartext->length; + + ASSERT (key->length == AES128_KEY_SIZE); + ASSERT (iiv->length == AES_BLOCK_SIZE); + + data = xalloc(length); + aes128_set_encrypt_key(&ctx.ctx, key->data); + CBC_SET_IV(&ctx, iiv->data); + + nettle_cbc_aes128_encrypt(&ctx, + length, data, cleartext->data); + + if (!MEMEQ(length, data, ciphertext->data)) + { + fprintf(stderr, "CBC encrypt failed:\nInput:"); + tstring_print_hex(cleartext); + fprintf(stderr, "\nOutput: "); + print_hex(length, data); + fprintf(stderr, "\nExpected:"); + tstring_print_hex(ciphertext); + fprintf(stderr, "\n"); + FAIL(); + } + free(data); +} + /* Test with more data and inplace decryption, to check that the * cbc_decrypt buffering works. */ #define CBC_BULK_DATA 0x2710 /* 10000 */ @@ -161,6 +198,17 @@ test_main(void) "b2eb05e2c39be9fcda6c19078c6a9d1b"), SHEX("000102030405060708090a0b0c0d0e0f"));
+ test_cbc_aes128(SHEX("2b7e151628aed2a6abf7158809cf4f3c"), + SHEX("6bc1bee22e409f96e93d7e117393172a" + "ae2d8a571e03ac9c9eb76fac45af8e51" + "30c81c46a35ce411e5fbc1191a0a52ef" + "f69f2445df4f9b17ad2b417be66c3710"), + SHEX("7649abac8119b246cee98e9b12e9197d" + "5086cb9b507219ee95db113a917678b2" + "73bed6b8e3c1743b7116e69e22229516" + "3ff1caa1681fac09120eca307586e1a7"), + SHEX("000102030405060708090a0b0c0d0e0f")); + test_cbc_bulk(); }
diff --git a/x86_64/aesni/aes128-decrypt.asm b/x86_64/aesni/aes128-decrypt.asm new file mode 100644 index 00000000..79111e47 --- /dev/null +++ b/x86_64/aesni/aes128-decrypt.asm @@ -0,0 +1,136 @@ +C x86_64/aesni/aes128-decrypt.asm + +ifelse(` + Copyright (C) 2015, 2018, 2021 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Input argument +define(`CTX', `%rdi') +define(`LENGTH',`%rsi') +define(`DST', `%rdx') +define(`SRC', `%rcx') + +define(`KEY0', `%xmm0') +define(`KEY1', `%xmm1') +define(`KEY2', `%xmm2') +define(`KEY3', `%xmm3') +define(`KEY4', `%xmm4') +define(`KEY5', `%xmm5') +define(`KEY6', `%xmm6') +define(`KEY7', `%xmm7') +define(`KEY8', `%xmm8') +define(`KEY9', `%xmm9') +define(`KEY10', `%xmm10') +define(`X', `%xmm11') +define(`Y', `%xmm12') + + .file "aes128-decrypt.asm" + + C nettle_aes128_decrypt(const struct aes128_ctx *ctx, + C size_t length, uint8_t *dst, + C const uint8_t *src); + + .text + ALIGN(16) +PROLOGUE(nettle_aes128_decrypt) + W64_ENTRY(4, 13) + shr $4, LENGTH + test LENGTH, LENGTH + jz .Lend + + movups (CTX), KEY0 + movups 16(CTX), KEY1 + movups 32(CTX), KEY2 + movups 48(CTX), KEY3 + movups 64(CTX), KEY4 + movups 80(CTX), KEY5 + movups 96(CTX), KEY6 + movups 112(CTX), KEY7 + movups 128(CTX), KEY8 + movups 144(CTX), KEY9 + movups 160(CTX), KEY10 + shr LENGTH + jnc .Lblock_loop + + movups (SRC), X + pxor KEY0, X + aesdec KEY1, X + aesdec KEY2, X + aesdec KEY3, X + aesdec KEY4, X + aesdec KEY5, X + aesdec KEY6, X + aesdec KEY7, X + aesdec KEY8, X + aesdec KEY9, X + aesdeclast KEY10, X + + movups X, (DST) + add $16, SRC + add $16, DST + test LENGTH, LENGTH + jz .Lend + +.Lblock_loop: + movups (SRC), X + movups 16(SRC), Y + pxor KEY0, X + pxor KEY0, Y + aesdec KEY1, X + aesdec KEY1, Y + aesdec KEY2, X + aesdec KEY2, Y + aesdec KEY3, X + aesdec KEY3, Y + aesdec KEY4, X + aesdec KEY4, Y + aesdec KEY5, X + aesdec KEY5, Y + aesdec KEY6, X + aesdec KEY6, Y + aesdec KEY7, X + aesdec KEY7, Y + aesdec KEY8, X + aesdec KEY8, Y + aesdec KEY9, X + aesdec KEY9, Y + aesdeclast KEY10, X + aesdeclast KEY10, Y + + movups X, (DST) + movups Y, 16(DST) + add $32, SRC + add $32, DST + dec LENGTH + jnz .Lblock_loop + +.Lend: + W64_EXIT(4, 13) + ret +EPILOGUE(nettle_aes128_decrypt) diff --git a/x86_64/aesni/aes128-encrypt.asm b/x86_64/aesni/aes128-encrypt.asm new file mode 100644 index 00000000..8e7ebe78 --- /dev/null +++ b/x86_64/aesni/aes128-encrypt.asm @@ -0,0 +1,136 @@ +C x86_64/aesni/aes128-encrypt.asm + +ifelse(` + Copyright (C) 2015, 2018, 2021 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Input argument +define(`CTX', `%rdi') +define(`LENGTH',`%rsi') +define(`DST', `%rdx') +define(`SRC', `%rcx') + +define(`KEY0', `%xmm0') +define(`KEY1', `%xmm1') +define(`KEY2', `%xmm2') +define(`KEY3', `%xmm3') +define(`KEY4', `%xmm4') +define(`KEY5', `%xmm5') +define(`KEY6', `%xmm6') +define(`KEY7', `%xmm7') +define(`KEY8', `%xmm8') +define(`KEY9', `%xmm9') +define(`KEY10', `%xmm10') +define(`X', `%xmm11') +define(`Y', `%xmm12') + + .file "aes128-encrypt.asm" + + C nettle_aes128_encrypt(const struct aes128_ctx *ctx, + C size_t length, uint8_t *dst, + C const uint8_t *src); + + .text + ALIGN(16) +PROLOGUE(nettle_aes128_encrypt) + W64_ENTRY(4, 13) + shr $4, LENGTH + test LENGTH, LENGTH + jz .Lend + + movups (CTX), KEY0 + movups 16(CTX), KEY1 + movups 32(CTX), KEY2 + movups 48(CTX), KEY3 + movups 64(CTX), KEY4 + movups 80(CTX), KEY5 + movups 96(CTX), KEY6 + movups 112(CTX), KEY7 + movups 128(CTX), KEY8 + movups 144(CTX), KEY9 + movups 160(CTX), KEY10 + shr LENGTH + jnc .Lblock_loop + + movups (SRC), X + pxor KEY0, X + aesenc KEY1, X + aesenc KEY2, X + aesenc KEY3, X + aesenc KEY4, X + aesenc KEY5, X + aesenc KEY6, X + aesenc KEY7, X + aesenc KEY8, X + aesenc KEY9, X + aesenclast KEY10, X + + movups X, (DST) + add $16, SRC + add $16, DST + test LENGTH, LENGTH + jz .Lend + +.Lblock_loop: + movups (SRC), X + movups 16(SRC), Y + pxor KEY0, X + pxor KEY0, Y + aesenc KEY1, X + aesenc KEY1, Y + aesenc KEY2, X + aesenc KEY2, Y + aesenc KEY3, X + aesenc KEY3, Y + aesenc KEY4, X + aesenc KEY4, Y + aesenc KEY5, X + aesenc KEY5, Y + aesenc KEY6, X + aesenc KEY6, Y + aesenc KEY7, X + aesenc KEY7, Y + aesenc KEY8, X + aesenc KEY8, Y + aesenc KEY9, X + aesenc KEY9, Y + aesenclast KEY10, X + aesenclast KEY10, Y + + movups X, (DST) + movups Y, 16(DST) + add $32, SRC + add $32, DST + dec LENGTH + jnz .Lblock_loop + +.Lend: + W64_EXIT(4, 13) + ret +EPILOGUE(nettle_aes128_encrypt) diff --git a/x86_64/aesni/cbc-aes128-encrypt.asm b/x86_64/aesni/cbc-aes128-encrypt.asm new file mode 100644 index 00000000..04c6c6b0 --- /dev/null +++ b/x86_64/aesni/cbc-aes128-encrypt.asm @@ -0,0 +1,108 @@ +C x86_64/aesni/cbc-aes128-encrypt.asm + +ifelse(` + Copyright (C) 2015, 2018, 2021 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Input argument +define(`CTX', `%rdi') +define(`LENGTH',`%rsi') +define(`DST', `%rdx') +define(`SRC', `%rcx') + +define(`KEY0', `%xmm0') +define(`KEY1', `%xmm1') +define(`KEY2', `%xmm2') +define(`KEY3', `%xmm3') +define(`KEY4', `%xmm4') +define(`KEY5', `%xmm5') +define(`KEY6', `%xmm6') +define(`KEY7', `%xmm7') +define(`KEY8', `%xmm8') +define(`KEY9', `%xmm9') +define(`KEY10', `%xmm10') +define(`X', `%xmm11') +define(`BLOCK', `%xmm12') + + .file "cbc-aes128-encrypt.asm" + + C nettle_cbc_aes128_encrypt(struct cbc_aes128_ctx *ctx, + C size_t length, uint8_t *dst, + C const uint8_t *src); + + .text + ALIGN(16) +PROLOGUE(nettle_cbc_aes128_encrypt) + W64_ENTRY(4, 13) + shr $4, LENGTH + test LENGTH, LENGTH + jz .Lend + + movups (CTX), KEY0 + movups 16(CTX), KEY1 + movups 32(CTX), KEY2 + movups 48(CTX), KEY3 + movups 64(CTX), KEY4 + movups 80(CTX), KEY5 + movups 96(CTX), KEY6 + movups 112(CTX), KEY7 + movups 128(CTX), KEY8 + movups 144(CTX), KEY9 + movups 160(CTX), KEY10 + movups 176(CTX), X C Load IV + +.Lblock_loop: + movups (SRC), BLOCK C Cleartext block + pxor BLOCK, X + pxor KEY0, X + aesenc KEY1, X + aesenc KEY2, X + aesenc KEY3, X + aesenc KEY4, X + aesenc KEY5, X + aesenc KEY6, X + aesenc KEY7, X + aesenc KEY8, X + aesenc KEY9, X + aesenclast KEY10, X + + movups X, (DST) + add $16, SRC + add $16, DST + + dec LENGTH + jnz .Lblock_loop + + C Save IV + movups X, 176(CTX) + +.Lend: + W64_EXIT(4, 13) + ret +EPILOGUE(nettle_cbc_aes128_encrypt)