This is a stand-alone patch that applies all the previous patches to the optimized GCM implementation. This patch is based on the master upstream so it can be merged directly. It passes the testsuite and yields the expected performance. --- configure.ac | 5 +- fat-ppc.c | 44 +++ fat-setup.h | 9 + gcm.c | 82 +++- powerpc64/README | 2 +- powerpc64/fat/gcm-hash.asm | 40 ++ powerpc64/p8/gcm-hash.asm | 956 +++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1121 insertions(+), 17 deletions(-) create mode 100644 powerpc64/fat/gcm-hash.asm create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index e9983697..13cfe33a 100644 --- a/configure.ac +++ b/configure.ac @@ -488,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -612,7 +612,8 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 #undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core diff --git a/fat-ppc.c b/fat-ppc.c index 2bc50481..e69dff95 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -120,6 +120,20 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
+#if GCM_TABLE_BITS == 8 +DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64) + +DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64) +#endif /* GCM_TABLE_BITS == 8 */ + +DECLARE_FAT_FUNC(_nettle_gcm_fill, gcm_fill_func) +DECLARE_FAT_FUNC_VAR(gcm_fill, gcm_fill_func, c) +DECLARE_FAT_FUNC_VAR(gcm_fill, gcm_fill_func, ppc64) + static void CONSTRUCTOR fat_init (void) { @@ -139,11 +153,25 @@ fat_init (void) fprintf (stderr, "libnettle: enabling arch 2.07 code.\n"); _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64; _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64; +#if GCM_TABLE_BITS == 8 + /* Make sure _nettle_gcm_init_key_vec function is compatible + with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c() + fills gcm_key table with values that are incompatible with + _nettle_gcm_hash_ppc64() */ + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64; + _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64; +#endif /* GCM_TABLE_BITS == 8 */ + _nettle_gcm_fill_vec = _nettle_gcm_fill_ppc64; } else { _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c; _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c; +#if GCM_TABLE_BITS == 8 + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c; + _nettle_gcm_hash_vec = _nettle_gcm_hash_c; +#endif /* GCM_TABLE_BITS == 8 */ + _nettle_gcm_fill_vec = _nettle_gcm_fill_c; } }
@@ -160,3 +188,19 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void, size_t length, uint8_t *dst, const uint8_t *src), (rounds, keys, T, length, dst, src)) + + #if GCM_TABLE_BITS == 8 +DEFINE_FAT_FUNC(_nettle_gcm_init_key, void, + (union nettle_block16 *table), + (table)) + +DEFINE_FAT_FUNC(_nettle_gcm_hash, void, + (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data), + (key, x, length, data)) +#endif /* GCM_TABLE_BITS == 8 */ + +DEFINE_FAT_FUNC(_nettle_gcm_fill, void, + (uint8_t *ctr, size_t blocks, + union nettle_block16 *buffer), + (ctr, blocks, buffer)) diff --git a/fat-setup.h b/fat-setup.h index 99f1ea67..623f9579 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -162,6 +162,15 @@ typedef void aes_crypt_internal_func (unsigned rounds, const uint32_t *keys, size_t length, uint8_t *dst, const uint8_t *src);
+#if GCM_TABLE_BITS == 8 +typedef void gcm_init_key_func (union nettle_block16 *table); + +typedef void gcm_hash_func (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +#endif /* GCM_TABLE_BITS == 8 */ + +typedef void gcm_fill_func (uint8_t *ctr, size_t blocks, union nettle_block16 *buffer); + typedef void *(memxor_func)(void *dst, const void *src, size_t n);
typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds); diff --git a/gcm.c b/gcm.c index cf615daf..935d4420 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,26 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key + +#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +/* For fat builds */ +void +_nettle_gcm_init_key_c (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash + +#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +/* For fat builds */ +void +_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -225,9 +245,45 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table)
#endif /* GCM_TABLE_BITS */
+#if HAVE_NATIVE_gcm_fill + +#define gcm_fill _nettle_gcm_fill +void +_nettle_gcm_fill (uint8_t *ctr, size_t blocks, union nettle_block16 *buffer); +/* For fat builds */ +void +_nettle_gcm_fill_c (uint8_t *ctr, size_t blocks, union nettle_block16 *buffer); +#endif /* HAVE_NATIVE_gcm_fill */ + /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifdef gcm_init_key +void +_nettle_gcm_init_key_c(union nettle_block16 *table) +#else +static void +gcm_init_key(union nettle_block16 *table) +#endif /* !gcm_init_key */ +{ +#if GCM_TABLE_BITS + /* Middle element if GCM_TABLE_BITS > 0, otherwise the first + element */ + unsigned i = (1<<GCM_TABLE_BITS)/2; + + /* Algorithm 3 from the gcm paper. First do powers of two, then do + the rest by adding. */ + while (i /= 2) + block16_mulx_ghash(&table[i], &table[2*i]); + for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) + { + unsigned j; + for (j = 1; j < i; j++) + block16_xor3(&table[i+j], &table[i], &table[j]); + } +#endif +} + /* Initialization of GCM. * @ctx: The context of GCM * @cipher: The context of the underlying block cipher @@ -245,24 +301,18 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS - /* Algorithm 3 from the gcm paper. First do powers of two, then do - the rest by adding. */ - while (i /= 2) - block16_mulx_ghash(&key->h[i], &key->h[2*i]); - for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) - { - unsigned j; - for (j = 1; j < i; j++) - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]); - } -#endif + gcm_init_key(key->h); }
-#ifndef gcm_hash +#ifdef gcm_hash +void +_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data) +#else static void gcm_hash(const struct gcm_key *key, union nettle_block16 *x, size_t length, const uint8_t *data) +#endif /* !gcm_hash */ { for (; length >= GCM_BLOCK_SIZE; length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE) @@ -276,7 +326,6 @@ gcm_hash(const struct gcm_key *key, union nettle_block16 *x, gcm_gf_mul (x, key->h); } } -#endif /* !gcm_hash */
static void gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x, @@ -333,9 +382,14 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key *key, ctx->auth_size += length; }
+#ifdef gcm_fill +void +_nettle_gcm_fill_c(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) +#else static nettle_fill16_func gcm_fill; static void gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) +#endif /* !gcm_fill */ { uint32_t c;
diff --git a/powerpc64/README b/powerpc64/README index 7301953b..fac1108b 100644 --- a/powerpc64/README +++ b/powerpc64/README @@ -52,7 +52,7 @@ storage operands, refer to "6.4.1 Accessing Unaligned Storage Operands" in [3] to see an example of accessing unaligned storage operands. "lxvd2x/stxvd2x" can be used to load/store data into unaligned storage operands but permuting is needed for loading and storing data in -little-endian mode VSX registers are defined with "X" suffix +little-endian mode
Function Prologue
diff --git a/powerpc64/fat/gcm-hash.asm b/powerpc64/fat/gcm-hash.asm new file mode 100644 index 00000000..2d9f281e --- /dev/null +++ b/powerpc64/fat/gcm-hash.asm @@ -0,0 +1,40 @@ +C powerpc64/fat/gcm-hash.asm + + +ifelse(` + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl picked up by configure +dnl PROLOGUE(_nettle_gcm_init_key) +dnl PROLOGUE(_nettle_gcm_hash) +dnl PROLOGUE(_nettle_gcm_fill) + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/gcm-hash.asm') diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..f57cbfe4 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,956 @@ +C powerpc64/p8/gcm-hash.asm + +ifelse(` + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6') + +define(`zero', `v0') +define(`swap_mask', `v1') +define(`hidw_mask', `v2') +define(`lodw_mask', `v3') +define(`poly', `v4') +define(`poly_h', `v4') +define(`poly_l', `v5') +define(`RP', `v6') +define(`Mh', `v7') +define(`Ml', `v8') +define(`H', `v9') +define(`Hh', `v10') +define(`Hl', `v11') +define(`RP2', `v9') +define(`M2h', `v10') +define(`M2l', `v11') + +define(`sl1', `v1') +define(`msb', `v5') +define(`H2', `v6') +define(`H2h', `v7') +define(`H2l', `v8') +define(`H_h', `v12') +define(`H_m', `v13') +define(`H_l', `v14') +define(`H_Hh', `v12') +define(`H_H', `v13') +define(`H_Hl', `v14') +define(`H_t', `v15') +define(`H2_h', `v16') +define(`H2_m', `v17') +define(`H2_l', `v18') +define(`H2_t', `v19') + +define(`C0', `v6') +define(`C1', `v7') +define(`C2', `v8') +define(`C3', `v12') +define(`C4', `v6') +define(`C5', `v7') +define(`C6', `v8') +define(`C7', `v12') + +define(`C', `v13') + +define(`Ch', `v14') +define(`Cl', `v15') +define(`Cm', `v16') + +define(`C01h', `v14') +define(`C01l', `v15') +define(`C01', `v16') +define(`C23h', `v17') +define(`C23l', `v18') +define(`C23', `v19') +define(`C45h', `v20') +define(`C45l', `v21') +define(`C45', `v22') +define(`C67h', `v6') +define(`C67l', `v7') +define(`C67', `v8') + +define(`H21', `v9') +define(`H21h', `v10') +define(`H21l', `v11') +define(`H43', `v23') +define(`H43h', `v24') +define(`H43l', `v25') +define(`H65', `v26') +define(`H65h', `v27') +define(`H65l', `v28') +define(`H87', `v29') +define(`H87h', `v30') +define(`H87l', `v31') + +# gcm_fill registers: + +define(`CTR', `r3') +define(`BLOCKS', `r4') +define(`BUFFER', `r5') + +define(`CTR0', `v2') +define(`CTR0S', `v3') +define(`CTR1', `v4') +define(`CTR2', `v5') +define(`CTR3', `v6') +define(`CTR4', `v7') +define(`CTR5', `v8') +define(`CTR6', `v9') +define(`CTR7', `v10') + +define(`I1', `v11') +define(`I2', `v12') +define(`I3', `v13') +define(`I4', `v14') +define(`I5', `v15') +define(`I6', `v16') +define(`I7', `v17') +define(`I8', `v18') + +.file "gcm-hash.asm" + +.text + + # void gcm_init_key (union gcm_block *table) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key) + DATA_LOAD_VEC(poly,.polynomial,r7) +IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)') + DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7) + DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7) + + li r10,8*TableElemAlign + lxvd2x VSR(H),r10,TABLE # load H +IF_LE(`vperm H,H,H,swap_mask') + + # --- calculate H = H shift left 1 modulo polynomial --- + + vupkhsw msb,H # most significant bit word-extend + vspltisb sl1,1 # splat 1 for shift left + vspltw msb,msb,0 # most significant bit extend + vsl H,H,sl1 # H shift left 1 + vand msb,msb,poly + vxor zero,zero,zero + vxor H_t,H,msb + + vsldoi H,H_t,H_t,8 # doubleword swap + vsldoi Hh,H,zero,8 + vsldoi Hl,zero,H,8 + + # --- calculate H^2 = H*H --- + + # reduction pre-processing + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + # polynomial multiplication "classical" + vpmsumd H_h,H_t,Hh # H^1h*H^1h + vpmsumd H_l,H_t,Hl # H^1l*H^1l + vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] + vsldoi Ml,H_m,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor H_h,H_h,Mh # [2] + vxor H_l,H_l,Ml # [2] + vxor H_l,H_l,RP # [1] + + # reduction second phase + vpmsumd RP,H_l,poly_l + vxor H_h,H_l,H_h + vxor H2_t,H_h,RP + + vsldoi H2,H2_t,H2_t,8 + vsldoi H2h,H2,zero,8 + vsldoi H2l,zero,H2,8 + + # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + li r8,0*TableElemAlign + li r9,1*TableElemAlign + li r10,2*TableElemAlign + stxvd2x VSR(Hl),r8,TABLE + stxvd2x VSR(H),r9,TABLE + stxvd2x VSR(Hh),r10,TABLE + + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^3,H^4 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^2l + vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h + vpmsumd H_h,H_t,H2h # H^1h*H^2h + vpmsumd H2_l,H2_t,H2l # H^2l*H^2l + vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h + vpmsumd H2_h,H2_t,H2h # H^2h*H^2h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^3 + vpmsumd RP2,H2_l,poly_h # [1] H^4 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^3 + vsldoi M2h,zero,H2_m,8 # [2] H^4 + vsldoi Ml,H_m,zero,8 # [2] H^3 + vsldoi M2l,H2_m,zero,8 # [2] H^4 + vsldoi RP,RP,RP,8 # [1] H^3 + vsldoi RP2,RP2,RP2,8 # [1] H^4 + vxor H_h,H_h,Mh # [2] H^3 + vxor H2_h,H2_h,M2h # [2] H^4 + vxor H_l,H_l,Ml # [2] H^3 + vxor H2_l,H2_l,M2l # [2] H^4 + vxor H_l,H_l,RP # [1] H^3 + vxor H2_l,H2_l,RP2 # [1] H^4 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^3 + vpmsumd RP2,H2_l,poly_l # H^4 + vxor H_h,H_l,H_h # H^3 + vxor H2_h,H2_l,H2_h # H^4 + vxor H_h,H_h,RP # H^3 + vxor H2_h,H2_h,RP2 # H^4 + + vsldoi H2,H2_h,H2_h,8 # H^4 + vsldoi H,H_h,H_h,8 # H^3 + vsldoi H2l,zero,H2,8 # H^4 + vsldoi H2h,H2,zero,8 # H^4 + + # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + li r8,6*TableElemAlign + li r9,7*TableElemAlign + li r10,8*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^5,H^6 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^4l + vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h + vpmsumd H_h,H_t,H2h # H^1h*H^4h + vpmsumd H2_l,H2_t,H2l # H^2l*H^4l + vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h + vpmsumd H2_h,H2_t,H2h # H^2h*H^4h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^5 + vpmsumd RP2,H2_l,poly_h # [1] H^6 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^5 + vsldoi M2h,zero,H2_m,8 # [2] H^6 + vsldoi Ml,H_m,zero,8 # [2] H^5 + vsldoi M2l,H2_m,zero,8 # [2] H^6 + vsldoi RP,RP,RP,8 # [1] H^5 + vsldoi RP2,RP2,RP2,8 # [1] H^6 + vxor H_h,H_h,Mh # [2] H^5 + vxor H2_h,H2_h,M2h # [2] H^6 + vxor H_l,H_l,Ml # [2] H^5 + vxor H2_l,H2_l,M2l # [2] H^6 + vxor H_l,H_l,RP # [1] H^5 + vxor H2_l,H2_l,RP2 # [1] H^6 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^5 + vpmsumd RP2,H2_l,poly_l # H^6 + vxor H_h,H_l,H_h # H^5 + vxor H2_h,H2_l,H2_h # H^6 + vxor H_h,H_h,RP # H^5 + vxor H2_h,H2_h,RP2 # H^6 + + vsldoi H2,H2_h,H2_h,8 # H^6 + vsldoi H,H_h,H_h,8 # H^5 + vsldoi H2l,zero,H2,8 # H^6 + vsldoi H2h,H2,zero,8 # H^6 + + # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + li r8,9*TableElemAlign + li r9,10*TableElemAlign + li r10,11*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^7,H^8 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^6l + vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h + vpmsumd H_h,H_t,H2h # H^1h*H^6h + vpmsumd H2_l,H2_t,H2l # H^2l*H^6l + vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h + vpmsumd H2_h,H2_t,H2h # H^2h*H^6h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^7 + vpmsumd RP2,H2_l,poly_h # [1] H^8 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^7 + vsldoi M2h,zero,H2_m,8 # [2] H^8 + vsldoi Ml,H_m,zero,8 # [2] H^7 + vsldoi M2l,H2_m,zero,8 # [2] H^8 + vsldoi RP,RP,RP,8 # [1] H^7 + vsldoi RP2,RP2,RP2,8 # [1] H^8 + vxor H_h,H_h,Mh # [2] H^7 + vxor H2_h,H2_h,M2h # [2] H^8 + vxor H_l,H_l,Ml # [2] H^7 + vxor H2_l,H2_l,M2l # [2] H^8 + vxor H_l,H_l,RP # [1] H^7 + vxor H2_l,H2_l,RP2 # [1] H^8 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^7 + vpmsumd RP2,H2_l,poly_l # H^8 + vxor H_h,H_l,H_h # H^7 + vxor H2_h,H2_l,H2_h # H^8 + vxor H_h,H_h,RP # H^7 + vxor H2_h,H2_h,RP2 # H^8 + + vsldoi H,H_h,H_h,8 # H^7 + vsldoi H2,H2_h,H2_h,8 # H^8 + + # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + li r8,12*TableElemAlign + li r9,13*TableElemAlign + li r10,14*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key) + + # void gcm_hash (const struct gcm_key *key, union gcm_block *x, + # size_t length, const uint8_t *data) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash) + vxor zero,zero,zero + + DATA_LOAD_VEC(poly,.polynomial,r7) +IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)') + DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7) + DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7) + + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + lxvd2x VSR(C),0,X # load X +IF_LE(`vperm C,C,C,swap_mask') + + srdi r7,LENGTH,7 # 8x loop count + cmpldi r7,0 + beq L2x + + # backup registers + stdu SP,-224(SP) + std r28,216(SP) + std r29,208(SP) + std r30,200(SP) + std r31,192(SP) + li 8,176 + stvx 20,8,SP + subi 8,8,16 + stvx 21,8,SP + subi 8,8,16 + stvx 22,8,SP + subi 8,8,16 + stvx 23,8,SP + subi 8,8,16 + stvx 24,8,SP + subi 8,8,16 + stvx 25,8,SP + subi 8,8,16 + stvx 26,8,SP + subi 8,8,16 + stvx 27,8,SP + subi 8,8,16 + stvx 28,8,SP + subi 8,8,16 + stvx 29,8,SP + subi 8,8,16 + stvx 30,8,SP + subi 8,8,16 + stvx 31,8,SP + + # table loading + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H21h),r8,TABLE + lxvd2x VSR(H21),r9,TABLE + lxvd2x VSR(H21l),r10,TABLE + li r8,6*TableElemAlign + li r9,7*TableElemAlign + li r10,8*TableElemAlign + lxvd2x VSR(H43h),r8,TABLE + lxvd2x VSR(H43),r9,TABLE + lxvd2x VSR(H43l),r10,TABLE + li r8,9*TableElemAlign + li r9,10*TableElemAlign + li r10,11*TableElemAlign + lxvd2x VSR(H65h),r8,TABLE + lxvd2x VSR(H65),r9,TABLE + lxvd2x VSR(H65l),r10,TABLE + li r8,12*TableElemAlign + li r9,13*TableElemAlign + li r10,14*TableElemAlign + lxvd2x VSR(H87h),r8,TABLE + lxvd2x VSR(H87),r9,TABLE + lxvd2x VSR(H87l),r10,TABLE + + li r8,0x10 + li r9,0x20 + li r10,0x30 + li r28,0x40 + li r29,0x50 + li r30,0x60 + li r31,0x70 + + mtctr r7 +.align 5 +L8x_loop: + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + lxvd2x VSR(C1),r8,DATA # load C1 + lxvd2x VSR(C2),r9,DATA # load C2 + lxvd2x VSR(C3),r10,DATA # load C3 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask + vperm C2,C2,C2,swap_mask + vperm C3,C3,C3,swap_mask') + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C23h,C2,C3,hidw_mask + vperm C23l,C2,C3,lodw_mask + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + + # input loading + lxvd2x VSR(C4),r28,DATA # load C4 + lxvd2x VSR(C5),r29,DATA # load C5 + lxvd2x VSR(C6),r30,DATA # load C6 + lxvd2x VSR(C7),r31,DATA # load C7 + + # swap permuting +IF_LE(`vperm C4,C4,C4,swap_mask + vperm C5,C5,C5,swap_mask + vperm C6,C6,C6,swap_mask + vperm C7,C7,C7,swap_mask') + + # polynomial multiplication "karatsuba" pre-processing + vperm C45h,C4,C5,hidw_mask + vperm C45l,C4,C5,lodw_mask + vperm C67h,C6,C7,hidw_mask + vperm C67l,C6,C7,lodw_mask + vxor C23,C23h,C23l + vxor C01,C01h,C01l + vxor C45,C45h,C45l + vxor C67,C67h,C67l + + # polynomial multiplication "karatsuba" + vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h + vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l + vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h + vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l + vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h + vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l + vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h + vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l + vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l) + vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l) + vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l) + vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l) + + # polynomial multiplication "karatsuba" post-processing + vxor C23,C23,C23h + vxor C01,C01,C01h + vxor C45,C45,C45h + vxor C67,C67,C67h + vxor C23,C23,C23l + vxor C01,C01,C01l + vxor C45,C45,C45l + vxor C67,C67,C67l + + # deferred recombination of partial products + vxor C01h,C01h,C23h # H0 = H01⊕H23 + vxor C45h,C45h,C67h # H1 = H45⊕H67 + vxor C01l,C01l,C23l # L0 = L01⊕L23 + vxor C45l,C45l,C67l # L1 = L45⊕L45 + vxor C01,C01,C23 # M0 = M01⊕M23 + vxor C45,C45,C67 # M1 = M45⊕M45 + vxor C01h,C01h,C45h # H = H0⊕H1 + vxor C01l,C01l,C45l # L = L0⊕L1 + vxor C01,C01,C45 # M = M0⊕M1 + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x80 + bdnz L8x_loop + + # restore registers + li 8,0 + lvx 31,8,SP + addi 8,8,16 + lvx 30,8,SP + addi 8,8,16 + lvx 29,8,SP + addi 8,8,16 + lvx 28,8,SP + addi 8,8,16 + lvx 27,8,SP + addi 8,8,16 + lvx 26,8,SP + addi 8,8,16 + lvx 25,8,SP + addi 8,8,16 + lvx 24,8,SP + addi 8,8,16 + lvx 23,8,SP + addi 8,8,16 + lvx 22,8,SP + addi 8,8,16 + lvx 21,8,SP + addi 8,8,16 + lvx 20,8,SP + ld r31,192(SP) + ld r30,200(SP) + ld r29,208(SP) + ld r28,216(SP) + addi SP,SP,224 + + clrldi LENGTH,LENGTH,57 +L2x: + srdi r7,LENGTH,5 + cmpldi r7,0 + beq L1x + + # table loading + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H21h),r8,TABLE + lxvd2x VSR(H21),r9,TABLE + lxvd2x VSR(H21l),r10,TABLE + + li r10,0x10 + + mtctr r7 +.align 5 +L2x_loop: + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + lxvd2x VSR(C1),r10,DATA # load C1 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask') + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + vxor C01,C01h,C01l + + # polynomial multiplication "karatsuba" + vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h + vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l + vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l) + + # polynomial multiplication "karatsuba" post-processing + vxor C01,C01,C01h + vxor C01,C01,C01l + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x20 + bdnz L2x_loop + + clrldi LENGTH,LENGTH,59 +L1x: + srdi r7,LENGTH,4 + cmpldi r7,0 + beq Lrem + + # table loading + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(Hl),0,TABLE + lxvd2x VSR(H), r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask') + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 +Lrem: + cmpldi LENGTH,0 + beq Ldone + + # table loading + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(Hl),0,TABLE + lxvd2x VSR(H), r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + # input loading + stdu SP,-16(SP) + stvx zero,0,SP +Lst_loop: + subic. LENGTH,LENGTH,1 + lbzx r7,LENGTH,DATA + stbx r7,LENGTH,SP + bne Lst_loop + lxvd2x VSR(C0),0,SP + addi SP,SP,16 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask') + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + +Ldone: +IF_LE(`vperm C,C,C,swap_mask') + stxvd2x VSR(C),0,X # store C + blr +EPILOGUE(_nettle_gcm_hash) + + # gcm_fill (uint8_t *ctr, size_t blocks, union gcm_block *buffer) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_fill) +IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r6)') + + vxor zero,zero,zero + vspltisb I1,1 + vspltisb I2,2 + vspltisb I3,3 + vspltisb I4,4 + vspltisb I5,5 + vspltisb I6,6 + vspltisb I7,7 + vspltisb I8,8 + vsldoi I1,zero,I1,1 + vsldoi I2,zero,I2,1 + vsldoi I3,zero,I3,1 + vsldoi I4,zero,I4,1 + vsldoi I5,zero,I5,1 + vsldoi I6,zero,I6,1 + vsldoi I7,zero,I7,1 + vsldoi I8,zero,I8,1 + + lxvd2x VSR(CTR0),0,CTR + IF_LE(`vperm CTR0,CTR0,CTR0,swap_mask') + + srdi r6,BLOCKS,3 # 8x loop count + cmpldi r6,0 + beq Lfill_4x + + std r25,-56(SP); + std r26,-48(SP); + std r27,-40(SP); + std r28,-32(SP); + std r29,-24(SP); + std r30,-16(SP); + std r31,-8(SP); + + li r25,0x10 + li r26,0x20 + li r27,0x30 + li r28,0x40 + li r29,0x50 + li r30,0x60 + li r31,0x70 + + mtctr r6 + L8x_fill_loop: + vadduwm CTR1,CTR0,I1 + vadduwm CTR2,CTR0,I2 + vadduwm CTR3,CTR0,I3 + vadduwm CTR4,CTR0,I4 + vadduwm CTR5,CTR0,I5 + vadduwm CTR6,CTR0,I6 + vadduwm CTR7,CTR0,I7 + + IF_LE(`vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask + vperm CTR2,CTR2,CTR2,swap_mask + vperm CTR3,CTR3,CTR3,swap_mask + vperm CTR4,CTR4,CTR4,swap_mask + vperm CTR5,CTR5,CTR5,swap_mask + vperm CTR6,CTR6,CTR6,swap_mask + vperm CTR7,CTR7,CTR7,swap_mask') + + IF_LE(`stxvd2x VSR(CTR0S),0,BUFFER') + IF_BE(`stxvd2x VSR(CTR0),0,BUFFER') + stxvd2x VSR(CTR1),r25,BUFFER + stxvd2x VSR(CTR2),r26,BUFFER + stxvd2x VSR(CTR3),r27,BUFFER + stxvd2x VSR(CTR4),r28,BUFFER + stxvd2x VSR(CTR5),r29,BUFFER + stxvd2x VSR(CTR6),r30,BUFFER + stxvd2x VSR(CTR7),r31,BUFFER + + vadduwm CTR0,CTR0,I8 + addi BUFFER,BUFFER,0x80 + bdnz L8x_fill_loop + + ld r25,-56(SP); + ld r26,-48(SP); + ld r27,-40(SP); + ld r28,-32(SP); + ld r29,-24(SP); + ld r30,-16(SP); + ld r31,-8(SP); + + clrldi BLOCKS,BLOCKS,61 + + Lfill_4x: + srdi r6,BLOCKS,2 + cmpldi r6,0 + beq Lfill_2x + + li r8,0x10 + li r9,0x20 + li r10,0x30 + + vadduwm CTR1,CTR0,I1 + vadduwm CTR2,CTR0,I2 + vadduwm CTR3,CTR0,I3 + + IF_LE(`vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask + vperm CTR2,CTR2,CTR2,swap_mask + vperm CTR3,CTR3,CTR3,swap_mask') + + IF_LE(`stxvd2x VSR(CTR0S),0,BUFFER') + IF_BE(`stxvd2x VSR(CTR0),0,BUFFER') + stxvd2x VSR(CTR1),r8,BUFFER + stxvd2x VSR(CTR2),r9,BUFFER + stxvd2x VSR(CTR3),r10,BUFFER + + vadduwm CTR0,CTR0,I4 + addi BUFFER,BUFFER,0x40 + + clrldi BLOCKS,BLOCKS,62 + + Lfill_2x: + srdi r6,BLOCKS,1 + cmpldi r6,0 + beq Lfill_1x + + li r10,0x10 + + vadduwm CTR1,CTR0,I1 + + IF_LE(`vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask') + + IF_LE(`stxvd2x VSR(CTR0S),0,BUFFER') + IF_BE(`stxvd2x VSR(CTR0),0,BUFFER') + stxvd2x VSR(CTR1),r10,BUFFER + + vadduwm CTR0,CTR0,I2 + addi BUFFER,BUFFER,0x20 + + clrldi BLOCKS,BLOCKS,63 + + Lfill_1x: + cmpldi BLOCKS,0 + beq Lfill_done + + IF_LE(`vperm CTR0S,CTR0,CTR0,swap_mask') + + IF_LE(`stxvd2x VSR(CTR0S),0,BUFFER') + IF_BE(`stxvd2x VSR(CTR0),0,BUFFER') + + vadduwm CTR0,CTR0,I1 + + Lfill_done: + IF_LE(`vperm CTR0,CTR0,CTR0,swap_mask') + stxvd2x VSR(CTR0),0,CTR + + blr +EPILOGUE(_nettle_gcm_fill) + + .data +IF_LE(`.align 4 +.polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 + .align 4 +.swap_mask: + .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 + .align 4 +.hidw_mask: + .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0 + .align 4 +.lodw_mask: + .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8') +IF_BE(`.align 4 +.polynomial: + .byte 0xc2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 + .align 4 +.hidw_mask: + .byte 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23 + .align 4 +.lodw_mask: + .byte 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31')
Maamoun TK maamoun.tk@googlemail.com writes:
This is a stand-alone patch that applies all the previous patches to the optimized GCM implementation. This patch is based on the master upstream so it can be merged directly.
Some questions on the overall structure:
What's the speedup you get from assembly gcm_fill? I see the C implementation uses memcpy and WRITE_UINT32, and is likely significantly slower than the ctr_fill16 in ctr.c. But it could be improved using portable means. If done well, it should be a very small fraction of the cpu time spent for gcm encryption.
What table layout is used by the assembly gcm_init_key? I would have expected not much tables to be needed when using the special multiply instructions. And Nettle generally doesn't try very hard to optimize key setup, under the theory that applications that need high performance also do a lot of work with each key. E.g., we use C code for AES key setup even when it could be sped up by assembly using special instructions.
So it would be easier for me to start with a patch for gcm_hash only (possibly with supporting C code for key setup).
Regards, /Niels
What's the speedup you get from assembly gcm_fill? I see the C implementation uses memcpy and WRITE_UINT32, and is likely significantly slower than the ctr_fill16 in ctr.c. But it could be improved using portable means. If done well, it should be a very small fraction of the cpu time spent for gcm encryption.
Actually, there is a counter involved along with writing operations, the idea here is that the width of a single block is 16 bytes which can fit in a vector register so one writing operation is needed plus executing e.g. 6 addition operations per cycle would boost the function performance. I measured the execution time of both C and altivec implementations on POWER8 for 32,768 blocks (512 KB), repeated 10000 times and compiled with -O3 gcm_fill_c() took 0.000073 seconds to execute gcm_fill_altivec() took 0.000019 seconds to execute As you can see, the function itself isn't time consuming at all and maybe optimizing it is not worth it, but gcm_fill is part of AES_CTR and what other libraries usually do is optimizing AES_CTR as a whole so I considered optimizing it to stay on the same track.
What table layout is used by the assembly gcm_init_key? I would have
expected not much tables to be needed when using the special multiply instructions. And Nettle generally doesn't try very hard to optimize key setup, under the theory that applications that need high performance also do a lot of work with each key. E.g., we use C code for AES key setup even when it could be sped up by assembly using special instructions.
So it would be easier for me to start with a patch for gcm_hash only (possibly with supporting C code for key setup).
It's a little bit more complicated than just special multiply instructions, let me explain that
From reference [1]:
To compute the GHASH digest of 4 consecutive blocks, we use a method of parallelization as described in References [3]. The method can be described by the following equations: Ciphertext inputs: C0, C1, C2, C3. Digest input/output: Digest. Digest = (((((((Digest ⊕C0)*H)⊕C1)*H)⊕C2)*H)⊕C3)*H = ((Digest⊕C0)*H^4)⊕(C1*H^3)⊕(C2*H^2)⊕(C3*H)
As you can see, there is a pre-computed constants here "H,H^2,H^3,H^3" which should settle in a table to be used in upcoming hash operations, furthermore to handle bit-reflection of the multiplication product, we precompute "H << 1 modulo polynomial g(x)" then to pre-compute "H^2,H^3,H^3" we use the operation: reflected (A)*reflected (H<<1 mod g(x)) = reflected (A*H) mod g(x) so we got H = H << 1 modulo polynomial g(x) then H^2 = H * H and so on. we can use these values as 128-bit constants and avoid shifting the product. Also in the init function we have to precompute inputs to the Karatsuba Algorithm, so we adapt the previous constants to be compatible with the Karatsuba Algorithm. In summary, The table layout varies according to the techniques used in the different implementations, it's unreasonable to fill certain table using C as standard key setup, for example there are several x86 GCM implementations in OpenSSL library and the table layouts of SSE and AVX implementations are different from each other.
[1] https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/co...
Maamoun TK maamoun.tk@googlemail.com writes:
What's the speedup you get from assembly gcm_fill? I see the C implementation uses memcpy and WRITE_UINT32, and is likely significantly slower than the ctr_fill16 in ctr.c. But it could be improved using portable means. If done well, it should be a very small fraction of the cpu time spent for gcm encryption.
I measured the execution time of both C and altivec implementations on POWER8 for 32,768 blocks (512 KB), repeated 10000 times and compiled with -O3 gcm_fill_c() took 0.000073 seconds to execute gcm_fill_altivec() took 0.000019 seconds to execute As you can see, the function itself isn't time consuming at all and maybe optimizing it is not worth it,
Can you try below patch? For now, tested on little endian (x86_64) only, and there the loop compiles to
50: 89 c8 mov %ecx,%eax 52: 4c 89 0a mov %r9,(%rdx) 55: 48 83 c2 10 add $0x10,%rdx 59: 83 c1 01 add $0x1,%ecx 5c: 0f c8 bswap %eax 5e: 48 c1 e0 20 shl $0x20,%rax 62: 4c 01 d0 add %r10,%rax 65: 48 89 42 f8 mov %rax,-0x8(%rdx) 69: 4c 39 c2 cmp %r8,%rdx 6c: 75 e2 jne 50 <gcm_fill+0x20>
Should run in a few cycles per block (6 cycles assuming dual-issue, decent out-of-order capabilities per block). I would expect unrolling, to do multiple blocks in parallel, to give a large performance improvement only on strict in-order processors.
but gcm_fill is part of AES_CTR and what other libraries usually do is optimizing AES_CTR as a whole so I considered optimizing it to stay on the same track.
In Nettle, I strive to go to the extra complexity of assembler implementation only when there's a significant performance benefit.
Regards, /Niels
diff --git a/gcm.c b/gcm.c index cf615daf..71e9f365 100644 --- a/gcm.c +++ b/gcm.c @@ -334,6 +334,46 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key *key, }
static nettle_fill16_func gcm_fill; +#if WORDS_BIGENDIAN +static void +gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) +{ + uint64_t hi, lo; + uint32_t lo; + size_t i; + hi = READ_UINT64(ctr); + mid = (uint64_t)READ_UINT32(ctr + 8) << 32; + lo = READ_UINT32(ctr + 12); + + for (i = 0; i < blocks; i++) + { + buffer[i].u64[0] = hi; + buffer[i].u64[1] = mid + lo++; + } + WRITE_UINT32(ctr + 12, lo); + +} +#elif HAVE_BUILTIN_BSWAP64 +/* Assume __builtin_bswap32 is also available */ +static void +gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) +{ + uint64_t hi, mid; + uint32_t lo; + size_t i; + hi = LE_READ_UINT64(ctr); + mid = LE_READ_UINT32(ctr + 8); + lo = READ_UINT32(ctr + 12); + + for (i = 0; i < blocks; i++) + { + buffer[i].u64[0] = hi; + buffer[i].u64[1] = mid + ((uint64_t)__builtin_bswap32(lo) << 32); + lo++; + } + WRITE_UINT32(ctr + 12, lo); +} +#else static void gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) { @@ -349,6 +389,7 @@ gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
WRITE_UINT32(ctr + GCM_BLOCK_SIZE - 4, c); } +#endif
void gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key,
It's gotten better with this patch, now it takes 0.000049 seconds to execute under the same circumstances.
On Fri, Sep 25, 2020 at 9:59 AM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
What's the speedup you get from assembly gcm_fill? I see the C implementation uses memcpy and WRITE_UINT32, and is likely significantly slower than the ctr_fill16 in ctr.c. But it could be improved using portable means. If done well, it should be a very small fraction of the cpu time spent for gcm encryption.
I measured the execution time of both C and altivec implementations on POWER8 for 32,768 blocks (512 KB), repeated 10000 times and compiled with -O3 gcm_fill_c() took 0.000073 seconds to execute gcm_fill_altivec() took 0.000019 seconds to execute As you can see, the function itself isn't time consuming at all and maybe optimizing it is not worth it,
Can you try below patch? For now, tested on little endian (x86_64) only, and there the loop compiles to
50: 89 c8 mov %ecx,%eax 52: 4c 89 0a mov %r9,(%rdx) 55: 48 83 c2 10 add $0x10,%rdx 59: 83 c1 01 add $0x1,%ecx 5c: 0f c8 bswap %eax 5e: 48 c1 e0 20 shl $0x20,%rax 62: 4c 01 d0 add %r10,%rax 65: 48 89 42 f8 mov %rax,-0x8(%rdx) 69: 4c 39 c2 cmp %r8,%rdx 6c: 75 e2 jne 50 <gcm_fill+0x20>
Should run in a few cycles per block (6 cycles assuming dual-issue, decent out-of-order capabilities per block). I would expect unrolling, to do multiple blocks in parallel, to give a large performance improvement only on strict in-order processors.
but gcm_fill is part of AES_CTR and what other libraries usually do is optimizing AES_CTR as a whole so I considered optimizing it to stay on the same track.
In Nettle, I strive to go to the extra complexity of assembler implementation only when there's a significant performance benefit.
Regards, /Niels
diff --git a/gcm.c b/gcm.c index cf615daf..71e9f365 100644 --- a/gcm.c +++ b/gcm.c @@ -334,6 +334,46 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key *key, }
static nettle_fill16_func gcm_fill; +#if WORDS_BIGENDIAN +static void +gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) +{
- uint64_t hi, lo;
- uint32_t lo;
- size_t i;
- hi = READ_UINT64(ctr);
- mid = (uint64_t)READ_UINT32(ctr + 8) << 32;
- lo = READ_UINT32(ctr + 12);
- for (i = 0; i < blocks; i++)
- {
buffer[i].u64[0] = hi;
buffer[i].u64[1] = mid + lo++;
- }
- WRITE_UINT32(ctr + 12, lo);
+} +#elif HAVE_BUILTIN_BSWAP64 +/* Assume __builtin_bswap32 is also available */ +static void +gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) +{
- uint64_t hi, mid;
- uint32_t lo;
- size_t i;
- hi = LE_READ_UINT64(ctr);
- mid = LE_READ_UINT32(ctr + 8);
- lo = READ_UINT32(ctr + 12);
- for (i = 0; i < blocks; i++)
- {
buffer[i].u64[0] = hi;
buffer[i].u64[1] = mid + ((uint64_t)__builtin_bswap32(lo) << 32);
lo++;
- }
- WRITE_UINT32(ctr + 12, lo);
+} +#else static void gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) { @@ -349,6 +389,7 @@ gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
WRITE_UINT32(ctr + GCM_BLOCK_SIZE - 4, c); } +#endif
void gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key,
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
nettle-bugs@lists.lysator.liu.se