--- configure.ac | 6 +- fat-ppc.c | 33 ++ fat-setup.h | 7 + gcm.c | 69 +++- powerpc64/fat/gcm-hash.asm | 39 +++ powerpc64/p8/gcm-hash.asm | 773 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 909 insertions(+), 18 deletions(-) create mode 100644 powerpc64/fat/gcm-hash.asm create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index e9983697..0129f950 100644 --- a/configure.ac +++ b/configure.ac @@ -488,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -612,9 +612,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/fat-ppc.c b/fat-ppc.c index 2bc50481..d73b45b0 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -120,6 +120,16 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
+#if GCM_TABLE_BITS == 8 +DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64) + +DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64) +#endif /* GCM_TABLE_BITS == 8 */ + static void CONSTRUCTOR fat_init (void) { @@ -139,11 +149,23 @@ fat_init (void) fprintf (stderr, "libnettle: enabling arch 2.07 code.\n"); _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64; _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64; +#if GCM_TABLE_BITS == 8 + /* Make sure _nettle_gcm_init_key_vec function is compatible + with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c() + fills gcm_key table with values that are incompatible with + _nettle_gcm_hash_ppc64() */ + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64; + _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64; +#endif /* GCM_TABLE_BITS == 8 */ } else { _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c; _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c; +#if GCM_TABLE_BITS == 8 + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c; + _nettle_gcm_hash_vec = _nettle_gcm_hash_c; +#endif /* GCM_TABLE_BITS == 8 */ } }
@@ -160,3 +182,14 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void, size_t length, uint8_t *dst, const uint8_t *src), (rounds, keys, T, length, dst, src)) + +#if GCM_TABLE_BITS == 8 +DEFINE_FAT_FUNC(_nettle_gcm_init_key, void, + (union nettle_block16 *table), + (table)) + +DEFINE_FAT_FUNC(_nettle_gcm_hash, void, + (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data), + (key, x, length, data)) +#endif /* GCM_TABLE_BITS == 8 */ diff --git a/fat-setup.h b/fat-setup.h index 99f1ea67..4a9f7969 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -162,6 +162,13 @@ typedef void aes_crypt_internal_func (unsigned rounds, const uint32_t *keys, size_t length, uint8_t *dst, const uint8_t *src);
+#if GCM_TABLE_BITS == 8 +typedef void gcm_init_key_func (union nettle_block16 *table); + +typedef void gcm_hash_func (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +#endif /* GCM_TABLE_BITS == 8 */ + typedef void *(memxor_func)(void *dst, const void *src, size_t n);
typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds); diff --git a/gcm.c b/gcm.c index 48b3e75a..787c303e 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,26 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key + +#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +/* For fat builds */ +void +_nettle_gcm_init_key_c (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash + +#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +/* For fat builds */ +void +_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +248,32 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifdef gcm_init_key +void +_nettle_gcm_init_key_c(union nettle_block16 *table) +#else +static void +gcm_init_key(union nettle_block16 *table) +#endif /* !gcm_init_key */ +{ +#if GCM_TABLE_BITS + /* Middle element if GCM_TABLE_BITS > 0, otherwise the first + element */ + unsigned i = (1<<GCM_TABLE_BITS)/2; + + /* Algorithm 3 from the gcm paper. First do powers of two, then do + the rest by adding. */ + while (i /= 2) + block16_mulx_ghash(&table[i], &table[2*i]); + for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) + { + unsigned j; + for (j = 1; j < i; j++) + block16_xor3(&table[i+j], &table[i], &table[j]); + } +#endif +} + /* Initialization of GCM. * @ctx: The context of GCM * @cipher: The context of the underlying block cipher @@ -244,25 +290,19 @@ gcm_set_key(struct gcm_key *key, /* H */ memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b); - -#if GCM_TABLE_BITS - /* Algorithm 3 from the gcm paper. First do powers of two, then do - the rest by adding. */ - while (i /= 2) - block16_mulx_ghash(&key->h[i], &key->h[2*i]); - for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) - { - unsigned j; - for (j = 1; j < i; j++) - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]); - } -#endif + + gcm_init_key(key->h); }
-#ifndef gcm_hash +#ifdef gcm_hash +void +_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data) +#else static void gcm_hash(const struct gcm_key *key, union nettle_block16 *x, size_t length, const uint8_t *data) +#endif /* !gcm_hash */ { for (; length >= GCM_BLOCK_SIZE; length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE) @@ -276,7 +316,6 @@ gcm_hash(const struct gcm_key *key, union nettle_block16 *x, gcm_gf_mul (x, key->h); } } -#endif /* !gcm_hash */
static void gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x, diff --git a/powerpc64/fat/gcm-hash.asm b/powerpc64/fat/gcm-hash.asm new file mode 100644 index 00000000..7fd77223 --- /dev/null +++ b/powerpc64/fat/gcm-hash.asm @@ -0,0 +1,39 @@ +C powerpc64/fat/gcm-hash.asm + + +ifelse(` + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl picked up by configure +dnl PROLOGUE(_nettle_gcm_init_key) +dnl PROLOGUE(_nettle_gcm_hash) + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/gcm-hash.asm') diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..f987f2ca --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,773 @@ +C powerpc64/p8/gcm-hash.asm + +ifelse(` + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6') + +define(`zero', `v0') +define(`swap_mask', `v1') +define(`hidw_mask', `v2') +define(`lodw_mask', `v3') +define(`poly', `v4') +define(`poly_h', `v4') +define(`poly_l', `v5') +define(`RP', `v6') +define(`Mh', `v7') +define(`Ml', `v8') +define(`H', `v9') +define(`Hh', `v10') +define(`Hl', `v11') +define(`RP2', `v9') +define(`M2h', `v10') +define(`M2l', `v11') + +define(`sl1', `v1') +define(`msb', `v5') +define(`H2', `v6') +define(`H2h', `v7') +define(`H2l', `v8') +define(`H_h', `v12') +define(`H_m', `v13') +define(`H_l', `v14') +define(`H_Hh', `v12') +define(`H_H', `v13') +define(`H_Hl', `v14') +define(`H_t', `v15') +define(`H2_h', `v16') +define(`H2_m', `v17') +define(`H2_l', `v18') +define(`H2_t', `v19') + +define(`C0', `v6') +define(`C1', `v7') +define(`C2', `v8') +define(`C3', `v12') +define(`C4', `v6') +define(`C5', `v7') +define(`C6', `v8') +define(`C7', `v12') + +define(`C', `v13') + +define(`Ch', `v14') +define(`Cl', `v15') +define(`Cm', `v16') + +define(`C01h', `v14') +define(`C01l', `v15') +define(`C01', `v16') +define(`C23h', `v17') +define(`C23l', `v18') +define(`C23', `v19') +define(`C45h', `v20') +define(`C45l', `v21') +define(`C45', `v22') +define(`C67h', `v6') +define(`C67l', `v7') +define(`C67', `v8') + +define(`H21', `v9') +define(`H21h', `v10') +define(`H21l', `v11') +define(`H43', `v23') +define(`H43h', `v24') +define(`H43l', `v25') +define(`H65', `v26') +define(`H65h', `v27') +define(`H65l', `v28') +define(`H87', `v29') +define(`H87h', `v30') +define(`H87l', `v31') + +.file "gcm-hash.asm" + +.text + + # void gcm_init_key (union gcm_block *table) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key) + DATA_LOAD_VEC(poly,.polynomial,r7) +IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)') + DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7) + DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7) + + li r10,8*TableElemAlign + lxvd2x VSR(H),r10,TABLE # load H +IF_LE(`vperm H,H,H,swap_mask') + + # --- calculate H = H shift left 1 modulo polynomial --- + + vupkhsw msb,H # most significant bit word-extend + vspltisb sl1,1 # splat 1 for shift left + vspltw msb,msb,0 # most significant bit extend + vsl H,H,sl1 # H shift left 1 + vand msb,msb,poly + vxor zero,zero,zero + vxor H_t,H,msb + + vsldoi H,H_t,H_t,8 # doubleword swap + vsldoi Hh,H,zero,8 + vsldoi Hl,zero,H,8 + + # --- calculate H^2 = H*H --- + + # reduction pre-processing + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + # polynomial multiplication "classical" + vpmsumd H_h,H_t,Hh # H^1h*H^1h + vpmsumd H_l,H_t,Hl # H^1l*H^1l + vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] + vsldoi Ml,H_m,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor H_h,H_h,Mh # [2] + vxor H_l,H_l,Ml # [2] + vxor H_l,H_l,RP # [1] + + # reduction second phase + vpmsumd RP,H_l,poly_l + vxor H_h,H_l,H_h + vxor H2_t,H_h,RP + + vsldoi H2,H2_t,H2_t,8 + vsldoi H2h,H2,zero,8 + vsldoi H2l,zero,H2,8 + + # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + li r8,0*TableElemAlign + li r9,1*TableElemAlign + li r10,2*TableElemAlign + stxvd2x VSR(Hl),r8,TABLE + stxvd2x VSR(H),r9,TABLE + stxvd2x VSR(Hh),r10,TABLE + + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^3,H^4 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^2l + vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h + vpmsumd H_h,H_t,H2h # H^1h*H^2h + vpmsumd H2_l,H2_t,H2l # H^2l*H^2l + vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h + vpmsumd H2_h,H2_t,H2h # H^2h*H^2h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^3 + vpmsumd RP2,H2_l,poly_h # [1] H^4 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^3 + vsldoi M2h,zero,H2_m,8 # [2] H^4 + vsldoi Ml,H_m,zero,8 # [2] H^3 + vsldoi M2l,H2_m,zero,8 # [2] H^4 + vsldoi RP,RP,RP,8 # [1] H^3 + vsldoi RP2,RP2,RP2,8 # [1] H^4 + vxor H_h,H_h,Mh # [2] H^3 + vxor H2_h,H2_h,M2h # [2] H^4 + vxor H_l,H_l,Ml # [2] H^3 + vxor H2_l,H2_l,M2l # [2] H^4 + vxor H_l,H_l,RP # [1] H^3 + vxor H2_l,H2_l,RP2 # [1] H^4 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^3 + vpmsumd RP2,H2_l,poly_l # H^4 + vxor H_h,H_l,H_h # H^3 + vxor H2_h,H2_l,H2_h # H^4 + vxor H_h,H_h,RP # H^3 + vxor H2_h,H2_h,RP2 # H^4 + + vsldoi H2,H2_h,H2_h,8 # H^4 + vsldoi H,H_h,H_h,8 # H^3 + vsldoi H2l,zero,H2,8 # H^4 + vsldoi H2h,H2,zero,8 # H^4 + + # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + li r8,6*TableElemAlign + li r9,7*TableElemAlign + li r10,8*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^5,H^6 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^4l + vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h + vpmsumd H_h,H_t,H2h # H^1h*H^4h + vpmsumd H2_l,H2_t,H2l # H^2l*H^4l + vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h + vpmsumd H2_h,H2_t,H2h # H^2h*H^4h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^5 + vpmsumd RP2,H2_l,poly_h # [1] H^6 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^5 + vsldoi M2h,zero,H2_m,8 # [2] H^6 + vsldoi Ml,H_m,zero,8 # [2] H^5 + vsldoi M2l,H2_m,zero,8 # [2] H^6 + vsldoi RP,RP,RP,8 # [1] H^5 + vsldoi RP2,RP2,RP2,8 # [1] H^6 + vxor H_h,H_h,Mh # [2] H^5 + vxor H2_h,H2_h,M2h # [2] H^6 + vxor H_l,H_l,Ml # [2] H^5 + vxor H2_l,H2_l,M2l # [2] H^6 + vxor H_l,H_l,RP # [1] H^5 + vxor H2_l,H2_l,RP2 # [1] H^6 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^5 + vpmsumd RP2,H2_l,poly_l # H^6 + vxor H_h,H_l,H_h # H^5 + vxor H2_h,H2_l,H2_h # H^6 + vxor H_h,H_h,RP # H^5 + vxor H2_h,H2_h,RP2 # H^6 + + vsldoi H2,H2_h,H2_h,8 # H^6 + vsldoi H,H_h,H_h,8 # H^5 + vsldoi H2l,zero,H2,8 # H^6 + vsldoi H2h,H2,zero,8 # H^6 + + # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + li r8,9*TableElemAlign + li r9,10*TableElemAlign + li r10,11*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^7,H^8 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^6l + vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h + vpmsumd H_h,H_t,H2h # H^1h*H^6h + vpmsumd H2_l,H2_t,H2l # H^2l*H^6l + vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h + vpmsumd H2_h,H2_t,H2h # H^2h*H^6h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^7 + vpmsumd RP2,H2_l,poly_h # [1] H^8 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^7 + vsldoi M2h,zero,H2_m,8 # [2] H^8 + vsldoi Ml,H_m,zero,8 # [2] H^7 + vsldoi M2l,H2_m,zero,8 # [2] H^8 + vsldoi RP,RP,RP,8 # [1] H^7 + vsldoi RP2,RP2,RP2,8 # [1] H^8 + vxor H_h,H_h,Mh # [2] H^7 + vxor H2_h,H2_h,M2h # [2] H^8 + vxor H_l,H_l,Ml # [2] H^7 + vxor H2_l,H2_l,M2l # [2] H^8 + vxor H_l,H_l,RP # [1] H^7 + vxor H2_l,H2_l,RP2 # [1] H^8 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^7 + vpmsumd RP2,H2_l,poly_l # H^8 + vxor H_h,H_l,H_h # H^7 + vxor H2_h,H2_l,H2_h # H^8 + vxor H_h,H_h,RP # H^7 + vxor H2_h,H2_h,RP2 # H^8 + + vsldoi H,H_h,H_h,8 # H^7 + vsldoi H2,H2_h,H2_h,8 # H^8 + + # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + li r8,12*TableElemAlign + li r9,13*TableElemAlign + li r10,14*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key) + + # void gcm_hash (const struct gcm_key *key, union gcm_block *x, + # size_t length, const uint8_t *data) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash) + vxor zero,zero,zero + + DATA_LOAD_VEC(poly,.polynomial,r7) +IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)') + DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7) + DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7) + + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + lxvd2x VSR(C),0,X # load X +IF_LE(`vperm C,C,C,swap_mask') + + srdi r7,LENGTH,7 # 8x loop count + cmpldi r7,0 + beq L2x + + # backup registers + stdu SP,-224(SP) + std r28,216(SP) + std r29,208(SP) + std r30,200(SP) + std r31,192(SP) + li 8,176 + stvx 20,8,SP + subi 8,8,16 + stvx 21,8,SP + subi 8,8,16 + stvx 22,8,SP + subi 8,8,16 + stvx 23,8,SP + subi 8,8,16 + stvx 24,8,SP + subi 8,8,16 + stvx 25,8,SP + subi 8,8,16 + stvx 26,8,SP + subi 8,8,16 + stvx 27,8,SP + subi 8,8,16 + stvx 28,8,SP + subi 8,8,16 + stvx 29,8,SP + subi 8,8,16 + stvx 30,8,SP + subi 8,8,16 + stvx 31,8,SP + + # table loading + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H21h),r8,TABLE + lxvd2x VSR(H21),r9,TABLE + lxvd2x VSR(H21l),r10,TABLE + li r8,6*TableElemAlign + li r9,7*TableElemAlign + li r10,8*TableElemAlign + lxvd2x VSR(H43h),r8,TABLE + lxvd2x VSR(H43),r9,TABLE + lxvd2x VSR(H43l),r10,TABLE + li r8,9*TableElemAlign + li r9,10*TableElemAlign + li r10,11*TableElemAlign + lxvd2x VSR(H65h),r8,TABLE + lxvd2x VSR(H65),r9,TABLE + lxvd2x VSR(H65l),r10,TABLE + li r8,12*TableElemAlign + li r9,13*TableElemAlign + li r10,14*TableElemAlign + lxvd2x VSR(H87h),r8,TABLE + lxvd2x VSR(H87),r9,TABLE + lxvd2x VSR(H87l),r10,TABLE + + li r8,0x10 + li r9,0x20 + li r10,0x30 + li r28,0x40 + li r29,0x50 + li r30,0x60 + li r31,0x70 + + mtctr r7 +.align 5 +L8x_loop: + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + lxvd2x VSR(C1),r8,DATA # load C1 + lxvd2x VSR(C2),r9,DATA # load C2 + lxvd2x VSR(C3),r10,DATA # load C3 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask + vperm C2,C2,C2,swap_mask + vperm C3,C3,C3,swap_mask') + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C23h,C2,C3,hidw_mask + vperm C23l,C2,C3,lodw_mask + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + + # input loading + lxvd2x VSR(C4),r28,DATA # load C4 + lxvd2x VSR(C5),r29,DATA # load C5 + lxvd2x VSR(C6),r30,DATA # load C6 + lxvd2x VSR(C7),r31,DATA # load C7 + + # swap permuting +IF_LE(`vperm C4,C4,C4,swap_mask + vperm C5,C5,C5,swap_mask + vperm C6,C6,C6,swap_mask + vperm C7,C7,C7,swap_mask') + + # polynomial multiplication "karatsuba" pre-processing + vperm C45h,C4,C5,hidw_mask + vperm C45l,C4,C5,lodw_mask + vperm C67h,C6,C7,hidw_mask + vperm C67l,C6,C7,lodw_mask + vxor C23,C23h,C23l + vxor C01,C01h,C01l + vxor C45,C45h,C45l + vxor C67,C67h,C67l + + # polynomial multiplication "karatsuba" + vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h + vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l + vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h + vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l + vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h + vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l + vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h + vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l + vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l) + vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l) + vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l) + vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l) + + # polynomial multiplication "karatsuba" post-processing + vxor C23,C23,C23h + vxor C01,C01,C01h + vxor C45,C45,C45h + vxor C67,C67,C67h + vxor C23,C23,C23l + vxor C01,C01,C01l + vxor C45,C45,C45l + vxor C67,C67,C67l + + # deferred recombination of partial products + vxor C01h,C01h,C23h # H0 = H01⊕H23 + vxor C45h,C45h,C67h # H1 = H45⊕H67 + vxor C01l,C01l,C23l # L0 = L01⊕L23 + vxor C45l,C45l,C67l # L1 = L45⊕L45 + vxor C01,C01,C23 # M0 = M01⊕M23 + vxor C45,C45,C67 # M1 = M45⊕M45 + vxor C01h,C01h,C45h # H = H0⊕H1 + vxor C01l,C01l,C45l # L = L0⊕L1 + vxor C01,C01,C45 # M = M0⊕M1 + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x80 + bdnz L8x_loop + + # restore registers + li 8,0 + lvx 31,8,SP + addi 8,8,16 + lvx 30,8,SP + addi 8,8,16 + lvx 29,8,SP + addi 8,8,16 + lvx 28,8,SP + addi 8,8,16 + lvx 27,8,SP + addi 8,8,16 + lvx 26,8,SP + addi 8,8,16 + lvx 25,8,SP + addi 8,8,16 + lvx 24,8,SP + addi 8,8,16 + lvx 23,8,SP + addi 8,8,16 + lvx 22,8,SP + addi 8,8,16 + lvx 21,8,SP + addi 8,8,16 + lvx 20,8,SP + ld r31,192(SP) + ld r30,200(SP) + ld r29,208(SP) + ld r28,216(SP) + addi SP,SP,224 + + clrldi LENGTH,LENGTH,57 +L2x: + srdi r7,LENGTH,5 + cmpldi r7,0 + beq L1x + + # table loading + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H21h),r8,TABLE + lxvd2x VSR(H21),r9,TABLE + lxvd2x VSR(H21l),r10,TABLE + + li r10,0x10 + + mtctr r7 +.align 5 +L2x_loop: + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + lxvd2x VSR(C1),r10,DATA # load C1 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask') + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + vxor C01,C01h,C01l + + # polynomial multiplication "karatsuba" + vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h + vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l + vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l) + + # polynomial multiplication "karatsuba" post-processing + vxor C01,C01,C01h + vxor C01,C01,C01l + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x20 + bdnz L2x_loop + + clrldi LENGTH,LENGTH,59 +L1x: + srdi r7,LENGTH,4 + cmpldi r7,0 + beq Lrem + + # table loading + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(Hl),0,TABLE + lxvd2x VSR(H), r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask') + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 +Lrem: + cmpldi LENGTH,0 + beq Ldone + + # table loading + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(Hl),0,TABLE + lxvd2x VSR(H), r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + # input loading + stdu SP,-16(SP) + stvx zero,0,SP +Lst_loop: + subic. LENGTH,LENGTH,1 + lbzx r7,LENGTH,DATA + stbx r7,LENGTH,SP + bne Lst_loop + lxvd2x VSR(C0),0,SP + addi SP,SP,16 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask') + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + +Ldone: +IF_LE(`vperm C,C,C,swap_mask') + stxvd2x VSR(C),0,X # store C + blr +EPILOGUE(_nettle_gcm_hash) + + .data +IF_LE(`.align 4 +.polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 + .align 4 +.swap_mask: + .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 + .align 4 +.hidw_mask: + .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0 + .align 4 +.lodw_mask: + .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8') +IF_BE(`.align 4 +.polynomial: + .byte 0xc2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 + .align 4 +.hidw_mask: + .byte 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23 + .align 4 +.lodw_mask: + .byte 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31')
gcm_fill() got C optimization which performs close to the one I implemented using altivec, the altivec version of gcm_fill has been wiped now.
On 28/09/20 8:25 am, Maamoun TK wrote:
gcm_fill() got C optimization which performs close to the one I
What do you mean by "close"? faster or slower? and it that difference consistent?
If the other implementation is slower (even by a few nanosec) it can be useful keeping this fast code around for builds where high-performance is critical.
AYJ
I posted a performance test here https://lists.lysator.liu.se/pipermail/nettle-bugs/2020/009169.html Personally, I prefer keeping the altivec version in nettle library since it's faster than C implementation but I'm not sure whether the performance margin fits with the library's convention of optimizing such functions.
On Mon, Sep 28, 2020 at 8:40 AM Amos Jeffries squid3@treenet.co.nz wrote:
On 28/09/20 8:25 am, Maamoun TK wrote:
gcm_fill() got C optimization which performs close to the one I
What do you mean by "close"? faster or slower? and it that difference consistent?
If the other implementation is slower (even by a few nanosec) it can be useful keeping this fast code around for builds where high-performance is critical.
AYJ _______________________________________________ nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
Maamoun TK maamoun.tk@googlemail.com writes:
I posted a performance test here https://lists.lysator.liu.se/pipermail/nettle-bugs/2020/009169.html Personally, I prefer keeping the altivec version in nettle library since it's faster than C implementation but I'm not sure whether the performance margin fits with the library's convention of optimizing such functions.
We can revisit it later, but lets go for the low-hanging fruit first.
Regards, /Niels
Maamoun TK maamoun.tk@googlemail.com writes:
Thanks for the update. This is quite complex for me. I have not yet read the code very carefully. I think I'd like to focus on the main function, gcm_hash, first. Some questions and suggestions, to make it easier:
1. Take out the fat support to it's own patch.
2. You could consider doing the init_key in C, if nothing else as documentation. It could be either under some #ifdef in gcm.c, or a separate .c file under powerpc64/p8/, next to gcm-hash.asm. Maybe it's still a good idea to have it in assembly, that's a tradeoff that depends a bit on the complexity of both the C and assembly, and the speedup from doing it in assembly. And I don't have a very strong opinion on this point now.
Even with asm, it might be a bit clearer to move it to its own .asm file, so each file can use define only for the relevant registers for that function.
3. What's TableElemAlign? Assuming GCM_TABLE_BITS is 8 (current Nettle ABI), you can treat struct gcm_key as a blob of size 4096 bytes, with alignment corresponding to what the C compiler uses for uint64_t. Are you using some padding at the start (depending on address) to ensure you get stronger alignment? And 256 byte alignment sounds a bit more than needed?
4. Please document the layout used for the precomputed values stored in struct gcm_key.
5. It would help with comments explaining the naming convention used for the named registers, and the instruction sequence used for a single Karatsuba multiplication, with any needed comments.
6. Is 8-way unrolling really necessary to get full utilization of the execution units? And it's also not yet clear to me what 8-way means, is that 8 blocks of 16 bytes each (i.e., 128 bytes input), or 8 input bytes?
7. Do you need any bit reversal? As you have mentioned, the multiplication operation is symmetric under bit reversal, so ideally bit reversal should be needed at most when setting the key and extracting the digest, but not by the main workhorse, gcm_hash.
I know you have referenced articles for the used algorithm, but it would be helpful to have the main ideas in comments close to the code.
Regards, /Niels
- Take out the fat support to it's own patch.
Done! I will post the assembly part first so you can review it.
2. You could consider doing the init_key in C, if nothing else as
documentation. It could be either under some #ifdef in gcm.c, or a separate .c file under powerpc64/p8/, next to gcm-hash.asm. Maybe it's still a good idea to have it in assembly, that's a tradeoff that depends a bit on the complexity of both the C and assembly, and the speedup from doing it in assembly. And I don't have a very strong opinion on this point now.
Even with asm, it might be a bit clearer to move it to its own .asm file, so each file can use define only for the relevant registers for that function.
Writing init_key() in C for PowerPC implementation has a couple of disadvantages: 1. init_key() and gcm_hash() functions are connected to each other through a shared table, it makes it easier to modify the implementation if both are written in the same way. 2. we have to use intrinsics for certain operations like 'vpmsumd', furthermore '__builtin_crypto_vpmsumd' is buggy on certain versions of GCC https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91275 and has different name on CLANG '__builtin_altivec_crypto_vpmsumd' so we will end up using a lot of conditions to check the variant of compiler plus writing inline assembly code for 'vpmsumd' in case the variant has intrinsic issue with it. I still prefer to have both functions in the same file, I separated the 'define' macros for each function so each function has its own define section above its prologue.
3. What's TableElemAlign? Assuming GCM_TABLE_BITS is 8 (current Nettle
ABI), you can treat struct gcm_key as a blob of size 4096 bytes, with alignment corresponding to what the C compiler uses for uint64_t. Are you using some padding at the start (depending on address) to ensure you get stronger alignment? And 256 byte alignment sounds a bit more than needed?
The compiler aligns each element of gcm_key array at 0x100 perhaps because the struct is declared as union so for example if I want to get the 'H' value that is assigned into the 9th index, I have to add 0x800 to the array address to get that value.
- Please document the layout used for the precomputed values stored in struct gcm_key.
Done!
5. It would help with comments explaining the naming convention used for
the named registers, and the instruction sequence used for a single Karatsuba multiplication, with any needed comments.
I tried to make a reader-friendly version of the implementation to make it clearer with appropriate comments. Let me know if the documentation still looks sketchy.
6. Is 8-way unrolling really necessary to get full utilization of the
execution units? And it's also not yet clear to me what 8-way means, is that 8 blocks of 16 bytes each (i.e., 128 bytes input), or 8 input bytes?
processing 4 blocks is enough to saturate the execution units but processing 8 blocks (each block is 128-bit) cut the times of reduction procedure execution by half compared to processing 4 blocks per loop so it performs better, however in order to facilitate a review of implementation I downed the loop to process 4 blocks to make it more clear.
- Do you need any bit reversal? As you have mentioned, the multiplication operation is symmetric under bit reversal, so ideally bit reversal should be needed at most when setting the key and extracting the digest, but not by the main workhorse, gcm_hash.
You got it. If the bit-reflection of the key is handled, there is no need to handle the bit-reflection of the multiplication product with that key. So any upcoming multiplication will be bit-reversal-free.
I would like to explain more about 'vpmsumd' instruction, in x86 arch the 'pclmulqdq' instruction is used for carry-less operations. To use 'pclmulqdq' an immediate value should be passed to the third parameter of the instruction to specify which doublewords will be multiplied. However, 'vpmsumd' do the following operation: (High-order doubleword of the second parameter * High-order doubleword of the third parameter) XOR (Low-order doubleword of the second parameter * Low-order doubleword of the third parameter)
On Sat, Oct 3, 2020 at 7:00 PM Maamoun TK maamoun.tk@googlemail.com wrote:
... 2. ... has different name on CLANG '__builtin_altivec_crypto_vpmsumd' so we will end up using a lot of conditions to check the variant of compiler plus writing inline assembly code for 'vpmsumd' in case the variant has intrinsic issue with it.
And __vpmsumd for IBM's XL C/C++ compiler.
Clang plays games with the preprocessor macros. It pretends to be GCC, Clang and XLC all at once, but it can't handle the other compiler's intrinsics.
Here's how to setup the preprocessor macro tests:
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) // IBM XL C/C++ #elif defined(__clang__) // Clang #else // GCC #endif
I believe all the PowerPC machines on the GCC compile farm have IBM XL C/C++ installed.
Jeff
Maamoun TK maamoun.tk@googlemail.com writes:
Done! I will post the assembly part first so you can review it.
Thanks. I hope to get the time to read it carefully soon.
- init_key() and gcm_hash() functions are connected to each other through
a shared table, it makes it easier to modify the implementation if both are written in the same way. 2. we have to use intrinsics for certain operations like 'vpmsumd', furthermore '__builtin_crypto_vpmsumd' is buggy on certain versions of GCC https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91275 and has different name on CLANG '__builtin_altivec_crypto_vpmsumd' so we will end up using a lot of conditions to check the variant of compiler plus writing inline assembly code for 'vpmsumd' in case the variant has intrinsic issue with it. I still prefer to have both functions in the same file, I separated the 'define' macros for each function so each function has its own define section above its prologue.
I see. And I wouldn't want C code with machine-specific or compiler-specific intrinsics. If there's no reasonable way to do it in portable C, let's stick to assembly.
- What's TableElemAlign? Assuming GCM_TABLE_BITS is 8 (current Nettle
ABI), you can treat struct gcm_key as a blob of size 4096 bytes, with alignment corresponding to what the C compiler uses for uint64_t. Are you using some padding at the start (depending on address) to ensure you get stronger alignment? And 256 byte alignment sounds a bit more than needed?
The compiler aligns each element of gcm_key array at 0x100 perhaps because the struct is declared as union so for example if I want to get the 'H' value that is assigned into the 9th index, I have to add 0x800 to the array address to get that value.
That's highly unexpected! It makes struct gcm_key 16 times larger than intended, 64 KByte rather than 4KByte, which seems pretty bad. I would expect more or less any C compiler to use size 16 and 8 byte minimum alignment for the elements (and I'd wish there were a nice and portable way to enforce minimum 16 byte alignment). Can you double check, and try to find an explanation for this?
I would like to explain more about 'vpmsumd' instruction, in x86 arch the 'pclmulqdq' instruction is used for carry-less operations. To use 'pclmulqdq' an immediate value should be passed to the third parameter of the instruction to specify which doublewords will be multiplied. However, 'vpmsumd' do the following operation: (High-order doubleword of the second parameter * High-order doubleword of the third parameter) XOR (Low-order doubleword of the second parameter * Low-order doubleword of the third parameter)
Interesting! Do you use inputs where one doubleword is zero, making one or the other of the xored values be zero, or is there some clever way to take advantage of the buiiltin wraparound? I guess one can also do some interesting things of other selected parts of the inputs zero, for example, the middle word of one of the operands, or all odd-numbered bits, or...
Regards, /Niels
nisse@lysator.liu.se (Niels Möller) writes:
Maamoun TK maamoun.tk@googlemail.com writes:
I would like to explain more about 'vpmsumd' instruction, in x86 arch the 'pclmulqdq' instruction is used for carry-less operations. To use 'pclmulqdq' an immediate value should be passed to the third parameter of the instruction to specify which doublewords will be multiplied. However, 'vpmsumd' do the following operation: (High-order doubleword of the second parameter * High-order doubleword of the third parameter) XOR (Low-order doubleword of the second parameter * Low-order doubleword of the third parameter)
Interesting! Do you use inputs where one doubleword is zero, making one or the other of the xored values be zero, or is there some clever way to take advantage of the buiiltin wraparound? I guess one can also do some interesting things of other selected parts of the inputs zero, for example, the middle word of one of the operands, or all odd-numbered bits, or...
Ok, let me see if I can get the math right first (and ignore the issues if bit order and reversal). Thanks a lot for this the explanation of this curious instruction, I hope I have understood it corectly, because it seems really useful.
We have the polynomial p(x) = x^128 + x^7 + x^2 + x + 1 over Z_2. Which implies that
x^128 = x^2 + x + 1 (mod p(x))
so let's call this simpler polynomial p'(x) = x^2 + x + 1.
As input we have two polynomials, one depending on the key (i.e., invariant when processing a large message), represented as 128 bits each. Split them in 64-bit pieces we can reason about,
A = a_1(x) x^64 + a_0(x), B = b_1(x) x^64 + b_0(x),
with (unreduced) product C(x) = A(x) B(x), split in 64-bit pieces as
C(x) = c_3(x) x^192 + c_2(x) x^128 + c_1(x) x^64 + c_0 (x)
(where c_3 actually is only 63 bits). To reduce, we use x^128 = p'(x) (mod p(x)). We'd then need to multiply the high half with p'(x), say
D = (c_3(x) x^64 + c_2(x) ) p'(x) = d_2(x) x^128 + d_1(x) 2^64 + d_0(x)
where we get the high few bits in d_2(x). So we'd need to compute d_2 p'(x) again. And finally, we get the result C(x) mod p by summing/xoring
c_1(x) x^64 + c_0(x) d_1(x) x^64 + d_0(x) d2(x) p'(x) --------------------------
Now, let's look closer at the initial multiplication, A(x) B(x), which can be expanded as
a_1(x) b_1(x) x^128 + (a_1(x) b_0(x) + a_0(x) b_1(x)) x^64 + a_0(x) b_0(x)
I see two tricks: First, we can precompute a'_2(x) x^64 + a'_1(x) = a_1(x) x^128 (mod (p(x)), and replace the first term with
(a'_2(x) x^64 + a'_1(x)) b_1(x)
That should eliminate the annoying high bits d_2 in the reduction, at the cost one one more 64x64 multiply. I would expect that's the same number of operations, but more shallow dependency path. We get a 192-bit partially reduced result, instad of the full 256-bit result. Let's denote this product as
h_2(x) x^128 + h_1(x) x^64 + h_0(x) = (a'_2(x) x^64) + a'_1(x)) b_1(x)
Second, if we also prepare a register with the swapped A, a_0(x) x^64 + a_1(x), then the vpmsumd instruction, if I understand you correctly, can compute the middle term (a_1(x) b_0(x) + a_0(x) b_1(x)) in one operation. So we get down to 3 64x64 multiplies for free, without tghe reconstruction needed for standard Karatsuba multiplication. So if we compute the middle term like this,
m_1(x) x^64 + m_0(x) = a_1(x) b_0(x) + a_0(x) b_1(x)
The part we need to fold is m_1(x) + h_2(x) then we get the fully reduced result by summing
a_0(x) b_0(x) (127 bits, one vpmsumd with one input half zero) m_0(x) (64 bits) h_1(x) h_0(x) (128 bits) (m_1(x) + h_2(x))p'(x) (70 or so bits)
If I count it correctly, we get the fully reduced result with 5 vpmsumd multiplicatiions, one for a_0 b_0, one for the midle terms, two for the miltiplication with a', and one for the only explicit multiplication with p'(x). And if we do multiple blocks in parallel, they can share share this final multiply.
There are sure further tricks, e.g., it seems the a_0(x) b_0(x) multiply can share an vpmsumd instruction with the a'_1(x) b_1(x) multiply, if we just prepare a register with a'_1(x) x^64 + a_0(x); then the replacement of a_1 with the twice as large a' values seems for free. One could also attempt to let operations corresponding to separate input blocks share vpmsumd instructions.
Regards, /Niels
nettle-bugs@lists.lysator.liu.se