--- configure.ac | 6 +- fat-ppc.c | 33 ++ fat-setup.h | 7 + gcm.c | 69 +++- powerpc64/fat/gcm-hash.asm | 39 +++ powerpc64/p8/gcm-hash.asm | 773 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 909 insertions(+), 18 deletions(-) create mode 100644 powerpc64/fat/gcm-hash.asm create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index e9983697..0129f950 100644 --- a/configure.ac +++ b/configure.ac @@ -488,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -612,9 +612,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/fat-ppc.c b/fat-ppc.c index 2bc50481..d73b45b0 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -120,6 +120,16 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
+#if GCM_TABLE_BITS == 8 +DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64) + +DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64) +#endif /* GCM_TABLE_BITS == 8 */ + static void CONSTRUCTOR fat_init (void) { @@ -139,11 +149,23 @@ fat_init (void) fprintf (stderr, "libnettle: enabling arch 2.07 code.\n"); _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64; _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64; +#if GCM_TABLE_BITS == 8 + /* Make sure _nettle_gcm_init_key_vec function is compatible + with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c() + fills gcm_key table with values that are incompatible with + _nettle_gcm_hash_ppc64() */ + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64; + _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64; +#endif /* GCM_TABLE_BITS == 8 */ } else { _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c; _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c; +#if GCM_TABLE_BITS == 8 + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c; + _nettle_gcm_hash_vec = _nettle_gcm_hash_c; +#endif /* GCM_TABLE_BITS == 8 */ } }
@@ -160,3 +182,14 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void, size_t length, uint8_t *dst, const uint8_t *src), (rounds, keys, T, length, dst, src)) + +#if GCM_TABLE_BITS == 8 +DEFINE_FAT_FUNC(_nettle_gcm_init_key, void, + (union nettle_block16 *table), + (table)) + +DEFINE_FAT_FUNC(_nettle_gcm_hash, void, + (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data), + (key, x, length, data)) +#endif /* GCM_TABLE_BITS == 8 */ diff --git a/fat-setup.h b/fat-setup.h index 99f1ea67..4a9f7969 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -162,6 +162,13 @@ typedef void aes_crypt_internal_func (unsigned rounds, const uint32_t *keys, size_t length, uint8_t *dst, const uint8_t *src);
+#if GCM_TABLE_BITS == 8 +typedef void gcm_init_key_func (union nettle_block16 *table); + +typedef void gcm_hash_func (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +#endif /* GCM_TABLE_BITS == 8 */ + typedef void *(memxor_func)(void *dst, const void *src, size_t n);
typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds); diff --git a/gcm.c b/gcm.c index 48b3e75a..787c303e 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,26 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key + +#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +/* For fat builds */ +void +_nettle_gcm_init_key_c (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash + +#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +/* For fat builds */ +void +_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +248,32 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifdef gcm_init_key +void +_nettle_gcm_init_key_c(union nettle_block16 *table) +#else +static void +gcm_init_key(union nettle_block16 *table) +#endif /* !gcm_init_key */ +{ +#if GCM_TABLE_BITS + /* Middle element if GCM_TABLE_BITS > 0, otherwise the first + element */ + unsigned i = (1<<GCM_TABLE_BITS)/2; + + /* Algorithm 3 from the gcm paper. First do powers of two, then do + the rest by adding. */ + while (i /= 2) + block16_mulx_ghash(&table[i], &table[2*i]); + for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) + { + unsigned j; + for (j = 1; j < i; j++) + block16_xor3(&table[i+j], &table[i], &table[j]); + } +#endif +} + /* Initialization of GCM. * @ctx: The context of GCM * @cipher: The context of the underlying block cipher @@ -244,25 +290,19 @@ gcm_set_key(struct gcm_key *key, /* H */ memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b); - -#if GCM_TABLE_BITS - /* Algorithm 3 from the gcm paper. First do powers of two, then do - the rest by adding. */ - while (i /= 2) - block16_mulx_ghash(&key->h[i], &key->h[2*i]); - for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) - { - unsigned j; - for (j = 1; j < i; j++) - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]); - } -#endif + + gcm_init_key(key->h); }
-#ifndef gcm_hash +#ifdef gcm_hash +void +_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data) +#else static void gcm_hash(const struct gcm_key *key, union nettle_block16 *x, size_t length, const uint8_t *data) +#endif /* !gcm_hash */ { for (; length >= GCM_BLOCK_SIZE; length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE) @@ -276,7 +316,6 @@ gcm_hash(const struct gcm_key *key, union nettle_block16 *x, gcm_gf_mul (x, key->h); } } -#endif /* !gcm_hash */
static void gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x, diff --git a/powerpc64/fat/gcm-hash.asm b/powerpc64/fat/gcm-hash.asm new file mode 100644 index 00000000..7fd77223 --- /dev/null +++ b/powerpc64/fat/gcm-hash.asm @@ -0,0 +1,39 @@ +C powerpc64/fat/gcm-hash.asm + + +ifelse(` + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl picked up by configure +dnl PROLOGUE(_nettle_gcm_init_key) +dnl PROLOGUE(_nettle_gcm_hash) + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/gcm-hash.asm') diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..f987f2ca --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,773 @@ +C powerpc64/p8/gcm-hash.asm + +ifelse(` + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6') + +define(`zero', `v0') +define(`swap_mask', `v1') +define(`hidw_mask', `v2') +define(`lodw_mask', `v3') +define(`poly', `v4') +define(`poly_h', `v4') +define(`poly_l', `v5') +define(`RP', `v6') +define(`Mh', `v7') +define(`Ml', `v8') +define(`H', `v9') +define(`Hh', `v10') +define(`Hl', `v11') +define(`RP2', `v9') +define(`M2h', `v10') +define(`M2l', `v11') + +define(`sl1', `v1') +define(`msb', `v5') +define(`H2', `v6') +define(`H2h', `v7') +define(`H2l', `v8') +define(`H_h', `v12') +define(`H_m', `v13') +define(`H_l', `v14') +define(`H_Hh', `v12') +define(`H_H', `v13') +define(`H_Hl', `v14') +define(`H_t', `v15') +define(`H2_h', `v16') +define(`H2_m', `v17') +define(`H2_l', `v18') +define(`H2_t', `v19') + +define(`C0', `v6') +define(`C1', `v7') +define(`C2', `v8') +define(`C3', `v12') +define(`C4', `v6') +define(`C5', `v7') +define(`C6', `v8') +define(`C7', `v12') + +define(`C', `v13') + +define(`Ch', `v14') +define(`Cl', `v15') +define(`Cm', `v16') + +define(`C01h', `v14') +define(`C01l', `v15') +define(`C01', `v16') +define(`C23h', `v17') +define(`C23l', `v18') +define(`C23', `v19') +define(`C45h', `v20') +define(`C45l', `v21') +define(`C45', `v22') +define(`C67h', `v6') +define(`C67l', `v7') +define(`C67', `v8') + +define(`H21', `v9') +define(`H21h', `v10') +define(`H21l', `v11') +define(`H43', `v23') +define(`H43h', `v24') +define(`H43l', `v25') +define(`H65', `v26') +define(`H65h', `v27') +define(`H65l', `v28') +define(`H87', `v29') +define(`H87h', `v30') +define(`H87l', `v31') + +.file "gcm-hash.asm" + +.text + + # void gcm_init_key (union gcm_block *table) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key) + DATA_LOAD_VEC(poly,.polynomial,r7) +IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)') + DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7) + DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7) + + li r10,8*TableElemAlign + lxvd2x VSR(H),r10,TABLE # load H +IF_LE(`vperm H,H,H,swap_mask') + + # --- calculate H = H shift left 1 modulo polynomial --- + + vupkhsw msb,H # most significant bit word-extend + vspltisb sl1,1 # splat 1 for shift left + vspltw msb,msb,0 # most significant bit extend + vsl H,H,sl1 # H shift left 1 + vand msb,msb,poly + vxor zero,zero,zero + vxor H_t,H,msb + + vsldoi H,H_t,H_t,8 # doubleword swap + vsldoi Hh,H,zero,8 + vsldoi Hl,zero,H,8 + + # --- calculate H^2 = H*H --- + + # reduction pre-processing + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + # polynomial multiplication "classical" + vpmsumd H_h,H_t,Hh # H^1h*H^1h + vpmsumd H_l,H_t,Hl # H^1l*H^1l + vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] + vsldoi Ml,H_m,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor H_h,H_h,Mh # [2] + vxor H_l,H_l,Ml # [2] + vxor H_l,H_l,RP # [1] + + # reduction second phase + vpmsumd RP,H_l,poly_l + vxor H_h,H_l,H_h + vxor H2_t,H_h,RP + + vsldoi H2,H2_t,H2_t,8 + vsldoi H2h,H2,zero,8 + vsldoi H2l,zero,H2,8 + + # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + li r8,0*TableElemAlign + li r9,1*TableElemAlign + li r10,2*TableElemAlign + stxvd2x VSR(Hl),r8,TABLE + stxvd2x VSR(H),r9,TABLE + stxvd2x VSR(Hh),r10,TABLE + + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^3,H^4 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^2l + vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h + vpmsumd H_h,H_t,H2h # H^1h*H^2h + vpmsumd H2_l,H2_t,H2l # H^2l*H^2l + vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h + vpmsumd H2_h,H2_t,H2h # H^2h*H^2h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^3 + vpmsumd RP2,H2_l,poly_h # [1] H^4 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^3 + vsldoi M2h,zero,H2_m,8 # [2] H^4 + vsldoi Ml,H_m,zero,8 # [2] H^3 + vsldoi M2l,H2_m,zero,8 # [2] H^4 + vsldoi RP,RP,RP,8 # [1] H^3 + vsldoi RP2,RP2,RP2,8 # [1] H^4 + vxor H_h,H_h,Mh # [2] H^3 + vxor H2_h,H2_h,M2h # [2] H^4 + vxor H_l,H_l,Ml # [2] H^3 + vxor H2_l,H2_l,M2l # [2] H^4 + vxor H_l,H_l,RP # [1] H^3 + vxor H2_l,H2_l,RP2 # [1] H^4 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^3 + vpmsumd RP2,H2_l,poly_l # H^4 + vxor H_h,H_l,H_h # H^3 + vxor H2_h,H2_l,H2_h # H^4 + vxor H_h,H_h,RP # H^3 + vxor H2_h,H2_h,RP2 # H^4 + + vsldoi H2,H2_h,H2_h,8 # H^4 + vsldoi H,H_h,H_h,8 # H^3 + vsldoi H2l,zero,H2,8 # H^4 + vsldoi H2h,H2,zero,8 # H^4 + + # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + li r8,6*TableElemAlign + li r9,7*TableElemAlign + li r10,8*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^5,H^6 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^4l + vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h + vpmsumd H_h,H_t,H2h # H^1h*H^4h + vpmsumd H2_l,H2_t,H2l # H^2l*H^4l + vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h + vpmsumd H2_h,H2_t,H2h # H^2h*H^4h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^5 + vpmsumd RP2,H2_l,poly_h # [1] H^6 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^5 + vsldoi M2h,zero,H2_m,8 # [2] H^6 + vsldoi Ml,H_m,zero,8 # [2] H^5 + vsldoi M2l,H2_m,zero,8 # [2] H^6 + vsldoi RP,RP,RP,8 # [1] H^5 + vsldoi RP2,RP2,RP2,8 # [1] H^6 + vxor H_h,H_h,Mh # [2] H^5 + vxor H2_h,H2_h,M2h # [2] H^6 + vxor H_l,H_l,Ml # [2] H^5 + vxor H2_l,H2_l,M2l # [2] H^6 + vxor H_l,H_l,RP # [1] H^5 + vxor H2_l,H2_l,RP2 # [1] H^6 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^5 + vpmsumd RP2,H2_l,poly_l # H^6 + vxor H_h,H_l,H_h # H^5 + vxor H2_h,H2_l,H2_h # H^6 + vxor H_h,H_h,RP # H^5 + vxor H2_h,H2_h,RP2 # H^6 + + vsldoi H2,H2_h,H2_h,8 # H^6 + vsldoi H,H_h,H_h,8 # H^5 + vsldoi H2l,zero,H2,8 # H^6 + vsldoi H2h,H2,zero,8 # H^6 + + # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + li r8,9*TableElemAlign + li r9,10*TableElemAlign + li r10,11*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + # --- calculate H^7,H^8 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^6l + vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h + vpmsumd H_h,H_t,H2h # H^1h*H^6h + vpmsumd H2_l,H2_t,H2l # H^2l*H^6l + vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h + vpmsumd H2_h,H2_t,H2h # H^2h*H^6h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^7 + vpmsumd RP2,H2_l,poly_h # [1] H^8 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^7 + vsldoi M2h,zero,H2_m,8 # [2] H^8 + vsldoi Ml,H_m,zero,8 # [2] H^7 + vsldoi M2l,H2_m,zero,8 # [2] H^8 + vsldoi RP,RP,RP,8 # [1] H^7 + vsldoi RP2,RP2,RP2,8 # [1] H^8 + vxor H_h,H_h,Mh # [2] H^7 + vxor H2_h,H2_h,M2h # [2] H^8 + vxor H_l,H_l,Ml # [2] H^7 + vxor H2_l,H2_l,M2l # [2] H^8 + vxor H_l,H_l,RP # [1] H^7 + vxor H2_l,H2_l,RP2 # [1] H^8 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^7 + vpmsumd RP2,H2_l,poly_l # H^8 + vxor H_h,H_l,H_h # H^7 + vxor H2_h,H2_l,H2_h # H^8 + vxor H_h,H_h,RP # H^7 + vxor H2_h,H2_h,RP2 # H^8 + + vsldoi H,H_h,H_h,8 # H^7 + vsldoi H2,H2_h,H2_h,8 # H^8 + + # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + li r8,12*TableElemAlign + li r9,13*TableElemAlign + li r10,14*TableElemAlign + stxvd2x VSR(H_Hh),r8,TABLE + stxvd2x VSR(H_H),r9,TABLE + stxvd2x VSR(H_Hl),r10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key) + + # void gcm_hash (const struct gcm_key *key, union gcm_block *x, + # size_t length, const uint8_t *data) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash) + vxor zero,zero,zero + + DATA_LOAD_VEC(poly,.polynomial,r7) +IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)') + DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7) + DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7) + + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + lxvd2x VSR(C),0,X # load X +IF_LE(`vperm C,C,C,swap_mask') + + srdi r7,LENGTH,7 # 8x loop count + cmpldi r7,0 + beq L2x + + # backup registers + stdu SP,-224(SP) + std r28,216(SP) + std r29,208(SP) + std r30,200(SP) + std r31,192(SP) + li 8,176 + stvx 20,8,SP + subi 8,8,16 + stvx 21,8,SP + subi 8,8,16 + stvx 22,8,SP + subi 8,8,16 + stvx 23,8,SP + subi 8,8,16 + stvx 24,8,SP + subi 8,8,16 + stvx 25,8,SP + subi 8,8,16 + stvx 26,8,SP + subi 8,8,16 + stvx 27,8,SP + subi 8,8,16 + stvx 28,8,SP + subi 8,8,16 + stvx 29,8,SP + subi 8,8,16 + stvx 30,8,SP + subi 8,8,16 + stvx 31,8,SP + + # table loading + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H21h),r8,TABLE + lxvd2x VSR(H21),r9,TABLE + lxvd2x VSR(H21l),r10,TABLE + li r8,6*TableElemAlign + li r9,7*TableElemAlign + li r10,8*TableElemAlign + lxvd2x VSR(H43h),r8,TABLE + lxvd2x VSR(H43),r9,TABLE + lxvd2x VSR(H43l),r10,TABLE + li r8,9*TableElemAlign + li r9,10*TableElemAlign + li r10,11*TableElemAlign + lxvd2x VSR(H65h),r8,TABLE + lxvd2x VSR(H65),r9,TABLE + lxvd2x VSR(H65l),r10,TABLE + li r8,12*TableElemAlign + li r9,13*TableElemAlign + li r10,14*TableElemAlign + lxvd2x VSR(H87h),r8,TABLE + lxvd2x VSR(H87),r9,TABLE + lxvd2x VSR(H87l),r10,TABLE + + li r8,0x10 + li r9,0x20 + li r10,0x30 + li r28,0x40 + li r29,0x50 + li r30,0x60 + li r31,0x70 + + mtctr r7 +.align 5 +L8x_loop: + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + lxvd2x VSR(C1),r8,DATA # load C1 + lxvd2x VSR(C2),r9,DATA # load C2 + lxvd2x VSR(C3),r10,DATA # load C3 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask + vperm C2,C2,C2,swap_mask + vperm C3,C3,C3,swap_mask') + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C23h,C2,C3,hidw_mask + vperm C23l,C2,C3,lodw_mask + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + + # input loading + lxvd2x VSR(C4),r28,DATA # load C4 + lxvd2x VSR(C5),r29,DATA # load C5 + lxvd2x VSR(C6),r30,DATA # load C6 + lxvd2x VSR(C7),r31,DATA # load C7 + + # swap permuting +IF_LE(`vperm C4,C4,C4,swap_mask + vperm C5,C5,C5,swap_mask + vperm C6,C6,C6,swap_mask + vperm C7,C7,C7,swap_mask') + + # polynomial multiplication "karatsuba" pre-processing + vperm C45h,C4,C5,hidw_mask + vperm C45l,C4,C5,lodw_mask + vperm C67h,C6,C7,hidw_mask + vperm C67l,C6,C7,lodw_mask + vxor C23,C23h,C23l + vxor C01,C01h,C01l + vxor C45,C45h,C45l + vxor C67,C67h,C67l + + # polynomial multiplication "karatsuba" + vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h + vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l + vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h + vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l + vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h + vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l + vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h + vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l + vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l) + vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l) + vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l) + vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l) + + # polynomial multiplication "karatsuba" post-processing + vxor C23,C23,C23h + vxor C01,C01,C01h + vxor C45,C45,C45h + vxor C67,C67,C67h + vxor C23,C23,C23l + vxor C01,C01,C01l + vxor C45,C45,C45l + vxor C67,C67,C67l + + # deferred recombination of partial products + vxor C01h,C01h,C23h # H0 = H01⊕H23 + vxor C45h,C45h,C67h # H1 = H45⊕H67 + vxor C01l,C01l,C23l # L0 = L01⊕L23 + vxor C45l,C45l,C67l # L1 = L45⊕L45 + vxor C01,C01,C23 # M0 = M01⊕M23 + vxor C45,C45,C67 # M1 = M45⊕M45 + vxor C01h,C01h,C45h # H = H0⊕H1 + vxor C01l,C01l,C45l # L = L0⊕L1 + vxor C01,C01,C45 # M = M0⊕M1 + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x80 + bdnz L8x_loop + + # restore registers + li 8,0 + lvx 31,8,SP + addi 8,8,16 + lvx 30,8,SP + addi 8,8,16 + lvx 29,8,SP + addi 8,8,16 + lvx 28,8,SP + addi 8,8,16 + lvx 27,8,SP + addi 8,8,16 + lvx 26,8,SP + addi 8,8,16 + lvx 25,8,SP + addi 8,8,16 + lvx 24,8,SP + addi 8,8,16 + lvx 23,8,SP + addi 8,8,16 + lvx 22,8,SP + addi 8,8,16 + lvx 21,8,SP + addi 8,8,16 + lvx 20,8,SP + ld r31,192(SP) + ld r30,200(SP) + ld r29,208(SP) + ld r28,216(SP) + addi SP,SP,224 + + clrldi LENGTH,LENGTH,57 +L2x: + srdi r7,LENGTH,5 + cmpldi r7,0 + beq L1x + + # table loading + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H21h),r8,TABLE + lxvd2x VSR(H21),r9,TABLE + lxvd2x VSR(H21l),r10,TABLE + + li r10,0x10 + + mtctr r7 +.align 5 +L2x_loop: + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + lxvd2x VSR(C1),r10,DATA # load C1 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask') + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + vxor C01,C01h,C01l + + # polynomial multiplication "karatsuba" + vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h + vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l + vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l) + + # polynomial multiplication "karatsuba" post-processing + vxor C01,C01,C01h + vxor C01,C01,C01l + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x20 + bdnz L2x_loop + + clrldi LENGTH,LENGTH,59 +L1x: + srdi r7,LENGTH,4 + cmpldi r7,0 + beq Lrem + + # table loading + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(Hl),0,TABLE + lxvd2x VSR(H), r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + # input loading + lxvd2x VSR(C0),0,DATA # load C0 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask') + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 +Lrem: + cmpldi LENGTH,0 + beq Ldone + + # table loading + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(Hl),0,TABLE + lxvd2x VSR(H), r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + # input loading + stdu SP,-16(SP) + stvx zero,0,SP +Lst_loop: + subic. LENGTH,LENGTH,1 + lbzx r7,LENGTH,DATA + stbx r7,LENGTH,SP + bne Lst_loop + lxvd2x VSR(C0),0,SP + addi SP,SP,16 + + # swap permuting +IF_LE(`vperm C0,C0,C0,swap_mask') + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + +Ldone: +IF_LE(`vperm C,C,C,swap_mask') + stxvd2x VSR(C),0,X # store C + blr +EPILOGUE(_nettle_gcm_hash) + + .data +IF_LE(`.align 4 +.polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 + .align 4 +.swap_mask: + .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 + .align 4 +.hidw_mask: + .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0 + .align 4 +.lodw_mask: + .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8') +IF_BE(`.align 4 +.polynomial: + .byte 0xc2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 + .align 4 +.hidw_mask: + .byte 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23 + .align 4 +.lodw_mask: + .byte 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31')