[PATCH] "PowerPC64" GCM support - nettle-bugs

27 Sep 2020

---
 configure.ac               |   6 +-
 fat-ppc.c                  |  33 ++
 fat-setup.h                |   7 +
 gcm.c                      |  69 +++-
 powerpc64/fat/gcm-hash.asm |  39 +++
 powerpc64/p8/gcm-hash.asm  | 773
+++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 909 insertions(+), 18 deletions(-)
 create mode 100644 powerpc64/fat/gcm-hash.asm
 create mode 100644 powerpc64/p8/gcm-hash.asm

diff --git a/configure.ac b/configure.ac
index e9983697..0129f950 100644
--- a/configure.ac
+++ b/configure.ac
@@ -488,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.asm
aes-decrypt-internal.asm \
  sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used.
-asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
+asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
   chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
   salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
@@ -612,9 +612,9 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_ecc_secp384r1_redc
 #undef HAVE_NATIVE_ecc_secp521r1_modp
 #undef HAVE_NATIVE_ecc_secp521r1_redc
-#undef HAVE_NATIVE_gcm_init_key8
+#undef HAVE_NATIVE_gcm_init_key
+#undef HAVE_NATIVE_gcm_hash
 #undef HAVE_NATIVE_gcm_hash8
-#undef HAVE_NATIVE_gcm_fill
 #undef HAVE_NATIVE_salsa20_core
 #undef HAVE_NATIVE_salsa20_2core
 #undef HAVE_NATIVE_fat_salsa20_2core
diff --git a/fat-ppc.c b/fat-ppc.c
index 2bc50481..d73b45b0 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -120,6 +120,16 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt,
aes_crypt_internal_func)
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c)
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
+#if GCM_TABLE_BITS == 8
+DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func)
+DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64)
+
+DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func)
+DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64)
+#endif /* GCM_TABLE_BITS == 8 */
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -139,11 +149,23 @@ fat_init (void)
  fprintf (stderr, "libnettle: enabling arch 2.07 code.\n");
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64;
+#if GCM_TABLE_BITS == 8
+     /* Make sure _nettle_gcm_init_key_vec function is compatible
+        with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c()
+        fills gcm_key table with values that are incompatible with
+        _nettle_gcm_hash_ppc64() */
+     _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64;
+     _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64;
+#endif /* GCM_TABLE_BITS == 8 */
     }
   else
     {
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
+#if GCM_TABLE_BITS == 8
+     _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c;
+     _nettle_gcm_hash_vec = _nettle_gcm_hash_c;
+#endif /* GCM_TABLE_BITS == 8 */
     }
 }
@@ -160,3 +182,14 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
  size_t length, uint8_t *dst,
  const uint8_t *src),
  (rounds, keys, T, length, dst, src))
+
+#if GCM_TABLE_BITS == 8
+DEFINE_FAT_FUNC(_nettle_gcm_init_key, void,
+ (union nettle_block16 *table),
+ (table))
+
+DEFINE_FAT_FUNC(_nettle_gcm_hash, void,
+ (const struct gcm_key *key, union nettle_block16 *x,
+ size_t length, const uint8_t *data),
+ (key, x, length, data))
+#endif /* GCM_TABLE_BITS == 8 */
diff --git a/fat-setup.h b/fat-setup.h
index 99f1ea67..4a9f7969 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -162,6 +162,13 @@ typedef void aes_crypt_internal_func (unsigned rounds,
const uint32_t *keys,
       size_t length, uint8_t *dst,
       const uint8_t *src);
+#if GCM_TABLE_BITS == 8
+typedef void gcm_init_key_func (union nettle_block16 *table);
+
+typedef void gcm_hash_func (const struct gcm_key *key, union
nettle_block16 *x,
+      size_t length, const uint8_t *data);
+#endif /* GCM_TABLE_BITS == 8 */
+
 typedef void *(memxor_func)(void *dst, const void *src, size_t n);
typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src,
unsigned rounds);
diff --git a/gcm.c b/gcm.c
index 48b3e75a..787c303e 100644
--- a/gcm.c
+++ b/gcm.c
@@ -140,6 +140,26 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
   memcpy (x->b, Z.b, sizeof(Z));
 }
 # elif GCM_TABLE_BITS == 8
+#  if HAVE_NATIVE_gcm_init_key
+
+#define gcm_init_key _nettle_gcm_init_key
+void
+_nettle_gcm_init_key (union nettle_block16 *table);
+/* For fat builds */
+void
+_nettle_gcm_init_key_c (union nettle_block16 *table);
+#  endif /* HAVE_NATIVE_gcm_init_key */
+#  if HAVE_NATIVE_gcm_hash
+
+#define gcm_hash _nettle_gcm_hash
+void
+_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
+   size_t length, const uint8_t *data);
+/* For fat builds */
+void
+_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x,
+   size_t length, const uint8_t *data);
+#  endif /* HAVE_NATIVE_gcm_hash */
 #  if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8
@@ -228,6 +248,32 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
 /* Increment the rightmost 32 bits. */
 #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifdef gcm_init_key
+void
+_nettle_gcm_init_key_c(union nettle_block16 *table)
+#else
+static void
+gcm_init_key(union nettle_block16 *table)
+#endif /* !gcm_init_key */
+{
+#if GCM_TABLE_BITS
+  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
+     element */
+  unsigned i = (1<<GCM_TABLE_BITS)/2;
+
+  /* Algorithm 3 from the gcm paper. First do powers of two, then do
+     the rest by adding. */
+  while (i /= 2)
+    block16_mulx_ghash(&table[i], &table[2*i]);
+  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
+    {
+      unsigned j;
+      for (j = 1; j < i; j++)
+ block16_xor3(&table[i+j], &table[i], &table[j]);
+    }
+#endif
+}
+
 /* Initialization of GCM.
  * @ctx: The context of GCM
  * @cipher: The context of the underlying block cipher
@@ -244,25 +290,19 @@ gcm_set_key(struct gcm_key *key,
   /* H */
   memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
   f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-
-#if GCM_TABLE_BITS
-  /* Algorithm 3 from the gcm paper. First do powers of two, then do
-     the rest by adding. */
-  while (i /= 2)
-    block16_mulx_ghash(&key->h[i], &key->h[2*i]);
-  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
-    {
-      unsigned j;
-      for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
-    }
-#endif
+
+  gcm_init_key(key->h);
 }
-#ifndef gcm_hash
+#ifdef gcm_hash
+void
+_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x,
+ size_t length, const uint8_t *data)
+#else
 static void
 gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
  size_t length, const uint8_t *data)
+#endif /* !gcm_hash */
 {
   for (; length >= GCM_BLOCK_SIZE;
        length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE)
@@ -276,7 +316,6 @@ gcm_hash(const struct gcm_key *key, union
nettle_block16 *x,
       gcm_gf_mul (x, key->h);
     }
 }
-#endif /* !gcm_hash */
static void
 gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
diff --git a/powerpc64/fat/gcm-hash.asm b/powerpc64/fat/gcm-hash.asm
new file mode 100644
index 00000000..7fd77223
--- /dev/null
+++ b/powerpc64/fat/gcm-hash.asm
@@ -0,0 +1,39 @@
+C powerpc64/fat/gcm-hash.asm
+
+
+ifelse(`
+   Copyright (C) 2020 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl picked up by configure
+dnl PROLOGUE(_nettle_gcm_init_key)
+dnl PROLOGUE(_nettle_gcm_hash)
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/gcm-hash.asm')
diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm
new file mode 100644
index 00000000..f987f2ca
--- /dev/null
+++ b/powerpc64/p8/gcm-hash.asm
@@ -0,0 +1,773 @@
+C powerpc64/p8/gcm-hash.asm
+
+ifelse(`
+   Copyright (C) 2020 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Alignment of gcm_key table elements, which is declared in gcm.h
+define(`TableElemAlign', `0x100')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+define(`TABLE', `r3')
+define(`X', `r4')
+define(`LENGTH', `r5')
+define(`DATA', `r6')
+
+define(`zero', `v0')
+define(`swap_mask', `v1')
+define(`hidw_mask', `v2')
+define(`lodw_mask', `v3')
+define(`poly', `v4')
+define(`poly_h', `v4')
+define(`poly_l', `v5')
+define(`RP', `v6')
+define(`Mh', `v7')
+define(`Ml', `v8')
+define(`H', `v9')
+define(`Hh', `v10')
+define(`Hl', `v11')
+define(`RP2', `v9')
+define(`M2h', `v10')
+define(`M2l', `v11')
+
+define(`sl1', `v1')
+define(`msb', `v5')
+define(`H2', `v6')
+define(`H2h', `v7')
+define(`H2l', `v8')
+define(`H_h', `v12')
+define(`H_m', `v13')
+define(`H_l', `v14')
+define(`H_Hh', `v12')
+define(`H_H', `v13')
+define(`H_Hl', `v14')
+define(`H_t', `v15')
+define(`H2_h', `v16')
+define(`H2_m', `v17')
+define(`H2_l', `v18')
+define(`H2_t', `v19')
+
+define(`C0', `v6')
+define(`C1', `v7')
+define(`C2', `v8')
+define(`C3', `v12')
+define(`C4', `v6')
+define(`C5', `v7')
+define(`C6', `v8')
+define(`C7', `v12')
+
+define(`C', `v13')
+
+define(`Ch', `v14')
+define(`Cl', `v15')
+define(`Cm', `v16')
+
+define(`C01h', `v14')
+define(`C01l', `v15')
+define(`C01', `v16')
+define(`C23h', `v17')
+define(`C23l', `v18')
+define(`C23', `v19')
+define(`C45h', `v20')
+define(`C45l', `v21')
+define(`C45', `v22')
+define(`C67h', `v6')
+define(`C67l', `v7')
+define(`C67', `v8')
+
+define(`H21', `v9')
+define(`H21h', `v10')
+define(`H21l', `v11')
+define(`H43', `v23')
+define(`H43h', `v24')
+define(`H43l', `v25')
+define(`H65', `v26')
+define(`H65h', `v27')
+define(`H65l', `v28')
+define(`H87', `v29')
+define(`H87h', `v30')
+define(`H87l', `v31')
+
+.file "gcm-hash.asm"
+
+.text
+
+ # void gcm_init_key (union gcm_block *table)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_init_key)
+ DATA_LOAD_VEC(poly,.polynomial,r7)
+IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)')
+ DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7)
+ DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7)
+
+ li    r10,8*TableElemAlign
+ lxvd2x VSR(H),r10,TABLE # load H
+IF_LE(`vperm H,H,H,swap_mask')
+
+ # --- calculate H = H shift left 1 modulo polynomial ---
+
+ vupkhsw    msb,H # most significant bit word-extend
+ vspltisb sl1,1 # splat 1 for shift left
+ vspltw      msb,msb,0 # most significant bit extend
+ vsl    H,H,sl1 # H shift left 1
+ vand msb,msb,poly
+ vxor zero,zero,zero
+ vxor H_t,H,msb
+
+ vsldoi H,H_t,H_t,8 # doubleword swap
+ vsldoi Hh,H,zero,8
+ vsldoi Hl,zero,H,8
+
+ # --- calculate H^2 = H*H ---
+
+ # reduction pre-processing
+ vsldoi poly_h,zero,poly,8
+ vsldoi poly_l,poly_h,poly_h,8
+
+ # polynomial multiplication "classical"
+ vpmsumd H_h,H_t,Hh # H^1h*H^1h
+ vpmsumd H_l,H_t,Hl # H^1l*H^1l
+ vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h   # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8   # [2]
+ vsldoi Ml,H_m,zero,8 # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor H_h,H_h,Mh       # [2]
+ vxor H_l,H_l,Ml       # [2]
+ vxor H_l,H_l,RP       # [1]
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l
+ vxor H_h,H_l,H_h
+ vxor H2_t,H_h,RP
+
+ vsldoi H2,H2_t,H2_t,8
+ vsldoi H2h,H2,zero,8
+ vsldoi H2l,zero,H2,8
+
+ # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] ---
+
+ li    r8,0*TableElemAlign
+ li    r9,1*TableElemAlign
+ li    r10,2*TableElemAlign
+ stxvd2x VSR(Hl),r8,TABLE
+ stxvd2x VSR(H),r9,TABLE
+ stxvd2x VSR(Hh),r10,TABLE
+
+ li    r8,3*TableElemAlign
+ li    r9,4*TableElemAlign
+ li    r10,5*TableElemAlign
+ stxvd2x VSR(H_Hh),r8,TABLE
+ stxvd2x VSR(H_H),r9,TABLE
+ stxvd2x VSR(H_Hl),r10,TABLE
+
+ # --- calculate H^3,H^4 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^2l
+ vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h
+ vpmsumd H_h,H_t,H2h # H^1h*H^2h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^2l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^2h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^3
+ vpmsumd    RP2,H2_l,poly_h # [1] H^4
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^3
+ vsldoi M2h,zero,H2_m,8 # [2] H^4
+ vsldoi Ml,H_m,zero,8 # [2] H^3
+ vsldoi M2l,H2_m,zero,8 # [2] H^4
+ vsldoi RP,RP,RP,8 # [1] H^3
+ vsldoi RP2,RP2,RP2,8 # [1] H^4
+ vxor H_h,H_h,Mh # [2] H^3
+ vxor H2_h,H2_h,M2h # [2] H^4
+ vxor H_l,H_l,Ml # [2] H^3
+ vxor H2_l,H2_l,M2l # [2] H^4
+ vxor H_l,H_l,RP # [1] H^3
+ vxor H2_l,H2_l,RP2 # [1] H^4
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^3
+ vpmsumd RP2,H2_l,poly_l # H^4
+ vxor H_h,H_l,H_h # H^3
+ vxor H2_h,H2_l,H2_h # H^4
+ vxor H_h,H_h,RP # H^3
+ vxor H2_h,H2_h,RP2 # H^4
+
+ vsldoi H2,H2_h,H2_h,8 # H^4
+ vsldoi H,H_h,H_h,8 # H^3
+ vsldoi H2l,zero,H2,8 # H^4
+ vsldoi H2h,H2,zero,8 # H^4
+
+ # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] ---
+
+ li    r8,6*TableElemAlign
+ li    r9,7*TableElemAlign
+ li    r10,8*TableElemAlign
+ stxvd2x VSR(H_Hh),r8,TABLE
+ stxvd2x VSR(H_H),r9,TABLE
+ stxvd2x VSR(H_Hl),r10,TABLE
+
+ # --- calculate H^5,H^6 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^4l
+ vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h
+ vpmsumd H_h,H_t,H2h # H^1h*H^4h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^4l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^4h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^5
+ vpmsumd    RP2,H2_l,poly_h # [1] H^6
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^5
+ vsldoi M2h,zero,H2_m,8 # [2] H^6
+ vsldoi Ml,H_m,zero,8 # [2] H^5
+ vsldoi M2l,H2_m,zero,8 # [2] H^6
+ vsldoi RP,RP,RP,8 # [1] H^5
+ vsldoi RP2,RP2,RP2,8 # [1] H^6
+ vxor H_h,H_h,Mh # [2] H^5
+ vxor H2_h,H2_h,M2h # [2] H^6
+ vxor H_l,H_l,Ml # [2] H^5
+ vxor H2_l,H2_l,M2l # [2] H^6
+ vxor H_l,H_l,RP # [1] H^5
+ vxor H2_l,H2_l,RP2 # [1] H^6
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^5
+ vpmsumd RP2,H2_l,poly_l # H^6
+ vxor H_h,H_l,H_h # H^5
+ vxor H2_h,H2_l,H2_h # H^6
+ vxor H_h,H_h,RP # H^5
+ vxor H2_h,H2_h,RP2 # H^6
+
+ vsldoi H2,H2_h,H2_h,8 # H^6
+ vsldoi H,H_h,H_h,8 # H^5
+ vsldoi H2l,zero,H2,8 # H^6
+ vsldoi H2h,H2,zero,8 # H^6
+
+ # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] ---
+
+ li    r8,9*TableElemAlign
+ li    r9,10*TableElemAlign
+ li    r10,11*TableElemAlign
+ stxvd2x VSR(H_Hh),r8,TABLE
+ stxvd2x VSR(H_H),r9,TABLE
+ stxvd2x VSR(H_Hl),r10,TABLE
+
+ # --- calculate H^7,H^8 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^6l
+ vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h
+ vpmsumd H_h,H_t,H2h # H^1h*H^6h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^6l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^6h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^7
+ vpmsumd    RP2,H2_l,poly_h # [1] H^8
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^7
+ vsldoi M2h,zero,H2_m,8 # [2] H^8
+ vsldoi Ml,H_m,zero,8 # [2] H^7
+ vsldoi M2l,H2_m,zero,8 # [2] H^8
+ vsldoi RP,RP,RP,8 # [1] H^7
+ vsldoi RP2,RP2,RP2,8 # [1] H^8
+ vxor H_h,H_h,Mh # [2] H^7
+ vxor H2_h,H2_h,M2h # [2] H^8
+ vxor H_l,H_l,Ml # [2] H^7
+ vxor H2_l,H2_l,M2l # [2] H^8
+ vxor H_l,H_l,RP # [1] H^7
+ vxor H2_l,H2_l,RP2 # [1] H^8
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^7
+ vpmsumd RP2,H2_l,poly_l # H^8
+ vxor H_h,H_l,H_h # H^7
+ vxor H2_h,H2_l,H2_h # H^8
+ vxor H_h,H_h,RP # H^7
+ vxor H2_h,H2_h,RP2 # H^8
+
+ vsldoi H,H_h,H_h,8 # H^7
+ vsldoi H2,H2_h,H2_h,8 # H^8
+
+ # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] ---
+
+ li    r8,12*TableElemAlign
+ li    r9,13*TableElemAlign
+ li    r10,14*TableElemAlign
+ stxvd2x VSR(H_Hh),r8,TABLE
+ stxvd2x VSR(H_H),r9,TABLE
+ stxvd2x VSR(H_Hl),r10,TABLE
+
+  blr
+EPILOGUE(_nettle_gcm_init_key)
+
+ # void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+ #                size_t length, const uint8_t *data)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_hash)
+ vxor zero,zero,zero
+
+ DATA_LOAD_VEC(poly,.polynomial,r7)
+IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)')
+ DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7)
+ DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7)
+
+ vsldoi poly_h,zero,poly,8
+ vsldoi poly_l,poly_h,poly_h,8
+
+ lxvd2x VSR(C),0,X # load X
+IF_LE(`vperm C,C,C,swap_mask')
+
+ srdi r7,LENGTH,7 # 8x loop count
+ cmpldi r7,0
+ beq L2x
+
+ # backup registers
+ stdu SP,-224(SP)
+ std r28,216(SP)
+ std r29,208(SP)
+ std r30,200(SP)
+ std r31,192(SP)
+    li 8,176
+    stvx 20,8,SP
+    subi 8,8,16
+    stvx 21,8,SP
+    subi 8,8,16
+    stvx 22,8,SP
+    subi 8,8,16
+    stvx 23,8,SP
+    subi 8,8,16
+    stvx 24,8,SP
+    subi 8,8,16
+    stvx 25,8,SP
+    subi 8,8,16
+    stvx 26,8,SP
+    subi 8,8,16
+    stvx 27,8,SP
+    subi 8,8,16
+    stvx 28,8,SP
+    subi 8,8,16
+    stvx 29,8,SP
+    subi 8,8,16
+    stvx 30,8,SP
+    subi 8,8,16
+    stvx 31,8,SP
+
+ # table loading
+ li r8,3*TableElemAlign
+ li r9,4*TableElemAlign
+ li r10,5*TableElemAlign
+ lxvd2x VSR(H21h),r8,TABLE
+ lxvd2x VSR(H21),r9,TABLE
+ lxvd2x VSR(H21l),r10,TABLE
+ li r8,6*TableElemAlign
+ li r9,7*TableElemAlign
+ li r10,8*TableElemAlign
+ lxvd2x VSR(H43h),r8,TABLE
+ lxvd2x VSR(H43),r9,TABLE
+ lxvd2x VSR(H43l),r10,TABLE
+ li r8,9*TableElemAlign
+ li r9,10*TableElemAlign
+ li r10,11*TableElemAlign
+ lxvd2x VSR(H65h),r8,TABLE
+ lxvd2x VSR(H65),r9,TABLE
+ lxvd2x VSR(H65l),r10,TABLE
+ li r8,12*TableElemAlign
+ li r9,13*TableElemAlign
+ li r10,14*TableElemAlign
+ lxvd2x VSR(H87h),r8,TABLE
+ lxvd2x VSR(H87),r9,TABLE
+ lxvd2x VSR(H87l),r10,TABLE
+
+ li r8,0x10
+ li r9,0x20
+ li r10,0x30
+ li r28,0x40
+ li r29,0x50
+ li r30,0x60
+ li r31,0x70
+
+ mtctr     r7
+.align 5
+L8x_loop:
+ # input loading
+ lxvd2x VSR(C0),0,DATA # load C0
+ lxvd2x VSR(C1),r8,DATA # load C1
+ lxvd2x VSR(C2),r9,DATA # load C2
+ lxvd2x VSR(C3),r10,DATA # load C3
+
+ # swap permuting
+IF_LE(`vperm C0,C0,C0,swap_mask
+ vperm C1,C1,C1,swap_mask
+ vperm C2,C2,C2,swap_mask
+ vperm C3,C3,C3,swap_mask')
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C23h,C2,C3,hidw_mask
+ vperm C23l,C2,C3,lodw_mask
+ vperm C01h,C0,C1,hidw_mask
+ vperm C01l,C0,C1,lodw_mask
+
+ # input loading
+ lxvd2x VSR(C4),r28,DATA # load C4
+ lxvd2x VSR(C5),r29,DATA # load C5
+ lxvd2x VSR(C6),r30,DATA # load C6
+ lxvd2x VSR(C7),r31,DATA # load C7
+
+ # swap permuting
+IF_LE(`vperm C4,C4,C4,swap_mask
+ vperm C5,C5,C5,swap_mask
+ vperm C6,C6,C6,swap_mask
+ vperm C7,C7,C7,swap_mask')
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C45h,C4,C5,hidw_mask
+ vperm C45l,C4,C5,lodw_mask
+ vperm C67h,C6,C7,hidw_mask
+ vperm C67l,C6,C7,lodw_mask
+ vxor C23,C23h,C23l
+ vxor C01,C01h,C01l
+ vxor C45,C45h,C45l
+ vxor C67,C67h,C67l
+
+ # polynomial multiplication "karatsuba"
+ vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h
+ vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l
+ vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h
+ vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l
+ vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h
+ vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l
+ vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h
+ vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l
+ vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l)
+ vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l)
+ vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l)
+ vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l)
+
+ # polynomial multiplication "karatsuba" post-processing
+ vxor C23,C23,C23h
+ vxor C01,C01,C01h
+ vxor C45,C45,C45h
+ vxor C67,C67,C67h
+ vxor C23,C23,C23l
+ vxor C01,C01,C01l
+ vxor C45,C45,C45l
+ vxor C67,C67,C67l
+
+ # deferred recombination of partial products
+ vxor C01h,C01h,C23h # H0 = H01⊕H23
+ vxor C45h,C45h,C67h # H1 = H45⊕H67
+ vxor C01l,C01l,C23l # L0 = L01⊕L23
+ vxor C45l,C45l,C67l # L1 = L45⊕L45
+ vxor C01,C01,C23 # M0 = M01⊕M23
+ vxor C45,C45,C67 # M1 = M45⊕M45
+ vxor C01h,C01h,C45h # H = H0⊕H1
+ vxor C01l,C01l,C45l # L = L0⊕L1
+ vxor C01,C01,C45 # M = M0⊕M1
+
+ # reduction first phase # [1]
+ vpmsumd RP,C01l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,C01,8   # [2]
+ vsldoi Ml,C01,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor C01h,C01h,Mh     # [2]
+ vxor C01l,C01l,Ml     # [2]
+ vxor C01l,C01l,RP     # [1]
+
+ # reduction second phase
+ vpmsumd RP,C01l,poly_l
+ vxor C01h,C01l,C01h
+ vxor C,C01h,RP
+
+ addi DATA,DATA,0x80
+ bdnz L8x_loop
+
+    # restore registers
+ li 8,0
+    lvx 31,8,SP
+    addi 8,8,16
+    lvx 30,8,SP
+    addi 8,8,16
+    lvx 29,8,SP
+    addi 8,8,16
+    lvx 28,8,SP
+    addi 8,8,16
+    lvx 27,8,SP
+    addi 8,8,16
+    lvx 26,8,SP
+    addi 8,8,16
+    lvx 25,8,SP
+    addi 8,8,16
+    lvx 24,8,SP
+    addi 8,8,16
+    lvx 23,8,SP
+    addi 8,8,16
+    lvx 22,8,SP
+    addi 8,8,16
+    lvx 21,8,SP
+    addi 8,8,16
+    lvx 20,8,SP
+ ld r31,192(SP)
+ ld r30,200(SP)
+ ld r29,208(SP)
+ ld r28,216(SP)
+ addi SP,SP,224
+
+ clrldi   LENGTH,LENGTH,57
+L2x:
+ srdi r7,LENGTH,5
+ cmpldi r7,0
+ beq L1x
+
+ # table loading
+ li r8,3*TableElemAlign
+ li r9,4*TableElemAlign
+ li r10,5*TableElemAlign
+ lxvd2x VSR(H21h),r8,TABLE
+ lxvd2x VSR(H21),r9,TABLE
+ lxvd2x VSR(H21l),r10,TABLE
+
+ li r10,0x10
+
+ mtctr     r7
+.align 5
+L2x_loop:
+ # input loading
+ lxvd2x VSR(C0),0,DATA # load C0
+ lxvd2x VSR(C1),r10,DATA # load C1
+
+ # swap permuting
+IF_LE(`vperm C0,C0,C0,swap_mask
+ vperm C1,C1,C1,swap_mask')
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C01h,C0,C1,hidw_mask
+ vperm C01l,C0,C1,lodw_mask
+ vxor C01,C01h,C01l
+
+ # polynomial multiplication "karatsuba"
+ vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h
+ vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l
+ vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l)
+
+ # polynomial multiplication "karatsuba" post-processing
+ vxor C01,C01,C01h
+ vxor C01,C01,C01l
+
+ # reduction first phase # [1]
+ vpmsumd RP,C01l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,C01,8   # [2]
+ vsldoi Ml,C01,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor C01h,C01h,Mh     # [2]
+ vxor C01l,C01l,Ml     # [2]
+ vxor C01l,C01l,RP     # [1]
+
+ # reduction second phase
+ vpmsumd RP,C01l,poly_l
+ vxor C01h,C01l,C01h
+ vxor C,C01h,RP
+
+ addi DATA,DATA,0x20
+ bdnz L2x_loop
+
+ clrldi   LENGTH,LENGTH,59
+L1x:
+ srdi r7,LENGTH,4
+ cmpldi r7,0
+ beq Lrem
+
+ # table loading
+ li r9,1*TableElemAlign
+ li r10,2*TableElemAlign
+ lxvd2x VSR(Hl),0,TABLE
+ lxvd2x VSR(H), r9,TABLE
+ lxvd2x VSR(Hh),r10,TABLE
+
+ # input loading
+ lxvd2x VSR(C0),0,DATA # load C0
+
+ # swap permuting
+IF_LE(`vperm C0,C0,C0,swap_mask')
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ vpmsumd Cl,C0,Hl # L = Hl*Cl
+ vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch
+ vpmsumd Ch,C0,Hh # H = Hh*Ch
+
+ # reduction first phase # [1]
+ vpmsumd RP,Cl,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,Cm,8   # [2]
+ vsldoi Ml,Cm,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor Ch,Ch,Mh   # [2]
+ vxor Cl,Cl,Ml       # [2]
+ vxor Cl,Cl,RP   # [1]
+
+ # reduction second phase
+ vpmsumd RP,Cl,poly_l
+ vxor Ch,Cl,Ch
+ vxor C,Ch,RP
+
+ addi DATA,DATA,0x10
+ clrldi   LENGTH,LENGTH,60
+Lrem:
+ cmpldi LENGTH,0
+ beq Ldone
+
+ # table loading
+ li r9,1*TableElemAlign
+ li r10,2*TableElemAlign
+ lxvd2x VSR(Hl),0,TABLE
+ lxvd2x VSR(H), r9,TABLE
+ lxvd2x VSR(Hh),r10,TABLE
+
+ # input loading
+ stdu SP,-16(SP)
+ stvx zero,0,SP
+Lst_loop:
+ subic.      LENGTH,LENGTH,1
+ lbzx r7,LENGTH,DATA
+ stbx r7,LENGTH,SP
+ bne Lst_loop
+ lxvd2x   VSR(C0),0,SP
+ addi SP,SP,16
+
+ # swap permuting
+IF_LE(`vperm C0,C0,C0,swap_mask')
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ vpmsumd Cl,C0,Hl # L = Hl*Cl
+ vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch
+ vpmsumd Ch,C0,Hh # H = Hh*Ch
+
+ # reduction first phase # [1]
+ vpmsumd RP,Cl,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,Cm,8   # [2]
+ vsldoi Ml,Cm,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor Ch,Ch,Mh   # [2]
+ vxor Cl,Cl,Ml     # [2]
+ vxor Cl,Cl,RP   # [1]
+
+ # reduction second phase
+ vpmsumd RP,Cl,poly_l
+ vxor Ch,Cl,Ch
+ vxor C,Ch,RP
+
+Ldone:
+IF_LE(`vperm C,C,C,swap_mask')
+ stxvd2x VSR(C),0,X # store C
+ blr
+EPILOGUE(_nettle_gcm_hash)
+
+    .data
+IF_LE(`.align 4
+.polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+    .align 4
+.swap_mask:
+ .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
+    .align 4
+.hidw_mask:
+ .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0
+    .align 4
+.lodw_mask:
+ .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8')
+IF_BE(`.align 4
+.polynomial:
+ .byte 0xc2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+    .align 4
+.hidw_mask:
+ .byte 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
+    .align 4
+.lodw_mask:
+ .byte 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31')
-- 
2.17.1