[PATCH] "PowerPC64" GCM support - nettle-bugs

24 Sep 2020

This is a stand-alone patch that applies all the previous patches to the
optimized GCM implementation. This patch is based on the master upstream so
it can be merged directly.
It passes the testsuite and yields the expected performance.
---
 configure.ac               |   5 +-
 fat-ppc.c                  |  44 +++
 fat-setup.h                |   9 +
 gcm.c                      |  82 +++-
 powerpc64/README           |   2 +-
 powerpc64/fat/gcm-hash.asm |  40 ++
 powerpc64/p8/gcm-hash.asm  | 956
+++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 1121 insertions(+), 17 deletions(-)
 create mode 100644 powerpc64/fat/gcm-hash.asm
 create mode 100644 powerpc64/p8/gcm-hash.asm

diff --git a/configure.ac b/configure.ac
index e9983697..13cfe33a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -488,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.asm
aes-decrypt-internal.asm \
  sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used.
-asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
+asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
   chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
   salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
@@ -612,7 +612,8 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_ecc_secp384r1_redc
 #undef HAVE_NATIVE_ecc_secp521r1_modp
 #undef HAVE_NATIVE_ecc_secp521r1_redc
-#undef HAVE_NATIVE_gcm_init_key8
+#undef HAVE_NATIVE_gcm_init_key
+#undef HAVE_NATIVE_gcm_hash
 #undef HAVE_NATIVE_gcm_hash8
 #undef HAVE_NATIVE_gcm_fill
 #undef HAVE_NATIVE_salsa20_core
diff --git a/fat-ppc.c b/fat-ppc.c
index 2bc50481..e69dff95 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -120,6 +120,20 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt,
aes_crypt_internal_func)
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c)
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
+#if GCM_TABLE_BITS == 8
+DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func)
+DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64)
+
+DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func)
+DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64)
+#endif /* GCM_TABLE_BITS == 8 */
+
+DECLARE_FAT_FUNC(_nettle_gcm_fill, gcm_fill_func)
+DECLARE_FAT_FUNC_VAR(gcm_fill, gcm_fill_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_fill, gcm_fill_func, ppc64)
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -139,11 +153,25 @@ fat_init (void)
  fprintf (stderr, "libnettle: enabling arch 2.07 code.\n");
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64;
+#if GCM_TABLE_BITS == 8
+     /* Make sure _nettle_gcm_init_key_vec function is compatible
+        with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c()
+        fills gcm_key table with values that are incompatible with
+        _nettle_gcm_hash_ppc64() */
+     _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64;
+     _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64;
+#endif /* GCM_TABLE_BITS == 8 */
+     _nettle_gcm_fill_vec = _nettle_gcm_fill_ppc64;
     }
   else
     {
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
+#if GCM_TABLE_BITS == 8
+     _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c;
+     _nettle_gcm_hash_vec = _nettle_gcm_hash_c;
+#endif /* GCM_TABLE_BITS == 8 */
+     _nettle_gcm_fill_vec = _nettle_gcm_fill_c;
     }
 }
@@ -160,3 +188,19 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
  size_t length, uint8_t *dst,
  const uint8_t *src),
  (rounds, keys, T, length, dst, src))
+
+ #if GCM_TABLE_BITS == 8
+DEFINE_FAT_FUNC(_nettle_gcm_init_key, void,
+ (union nettle_block16 *table),
+ (table))
+
+DEFINE_FAT_FUNC(_nettle_gcm_hash, void,
+ (const struct gcm_key *key, union nettle_block16 *x,
+ size_t length, const uint8_t *data),
+ (key, x, length, data))
+#endif /* GCM_TABLE_BITS == 8 */
+
+DEFINE_FAT_FUNC(_nettle_gcm_fill, void,
+ (uint8_t *ctr, size_t blocks,
+ union nettle_block16 *buffer),
+ (ctr, blocks, buffer))
diff --git a/fat-setup.h b/fat-setup.h
index 99f1ea67..623f9579 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -162,6 +162,15 @@ typedef void aes_crypt_internal_func (unsigned rounds,
const uint32_t *keys,
       size_t length, uint8_t *dst,
       const uint8_t *src);
+#if GCM_TABLE_BITS == 8
+typedef void gcm_init_key_func (union nettle_block16 *table);
+
+typedef void gcm_hash_func (const struct gcm_key *key, union
nettle_block16 *x,
+      size_t length, const uint8_t *data);
+#endif /* GCM_TABLE_BITS == 8 */
+
+typedef void gcm_fill_func (uint8_t *ctr, size_t blocks, union
nettle_block16 *buffer);
+
 typedef void *(memxor_func)(void *dst, const void *src, size_t n);
typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src,
unsigned rounds);
diff --git a/gcm.c b/gcm.c
index cf615daf..935d4420 100644
--- a/gcm.c
+++ b/gcm.c
@@ -140,6 +140,26 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
   memcpy (x->b, Z.b, sizeof(Z));
 }
 # elif GCM_TABLE_BITS == 8
+#  if HAVE_NATIVE_gcm_init_key
+
+#define gcm_init_key _nettle_gcm_init_key
+void
+_nettle_gcm_init_key (union nettle_block16 *table);
+/* For fat builds */
+void
+_nettle_gcm_init_key_c (union nettle_block16 *table);
+#  endif /* HAVE_NATIVE_gcm_init_key */
+#  if HAVE_NATIVE_gcm_hash
+
+#define gcm_hash _nettle_gcm_hash
+void
+_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
+   size_t length, const uint8_t *data);
+/* For fat builds */
+void
+_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x,
+   size_t length, const uint8_t *data);
+#  endif /* HAVE_NATIVE_gcm_hash */
 #  if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8
@@ -225,9 +245,45 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
#endif /* GCM_TABLE_BITS */
+#if HAVE_NATIVE_gcm_fill
+
+#define gcm_fill _nettle_gcm_fill
+void
+_nettle_gcm_fill (uint8_t *ctr, size_t blocks, union nettle_block16
*buffer);
+/* For fat builds */
+void
+_nettle_gcm_fill_c (uint8_t *ctr, size_t blocks, union nettle_block16
*buffer);
+#endif /* HAVE_NATIVE_gcm_fill */
+
 /* Increment the rightmost 32 bits. */
 #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifdef gcm_init_key
+void
+_nettle_gcm_init_key_c(union nettle_block16 *table)
+#else
+static void
+gcm_init_key(union nettle_block16 *table)
+#endif /* !gcm_init_key */
+{
+#if GCM_TABLE_BITS
+  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
+     element */
+  unsigned i = (1<<GCM_TABLE_BITS)/2;
+
+  /* Algorithm 3 from the gcm paper. First do powers of two, then do
+     the rest by adding. */
+  while (i /= 2)
+    block16_mulx_ghash(&table[i], &table[2*i]);
+  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
+    {
+      unsigned j;
+      for (j = 1; j < i; j++)
+ block16_xor3(&table[i+j], &table[i], &table[j]);
+    }
+#endif
+}
+
 /* Initialization of GCM.
  * @ctx: The context of GCM
  * @cipher: The context of the underlying block cipher
@@ -245,24 +301,18 @@ gcm_set_key(struct gcm_key *key,
   memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
   f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS
-  /* Algorithm 3 from the gcm paper. First do powers of two, then do
-     the rest by adding. */
-  while (i /= 2)
-    block16_mulx_ghash(&key->h[i], &key->h[2*i]);
-  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
-    {
-      unsigned j;
-      for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
-    }
-#endif
+  gcm_init_key(key->h);
 }
-#ifndef gcm_hash
+#ifdef gcm_hash
+void
+_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x,
+ size_t length, const uint8_t *data)
+#else
 static void
 gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
  size_t length, const uint8_t *data)
+#endif /* !gcm_hash */
 {
   for (; length >= GCM_BLOCK_SIZE;
        length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE)
@@ -276,7 +326,6 @@ gcm_hash(const struct gcm_key *key, union
nettle_block16 *x,
       gcm_gf_mul (x, key->h);
     }
 }
-#endif /* !gcm_hash */
static void
 gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
@@ -333,9 +382,14 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key
*key,
   ctx->auth_size += length;
 }
+#ifdef gcm_fill
+void
+_nettle_gcm_fill_c(uint8_t *ctr, size_t blocks, union nettle_block16
*buffer)
+#else
 static nettle_fill16_func gcm_fill;
 static void
 gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
+#endif /* !gcm_fill */
 {
   uint32_t c;
diff --git a/powerpc64/README b/powerpc64/README
index 7301953b..fac1108b 100644
--- a/powerpc64/README
+++ b/powerpc64/README
@@ -52,7 +52,7 @@ storage operands, refer to "6.4.1 Accessing Unaligned
Storage Operands"
 in [3] to see an example of accessing unaligned storage operands.
 "lxvd2x/stxvd2x" can be used to load/store data into unaligned storage
 operands but permuting is needed for loading and storing data in
-little-endian mode VSX registers are defined with "X" suffix
+little-endian mode
Function Prologue
diff --git a/powerpc64/fat/gcm-hash.asm b/powerpc64/fat/gcm-hash.asm
new file mode 100644
index 00000000..2d9f281e
--- /dev/null
+++ b/powerpc64/fat/gcm-hash.asm
@@ -0,0 +1,40 @@
+C powerpc64/fat/gcm-hash.asm
+
+
+ifelse(`
+   Copyright (C) 2020 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl picked up by configure
+dnl PROLOGUE(_nettle_gcm_init_key)
+dnl PROLOGUE(_nettle_gcm_hash)
+dnl PROLOGUE(_nettle_gcm_fill)
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/gcm-hash.asm')
diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm
new file mode 100644
index 00000000..f57cbfe4
--- /dev/null
+++ b/powerpc64/p8/gcm-hash.asm
@@ -0,0 +1,956 @@
+C powerpc64/p8/gcm-hash.asm
+
+ifelse(`
+   Copyright (C) 2020 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Alignment of gcm_key table elements, which is declared in gcm.h
+define(`TableElemAlign', `0x100')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+define(`TABLE', `r3')
+define(`X', `r4')
+define(`LENGTH', `r5')
+define(`DATA', `r6')
+
+define(`zero', `v0')
+define(`swap_mask', `v1')
+define(`hidw_mask', `v2')
+define(`lodw_mask', `v3')
+define(`poly', `v4')
+define(`poly_h', `v4')
+define(`poly_l', `v5')
+define(`RP', `v6')
+define(`Mh', `v7')
+define(`Ml', `v8')
+define(`H', `v9')
+define(`Hh', `v10')
+define(`Hl', `v11')
+define(`RP2', `v9')
+define(`M2h', `v10')
+define(`M2l', `v11')
+
+define(`sl1', `v1')
+define(`msb', `v5')
+define(`H2', `v6')
+define(`H2h', `v7')
+define(`H2l', `v8')
+define(`H_h', `v12')
+define(`H_m', `v13')
+define(`H_l', `v14')
+define(`H_Hh', `v12')
+define(`H_H', `v13')
+define(`H_Hl', `v14')
+define(`H_t', `v15')
+define(`H2_h', `v16')
+define(`H2_m', `v17')
+define(`H2_l', `v18')
+define(`H2_t', `v19')
+
+define(`C0', `v6')
+define(`C1', `v7')
+define(`C2', `v8')
+define(`C3', `v12')
+define(`C4', `v6')
+define(`C5', `v7')
+define(`C6', `v8')
+define(`C7', `v12')
+
+define(`C', `v13')
+
+define(`Ch', `v14')
+define(`Cl', `v15')
+define(`Cm', `v16')
+
+define(`C01h', `v14')
+define(`C01l', `v15')
+define(`C01', `v16')
+define(`C23h', `v17')
+define(`C23l', `v18')
+define(`C23', `v19')
+define(`C45h', `v20')
+define(`C45l', `v21')
+define(`C45', `v22')
+define(`C67h', `v6')
+define(`C67l', `v7')
+define(`C67', `v8')
+
+define(`H21', `v9')
+define(`H21h', `v10')
+define(`H21l', `v11')
+define(`H43', `v23')
+define(`H43h', `v24')
+define(`H43l', `v25')
+define(`H65', `v26')
+define(`H65h', `v27')
+define(`H65l', `v28')
+define(`H87', `v29')
+define(`H87h', `v30')
+define(`H87l', `v31')
+
+# gcm_fill registers:
+
+define(`CTR', `r3')
+define(`BLOCKS', `r4')
+define(`BUFFER', `r5')
+
+define(`CTR0', `v2')
+define(`CTR0S', `v3')
+define(`CTR1', `v4')
+define(`CTR2', `v5')
+define(`CTR3', `v6')
+define(`CTR4', `v7')
+define(`CTR5', `v8')
+define(`CTR6', `v9')
+define(`CTR7', `v10')
+
+define(`I1', `v11')
+define(`I2', `v12')
+define(`I3', `v13')
+define(`I4', `v14')
+define(`I5', `v15')
+define(`I6', `v16')
+define(`I7', `v17')
+define(`I8', `v18')
+
+.file "gcm-hash.asm"
+
+.text
+
+ # void gcm_init_key (union gcm_block *table)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_init_key)
+ DATA_LOAD_VEC(poly,.polynomial,r7)
+IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)')
+ DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7)
+ DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7)
+
+ li    r10,8*TableElemAlign
+ lxvd2x VSR(H),r10,TABLE # load H
+IF_LE(`vperm H,H,H,swap_mask')
+
+ # --- calculate H = H shift left 1 modulo polynomial ---
+
+ vupkhsw    msb,H # most significant bit word-extend
+ vspltisb sl1,1 # splat 1 for shift left
+ vspltw      msb,msb,0 # most significant bit extend
+ vsl    H,H,sl1 # H shift left 1
+ vand msb,msb,poly
+ vxor zero,zero,zero
+ vxor H_t,H,msb
+
+ vsldoi H,H_t,H_t,8 # doubleword swap
+ vsldoi Hh,H,zero,8
+ vsldoi Hl,zero,H,8
+
+ # --- calculate H^2 = H*H ---
+
+ # reduction pre-processing
+ vsldoi poly_h,zero,poly,8
+ vsldoi poly_l,poly_h,poly_h,8
+
+ # polynomial multiplication "classical"
+ vpmsumd H_h,H_t,Hh # H^1h*H^1h
+ vpmsumd H_l,H_t,Hl # H^1l*H^1l
+ vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h   # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8   # [2]
+ vsldoi Ml,H_m,zero,8 # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor H_h,H_h,Mh       # [2]
+ vxor H_l,H_l,Ml       # [2]
+ vxor H_l,H_l,RP       # [1]
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l
+ vxor H_h,H_l,H_h
+ vxor H2_t,H_h,RP
+
+ vsldoi H2,H2_t,H2_t,8
+ vsldoi H2h,H2,zero,8
+ vsldoi H2l,zero,H2,8
+
+ # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] ---
+
+ li    r8,0*TableElemAlign
+ li    r9,1*TableElemAlign
+ li    r10,2*TableElemAlign
+ stxvd2x VSR(Hl),r8,TABLE
+ stxvd2x VSR(H),r9,TABLE
+ stxvd2x VSR(Hh),r10,TABLE
+
+ li    r8,3*TableElemAlign
+ li    r9,4*TableElemAlign
+ li    r10,5*TableElemAlign
+ stxvd2x VSR(H_Hh),r8,TABLE
+ stxvd2x VSR(H_H),r9,TABLE
+ stxvd2x VSR(H_Hl),r10,TABLE
+
+ # --- calculate H^3,H^4 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^2l
+ vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h
+ vpmsumd H_h,H_t,H2h # H^1h*H^2h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^2l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^2h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^3
+ vpmsumd    RP2,H2_l,poly_h # [1] H^4
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^3
+ vsldoi M2h,zero,H2_m,8 # [2] H^4
+ vsldoi Ml,H_m,zero,8 # [2] H^3
+ vsldoi M2l,H2_m,zero,8 # [2] H^4
+ vsldoi RP,RP,RP,8 # [1] H^3
+ vsldoi RP2,RP2,RP2,8 # [1] H^4
+ vxor H_h,H_h,Mh # [2] H^3
+ vxor H2_h,H2_h,M2h # [2] H^4
+ vxor H_l,H_l,Ml # [2] H^3
+ vxor H2_l,H2_l,M2l # [2] H^4
+ vxor H_l,H_l,RP # [1] H^3
+ vxor H2_l,H2_l,RP2 # [1] H^4
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^3
+ vpmsumd RP2,H2_l,poly_l # H^4
+ vxor H_h,H_l,H_h # H^3
+ vxor H2_h,H2_l,H2_h # H^4
+ vxor H_h,H_h,RP # H^3
+ vxor H2_h,H2_h,RP2 # H^4
+
+ vsldoi H2,H2_h,H2_h,8 # H^4
+ vsldoi H,H_h,H_h,8 # H^3
+ vsldoi H2l,zero,H2,8 # H^4
+ vsldoi H2h,H2,zero,8 # H^4
+
+ # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] ---
+
+ li    r8,6*TableElemAlign
+ li    r9,7*TableElemAlign
+ li    r10,8*TableElemAlign
+ stxvd2x VSR(H_Hh),r8,TABLE
+ stxvd2x VSR(H_H),r9,TABLE
+ stxvd2x VSR(H_Hl),r10,TABLE
+
+ # --- calculate H^5,H^6 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^4l
+ vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h
+ vpmsumd H_h,H_t,H2h # H^1h*H^4h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^4l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^4h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^5
+ vpmsumd    RP2,H2_l,poly_h # [1] H^6
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^5
+ vsldoi M2h,zero,H2_m,8 # [2] H^6
+ vsldoi Ml,H_m,zero,8 # [2] H^5
+ vsldoi M2l,H2_m,zero,8 # [2] H^6
+ vsldoi RP,RP,RP,8 # [1] H^5
+ vsldoi RP2,RP2,RP2,8 # [1] H^6
+ vxor H_h,H_h,Mh # [2] H^5
+ vxor H2_h,H2_h,M2h # [2] H^6
+ vxor H_l,H_l,Ml # [2] H^5
+ vxor H2_l,H2_l,M2l # [2] H^6
+ vxor H_l,H_l,RP # [1] H^5
+ vxor H2_l,H2_l,RP2 # [1] H^6
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^5
+ vpmsumd RP2,H2_l,poly_l # H^6
+ vxor H_h,H_l,H_h # H^5
+ vxor H2_h,H2_l,H2_h # H^6
+ vxor H_h,H_h,RP # H^5
+ vxor H2_h,H2_h,RP2 # H^6
+
+ vsldoi H2,H2_h,H2_h,8 # H^6
+ vsldoi H,H_h,H_h,8 # H^5
+ vsldoi H2l,zero,H2,8 # H^6
+ vsldoi H2h,H2,zero,8 # H^6
+
+ # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] ---
+
+ li    r8,9*TableElemAlign
+ li    r9,10*TableElemAlign
+ li    r10,11*TableElemAlign
+ stxvd2x VSR(H_Hh),r8,TABLE
+ stxvd2x VSR(H_H),r9,TABLE
+ stxvd2x VSR(H_Hl),r10,TABLE
+
+ # --- calculate H^7,H^8 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^6l
+ vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h
+ vpmsumd H_h,H_t,H2h # H^1h*H^6h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^6l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^6h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^7
+ vpmsumd    RP2,H2_l,poly_h # [1] H^8
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^7
+ vsldoi M2h,zero,H2_m,8 # [2] H^8
+ vsldoi Ml,H_m,zero,8 # [2] H^7
+ vsldoi M2l,H2_m,zero,8 # [2] H^8
+ vsldoi RP,RP,RP,8 # [1] H^7
+ vsldoi RP2,RP2,RP2,8 # [1] H^8
+ vxor H_h,H_h,Mh # [2] H^7
+ vxor H2_h,H2_h,M2h # [2] H^8
+ vxor H_l,H_l,Ml # [2] H^7
+ vxor H2_l,H2_l,M2l # [2] H^8
+ vxor H_l,H_l,RP # [1] H^7
+ vxor H2_l,H2_l,RP2 # [1] H^8
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^7
+ vpmsumd RP2,H2_l,poly_l # H^8
+ vxor H_h,H_l,H_h # H^7
+ vxor H2_h,H2_l,H2_h # H^8
+ vxor H_h,H_h,RP # H^7
+ vxor H2_h,H2_h,RP2 # H^8
+
+ vsldoi H,H_h,H_h,8 # H^7
+ vsldoi H2,H2_h,H2_h,8 # H^8
+
+ # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] ---
+
+ li    r8,12*TableElemAlign
+ li    r9,13*TableElemAlign
+ li    r10,14*TableElemAlign
+ stxvd2x VSR(H_Hh),r8,TABLE
+ stxvd2x VSR(H_H),r9,TABLE
+ stxvd2x VSR(H_Hl),r10,TABLE
+
+  blr
+EPILOGUE(_nettle_gcm_init_key)
+
+ # void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+ #                size_t length, const uint8_t *data)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_hash)
+ vxor zero,zero,zero
+
+ DATA_LOAD_VEC(poly,.polynomial,r7)
+IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r7)')
+ DATA_LOAD_VEC(hidw_mask,.hidw_mask,r7)
+ DATA_LOAD_VEC(lodw_mask,.lodw_mask,r7)
+
+ vsldoi poly_h,zero,poly,8
+ vsldoi poly_l,poly_h,poly_h,8
+
+ lxvd2x VSR(C),0,X # load X
+IF_LE(`vperm C,C,C,swap_mask')
+
+ srdi r7,LENGTH,7 # 8x loop count
+ cmpldi r7,0
+ beq L2x
+
+ # backup registers
+ stdu SP,-224(SP)
+ std r28,216(SP)
+ std r29,208(SP)
+ std r30,200(SP)
+ std r31,192(SP)
+    li 8,176
+    stvx 20,8,SP
+    subi 8,8,16
+    stvx 21,8,SP
+    subi 8,8,16
+    stvx 22,8,SP
+    subi 8,8,16
+    stvx 23,8,SP
+    subi 8,8,16
+    stvx 24,8,SP
+    subi 8,8,16
+    stvx 25,8,SP
+    subi 8,8,16
+    stvx 26,8,SP
+    subi 8,8,16
+    stvx 27,8,SP
+    subi 8,8,16
+    stvx 28,8,SP
+    subi 8,8,16
+    stvx 29,8,SP
+    subi 8,8,16
+    stvx 30,8,SP
+    subi 8,8,16
+    stvx 31,8,SP
+
+ # table loading
+ li r8,3*TableElemAlign
+ li r9,4*TableElemAlign
+ li r10,5*TableElemAlign
+ lxvd2x VSR(H21h),r8,TABLE
+ lxvd2x VSR(H21),r9,TABLE
+ lxvd2x VSR(H21l),r10,TABLE
+ li r8,6*TableElemAlign
+ li r9,7*TableElemAlign
+ li r10,8*TableElemAlign
+ lxvd2x VSR(H43h),r8,TABLE
+ lxvd2x VSR(H43),r9,TABLE
+ lxvd2x VSR(H43l),r10,TABLE
+ li r8,9*TableElemAlign
+ li r9,10*TableElemAlign
+ li r10,11*TableElemAlign
+ lxvd2x VSR(H65h),r8,TABLE
+ lxvd2x VSR(H65),r9,TABLE
+ lxvd2x VSR(H65l),r10,TABLE
+ li r8,12*TableElemAlign
+ li r9,13*TableElemAlign
+ li r10,14*TableElemAlign
+ lxvd2x VSR(H87h),r8,TABLE
+ lxvd2x VSR(H87),r9,TABLE
+ lxvd2x VSR(H87l),r10,TABLE
+
+ li r8,0x10
+ li r9,0x20
+ li r10,0x30
+ li r28,0x40
+ li r29,0x50
+ li r30,0x60
+ li r31,0x70
+
+ mtctr     r7
+.align 5
+L8x_loop:
+ # input loading
+ lxvd2x VSR(C0),0,DATA # load C0
+ lxvd2x VSR(C1),r8,DATA # load C1
+ lxvd2x VSR(C2),r9,DATA # load C2
+ lxvd2x VSR(C3),r10,DATA # load C3
+
+ # swap permuting
+IF_LE(`vperm C0,C0,C0,swap_mask
+ vperm C1,C1,C1,swap_mask
+ vperm C2,C2,C2,swap_mask
+ vperm C3,C3,C3,swap_mask')
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C23h,C2,C3,hidw_mask
+ vperm C23l,C2,C3,lodw_mask
+ vperm C01h,C0,C1,hidw_mask
+ vperm C01l,C0,C1,lodw_mask
+
+ # input loading
+ lxvd2x VSR(C4),r28,DATA # load C4
+ lxvd2x VSR(C5),r29,DATA # load C5
+ lxvd2x VSR(C6),r30,DATA # load C6
+ lxvd2x VSR(C7),r31,DATA # load C7
+
+ # swap permuting
+IF_LE(`vperm C4,C4,C4,swap_mask
+ vperm C5,C5,C5,swap_mask
+ vperm C6,C6,C6,swap_mask
+ vperm C7,C7,C7,swap_mask')
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C45h,C4,C5,hidw_mask
+ vperm C45l,C4,C5,lodw_mask
+ vperm C67h,C6,C7,hidw_mask
+ vperm C67l,C6,C7,lodw_mask
+ vxor C23,C23h,C23l
+ vxor C01,C01h,C01l
+ vxor C45,C45h,C45l
+ vxor C67,C67h,C67l
+
+ # polynomial multiplication "karatsuba"
+ vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h
+ vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l
+ vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h
+ vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l
+ vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h
+ vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l
+ vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h
+ vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l
+ vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l)
+ vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l)
+ vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l)
+ vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l)
+
+ # polynomial multiplication "karatsuba" post-processing
+ vxor C23,C23,C23h
+ vxor C01,C01,C01h
+ vxor C45,C45,C45h
+ vxor C67,C67,C67h
+ vxor C23,C23,C23l
+ vxor C01,C01,C01l
+ vxor C45,C45,C45l
+ vxor C67,C67,C67l
+
+ # deferred recombination of partial products
+ vxor C01h,C01h,C23h # H0 = H01⊕H23
+ vxor C45h,C45h,C67h # H1 = H45⊕H67
+ vxor C01l,C01l,C23l # L0 = L01⊕L23
+ vxor C45l,C45l,C67l # L1 = L45⊕L45
+ vxor C01,C01,C23 # M0 = M01⊕M23
+ vxor C45,C45,C67 # M1 = M45⊕M45
+ vxor C01h,C01h,C45h # H = H0⊕H1
+ vxor C01l,C01l,C45l # L = L0⊕L1
+ vxor C01,C01,C45 # M = M0⊕M1
+
+ # reduction first phase # [1]
+ vpmsumd RP,C01l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,C01,8   # [2]
+ vsldoi Ml,C01,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor C01h,C01h,Mh     # [2]
+ vxor C01l,C01l,Ml     # [2]
+ vxor C01l,C01l,RP     # [1]
+
+ # reduction second phase
+ vpmsumd RP,C01l,poly_l
+ vxor C01h,C01l,C01h
+ vxor C,C01h,RP
+
+ addi DATA,DATA,0x80
+ bdnz L8x_loop
+
+    # restore registers
+ li 8,0
+    lvx 31,8,SP
+    addi 8,8,16
+    lvx 30,8,SP
+    addi 8,8,16
+    lvx 29,8,SP
+    addi 8,8,16
+    lvx 28,8,SP
+    addi 8,8,16
+    lvx 27,8,SP
+    addi 8,8,16
+    lvx 26,8,SP
+    addi 8,8,16
+    lvx 25,8,SP
+    addi 8,8,16
+    lvx 24,8,SP
+    addi 8,8,16
+    lvx 23,8,SP
+    addi 8,8,16
+    lvx 22,8,SP
+    addi 8,8,16
+    lvx 21,8,SP
+    addi 8,8,16
+    lvx 20,8,SP
+ ld r31,192(SP)
+ ld r30,200(SP)
+ ld r29,208(SP)
+ ld r28,216(SP)
+ addi SP,SP,224
+
+ clrldi   LENGTH,LENGTH,57
+L2x:
+ srdi r7,LENGTH,5
+ cmpldi r7,0
+ beq L1x
+
+ # table loading
+ li r8,3*TableElemAlign
+ li r9,4*TableElemAlign
+ li r10,5*TableElemAlign
+ lxvd2x VSR(H21h),r8,TABLE
+ lxvd2x VSR(H21),r9,TABLE
+ lxvd2x VSR(H21l),r10,TABLE
+
+ li r10,0x10
+
+ mtctr     r7
+.align 5
+L2x_loop:
+ # input loading
+ lxvd2x VSR(C0),0,DATA # load C0
+ lxvd2x VSR(C1),r10,DATA # load C1
+
+ # swap permuting
+IF_LE(`vperm C0,C0,C0,swap_mask
+ vperm C1,C1,C1,swap_mask')
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C01h,C0,C1,hidw_mask
+ vperm C01l,C0,C1,lodw_mask
+ vxor C01,C01h,C01l
+
+ # polynomial multiplication "karatsuba"
+ vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h
+ vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l
+ vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l)
+
+ # polynomial multiplication "karatsuba" post-processing
+ vxor C01,C01,C01h
+ vxor C01,C01,C01l
+
+ # reduction first phase # [1]
+ vpmsumd RP,C01l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,C01,8   # [2]
+ vsldoi Ml,C01,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor C01h,C01h,Mh     # [2]
+ vxor C01l,C01l,Ml     # [2]
+ vxor C01l,C01l,RP     # [1]
+
+ # reduction second phase
+ vpmsumd RP,C01l,poly_l
+ vxor C01h,C01l,C01h
+ vxor C,C01h,RP
+
+ addi DATA,DATA,0x20
+ bdnz L2x_loop
+
+ clrldi   LENGTH,LENGTH,59
+L1x:
+ srdi r7,LENGTH,4
+ cmpldi r7,0
+ beq Lrem
+
+ # table loading
+ li r9,1*TableElemAlign
+ li r10,2*TableElemAlign
+ lxvd2x VSR(Hl),0,TABLE
+ lxvd2x VSR(H), r9,TABLE
+ lxvd2x VSR(Hh),r10,TABLE
+
+ # input loading
+ lxvd2x VSR(C0),0,DATA # load C0
+
+ # swap permuting
+IF_LE(`vperm C0,C0,C0,swap_mask')
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ vpmsumd Cl,C0,Hl # L = Hl*Cl
+ vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch
+ vpmsumd Ch,C0,Hh # H = Hh*Ch
+
+ # reduction first phase # [1]
+ vpmsumd RP,Cl,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,Cm,8   # [2]
+ vsldoi Ml,Cm,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor Ch,Ch,Mh   # [2]
+ vxor Cl,Cl,Ml       # [2]
+ vxor Cl,Cl,RP   # [1]
+
+ # reduction second phase
+ vpmsumd RP,Cl,poly_l
+ vxor Ch,Cl,Ch
+ vxor C,Ch,RP
+
+ addi DATA,DATA,0x10
+ clrldi   LENGTH,LENGTH,60
+Lrem:
+ cmpldi LENGTH,0
+ beq Ldone
+
+ # table loading
+ li r9,1*TableElemAlign
+ li r10,2*TableElemAlign
+ lxvd2x VSR(Hl),0,TABLE
+ lxvd2x VSR(H), r9,TABLE
+ lxvd2x VSR(Hh),r10,TABLE
+
+ # input loading
+ stdu SP,-16(SP)
+ stvx zero,0,SP
+Lst_loop:
+ subic.      LENGTH,LENGTH,1
+ lbzx r7,LENGTH,DATA
+ stbx r7,LENGTH,SP
+ bne Lst_loop
+ lxvd2x   VSR(C0),0,SP
+ addi SP,SP,16
+
+ # swap permuting
+IF_LE(`vperm C0,C0,C0,swap_mask')
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ vpmsumd Cl,C0,Hl # L = Hl*Cl
+ vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch
+ vpmsumd Ch,C0,Hh # H = Hh*Ch
+
+ # reduction first phase # [1]
+ vpmsumd RP,Cl,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,Cm,8   # [2]
+ vsldoi Ml,Cm,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor Ch,Ch,Mh   # [2]
+ vxor Cl,Cl,Ml     # [2]
+ vxor Cl,Cl,RP   # [1]
+
+ # reduction second phase
+ vpmsumd RP,Cl,poly_l
+ vxor Ch,Cl,Ch
+ vxor C,Ch,RP
+
+Ldone:
+IF_LE(`vperm C,C,C,swap_mask')
+ stxvd2x VSR(C),0,X # store C
+ blr
+EPILOGUE(_nettle_gcm_hash)
+
+ # gcm_fill (uint8_t *ctr, size_t blocks, union gcm_block *buffer)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_fill)
+IF_LE(`DATA_LOAD_VEC(swap_mask,.swap_mask,r6)')
+
+ vxor zero,zero,zero
+ vspltisb I1,1
+ vspltisb I2,2
+ vspltisb I3,3
+ vspltisb I4,4
+ vspltisb I5,5
+ vspltisb I6,6
+ vspltisb I7,7
+ vspltisb I8,8
+ vsldoi I1,zero,I1,1
+ vsldoi I2,zero,I2,1
+ vsldoi I3,zero,I3,1
+ vsldoi I4,zero,I4,1
+ vsldoi I5,zero,I5,1
+ vsldoi I6,zero,I6,1
+ vsldoi I7,zero,I7,1
+ vsldoi I8,zero,I8,1
+
+ lxvd2x VSR(CTR0),0,CTR
+ IF_LE(`vperm CTR0,CTR0,CTR0,swap_mask')
+
+ srdi r6,BLOCKS,3 # 8x loop count
+ cmpldi r6,0
+ beq Lfill_4x
+
+ std r25,-56(SP);
+ std r26,-48(SP);
+ std r27,-40(SP);
+ std r28,-32(SP);
+ std r29,-24(SP);
+ std r30,-16(SP);
+ std r31,-8(SP);
+
+ li r25,0x10
+ li r26,0x20
+ li r27,0x30
+ li r28,0x40
+ li r29,0x50
+ li r30,0x60
+ li r31,0x70
+
+ mtctr r6
+ L8x_fill_loop:
+ vadduwm CTR1,CTR0,I1
+ vadduwm CTR2,CTR0,I2
+ vadduwm CTR3,CTR0,I3
+ vadduwm CTR4,CTR0,I4
+ vadduwm CTR5,CTR0,I5
+ vadduwm CTR6,CTR0,I6
+ vadduwm CTR7,CTR0,I7
+
+ IF_LE(`vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask
+ vperm CTR2,CTR2,CTR2,swap_mask
+ vperm CTR3,CTR3,CTR3,swap_mask
+ vperm CTR4,CTR4,CTR4,swap_mask
+ vperm CTR5,CTR5,CTR5,swap_mask
+ vperm CTR6,CTR6,CTR6,swap_mask
+ vperm CTR7,CTR7,CTR7,swap_mask')
+
+ IF_LE(`stxvd2x VSR(CTR0S),0,BUFFER')
+ IF_BE(`stxvd2x VSR(CTR0),0,BUFFER')
+ stxvd2x VSR(CTR1),r25,BUFFER
+ stxvd2x VSR(CTR2),r26,BUFFER
+ stxvd2x VSR(CTR3),r27,BUFFER
+ stxvd2x VSR(CTR4),r28,BUFFER
+ stxvd2x VSR(CTR5),r29,BUFFER
+ stxvd2x VSR(CTR6),r30,BUFFER
+ stxvd2x VSR(CTR7),r31,BUFFER
+
+ vadduwm CTR0,CTR0,I8
+ addi BUFFER,BUFFER,0x80
+ bdnz L8x_fill_loop
+
+ ld r25,-56(SP);
+ ld r26,-48(SP);
+ ld r27,-40(SP);
+ ld r28,-32(SP);
+ ld r29,-24(SP);
+ ld r30,-16(SP);
+ ld r31,-8(SP);
+
+ clrldi BLOCKS,BLOCKS,61
+
+ Lfill_4x:
+ srdi r6,BLOCKS,2
+ cmpldi r6,0
+ beq Lfill_2x
+
+ li r8,0x10
+ li r9,0x20
+ li r10,0x30
+
+ vadduwm CTR1,CTR0,I1
+ vadduwm CTR2,CTR0,I2
+ vadduwm CTR3,CTR0,I3
+
+ IF_LE(`vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask
+ vperm CTR2,CTR2,CTR2,swap_mask
+ vperm CTR3,CTR3,CTR3,swap_mask')
+
+ IF_LE(`stxvd2x VSR(CTR0S),0,BUFFER')
+ IF_BE(`stxvd2x VSR(CTR0),0,BUFFER')
+ stxvd2x VSR(CTR1),r8,BUFFER
+ stxvd2x VSR(CTR2),r9,BUFFER
+ stxvd2x VSR(CTR3),r10,BUFFER
+
+ vadduwm CTR0,CTR0,I4
+ addi BUFFER,BUFFER,0x40
+
+ clrldi BLOCKS,BLOCKS,62
+
+ Lfill_2x:
+ srdi r6,BLOCKS,1
+ cmpldi r6,0
+ beq Lfill_1x
+
+ li r10,0x10
+
+ vadduwm CTR1,CTR0,I1
+
+ IF_LE(`vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask')
+
+ IF_LE(`stxvd2x VSR(CTR0S),0,BUFFER')
+ IF_BE(`stxvd2x VSR(CTR0),0,BUFFER')
+ stxvd2x VSR(CTR1),r10,BUFFER
+
+ vadduwm CTR0,CTR0,I2
+ addi BUFFER,BUFFER,0x20
+
+ clrldi BLOCKS,BLOCKS,63
+
+ Lfill_1x:
+ cmpldi BLOCKS,0
+ beq Lfill_done
+
+ IF_LE(`vperm CTR0S,CTR0,CTR0,swap_mask')
+
+ IF_LE(`stxvd2x VSR(CTR0S),0,BUFFER')
+ IF_BE(`stxvd2x VSR(CTR0),0,BUFFER')
+
+ vadduwm CTR0,CTR0,I1
+
+ Lfill_done:
+ IF_LE(`vperm CTR0,CTR0,CTR0,swap_mask')
+ stxvd2x VSR(CTR0),0,CTR
+
+ blr
+EPILOGUE(_nettle_gcm_fill)
+
+    .data
+IF_LE(`.align 4
+.polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+    .align 4
+.swap_mask:
+ .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
+    .align 4
+.hidw_mask:
+ .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0
+    .align 4
+.lodw_mask:
+ .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8')
+IF_BE(`.align 4
+.polynomial:
+ .byte 0xc2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+    .align 4
+.hidw_mask:
+ .byte 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
+    .align 4
+.lodw_mask:
+ .byte 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31')
-- 
2.17.1