New subject: GCM with ARM Neon (was: Re: [PATCH] "PowerPC64" GCM support)

3 Oct 2020

---
 configure.ac              |   6 +-
 gcm.c                     |  49 +++-
 powerpc64/p8/gcm-hash.asm | 607
++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 647 insertions(+), 15 deletions(-)
 create mode 100644 powerpc64/p8/gcm-hash.asm

diff --git a/configure.ac b/configure.ac
index e9983697..0129f950 100644
--- a/configure.ac
+++ b/configure.ac
@@ -488,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.asm
aes-decrypt-internal.asm \
  sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used.
-asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
+asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
   chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
   salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
@@ -612,9 +612,9 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_ecc_secp384r1_redc
 #undef HAVE_NATIVE_ecc_secp521r1_modp
 #undef HAVE_NATIVE_ecc_secp521r1_redc
-#undef HAVE_NATIVE_gcm_init_key8
+#undef HAVE_NATIVE_gcm_init_key
+#undef HAVE_NATIVE_gcm_hash
 #undef HAVE_NATIVE_gcm_hash8
-#undef HAVE_NATIVE_gcm_fill
 #undef HAVE_NATIVE_salsa20_core
 #undef HAVE_NATIVE_salsa20_2core
 #undef HAVE_NATIVE_fat_salsa20_2core
diff --git a/gcm.c b/gcm.c
index 48b3e75a..81981c1c 100644
--- a/gcm.c
+++ b/gcm.c
@@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
   memcpy (x->b, Z.b, sizeof(Z));
 }
 # elif GCM_TABLE_BITS == 8
+#  if HAVE_NATIVE_gcm_init_key
+
+#define gcm_init_key _nettle_gcm_init_key
+void
+_nettle_gcm_init_key (union nettle_block16 *table);
+#  endif /* HAVE_NATIVE_gcm_init_key */
+#  if HAVE_NATIVE_gcm_hash
+
+#define gcm_hash _nettle_gcm_hash
+void
+_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
+   size_t length, const uint8_t *data);
+#  endif /* HAVE_NATIVE_gcm_hash */
 #  if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8
@@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
 /* Increment the rightmost 32 bits. */
 #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key
+static void
+gcm_init_key(union nettle_block16 *table)
+{
+#if GCM_TABLE_BITS
+  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
+     element */
+  unsigned i = (1<<GCM_TABLE_BITS)/2;
+
+  /* Algorithm 3 from the gcm paper. First do powers of two, then do
+     the rest by adding. */
+  while (i /= 2)
+    block16_mulx_ghash(&table[i], &table[2*i]);
+  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
+    {
+      unsigned j;
+      for (j = 1; j < i; j++)
+ block16_xor3(&table[i+j], &table[i], &table[j]);
+    }
+#endif
+}
+#endif /* !gcm_init_key */
+
 /* Initialization of GCM.
  * @ctx: The context of GCM
  * @cipher: The context of the underlying block cipher
@@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key,
   memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
   f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS
-  /* Algorithm 3 from the gcm paper. First do powers of two, then do
-     the rest by adding. */
-  while (i /= 2)
-    block16_mulx_ghash(&key->h[i], &key->h[2*i]);
-  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
-    {
-      unsigned j;
-      for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
-    }
-#endif
+  gcm_init_key(key->h);
 }
#ifndef gcm_hash
diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm
new file mode 100644
index 00000000..540c9f97
--- /dev/null
+++ b/powerpc64/p8/gcm-hash.asm
@@ -0,0 +1,607 @@
+C powerpc64/p8/gcm-hash.asm
+
+ifelse(`
+   Copyright (D) 2020 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Alignment of gcm_key table elements, which is declared in gcm.h
+define(`TableElemAlign', `0x100')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+define(`TABLE', `r3')
+
+define(`ZERO', `v0')
+define(`B1', `v1')
+define(`EMSB', `v2')
+define(`POLY', `v7')
+define(`POLY_L', `v1')
+define(`POLY_H', `v2')
+
+define(`H', `v3')
+define(`H3', `v3')
+define(`Hl', `v4')
+define(`Hm', `v5')
+define(`Hh', `v6')
+define(`RP', `v7')
+define(`Mh', `v8')
+define(`Ml', `v9')
+define(`H2', `v7')
+define(`H4', `v7')
+define(`H2m', `v8')
+define(`H2l', `v9')
+define(`H2h', `v10')
+define(`RP2', `v11')
+define(`M2h', `v12')
+define(`M2l', `v13')
+
+define(`H2_l', `v14')
+define(`H2_m', `v15')
+define(`H2_h', `v16')
+define(`H3_l', `v14')
+define(`H3_m', `v15')
+define(`H3_h', `v16')
+define(`H4_l', `v17')
+define(`H4_m', `v18')
+define(`H4_h', `v19')
+
+define(`H21l', `v16')
+define(`H21h', `v17')
+define(`H3m', `v16')
+define(`H4m', `v17')
+define(`H43l', `v18')
+define(`H43h', `v19')
+
+define(`LE_TEMP', `v18')
+define(`LE_MASK', `v19')
+
+.file "gcm-hash.asm"
+
+.text
+
+    C void gcm_init_key (union gcm_block *table)
+
+C This function populates the gcm table as the following layout
+C ********************************************************************
+C | Hm = low-order doubleword of H^1:high-order doubleword of H^1    |
+C | Hl = 64-bits zeros:low-order doubleword of H^1                   |
+C | Hh = high-order doubleword of H^1:64-bits zeros                  |
+C |                                                                  |
+C | H2m = low-order doubleword of H^2:high-order doubleword of H^2   |
+C | H21l = low-order doubleword of H^2:low-order doubleword of H^1   |
+C | H21h = high-order doubleword of H^2:high-order doubleword of H^1 |
+C |                                                                  |
+C | H3m = low-order doubleword of H^3:high-order doubleword of H^3   |
+C | H4m = low-order doubleword of H^4:high-order doubleword of H^4   |
+C | H43l = low-order doubleword of H^4:low-order doubleword of H^3   |
+C | H43h = high-order doubleword of H^4:high-order doubleword of H^3 |
+C ********************************************************************
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_init_key)
+    DATA_LOAD_VEC(POLY,.polynomial,r7)           C
0xC2000000000000000000000000000001
+IF_LE(`
+    li             r8,0
+    lvsl           LE_MASK,0,r8                  C
0x000102030405060708090A0B0C0D0E0F
+    vspltisb       LE_TEMP,0x07                  C
0x07070707070707070707070707070707
+    vxor           LE_MASK,LE_MASK,LE_TEMP       C
0x07060504030201000F0E0D0C0B0A0908
+')
+
+    C 'H' is assigned by gcm_set_key() to the middle element of the table
+    li             r10,8*TableElemAlign
+    lxvd2x         VSR(H),r10,TABLE              C load 'H'
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          H,H,H,LE_MASK
+')
+
+    C --- calculate [H = H << 1 modulo polynomial] ---
+
+    vupkhsb        EMSB,H                        C extend most significant
bit to first byte
+    vspltisb       B1,1                          C
0x01010101010101010101010101010101
+    vspltb         EMSB,EMSB,0                   C first byte
quadword-extend
+    vsl            H,H,B1                        C H = H << 1
+    vand           EMSB,EMSB,POLY                C EMSB &=
0xC2000000000000000000000000000001
+    vxor           ZERO,ZERO,ZERO                C
0x00000000000000000000000000000000
+    vxor           H,H,EMSB                      C H ^= EMSB
+
+    C calculate [Hl = 0:H^1l, Hm = H^1l:H^1h, Hh = H^1h:0]
+    xxmrgld        VSR(Hl),VSR(ZERO),VSR(H)
+    xxswapd        VSR(Hm),VSR(H)
+    xxmrghd        VSR(Hh),VSR(H),VSR(ZERO)
+
+    C --- calculate H^2 = H*H ---
+
+    C reduction pre-processing
+    xxmrghd        VSR(POLY_H),VSR(POLY),VSR(ZERO) C
0xC2000000000000000000000000000000
+    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY) C
0x0000000000000000C200000000000000
+
+    C polynomial multiplication "classical"
+    vpmsumd        H2_l,H,Hl                     C H^1l*H^1l
+    vpmsumd        H2_m,H,Hm                     C H^1h*H^1l⊕H^1l*H^1h
+    vpmsumd        H2_h,H,Hh                     C H^1h*H^1h
+
+    C reduction first phase                        [1]
+    vpmsumd        RP,H2_l,POLY_L                C [1]
+
+    C polynomial multiplication post-processing    [2]
+    xxmrghd        VSR(Mh),VSR(ZERO),VSR(H2_m)   C [2]
+    xxmrgld        VSR(Ml),VSR(H2_m),VSR(ZERO)   C [2]
+    xxswapd        VSR(RP),VSR(RP)               C [1]
+    vxor           H2_h,H2_h,Mh                  C [2]
+    vxor           H2_l,H2_l,Ml                  C [2]
+    vxor           H2_l,H2_l,RP                  C [1]
+
+    C reduction second phase
+    vpmsumd        RP,H2_l,POLY_H
+    vxor           H2_h,H2_h,H2_l
+    vxor           H2,H2_h,RP
+
+    C store [H2m = H^2l:H^2h, H2l = 0:H^2l, H2h = H^2h:0]
+    xxswapd        VSR(H2m),VSR(H2)
+    xxmrgld        VSR(H2l),VSR(ZERO),VSR(H2)
+    xxmrghd        VSR(H2h),VSR(H2),VSR(ZERO)
+
+    C calculate [H21l = H^2l:H^1l, H21h = H^2h:H^1h]
+    xxmrgld        VSR(H21l),VSR(H2),VSR(H)
+    xxmrghd        VSR(H21h),VSR(H2),VSR(H)
+
+    C store [Hm, Hl, Hh]
+    li             r9,1*TableElemAlign
+    li             r10,2*TableElemAlign
+    stxvd2x        VSR(Hm),0,TABLE
+    stxvd2x        VSR(Hl),r9,TABLE
+    stxvd2x        VSR(Hh),r10,TABLE
+
+    C store [H2m, H21l, H21h]
+    li             r8,3*TableElemAlign
+    li             r9,4*TableElemAlign
+    li             r10,5*TableElemAlign
+    stxvd2x        VSR(H2m),r8,TABLE
+    stxvd2x        VSR(H21l),r9,TABLE
+    stxvd2x        VSR(H21h),r10,TABLE
+
+    C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
+
+    C polynomial multiplication "classical"
+    vpmsumd        H3_l,H,H2l                    C H^1l*H^2l
+    vpmsumd        H4_l,H2,H2l                   C H^2l*H^2l
+    vpmsumd        H3_m,H,H2m                    C H^1h*H^2l⊕H^1l*H^2h
+    vpmsumd        H4_m,H2,H2m                   C H^2h*H^2l⊕H^2l*H^2h
+    vpmsumd        H3_h,H,H2h                    C H^1h*H^2h
+    vpmsumd        H4_h,H2,H2h                   C H^2h*H^2h
+
+    C reduction first phase                        [1]
+    vpmsumd        RP,H3_l,POLY_L                C [1] H^3
+    vpmsumd        RP2,H4_l,POLY_L               C [1] H^4
+
+    C polynomial multiplication post-processing    [2]
+    xxmrghd        VSR(Mh),VSR(ZERO),VSR(H3_m)   C [2] H^3
+    xxmrghd        VSR(M2h),VSR(ZERO),VSR(H4_m)  C [2] H^4
+    xxmrgld        VSR(Ml),VSR(H3_m),VSR(ZERO)   C [2] H^3
+    xxmrgld        VSR(M2l),VSR(H4_m),VSR(ZERO)  C [2] H^4
+    xxswapd        VSR(RP),VSR(RP)               C [1] H^3
+    xxswapd        VSR(RP2),VSR(RP2)             C [1] H^4
+    vxor           H3_h,H3_h,Mh                  C [2] H^3
+    vxor           H4_h,H4_h,M2h                 C [2] H^4
+    vxor           H3_l,H3_l,Ml                  C [2] H^3
+    vxor           H4_l,H4_l,M2l                 C [2] H^4
+    vxor           H3_l,H3_l,RP                  C [1] H^3
+    vxor           H4_l,H4_l,RP2                 C [1] H^4
+
+    C reduction second phase
+    vpmsumd        RP,H3_l,POLY_H                C H^3
+    vpmsumd        RP2,H4_l,POLY_H               C H^4
+    vxor           H3_h,H3_h,H3_l                C H^3
+    vxor           H4_h,H4_h,H4_l                C H^4
+    vxor           H3,H3_h,RP                    C H^3
+    vxor           H4,H4_h,RP2                   C H^4
+
+    C calculate [H3m = H^3l:H^3h, H4m = H^4l:H^4h, H43l = H^4l:H^3l, H43h
= H^4h:H^3h]
+    xxswapd        VSR(H3m),VSR(H3)
+    xxswapd        VSR(H4m),VSR(H4)
+    xxmrgld        VSR(H43l),VSR(H4),VSR(H3)
+    xxmrghd        VSR(H43h),VSR(H4),VSR(H3)
+
+    C store [H3m, H4m, H43l, H43h]
+    li             r7,6*TableElemAlign
+    li             r8,7*TableElemAlign
+    li             r9,8*TableElemAlign
+    li             r10,9*TableElemAlign
+    stxvd2x        VSR(H3m),r7,TABLE
+    stxvd2x        VSR(H4m),r8,TABLE
+    stxvd2x        VSR(H43l),r9,TABLE
+    stxvd2x        VSR(H43h),r10,TABLE
+
+    blr
+EPILOGUE(_nettle_gcm_init_key)
+
+define(`TABLE', `r3')
+define(`X', `r4')
+define(`LENGTH', `r5')
+define(`DATA', `r6')
+
+define(`ZERO', `v0')
+define(`POLY', `v3')
+define(`POLY_L', `v1')
+define(`POLY_H', `v2')
+
+define(`D', `v3')
+define(`C0', `v4')
+define(`C1', `v5')
+define(`C2', `v6')
+define(`C3', `v7')
+define(`Mh', `v8')
+define(`Ml', `v9')
+define(`RP', `v10')
+define(`C01h', `v11')
+define(`C01l', `v12')
+define(`C23h', `v13')
+define(`C23l', `v14')
+
+define(`H1', `v15')
+define(`H2', `v16')
+define(`H21l', `v17')
+define(`H21h', `v18')
+define(`H3', `v20')
+define(`H4', `v21')
+define(`H43l', `v22')
+define(`H43h', `v23')
+
+define(`Cl', `v5')
+define(`Cm', `v6')
+define(`Ch', `v7')
+define(`Hl', `v15')
+define(`H', `v16')
+define(`Hh', `v17')
+
+define(`LE_TEMP', `v18')
+define(`LE_MASK', `v19')
+
+    C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+    C                size_t length, const uint8_t *data)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_hash)
+    DATA_LOAD_VEC(POLY,.polynomial,r7)
+IF_LE(`
+    li             r8,0
+    lvsl           LE_MASK,0,r8
+    vspltisb       LE_TEMP,0x07
+    vxor           LE_MASK,LE_MASK,LE_TEMP
+')
+    vxor           ZERO,ZERO,ZERO
+
+    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY)
+    xxmrghd        VSR(POLY_H),VSR(POLY),VSR(ZERO)
+
+    lxvd2x         VSR(D),0,X                    C load 'X' pointer
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          D,D,D,LE_MASK
+')
+
+    C --- process 4 blocks '128-bit each' per one loop ---
+
+    srdi           r7,LENGTH,6                   C 4-blocks loop count
'LENGTH / (4 * 16)'
+    cmpldi         r7,0
+    beq            L2x
+
+    mtctr          r7                            C assign counter register
to loop count
+
+    C backup non-volatile vector registers
+    addi           r8,SP,-64
+    stvx           20,0,r8
+    addi           r8,r8,16
+    stvx           21,0,r8
+    addi           r8,r8,16
+    stvx           22,0,r8
+    addi           r8,r8,16
+    stvx           23,0,r8
+
+    C load table elements
+    li             r8,3*TableElemAlign
+    li             r9,4*TableElemAlign
+    li             r10,5*TableElemAlign
+    lxvd2x         VSR(H1),0,TABLE
+    lxvd2x         VSR(H2),r8,TABLE
+    lxvd2x         VSR(H21l),r9,TABLE
+    lxvd2x         VSR(H21h),r10,TABLE
+    li             r7,6*TableElemAlign
+    li             r8,7*TableElemAlign
+    li             r9,8*TableElemAlign
+    li             r10,9*TableElemAlign
+    lxvd2x         VSR(H3),r7,TABLE
+    lxvd2x         VSR(H4),r8,TABLE
+    lxvd2x         VSR(H43l),r9,TABLE
+    lxvd2x         VSR(H43h),r10,TABLE
+
+    li             r8,0x10
+    li             r9,0x20
+    li             r10,0x30
+.align 5
+L4x_loop:
+    C input loading
+    lxvd2x         VSR(C0),0,DATA                C load C0
+    lxvd2x         VSR(C1),r8,DATA               C load C1
+    lxvd2x         VSR(C2),r9,DATA               C load C2
+    lxvd2x         VSR(C3),r10,DATA              C load C3
+
+IF_LE(`
+    vperm          C0,C0,C0,LE_MASK
+    vperm          C1,C1,C1,LE_MASK
+    vperm          C2,C2,C2,LE_MASK
+    vperm          C3,C3,C3,LE_MASK
+')
+
+    C previous digest combining
+    vxor           C0,C0,D
+
+    C polynomial multiplication "classical" pre-processing
+    xxmrghd        VSR(C23h),VSR(C2),VSR(C3)
+    xxmrgld        VSR(C23l),VSR(C2),VSR(C3)
+    xxmrghd        VSR(C01h),VSR(C0),VSR(C1)
+    xxmrgld        VSR(C01l),VSR(C0),VSR(C1)
+
+    C polynomial multiplication "classical"
+    vpmsumd        C3,C3,H1                      C M3 = H^1l*C3h⊕H^1h*C3l
+    vpmsumd        C2,C2,H2                      C M2 = H^2l*C2h⊕H^2h*C2l
+    vpmsumd        C1,C1,H3                      C M1 = H^3l*C1h⊕H^3h*C1l
+    vpmsumd        C0,C0,H4                      C M0 = H^4l*C0h⊕H^4h*C0l
+    vpmsumd        C23h,C23h,H21h                C H23 = H^2h*C2h⊕H^1h*C3h
+    vpmsumd        C23l,C23l,H21l                C L23 = H^2l*C2l⊕H^1l*C3l
+    vpmsumd        C01h,C01h,H43h                C H01 = H^4h*C0h⊕H^4h*C1h
+    vpmsumd        C01l,C01l,H43l                C L01 = H^4l*C0l⊕H^4l*C1l
+
+    C polynomial multiplication "classical" post-processing
+    vxor           C2,C2,C3                      C M2 = M2⊕M3
+    vxor           C0,C0,C1                      C M0 = M0⊕M1
+
+    C deferred recombination of partial products
+    vxor           C01h,C01h,C23h                C H0 = H01⊕H23
+    vxor           C01l,C01l,C23l                C L0 = L01⊕L23
+    vxor           C0,C0,C2                      C M0 = M0⊕M2
+
+    C reduction first phase                        [1]
+    vpmsumd        RP,C01l,POLY_L                C [1]
+
+    C polynomial multiplication post-processing    [2]
+    xxmrghd        VSR(Mh),VSR(ZERO),VSR(C0)     C [2]
+    xxmrgld        VSR(Ml),VSR(C0),VSR(ZERO)     C [2]
+    xxswapd        VSR(RP),VSR(RP)               C [1]
+    vxor           C01h,C01h,Mh                  C [2]
+    vxor           C01l,C01l,Ml                  C [2]
+    vxor           C01l,C01l,RP                  C [1]
+
+    C reduction second phase
+    vpmsumd        RP,C01l,POLY_H
+    vxor           C01h,C01l,C01h
+    vxor           D,C01h,RP
+
+    addi           DATA,DATA,0x40
+    bdnz           L4x_loop
+
+    C restore non-volatile vector registers
+    addi           r8,SP,-64
+    lvx            20,0,r8
+    addi           r8,r8,16
+    lvx            21,0,r8
+    addi           r8,r8,16
+    lvx            22,0,r8
+    addi           r8,r8,16
+    lvx            23,0,r8
+
+    clrldi         LENGTH,LENGTH,58              C 'set the high-order 58
bits to zeros'
+L2x:
+    C --- process 2 blocks ---
+
+    srdi           r7,LENGTH,5                   C 'LENGTH / (2 * 16)'
+    cmpldi         r7,0
+    beq            L1x
+
+    C load table elements
+    li             r8,3*TableElemAlign
+    li             r9,4*TableElemAlign
+    li             r10,5*TableElemAlign
+    lxvd2x         VSR(H1),0,TABLE
+    lxvd2x         VSR(H2),r8,TABLE
+    lxvd2x         VSR(H21l),r9,TABLE
+    lxvd2x         VSR(H21h),r10,TABLE
+
+    C input loading
+    li             r10,0x10
+    lxvd2x         VSR(C0),0,DATA                C load C0
+    lxvd2x         VSR(C1),r10,DATA              C load C1
+
+IF_LE(`
+    vperm          C0,C0,C0,LE_MASK
+    vperm          C1,C1,C1,LE_MASK
+')
+
+    C previous digest combining
+    vxor           C0,C0,D
+
+    C polynomial multiplication "classical" pre-processing
+    xxmrghd        VSR(C01h),VSR(C0),VSR(C1)
+    xxmrgld        VSR(C01l),VSR(C0),VSR(C1)
+
+    C polynomial multiplication "classical"
+    vpmsumd        C1,C1,H1                      C M1 = H^1l*C1h⊕H^1h*C1l
+    vpmsumd        C0,C0,H2                      C M0 = H^2l*C0h⊕H^2h*C0l
+    vpmsumd        C01h,C01h,H21h                C H01 = H^2h*C0h⊕H^1h*C1h
+    vpmsumd        C01l,C01l,H21l                C L01 = H^2l*C0l⊕H^1l*C1l
+
+    C deferred recombination of partial products
+    vxor           C0,C0,C1                      C M0 = M0⊕M1
+
+    C reduction first phase                        [1]
+    vpmsumd        RP,C01l,POLY_L                C [1]
+
+    C polynomial multiplication post-processing    [2]
+    xxmrghd        VSR(Mh),VSR(ZERO),VSR(C0)     C [2]
+    xxmrgld        VSR(Ml),VSR(C0),VSR(ZERO)     C [2]
+    xxswapd        VSR(RP),VSR(RP)               C [1]
+    vxor           C01h,C01h,Mh                  C [2]
+    vxor           C01l,C01l,Ml                  C [2]
+    vxor           C01l,C01l,RP                  C [1]
+
+    C reduction second phase
+    vpmsumd        RP,C01l,POLY_H
+    vxor           C01h,C01l,C01h
+    vxor           D,C01h,RP
+
+    addi           DATA,DATA,0x20
+    clrldi         LENGTH,LENGTH,59              C 'set the high-order 59
bits to zeros'
+L1x:
+    C --- process 1 block ---
+
+    srdi           r7,LENGTH,4                   C 'LENGTH / (1 * 16)'
+    cmpldi         r7,0
+    beq            Lmod
+
+    C load table elements
+    li             r9,1*TableElemAlign
+    li             r10,2*TableElemAlign
+    lxvd2x         VSR(H),0,TABLE
+    lxvd2x         VSR(Hl),r9,TABLE
+    lxvd2x         VSR(Hh),r10,TABLE
+
+    C input loading
+    lxvd2x         VSR(C0),0,DATA                C load C0
+
+IF_LE(`
+    vperm          C0,C0,C0,LE_MASK
+')
+
+    C previous digest combining
+    vxor           C0,C0,D
+
+    C polynomial multiplication "classical"
+    vpmsumd        Cl,C0,Hl                      C L = Hl*Cl
+    vpmsumd        Cm,C0,H                       C M = Hh*Cl⊕Hl*Ch
+    vpmsumd        Ch,C0,Hh                      C H = Hh*Ch
+
+    C reduction first phase                      C [1]
+    vpmsumd        RP,Cl,POLY_L                  C [1]
+
+    C polynomial multiplication post-processing    [2]
+    xxmrghd        VSR(Mh),VSR(ZERO),VSR(Cm)     C [2]
+    xxmrgld        VSR(Ml),VSR(Cm),VSR(ZERO)     C [2]
+    xxswapd        VSR(RP),VSR(RP)               C [1]
+    vxor           Ch,Ch,Mh                      C [2]
+    vxor           Cl,Cl,Ml                      C [2]
+    vxor           Cl,Cl,RP                      C [1]
+
+    C reduction second phase
+    vpmsumd        RP,Cl,POLY_H
+    vxor           Ch,Cl,Ch
+    vxor           D,Ch,RP
+
+    addi           DATA,DATA,0x10
+    clrldi         LENGTH,LENGTH,60              C 'set the high-order 60
bits to zeros'
+Lmod:
+    C --- process the modulo bytes, padding the low-order bytes with zeros
---
+
+    cmpldi         LENGTH,0
+    beq            Ldone
+
+    C load table elements
+    li             r9,1*TableElemAlign
+    li             r10,2*TableElemAlign
+    lxvd2x         VSR(H),0,TABLE
+    lxvd2x         VSR(Hl),r9,TABLE
+    lxvd2x         VSR(Hh),r10,TABLE
+
+    C push every modulo byte to the stack and load them with padding into
vector register
+    addi           r8,SP,-16
+    stvx           ZERO,0,r8
+Lstb_loop:
+    subic.         LENGTH,LENGTH,1
+    lbzx           r7,LENGTH,DATA
+    stbx           r7,LENGTH,r8
+    bne            Lstb_loop
+    lxvd2x         VSR(C0),0,r8
+
+IF_LE(`
+    vperm          C0,C0,C0,LE_MASK
+')
+
+    C previous digest combining
+    vxor           C0,C0,D
+
+    C polynomial multiplication "classical"
+    vpmsumd        Cl,C0,Hl                      C L = Hl*Cl
+    vpmsumd        Cm,C0,H                       C M = Hh*Cl⊕Hl*Ch
+    vpmsumd        Ch,C0,Hh                      C H = Hh*Ch
+
+    C reduction first phase                        [1]
+    vpmsumd        RP,Cl,POLY_L                  C [1]
+
+    C polynomial multiplication post-processing    [2]
+    xxmrghd        VSR(Mh),VSR(ZERO),VSR(Cm)     C [2]
+    xxmrgld        VSR(Ml),VSR(Cm),VSR(ZERO)     C [2]
+    xxswapd        VSR(RP),VSR(RP)               C [1]
+    vxor           Ch,Ch,Mh                      C [2]
+    vxor           Cl,Cl,Ml                      C [2]
+    vxor           Cl,Cl,RP                      C [1]
+
+    C reduction second phase
+    vpmsumd        RP,Cl,POLY_H
+    vxor           Ch,Cl,Ch
+    vxor           D,Ch,RP
+
+Ldone:
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          D,D,D,LE_MASK
+')
+    stxvd2x        VSR(D),0,X                    C store digest 'D'
+
+    blr
+EPILOGUE(_nettle_gcm_hash)
+
+.data
+    C 0xC2000000000000000000000000000001
+.polynomial:
+.align 4
+IF_BE(`
+.byte 0xC2
+.rept 14
+.byte 0x00
+.endr
+.byte 0x01
+',`
+.byte 0x01
+.rept 14
+.byte 0x00
+.endr
+.byte 0xC2
+')
-- 
2.17.1