--- configure.ac | 6 +- gcm.c | 49 +++- powerpc64/p8/gcm-hash.asm | 607 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 647 insertions(+), 15 deletions(-) create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index e9983697..0129f950 100644 --- a/configure.ac +++ b/configure.ac @@ -488,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -612,9 +612,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/gcm.c b/gcm.c index 48b3e75a..81981c1c 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key + +#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash + +#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key +static void +gcm_init_key(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS + /* Middle element if GCM_TABLE_BITS > 0, otherwise the first + element */ + unsigned i = (1<<GCM_TABLE_BITS)/2; + + /* Algorithm 3 from the gcm paper. First do powers of two, then do + the rest by adding. */ + while (i /= 2) + block16_mulx_ghash(&table[i], &table[2*i]); + for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) + { + unsigned j; + for (j = 1; j < i; j++) + block16_xor3(&table[i+j], &table[i], &table[j]); + } +#endif +} +#endif /* !gcm_init_key */ + /* Initialization of GCM. * @ctx: The context of GCM * @cipher: The context of the underlying block cipher @@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS - /* Algorithm 3 from the gcm paper. First do powers of two, then do - the rest by adding. */ - while (i /= 2) - block16_mulx_ghash(&key->h[i], &key->h[2*i]); - for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) - { - unsigned j; - for (j = 1; j < i; j++) - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]); - } -#endif + gcm_init_key(key->h); }
#ifndef gcm_hash diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..540c9f97 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,607 @@ +C powerpc64/p8/gcm-hash.asm + +ifelse(` + Copyright (D) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`TABLE', `r3') + +define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v2') +define(`POLY', `v7') +define(`POLY_L', `v1') +define(`POLY_H', `v2') + +define(`H', `v3') +define(`H3', `v3') +define(`Hl', `v4') +define(`Hm', `v5') +define(`Hh', `v6') +define(`RP', `v7') +define(`Mh', `v8') +define(`Ml', `v9') +define(`H2', `v7') +define(`H4', `v7') +define(`H2m', `v8') +define(`H2l', `v9') +define(`H2h', `v10') +define(`RP2', `v11') +define(`M2h', `v12') +define(`M2l', `v13') + +define(`H2_l', `v14') +define(`H2_m', `v15') +define(`H2_h', `v16') +define(`H3_l', `v14') +define(`H3_m', `v15') +define(`H3_h', `v16') +define(`H4_l', `v17') +define(`H4_m', `v18') +define(`H4_h', `v19') + +define(`H21l', `v16') +define(`H21h', `v17') +define(`H3m', `v16') +define(`H4m', `v17') +define(`H43l', `v18') +define(`H43h', `v19') + +define(`LE_TEMP', `v18') +define(`LE_MASK', `v19') + +.file "gcm-hash.asm" + +.text + + C void gcm_init_key (union gcm_block *table) + +C This function populates the gcm table as the following layout +C ******************************************************************** +C | Hm = low-order doubleword of H^1:high-order doubleword of H^1 | +C | Hl = 64-bits zeros:low-order doubleword of H^1 | +C | Hh = high-order doubleword of H^1:64-bits zeros | +C | | +C | H2m = low-order doubleword of H^2:high-order doubleword of H^2 | +C | H21l = low-order doubleword of H^2:low-order doubleword of H^1 | +C | H21h = high-order doubleword of H^2:high-order doubleword of H^1 | +C | | +C | H3m = low-order doubleword of H^3:high-order doubleword of H^3 | +C | H4m = low-order doubleword of H^4:high-order doubleword of H^4 | +C | H43l = low-order doubleword of H^4:low-order doubleword of H^3 | +C | H43h = high-order doubleword of H^4:high-order doubleword of H^3 | +C ******************************************************************** + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key) + DATA_LOAD_VEC(POLY,.polynomial,r7) C 0xC2000000000000000000000000000001 +IF_LE(` + li r8,0 + lvsl LE_MASK,0,r8 C 0x000102030405060708090A0B0C0D0E0F + vspltisb LE_TEMP,0x07 C 0x07070707070707070707070707070707 + vxor LE_MASK,LE_MASK,LE_TEMP C 0x07060504030201000F0E0D0C0B0A0908 +') + + C 'H' is assigned by gcm_set_key() to the middle element of the table + li r10,8*TableElemAlign + lxvd2x VSR(H),r10,TABLE C load 'H' + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm H,H,H,LE_MASK +') + + C --- calculate [H = H << 1 modulo polynomial] --- + + vupkhsb EMSB,H C extend most significant bit to first byte + vspltisb B1,1 C 0x01010101010101010101010101010101 + vspltb EMSB,EMSB,0 C first byte quadword-extend + vsl H,H,B1 C H = H << 1 + vand EMSB,EMSB,POLY C EMSB &= 0xC2000000000000000000000000000001 + vxor ZERO,ZERO,ZERO C 0x00000000000000000000000000000000 + vxor H,H,EMSB C H ^= EMSB + + C calculate [Hl = 0:H^1l, Hm = H^1l:H^1h, Hh = H^1h:0] + xxmrgld VSR(Hl),VSR(ZERO),VSR(H) + xxswapd VSR(Hm),VSR(H) + xxmrghd VSR(Hh),VSR(H),VSR(ZERO) + + C --- calculate H^2 = H*H --- + + C reduction pre-processing + xxmrghd VSR(POLY_H),VSR(POLY),VSR(ZERO) C 0xC2000000000000000000000000000000 + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C 0x0000000000000000C200000000000000 + + C polynomial multiplication "classical" + vpmsumd H2_l,H,Hl C H^1l*H^1l + vpmsumd H2_m,H,Hm C H^1h*H^1l⊕H^1l*H^1h + vpmsumd H2_h,H,Hh C H^1h*H^1h + + C reduction first phase [1] + vpmsumd RP,H2_l,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(H2_m) C [2] + xxmrgld VSR(Ml),VSR(H2_m),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor H2_h,H2_h,Mh C [2] + vxor H2_l,H2_l,Ml C [2] + vxor H2_l,H2_l,RP C [1] + + C reduction second phase + vpmsumd RP,H2_l,POLY_H + vxor H2_h,H2_h,H2_l + vxor H2,H2_h,RP + + C store [H2m = H^2l:H^2h, H2l = 0:H^2l, H2h = H^2h:0] + xxswapd VSR(H2m),VSR(H2) + xxmrgld VSR(H2l),VSR(ZERO),VSR(H2) + xxmrghd VSR(H2h),VSR(H2),VSR(ZERO) + + C calculate [H21l = H^2l:H^1l, H21h = H^2h:H^1h] + xxmrgld VSR(H21l),VSR(H2),VSR(H) + xxmrghd VSR(H21h),VSR(H2),VSR(H) + + C store [Hm, Hl, Hh] + li r9,1*TableElemAlign + li r10,2*TableElemAlign + stxvd2x VSR(Hm),0,TABLE + stxvd2x VSR(Hl),r9,TABLE + stxvd2x VSR(Hh),r10,TABLE + + C store [H2m, H21l, H21h] + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + stxvd2x VSR(H2m),r8,TABLE + stxvd2x VSR(H21l),r9,TABLE + stxvd2x VSR(H21h),r10,TABLE + + C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 --- + + C polynomial multiplication "classical" + vpmsumd H3_l,H,H2l C H^1l*H^2l + vpmsumd H4_l,H2,H2l C H^2l*H^2l + vpmsumd H3_m,H,H2m C H^1h*H^2l⊕H^1l*H^2h + vpmsumd H4_m,H2,H2m C H^2h*H^2l⊕H^2l*H^2h + vpmsumd H3_h,H,H2h C H^1h*H^2h + vpmsumd H4_h,H2,H2h C H^2h*H^2h + + C reduction first phase [1] + vpmsumd RP,H3_l,POLY_L C [1] H^3 + vpmsumd RP2,H4_l,POLY_L C [1] H^4 + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(H3_m) C [2] H^3 + xxmrghd VSR(M2h),VSR(ZERO),VSR(H4_m) C [2] H^4 + xxmrgld VSR(Ml),VSR(H3_m),VSR(ZERO) C [2] H^3 + xxmrgld VSR(M2l),VSR(H4_m),VSR(ZERO) C [2] H^4 + xxswapd VSR(RP),VSR(RP) C [1] H^3 + xxswapd VSR(RP2),VSR(RP2) C [1] H^4 + vxor H3_h,H3_h,Mh C [2] H^3 + vxor H4_h,H4_h,M2h C [2] H^4 + vxor H3_l,H3_l,Ml C [2] H^3 + vxor H4_l,H4_l,M2l C [2] H^4 + vxor H3_l,H3_l,RP C [1] H^3 + vxor H4_l,H4_l,RP2 C [1] H^4 + + C reduction second phase + vpmsumd RP,H3_l,POLY_H C H^3 + vpmsumd RP2,H4_l,POLY_H C H^4 + vxor H3_h,H3_h,H3_l C H^3 + vxor H4_h,H4_h,H4_l C H^4 + vxor H3,H3_h,RP C H^3 + vxor H4,H4_h,RP2 C H^4 + + C calculate [H3m = H^3l:H^3h, H4m = H^4l:H^4h, H43l = H^4l:H^3l, H43h = H^4h:H^3h] + xxswapd VSR(H3m),VSR(H3) + xxswapd VSR(H4m),VSR(H4) + xxmrgld VSR(H43l),VSR(H4),VSR(H3) + xxmrghd VSR(H43h),VSR(H4),VSR(H3) + + C store [H3m, H4m, H43l, H43h] + li r7,6*TableElemAlign + li r8,7*TableElemAlign + li r9,8*TableElemAlign + li r10,9*TableElemAlign + stxvd2x VSR(H3m),r7,TABLE + stxvd2x VSR(H4m),r8,TABLE + stxvd2x VSR(H43l),r9,TABLE + stxvd2x VSR(H43h),r10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key) + +define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6') + +define(`ZERO', `v0') +define(`POLY', `v3') +define(`POLY_L', `v1') +define(`POLY_H', `v2') + +define(`D', `v3') +define(`C0', `v4') +define(`C1', `v5') +define(`C2', `v6') +define(`C3', `v7') +define(`Mh', `v8') +define(`Ml', `v9') +define(`RP', `v10') +define(`C01h', `v11') +define(`C01l', `v12') +define(`C23h', `v13') +define(`C23l', `v14') + +define(`H1', `v15') +define(`H2', `v16') +define(`H21l', `v17') +define(`H21h', `v18') +define(`H3', `v20') +define(`H4', `v21') +define(`H43l', `v22') +define(`H43h', `v23') + +define(`Cl', `v5') +define(`Cm', `v6') +define(`Ch', `v7') +define(`Hl', `v15') +define(`H', `v16') +define(`Hh', `v17') + +define(`LE_TEMP', `v18') +define(`LE_MASK', `v19') + + C void gcm_hash (const struct gcm_key *key, union gcm_block *x, + C size_t length, const uint8_t *data) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash) + DATA_LOAD_VEC(POLY,.polynomial,r7) +IF_LE(` + li r8,0 + lvsl LE_MASK,0,r8 + vspltisb LE_TEMP,0x07 + vxor LE_MASK,LE_MASK,LE_TEMP +') + vxor ZERO,ZERO,ZERO + + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) + xxmrghd VSR(POLY_H),VSR(POLY),VSR(ZERO) + + lxvd2x VSR(D),0,X C load 'X' pointer + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + + C --- process 4 blocks '128-bit each' per one loop --- + + srdi r7,LENGTH,6 C 4-blocks loop count 'LENGTH / (4 * 16)' + cmpldi r7,0 + beq L2x + + mtctr r7 C assign counter register to loop count + + C backup non-volatile vector registers + addi r8,SP,-64 + stvx 20,0,r8 + addi r8,r8,16 + stvx 21,0,r8 + addi r8,r8,16 + stvx 22,0,r8 + addi r8,r8,16 + stvx 23,0,r8 + + C load table elements + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H1),0,TABLE + lxvd2x VSR(H2),r8,TABLE + lxvd2x VSR(H21l),r9,TABLE + lxvd2x VSR(H21h),r10,TABLE + li r7,6*TableElemAlign + li r8,7*TableElemAlign + li r9,8*TableElemAlign + li r10,9*TableElemAlign + lxvd2x VSR(H3),r7,TABLE + lxvd2x VSR(H4),r8,TABLE + lxvd2x VSR(H43l),r9,TABLE + lxvd2x VSR(H43h),r10,TABLE + + li r8,0x10 + li r9,0x20 + li r10,0x30 +.align 5 +L4x_loop: + C input loading + lxvd2x VSR(C0),0,DATA C load C0 + lxvd2x VSR(C1),r8,DATA C load C1 + lxvd2x VSR(C2),r9,DATA C load C2 + lxvd2x VSR(C3),r10,DATA C load C3 + +IF_LE(` + vperm C0,C0,C0,LE_MASK + vperm C1,C1,C1,LE_MASK + vperm C2,C2,C2,LE_MASK + vperm C3,C3,C3,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication "classical" pre-processing + xxmrghd VSR(C23h),VSR(C2),VSR(C3) + xxmrgld VSR(C23l),VSR(C2),VSR(C3) + xxmrghd VSR(C01h),VSR(C0),VSR(C1) + xxmrgld VSR(C01l),VSR(C0),VSR(C1) + + C polynomial multiplication "classical" + vpmsumd C3,C3,H1 C M3 = H^1l*C3h⊕H^1h*C3l + vpmsumd C2,C2,H2 C M2 = H^2l*C2h⊕H^2h*C2l + vpmsumd C1,C1,H3 C M1 = H^3l*C1h⊕H^3h*C1l + vpmsumd C0,C0,H4 C M0 = H^4l*C0h⊕H^4h*C0l + vpmsumd C23h,C23h,H21h C H23 = H^2h*C2h⊕H^1h*C3h + vpmsumd C23l,C23l,H21l C L23 = H^2l*C2l⊕H^1l*C3l + vpmsumd C01h,C01h,H43h C H01 = H^4h*C0h⊕H^4h*C1h + vpmsumd C01l,C01l,H43l C L01 = H^4l*C0l⊕H^4l*C1l + + C polynomial multiplication "classical" post-processing + vxor C2,C2,C3 C M2 = M2⊕M3 + vxor C0,C0,C1 C M0 = M0⊕M1 + + C deferred recombination of partial products + vxor C01h,C01h,C23h C H0 = H01⊕H23 + vxor C01l,C01l,C23l C L0 = L01⊕L23 + vxor C0,C0,C2 C M0 = M0⊕M2 + + C reduction first phase [1] + vpmsumd RP,C01l,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(C0) C [2] + xxmrgld VSR(Ml),VSR(C0),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor C01h,C01h,Mh C [2] + vxor C01l,C01l,Ml C [2] + vxor C01l,C01l,RP C [1] + + C reduction second phase + vpmsumd RP,C01l,POLY_H + vxor C01h,C01l,C01h + vxor D,C01h,RP + + addi DATA,DATA,0x40 + bdnz L4x_loop + + C restore non-volatile vector registers + addi r8,SP,-64 + lvx 20,0,r8 + addi r8,r8,16 + lvx 21,0,r8 + addi r8,r8,16 + lvx 22,0,r8 + addi r8,r8,16 + lvx 23,0,r8 + + clrldi LENGTH,LENGTH,58 C 'set the high-order 58 bits to zeros' +L2x: + C --- process 2 blocks --- + + srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)' + cmpldi r7,0 + beq L1x + + C load table elements + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H1),0,TABLE + lxvd2x VSR(H2),r8,TABLE + lxvd2x VSR(H21l),r9,TABLE + lxvd2x VSR(H21h),r10,TABLE + + C input loading + li r10,0x10 + lxvd2x VSR(C0),0,DATA C load C0 + lxvd2x VSR(C1),r10,DATA C load C1 + +IF_LE(` + vperm C0,C0,C0,LE_MASK + vperm C1,C1,C1,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication "classical" pre-processing + xxmrghd VSR(C01h),VSR(C0),VSR(C1) + xxmrgld VSR(C01l),VSR(C0),VSR(C1) + + C polynomial multiplication "classical" + vpmsumd C1,C1,H1 C M1 = H^1l*C1h⊕H^1h*C1l + vpmsumd C0,C0,H2 C M0 = H^2l*C0h⊕H^2h*C0l + vpmsumd C01h,C01h,H21h C H01 = H^2h*C0h⊕H^1h*C1h + vpmsumd C01l,C01l,H21l C L01 = H^2l*C0l⊕H^1l*C1l + + C deferred recombination of partial products + vxor C0,C0,C1 C M0 = M0⊕M1 + + C reduction first phase [1] + vpmsumd RP,C01l,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(C0) C [2] + xxmrgld VSR(Ml),VSR(C0),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor C01h,C01h,Mh C [2] + vxor C01l,C01l,Ml C [2] + vxor C01l,C01l,RP C [1] + + C reduction second phase + vpmsumd RP,C01l,POLY_H + vxor C01h,C01l,C01h + vxor D,C01h,RP + + addi DATA,DATA,0x20 + clrldi LENGTH,LENGTH,59 C 'set the high-order 59 bits to zeros' +L1x: + C --- process 1 block --- + + srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)' + cmpldi r7,0 + beq Lmod + + C load table elements + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(H),0,TABLE + lxvd2x VSR(Hl),r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + C input loading + lxvd2x VSR(C0),0,DATA C load C0 + +IF_LE(` + vperm C0,C0,C0,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication "classical" + vpmsumd Cl,C0,Hl C L = Hl*Cl + vpmsumd Cm,C0,H C M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh C H = Hh*Ch + + C reduction first phase C [1] + vpmsumd RP,Cl,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(Cm) C [2] + xxmrgld VSR(Ml),VSR(Cm),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor Ch,Ch,Mh C [2] + vxor Cl,Cl,Ml C [2] + vxor Cl,Cl,RP C [1] + + C reduction second phase + vpmsumd RP,Cl,POLY_H + vxor Ch,Cl,Ch + vxor D,Ch,RP + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 C 'set the high-order 60 bits to zeros' +Lmod: + C --- process the modulo bytes, padding the low-order bytes with zeros --- + + cmpldi LENGTH,0 + beq Ldone + + C load table elements + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(H),0,TABLE + lxvd2x VSR(Hl),r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + C push every modulo byte to the stack and load them with padding into vector register + addi r8,SP,-16 + stvx ZERO,0,r8 +Lstb_loop: + subic. LENGTH,LENGTH,1 + lbzx r7,LENGTH,DATA + stbx r7,LENGTH,r8 + bne Lstb_loop + lxvd2x VSR(C0),0,r8 + +IF_LE(` + vperm C0,C0,C0,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication "classical" + vpmsumd Cl,C0,Hl C L = Hl*Cl + vpmsumd Cm,C0,H C M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh C H = Hh*Ch + + C reduction first phase [1] + vpmsumd RP,Cl,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(Cm) C [2] + xxmrgld VSR(Ml),VSR(Cm),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor Ch,Ch,Mh C [2] + vxor Cl,Cl,Ml C [2] + vxor Cl,Cl,RP C [1] + + C reduction second phase + vpmsumd RP,Cl,POLY_H + vxor Ch,Cl,Ch + vxor D,Ch,RP + +Ldone: + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + stxvd2x VSR(D),0,X C store digest 'D' + + blr +EPILOGUE(_nettle_gcm_hash) + +.data + C 0xC2000000000000000000000000000001 +.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')