This implementation takes advantage of research made by Niels Möller to optimize GCM on PowerPC, this optimization yields a +27.7% performance boost on POWER8 over the previous implementation that was based on intel documents. The performance comparison is made by processing 4 blocks per loop without any further optimizations. I made some documentations between the lines but I suggest writing a document similar to the intel ones that go into more details and clarify the preference of this method. I'm also curious if this method can also make a difference in other architectures like ARM, I'm planning to try it out for ARM to figure that out. --- configure.ac | 6 +- gcm.c | 49 +++-- powerpc64/p8/gcm-hash.asm | 502 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+), 15 deletions(-) create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index 2a47f940..20f7cf74 100644 --- a/configure.ac +++ b/configure.ac @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -621,9 +621,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/gcm.c b/gcm.c index 48b3e75a..81981c1c 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key + +#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash + +#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key +static void +gcm_init_key(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS + /* Middle element if GCM_TABLE_BITS > 0, otherwise the first + element */ + unsigned i = (1<<GCM_TABLE_BITS)/2; + + /* Algorithm 3 from the gcm paper. First do powers of two, then do + the rest by adding. */ + while (i /= 2) + block16_mulx_ghash(&table[i], &table[2*i]); + for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) + { + unsigned j; + for (j = 1; j < i; j++) + block16_xor3(&table[i+j], &table[i], &table[j]); + } +#endif +} +#endif /* !gcm_init_key */ + /* Initialization of GCM. * @ctx: The context of GCM * @cipher: The context of the underlying block cipher @@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS - /* Algorithm 3 from the gcm paper. First do powers of two, then do - the rest by adding. */ - while (i /= 2) - block16_mulx_ghash(&key->h[i], &key->h[2*i]); - for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) - { - unsigned j; - for (j = 1; j < i; j++) - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]); - } -#endif + gcm_init_key(key->h); }
#ifndef gcm_hash diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..e79fbdc2 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,502 @@ +C powerpc64/p8/gcm-hash.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`TABLE', `r3') + +define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v16') +define(`POLY', `v17') +define(`POLY_L', `v1') + +define(`H', `v2') +define(`H2', `v3') +define(`H3', `v4') +define(`H4', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`Hl', `v10') +define(`Hm', `v11') +define(`Hp', `v12') +define(`Hl2', `v13') +define(`Hm2', `v14') +define(`Hp2', `v15') +define(`R', `v13') +define(`F', `v14') +define(`T', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T2', `v18') + +define(`LE_TEMP', `v18') +define(`LE_MASK', `v19') + +.file "gcm-hash.asm" + +.text + + C void gcm_init_key (union gcm_block *table) + +C This function populates the gcm table as the following layout +C ******************************************************************************* +C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) | +C | | +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) | +C | | +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) | +C | | +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | +C ******************************************************************************* + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key) + DATA_LOAD_VEC(POLY,.polynomial,r7) C 0xC2000000000000000000000000000001 +IF_LE(` + li r8,0 + lvsl LE_MASK,0,r8 C 0x000102030405060708090A0B0C0D0E0F + vspltisb LE_TEMP,0x07 C 0x07070707070707070707070707070707 + vxor LE_MASK,LE_MASK,LE_TEMP C 0x07060504030201000F0E0D0C0B0A0908 +') + + C 'H' is assigned by gcm_set_key() to the middle element of the table + li r10,8*TableElemAlign + lxvd2x VSR(H),r10,TABLE C load 'H' + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm H,H,H,LE_MASK +') + + C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) --- + + vupkhsb EMSB,H C extend most significant bit to first byte + vspltisb B1,1 C 0x01010101010101010101010101010101 + vspltb EMSB,EMSB,0 C first byte quadword-extend + vsl H,H,B1 C H = H << 1 + vand EMSB,EMSB,POLY C EMSB &= 0xC2000000000000000000000000000001 + vxor ZERO,ZERO,ZERO C 0x00000000000000000000000000000000 + vxor H,H,EMSB C H ^= EMSB + + C --- calculate H^2 = H*H --- + + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C 0x0000000000000000C200000000000000 + + C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) --- + C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 --- + C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) --- + vpmsumd Hp,H,POLY_L C Hp = (H mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷) + xxmrgld VSR(Hl),VSR(H),VSR(ZERO) C Hl = (H mod x⁶⁴) × x⁶⁴ + xxswapd VSR(Hm),VSR(H) + vxor Hl,Hl,Hp C Hl = Hl + Hp + vxor Hm,Hm,Hp C Hm = Hm + Hp + xxmrghd VSR(H1M),VSR(H),VSR(Hl) C H1M = (H div x⁶⁴)||(Hl div x⁶⁴) + xxmrgld VSR(H1L),VSR(H),VSR(Hm) C H1L = (H mod x⁶⁴)||(Hl mod x⁶⁴) + + vpmsumd F,H1L,H C F = (H1Lh × Hh) + (H1Ll × Hl) + vpmsumd R,H1M,H C R = (H1Mh × Hh) + (H1Ml × Hl) + + C --- rduction --- + vpmsumd T,F,POLY_L C T = (F mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷) + xxswapd VSR(H2),VSR(F) + vxor R,R,T C R = R + T + vxor H2,R,H2 + + xxmrgld VSR(Hl),VSR(H2),VSR(ZERO) + xxswapd VSR(Hm),VSR(H2) + vpmsumd Hp,H2,POLY_L + vxor Hl,Hl,Hp + vxor Hm,Hm,Hp + xxmrghd VSR(H2M),VSR(H2),VSR(Hl) + xxmrgld VSR(H2L),VSR(H2),VSR(Hm) + + C store H1M, H1L, H2M, H2L + li r8,1*TableElemAlign + li r9,2*TableElemAlign + li r10,3*TableElemAlign + stxvd2x VSR(H1M),0,TABLE + stxvd2x VSR(H1L),r8,TABLE + stxvd2x VSR(H2M),r9,TABLE + stxvd2x VSR(H2L),r10,TABLE + + C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 --- + + vpmsumd F,H1L,H2 + vpmsumd F2,H2L,H2 + vpmsumd R,H1M,H2 + vpmsumd R2,H2M,H2 + + vpmsumd T,F,POLY_L + vpmsumd T2,F2,POLY_L + xxswapd VSR(H3),VSR(F) + xxswapd VSR(H4),VSR(F2) + vxor R,R,T + vxor R2,R2,T2 + vxor H3,R,H3 + vxor H4,R2,H4 + + xxmrgld VSR(Hl),VSR(H3),VSR(ZERO) + xxmrgld VSR(Hl2),VSR(H4),VSR(ZERO) + xxswapd VSR(Hm),VSR(H3) + xxswapd VSR(Hm2),VSR(H4) + vpmsumd Hp,H3,POLY_L + vpmsumd Hp2,H4,POLY_L + vxor Hl,Hl,Hp + vxor Hl2,Hl2,Hp2 + vxor Hm,Hm,Hp + vxor Hm2,Hm2,Hp2 + xxmrghd VSR(H1M),VSR(H3),VSR(Hl) + xxmrghd VSR(H2M),VSR(H4),VSR(Hl2) + xxmrgld VSR(H1L),VSR(H3),VSR(Hm) + xxmrgld VSR(H2L),VSR(H4),VSR(Hm2) + + C store H3M, H3L, H4M, H4L + li r7,4*TableElemAlign + li r8,5*TableElemAlign + li r9,6*TableElemAlign + li r10,7*TableElemAlign + stxvd2x VSR(H1M),r7,TABLE + stxvd2x VSR(H1L),r8,TABLE + stxvd2x VSR(H2M),r9,TABLE + stxvd2x VSR(H2L),r10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key) + +define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6') + +define(`ZERO', `v16') +define(`POLY', `v17') +define(`POLY_L', `v0') + +define(`D', `v1') +define(`C0', `v2') +define(`C1', `v3') +define(`C2', `v4') +define(`C3', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`H3M', `v10') +define(`H3L', `v11') +define(`H4M', `v12') +define(`H4L', `v13') +define(`R', `v14') +define(`F', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`R3', `v18') +define(`F3', `v20') +define(`R4', `v21') +define(`F4', `v22') +define(`T', `v23') + +define(`LE_TEMP', `v18') +define(`LE_MASK', `v19') + + C void gcm_hash (const struct gcm_key *key, union gcm_block *x, + C size_t length, const uint8_t *data) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash) + DATA_LOAD_VEC(POLY,.polynomial,r7) +IF_LE(` + li r8,0 + lvsl LE_MASK,0,r8 + vspltisb LE_TEMP,0x07 + vxor LE_MASK,LE_MASK,LE_TEMP +') + vxor ZERO,ZERO,ZERO + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) + + lxvd2x VSR(D),0,X C load 'X' pointer + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + + C --- process 4 blocks '128-bit each' per one loop --- + + srdi r7,LENGTH,6 C 4-blocks loop count 'LENGTH / (4 * 16)' + cmpldi r7,0 + beq L2x + + mtctr r7 C assign counter register to loop count + + C store non-volatile vector registers + addi r8,SP,-64 + stvx 20,0,r8 + addi r8,r8,16 + stvx 21,0,r8 + addi r8,r8,16 + stvx 22,0,r8 + addi r8,r8,16 + stvx 23,0,r8 + + C load table elements + li r8,1*TableElemAlign + li r9,2*TableElemAlign + li r10,3*TableElemAlign + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r8,TABLE + lxvd2x VSR(H2M),r9,TABLE + lxvd2x VSR(H2L),r10,TABLE + li r7,4*TableElemAlign + li r8,5*TableElemAlign + li r9,6*TableElemAlign + li r10,7*TableElemAlign + lxvd2x VSR(H3M),r7,TABLE + lxvd2x VSR(H3L),r8,TABLE + lxvd2x VSR(H4M),r9,TABLE + lxvd2x VSR(H4L),r10,TABLE + + li r8,0x10 + li r9,0x20 + li r10,0x30 +.align 5 +L4x_loop: + C input loading + lxvd2x VSR(C0),0,DATA C load C0 + lxvd2x VSR(C1),r8,DATA C load C1 + lxvd2x VSR(C2),r9,DATA C load C2 + lxvd2x VSR(C3),r10,DATA C load C3 + +IF_LE(` + vperm C0,C0,C0,LE_MASK + vperm C1,C1,C1,LE_MASK + vperm C2,C2,C2,LE_MASK + vperm C3,C3,C3,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication + vpmsumd F2,H3L,C1 + vpmsumd R2,H3M,C1 + vpmsumd F3,H2L,C2 + vpmsumd R3,H2M,C2 + vpmsumd F4,H1L,C3 + vpmsumd R4,H1M,C3 + vpmsumd F,H4L,C0 + vpmsumd R,H4M,C0 + + C deferred recombination of partial products + vxor F3,F3,F4 + vxor R3,R3,R4 + vxor F,F,F2 + vxor R,R,R2 + vxor F,F,F3 + vxor R,R,R3 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DATA,DATA,0x40 + bdnz L4x_loop + + C restore non-volatile vector registers + addi r8,SP,-64 + lvx 20,0,r8 + addi r8,r8,16 + lvx 21,0,r8 + addi r8,r8,16 + lvx 22,0,r8 + addi r8,r8,16 + lvx 23,0,r8 + + clrldi LENGTH,LENGTH,58 C 'set the high-order 58 bits to zeros' +L2x: + C --- process 2 blocks --- + + srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)' + cmpldi r7,0 + beq L1x + + C load table elements + li r8,1*TableElemAlign + li r9,2*TableElemAlign + li r10,3*TableElemAlign + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r8,TABLE + lxvd2x VSR(H2M),r9,TABLE + lxvd2x VSR(H2L),r10,TABLE + + C input loading + li r10,0x10 + lxvd2x VSR(C0),0,DATA C load C0 + lxvd2x VSR(C1),r10,DATA C load C1 + +IF_LE(` + vperm C0,C0,C0,LE_MASK + vperm C1,C1,C1,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication + vpmsumd F2,H1L,C1 + vpmsumd R2,H1M,C1 + vpmsumd F,H2L,C0 + vpmsumd R,H2M,C0 + + C deferred recombination of partial products + vxor F,F,F2 + vxor R,R,R2 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DATA,DATA,0x20 + clrldi LENGTH,LENGTH,59 C 'set the high-order 59 bits to zeros' +L1x: + C --- process 1 block --- + + srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)' + cmpldi r7,0 + beq Lmod + + C load table elements + li r8,1*TableElemAlign + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r8,TABLE + + C input loading + lxvd2x VSR(C0),0,DATA C load C0 + +IF_LE(` + vperm C0,C0,C0,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication + vpmsumd F,H1L,C0 + vpmsumd R,H1M,C0 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 C 'set the high-order 60 bits to zeros' +Lmod: + C --- process the modulo bytes, padding the low-order bytes with zeros --- + + cmpldi LENGTH,0 + beq Ldone + + C load table elements + li r8,1*TableElemAlign + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r8,TABLE + + C push every modulo byte to the stack and load them with padding into vector register + vxor ZERO,ZERO,ZERO + addi r8,SP,-16 + stvx ZERO,0,r8 +Lstb_loop: + subic. LENGTH,LENGTH,1 + lbzx r7,LENGTH,DATA + stbx r7,LENGTH,r8 + bne Lstb_loop + lxvd2x VSR(C0),0,r8 + +IF_LE(` + vperm C0,C0,C0,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication + vpmsumd F,H1L,C0 + vpmsumd R,H1M,C0 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + +Ldone: + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + stxvd2x VSR(D),0,X C store digest 'D' + + blr +EPILOGUE(_nettle_gcm_hash) + +.data + C 0xC2000000000000000000000000000001 +.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')
I think I mislabeled the percentage of performance comparison, the new method achieved 27.7% reduction in time on POWER8 that corresponds to 37.9% increase in performance.
On Tue, Nov 10, 2020 at 6:25 AM Maamoun TK maamoun.tk@googlemail.com wrote:
This implementation takes advantage of research made by Niels Möller to optimize GCM on PowerPC, this optimization yields a +27.7% performance boost on POWER8 over the previous implementation that was based on intel documents. The performance comparison is made by processing 4 blocks per loop without any further optimizations. I made some documentations between the lines but I suggest writing a document similar to the intel ones that go into more details and clarify the preference of this method. I'm also curious if this method can also make a difference in other architectures like ARM, I'm planning to try it out for ARM to figure that out.
configure.ac | 6 +- gcm.c | 49 +++-- powerpc64/p8/gcm-hash.asm | 502 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+), 15 deletions(-) create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index 2a47f940..20f7cf74 100644 --- a/configure.ac +++ b/configure.ac @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -621,9 +621,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/gcm.c b/gcm.c index 48b3e75a..81981c1c 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key
+#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash
+#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
- size_t length, const uint8_t *data);
+# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key +static void +gcm_init_key(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS
- /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
element */
- unsigned i = (1<<GCM_TABLE_BITS)/2;
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&table[i], &table[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&table[i+j], &table[i], &table[j]);
- }
+#endif +} +#endif /* !gcm_init_key */
/* Initialization of GCM.
- @ctx: The context of GCM
- @cipher: The context of the underlying block cipher
@@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&key->h[i], &key->h[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
- }
-#endif
- gcm_init_key(key->h);
}
#ifndef gcm_hash diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..e79fbdc2 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,502 @@ +C powerpc64/p8/gcm-hash.asm
+ifelse(`
- Copyright (C) 2020 Niels Möller and Mamone Tarsha
- This file is part of GNU Nettle.
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
- or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
- or both in parallel, as here.
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received copies of the GNU General Public License and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
+')
+C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100')
+C Register usage:
+define(`SP', `r1') +define(`TOCP', `r2')
+define(`TABLE', `r3')
+define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v16') +define(`POLY', `v17') +define(`POLY_L', `v1')
+define(`H', `v2') +define(`H2', `v3') +define(`H3', `v4') +define(`H4', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`Hl', `v10') +define(`Hm', `v11') +define(`Hp', `v12') +define(`Hl2', `v13') +define(`Hm2', `v14') +define(`Hp2', `v15') +define(`R', `v13') +define(`F', `v14') +define(`T', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T2', `v18')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
+.file "gcm-hash.asm"
+.text
- C void gcm_init_key (union gcm_block *table)
+C This function populates the gcm table as the following layout +C
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) | +C | | +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) | +C | | +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) | +C | | +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | +C
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key)
- DATA_LOAD_VEC(POLY,.polynomial,r7) C
0xC2000000000000000000000000000001 +IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8 C
0x000102030405060708090A0B0C0D0E0F
- vspltisb LE_TEMP,0x07 C
0x07070707070707070707070707070707
- vxor LE_MASK,LE_MASK,LE_TEMP C
0x07060504030201000F0E0D0C0B0A0908 +')
- C 'H' is assigned by gcm_set_key() to the middle element of the table
- li r10,8*TableElemAlign
- lxvd2x VSR(H),r10,TABLE C load 'H'
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm H,H,H,LE_MASK
+')
- C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1)
- vupkhsb EMSB,H C extend most
significant bit to first byte
- vspltisb B1,1 C
0x01010101010101010101010101010101
- vspltb EMSB,EMSB,0 C first byte
quadword-extend
- vsl H,H,B1 C H = H << 1
- vand EMSB,EMSB,POLY C EMSB &=
0xC2000000000000000000000000000001
- vxor ZERO,ZERO,ZERO C
0x00000000000000000000000000000000
- vxor H,H,EMSB C H ^= EMSB
- C --- calculate H^2 = H*H ---
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C
0x0000000000000000C200000000000000
- C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
- vpmsumd Hp,H,POLY_L C Hp = (H mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxmrgld VSR(Hl),VSR(H),VSR(ZERO) C Hl = (H mod x⁶⁴) × x⁶⁴
- xxswapd VSR(Hm),VSR(H)
- vxor Hl,Hl,Hp C Hl = Hl + Hp
- vxor Hm,Hm,Hp C Hm = Hm + Hp
- xxmrghd VSR(H1M),VSR(H),VSR(Hl) C H1M = (H div x⁶⁴)||(Hl
div x⁶⁴)
- xxmrgld VSR(H1L),VSR(H),VSR(Hm) C H1L = (H mod x⁶⁴)||(Hl
mod x⁶⁴)
- vpmsumd F,H1L,H C F = (H1Lh × Hh) +
(H1Ll × Hl)
- vpmsumd R,H1M,H C R = (H1Mh × Hh) +
(H1Ml × Hl)
- C --- rduction ---
- vpmsumd T,F,POLY_L C T = (F mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxswapd VSR(H2),VSR(F)
- vxor R,R,T C R = R + T
- vxor H2,R,H2
- xxmrgld VSR(Hl),VSR(H2),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H2)
- vpmsumd Hp,H2,POLY_L
- vxor Hl,Hl,Hp
- vxor Hm,Hm,Hp
- xxmrghd VSR(H2M),VSR(H2),VSR(Hl)
- xxmrgld VSR(H2L),VSR(H2),VSR(Hm)
- C store H1M, H1L, H2M, H2L
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- stxvd2x VSR(H1M),0,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
- vpmsumd F,H1L,H2
- vpmsumd F2,H2L,H2
- vpmsumd R,H1M,H2
- vpmsumd R2,H2M,H2
- vpmsumd T,F,POLY_L
- vpmsumd T2,F2,POLY_L
- xxswapd VSR(H3),VSR(F)
- xxswapd VSR(H4),VSR(F2)
- vxor R,R,T
- vxor R2,R2,T2
- vxor H3,R,H3
- vxor H4,R2,H4
- xxmrgld VSR(Hl),VSR(H3),VSR(ZERO)
- xxmrgld VSR(Hl2),VSR(H4),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H3)
- xxswapd VSR(Hm2),VSR(H4)
- vpmsumd Hp,H3,POLY_L
- vpmsumd Hp2,H4,POLY_L
- vxor Hl,Hl,Hp
- vxor Hl2,Hl2,Hp2
- vxor Hm,Hm,Hp
- vxor Hm2,Hm2,Hp2
- xxmrghd VSR(H1M),VSR(H3),VSR(Hl)
- xxmrghd VSR(H2M),VSR(H4),VSR(Hl2)
- xxmrgld VSR(H1L),VSR(H3),VSR(Hm)
- xxmrgld VSR(H2L),VSR(H4),VSR(Hm2)
- C store H3M, H3L, H4M, H4L
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- stxvd2x VSR(H1M),r7,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- blr
+EPILOGUE(_nettle_gcm_init_key)
+define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6')
+define(`ZERO', `v16') +define(`POLY', `v17') +define(`POLY_L', `v0')
+define(`D', `v1') +define(`C0', `v2') +define(`C1', `v3') +define(`C2', `v4') +define(`C3', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`H3M', `v10') +define(`H3L', `v11') +define(`H4M', `v12') +define(`H4L', `v13') +define(`R', `v14') +define(`F', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`R3', `v18') +define(`F3', `v20') +define(`R4', `v21') +define(`F4', `v22') +define(`T', `v23')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
- C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
- C size_t length, const uint8_t *data)
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash)
- DATA_LOAD_VEC(POLY,.polynomial,r7)
+IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8
- vspltisb LE_TEMP,0x07
- vxor LE_MASK,LE_MASK,LE_TEMP
+')
- vxor ZERO,ZERO,ZERO
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
- lxvd2x VSR(D),0,X C load 'X' pointer
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- C --- process 4 blocks '128-bit each' per one loop ---
- srdi r7,LENGTH,6 C 4-blocks loop count
'LENGTH / (4 * 16)'
- cmpldi r7,0
- beq L2x
- mtctr r7 C assign counter
register to loop count
- C store non-volatile vector registers
- addi r8,SP,-64
- stvx 20,0,r8
- addi r8,r8,16
- stvx 21,0,r8
- addi r8,r8,16
- stvx 22,0,r8
- addi r8,r8,16
- stvx 23,0,r8
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- lxvd2x VSR(H3M),r7,TABLE
- lxvd2x VSR(H3L),r8,TABLE
- lxvd2x VSR(H4M),r9,TABLE
- lxvd2x VSR(H4L),r10,TABLE
- li r8,0x10
- li r9,0x20
- li r10,0x30
+.align 5 +L4x_loop:
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r8,DATA C load C1
- lxvd2x VSR(C2),r9,DATA C load C2
- lxvd2x VSR(C3),r10,DATA C load C3
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
- vperm C2,C2,C2,LE_MASK
- vperm C3,C3,C3,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H3L,C1
- vpmsumd R2,H3M,C1
- vpmsumd F3,H2L,C2
- vpmsumd R3,H2M,C2
- vpmsumd F4,H1L,C3
- vpmsumd R4,H1M,C3
- vpmsumd F,H4L,C0
- vpmsumd R,H4M,C0
- C deferred recombination of partial products
- vxor F3,F3,F4
- vxor R3,R3,R4
- vxor F,F,F2
- vxor R,R,R2
- vxor F,F,F3
- vxor R,R,R3
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x40
- bdnz L4x_loop
- C restore non-volatile vector registers
- addi r8,SP,-64
- lvx 20,0,r8
- addi r8,r8,16
- lvx 21,0,r8
- addi r8,r8,16
- lvx 22,0,r8
- addi r8,r8,16
- lvx 23,0,r8
- clrldi LENGTH,LENGTH,58 C 'set the high-order 58
bits to zeros' +L2x:
- C --- process 2 blocks ---
- srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)'
- cmpldi r7,0
- beq L1x
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- C input loading
- li r10,0x10
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r10,DATA C load C1
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H1L,C1
- vpmsumd R2,H1M,C1
- vpmsumd F,H2L,C0
- vpmsumd R,H2M,C0
- C deferred recombination of partial products
- vxor F,F,F2
- vxor R,R,R2
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x20
- clrldi LENGTH,LENGTH,59 C 'set the high-order 59
bits to zeros' +L1x:
- C --- process 1 block ---
- srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)'
- cmpldi r7,0
- beq Lmod
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x10
- clrldi LENGTH,LENGTH,60 C 'set the high-order 60
bits to zeros' +Lmod:
- C --- process the modulo bytes, padding the low-order bytes with
zeros ---
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
+Ldone:
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- stxvd2x VSR(D),0,X C store digest 'D'
- blr
+EPILOGUE(_nettle_gcm_hash)
+.data
- C 0xC2000000000000000000000000000001
+.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')
-- 2.17.1
On Wed, Nov 11, 2020 at 02:17:41AM +0200, Maamoun TK wrote:
I think I mislabeled the percentage of performance comparison, the new method achieved 27.7% reduction in time on POWER8 that corresponds to 37.9% increase in performance.
Hi Maamoun,
Many thanks to you and Niels. We plan to test this on POWER9.
On Tue, Nov 10, 2020 at 6:25 AM Maamoun TK maamoun.tk@googlemail.com wrote:
This implementation takes advantage of research made by Niels Möller to optimize GCM on PowerPC, this optimization yields a +27.7% performance boost on POWER8 over the previous implementation that was based on intel documents. The performance comparison is made by processing 4 blocks per loop without any further optimizations. I made some documentations between the lines but I suggest writing a document similar to the intel ones that go into more details and clarify the preference of this method. I'm also curious if this method can also make a difference in other architectures like ARM, I'm planning to try it out for ARM to figure that out.
configure.ac | 6 +- gcm.c | 49 +++-- powerpc64/p8/gcm-hash.asm | 502 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+), 15 deletions(-) create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index 2a47f940..20f7cf74 100644 --- a/configure.ac +++ b/configure.ac @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -621,9 +621,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/gcm.c b/gcm.c index 48b3e75a..81981c1c 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key
+#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash
+#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
- size_t length, const uint8_t *data);
+# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key +static void +gcm_init_key(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS
- /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
element */
- unsigned i = (1<<GCM_TABLE_BITS)/2;
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&table[i], &table[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&table[i+j], &table[i], &table[j]);
- }
+#endif +} +#endif /* !gcm_init_key */
/* Initialization of GCM.
- @ctx: The context of GCM
- @cipher: The context of the underlying block cipher
@@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&key->h[i], &key->h[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
- }
-#endif
- gcm_init_key(key->h);
}
#ifndef gcm_hash diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..e79fbdc2 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,502 @@ +C powerpc64/p8/gcm-hash.asm
+ifelse(`
- Copyright (C) 2020 Niels Möller and Mamone Tarsha
- This file is part of GNU Nettle.
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
- or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
- or both in parallel, as here.
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received copies of the GNU General Public License and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
+')
+C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100')
+C Register usage:
+define(`SP', `r1') +define(`TOCP', `r2')
+define(`TABLE', `r3')
+define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v16') +define(`POLY', `v17') +define(`POLY_L', `v1')
+define(`H', `v2') +define(`H2', `v3') +define(`H3', `v4') +define(`H4', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`Hl', `v10') +define(`Hm', `v11') +define(`Hp', `v12') +define(`Hl2', `v13') +define(`Hm2', `v14') +define(`Hp2', `v15') +define(`R', `v13') +define(`F', `v14') +define(`T', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T2', `v18')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
+.file "gcm-hash.asm"
+.text
- C void gcm_init_key (union gcm_block *table)
+C This function populates the gcm table as the following layout +C
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) | +C | | +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) | +C | | +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) | +C | | +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | +C
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key)
- DATA_LOAD_VEC(POLY,.polynomial,r7) C
0xC2000000000000000000000000000001 +IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8 C
0x000102030405060708090A0B0C0D0E0F
- vspltisb LE_TEMP,0x07 C
0x07070707070707070707070707070707
- vxor LE_MASK,LE_MASK,LE_TEMP C
0x07060504030201000F0E0D0C0B0A0908 +')
- C 'H' is assigned by gcm_set_key() to the middle element of the table
- li r10,8*TableElemAlign
- lxvd2x VSR(H),r10,TABLE C load 'H'
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm H,H,H,LE_MASK
+')
- C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1)
- vupkhsb EMSB,H C extend most
significant bit to first byte
- vspltisb B1,1 C
0x01010101010101010101010101010101
- vspltb EMSB,EMSB,0 C first byte
quadword-extend
- vsl H,H,B1 C H = H << 1
- vand EMSB,EMSB,POLY C EMSB &=
0xC2000000000000000000000000000001
- vxor ZERO,ZERO,ZERO C
0x00000000000000000000000000000000
- vxor H,H,EMSB C H ^= EMSB
- C --- calculate H^2 = H*H ---
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C
0x0000000000000000C200000000000000
- C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
- vpmsumd Hp,H,POLY_L C Hp = (H mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxmrgld VSR(Hl),VSR(H),VSR(ZERO) C Hl = (H mod x⁶⁴) × x⁶⁴
- xxswapd VSR(Hm),VSR(H)
- vxor Hl,Hl,Hp C Hl = Hl + Hp
- vxor Hm,Hm,Hp C Hm = Hm + Hp
- xxmrghd VSR(H1M),VSR(H),VSR(Hl) C H1M = (H div x⁶⁴)||(Hl
div x⁶⁴)
- xxmrgld VSR(H1L),VSR(H),VSR(Hm) C H1L = (H mod x⁶⁴)||(Hl
mod x⁶⁴)
- vpmsumd F,H1L,H C F = (H1Lh × Hh) +
(H1Ll × Hl)
- vpmsumd R,H1M,H C R = (H1Mh × Hh) +
(H1Ml × Hl)
- C --- rduction ---
- vpmsumd T,F,POLY_L C T = (F mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxswapd VSR(H2),VSR(F)
- vxor R,R,T C R = R + T
- vxor H2,R,H2
- xxmrgld VSR(Hl),VSR(H2),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H2)
- vpmsumd Hp,H2,POLY_L
- vxor Hl,Hl,Hp
- vxor Hm,Hm,Hp
- xxmrghd VSR(H2M),VSR(H2),VSR(Hl)
- xxmrgld VSR(H2L),VSR(H2),VSR(Hm)
- C store H1M, H1L, H2M, H2L
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- stxvd2x VSR(H1M),0,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
- vpmsumd F,H1L,H2
- vpmsumd F2,H2L,H2
- vpmsumd R,H1M,H2
- vpmsumd R2,H2M,H2
- vpmsumd T,F,POLY_L
- vpmsumd T2,F2,POLY_L
- xxswapd VSR(H3),VSR(F)
- xxswapd VSR(H4),VSR(F2)
- vxor R,R,T
- vxor R2,R2,T2
- vxor H3,R,H3
- vxor H4,R2,H4
- xxmrgld VSR(Hl),VSR(H3),VSR(ZERO)
- xxmrgld VSR(Hl2),VSR(H4),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H3)
- xxswapd VSR(Hm2),VSR(H4)
- vpmsumd Hp,H3,POLY_L
- vpmsumd Hp2,H4,POLY_L
- vxor Hl,Hl,Hp
- vxor Hl2,Hl2,Hp2
- vxor Hm,Hm,Hp
- vxor Hm2,Hm2,Hp2
- xxmrghd VSR(H1M),VSR(H3),VSR(Hl)
- xxmrghd VSR(H2M),VSR(H4),VSR(Hl2)
- xxmrgld VSR(H1L),VSR(H3),VSR(Hm)
- xxmrgld VSR(H2L),VSR(H4),VSR(Hm2)
- C store H3M, H3L, H4M, H4L
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- stxvd2x VSR(H1M),r7,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- blr
+EPILOGUE(_nettle_gcm_init_key)
+define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6')
+define(`ZERO', `v16') +define(`POLY', `v17') +define(`POLY_L', `v0')
+define(`D', `v1') +define(`C0', `v2') +define(`C1', `v3') +define(`C2', `v4') +define(`C3', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`H3M', `v10') +define(`H3L', `v11') +define(`H4M', `v12') +define(`H4L', `v13') +define(`R', `v14') +define(`F', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`R3', `v18') +define(`F3', `v20') +define(`R4', `v21') +define(`F4', `v22') +define(`T', `v23')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
- C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
- C size_t length, const uint8_t *data)
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash)
- DATA_LOAD_VEC(POLY,.polynomial,r7)
+IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8
- vspltisb LE_TEMP,0x07
- vxor LE_MASK,LE_MASK,LE_TEMP
+')
- vxor ZERO,ZERO,ZERO
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
- lxvd2x VSR(D),0,X C load 'X' pointer
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- C --- process 4 blocks '128-bit each' per one loop ---
- srdi r7,LENGTH,6 C 4-blocks loop count
'LENGTH / (4 * 16)'
- cmpldi r7,0
- beq L2x
- mtctr r7 C assign counter
register to loop count
- C store non-volatile vector registers
- addi r8,SP,-64
- stvx 20,0,r8
- addi r8,r8,16
- stvx 21,0,r8
- addi r8,r8,16
- stvx 22,0,r8
- addi r8,r8,16
- stvx 23,0,r8
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- lxvd2x VSR(H3M),r7,TABLE
- lxvd2x VSR(H3L),r8,TABLE
- lxvd2x VSR(H4M),r9,TABLE
- lxvd2x VSR(H4L),r10,TABLE
- li r8,0x10
- li r9,0x20
- li r10,0x30
+.align 5 +L4x_loop:
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r8,DATA C load C1
- lxvd2x VSR(C2),r9,DATA C load C2
- lxvd2x VSR(C3),r10,DATA C load C3
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
- vperm C2,C2,C2,LE_MASK
- vperm C3,C3,C3,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H3L,C1
- vpmsumd R2,H3M,C1
- vpmsumd F3,H2L,C2
- vpmsumd R3,H2M,C2
- vpmsumd F4,H1L,C3
- vpmsumd R4,H1M,C3
- vpmsumd F,H4L,C0
- vpmsumd R,H4M,C0
- C deferred recombination of partial products
- vxor F3,F3,F4
- vxor R3,R3,R4
- vxor F,F,F2
- vxor R,R,R2
- vxor F,F,F3
- vxor R,R,R3
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x40
- bdnz L4x_loop
- C restore non-volatile vector registers
- addi r8,SP,-64
- lvx 20,0,r8
- addi r8,r8,16
- lvx 21,0,r8
- addi r8,r8,16
- lvx 22,0,r8
- addi r8,r8,16
- lvx 23,0,r8
- clrldi LENGTH,LENGTH,58 C 'set the high-order 58
bits to zeros' +L2x:
- C --- process 2 blocks ---
- srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)'
- cmpldi r7,0
- beq L1x
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- C input loading
- li r10,0x10
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r10,DATA C load C1
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H1L,C1
- vpmsumd R2,H1M,C1
- vpmsumd F,H2L,C0
- vpmsumd R,H2M,C0
- C deferred recombination of partial products
- vxor F,F,F2
- vxor R,R,R2
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x20
- clrldi LENGTH,LENGTH,59 C 'set the high-order 59
bits to zeros' +L1x:
- C --- process 1 block ---
- srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)'
- cmpldi r7,0
- beq Lmod
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x10
- clrldi LENGTH,LENGTH,60 C 'set the high-order 60
bits to zeros' +Lmod:
- C --- process the modulo bytes, padding the low-order bytes with
zeros ---
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
+Ldone:
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- stxvd2x VSR(D),0,X C store digest 'D'
- blr
+EPILOGUE(_nettle_gcm_hash)
+.data
- C 0xC2000000000000000000000000000001
+.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')
-- 2.17.1
nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
The result of benchmark this implementation on POWER9.
|*****************************************************************************************************************************| | vs. lookup table based C implementation | vs. hardware acceleration using Intel optimization documents | |-----------------------------------------------|-----------------------------------------------------------------------------| | 2288% increase in performance | 44.48% increase in performance | |*****************************************************************************************************************************|
regards, Mamone
On Wed, Nov 11, 2020 at 6:24 PM George Wilson gcwilson@linux.ibm.com wrote:
On Wed, Nov 11, 2020 at 02:17:41AM +0200, Maamoun TK wrote:
I think I mislabeled the percentage of performance comparison, the new method achieved 27.7% reduction in time on POWER8 that corresponds to
37.9%
increase in performance.
Hi Maamoun,
Many thanks to you and Niels. We plan to test this on POWER9.
On Tue, Nov 10, 2020 at 6:25 AM Maamoun TK maamoun.tk@googlemail.com wrote:
This implementation takes advantage of research made by Niels Möller to optimize GCM on PowerPC, this optimization yields a +27.7% performance boost on POWER8 over the previous implementation that was based on
intel
documents. The performance comparison is made by processing 4 blocks
per
loop without any further optimizations. I made some documentations between the lines but I suggest writing a document similar to the intel ones that go into more details and
clarify
the preference of this method. I'm also curious if this method can also make a difference in other architectures like ARM, I'm planning to try
it
out for ARM to figure that out.
configure.ac | 6 +- gcm.c | 49 +++-- powerpc64/p8/gcm-hash.asm | 502 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+), 15 deletions(-) create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index 2a47f940..20f7cf74 100644 --- a/configure.ac +++ b/configure.ac @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are
used.
-asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm
sha256-compress-2.asm \
@@ -621,9 +621,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/gcm.c b/gcm.c index 48b3e75a..81981c1c 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key
+#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash
+#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
- size_t length, const uint8_t *data);
+# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key +static void +gcm_init_key(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS
- /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
element */
- unsigned i = (1<<GCM_TABLE_BITS)/2;
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&table[i], &table[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&table[i+j], &table[i], &table[j]);
- }
+#endif +} +#endif /* !gcm_init_key */
/* Initialization of GCM.
- @ctx: The context of GCM
- @cipher: The context of the underlying block cipher
@@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&key->h[i], &key->h[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
- }
-#endif
- gcm_init_key(key->h);
}
#ifndef gcm_hash diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..e79fbdc2 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,502 @@ +C powerpc64/p8/gcm-hash.asm
+ifelse(`
- Copyright (C) 2020 Niels Möller and Mamone Tarsha
- This file is part of GNU Nettle.
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at
your
option) any later version.
- or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at
your
option) any later version.
- or both in parallel, as here.
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received copies of the GNU General Public License
and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
+')
+C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100')
+C Register usage:
+define(`SP', `r1') +define(`TOCP', `r2')
+define(`TABLE', `r3')
+define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v16') +define(`POLY', `v17') +define(`POLY_L', `v1')
+define(`H', `v2') +define(`H2', `v3') +define(`H3', `v4') +define(`H4', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`Hl', `v10') +define(`Hm', `v11') +define(`Hp', `v12') +define(`Hl2', `v13') +define(`Hm2', `v14') +define(`Hp2', `v15') +define(`R', `v13') +define(`F', `v14') +define(`T', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T2', `v18')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
+.file "gcm-hash.asm"
+.text
- C void gcm_init_key (union gcm_block *table)
+C This function populates the gcm table as the following layout +C
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) | +C | | +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) | +C | | +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) | +C | | +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | +C
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key)
- DATA_LOAD_VEC(POLY,.polynomial,r7) C
0xC2000000000000000000000000000001 +IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8 C
0x000102030405060708090A0B0C0D0E0F
- vspltisb LE_TEMP,0x07 C
0x07070707070707070707070707070707
- vxor LE_MASK,LE_MASK,LE_TEMP C
0x07060504030201000F0E0D0C0B0A0908 +')
- C 'H' is assigned by gcm_set_key() to the middle element of the
table
- li r10,8*TableElemAlign
- lxvd2x VSR(H),r10,TABLE C load 'H'
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm H,H,H,LE_MASK
+')
- C --- calculate H = H << 1 mod P(X), P(X) =
(x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1)
- vupkhsb EMSB,H C extend most
significant bit to first byte
- vspltisb B1,1 C
0x01010101010101010101010101010101
- vspltb EMSB,EMSB,0 C first byte
quadword-extend
- vsl H,H,B1 C H = H << 1
- vand EMSB,EMSB,POLY C EMSB &=
0xC2000000000000000000000000000001
- vxor ZERO,ZERO,ZERO C
0x00000000000000000000000000000000
- vxor H,H,EMSB C H ^= EMSB
- C --- calculate H^2 = H*H ---
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C
0x0000000000000000C200000000000000
- C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤
127 ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
- vpmsumd Hp,H,POLY_L C Hp = (H mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxmrgld VSR(Hl),VSR(H),VSR(ZERO) C Hl = (H mod x⁶⁴) ×
x⁶⁴
- xxswapd VSR(Hm),VSR(H)
- vxor Hl,Hl,Hp C Hl = Hl + Hp
- vxor Hm,Hm,Hp C Hm = Hm + Hp
- xxmrghd VSR(H1M),VSR(H),VSR(Hl) C H1M = (H div
x⁶⁴)||(Hl
div x⁶⁴)
- xxmrgld VSR(H1L),VSR(H),VSR(Hm) C H1L = (H mod
x⁶⁴)||(Hl
mod x⁶⁴)
- vpmsumd F,H1L,H C F = (H1Lh × Hh) +
(H1Ll × Hl)
- vpmsumd R,H1M,H C R = (H1Mh × Hh) +
(H1Ml × Hl)
- C --- rduction ---
- vpmsumd T,F,POLY_L C T = (F mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxswapd VSR(H2),VSR(F)
- vxor R,R,T C R = R + T
- vxor H2,R,H2
- xxmrgld VSR(Hl),VSR(H2),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H2)
- vpmsumd Hp,H2,POLY_L
- vxor Hl,Hl,Hp
- vxor Hm,Hm,Hp
- xxmrghd VSR(H2M),VSR(H2),VSR(Hl)
- xxmrgld VSR(H2L),VSR(H2),VSR(Hm)
- C store H1M, H1L, H2M, H2L
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- stxvd2x VSR(H1M),0,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
- vpmsumd F,H1L,H2
- vpmsumd F2,H2L,H2
- vpmsumd R,H1M,H2
- vpmsumd R2,H2M,H2
- vpmsumd T,F,POLY_L
- vpmsumd T2,F2,POLY_L
- xxswapd VSR(H3),VSR(F)
- xxswapd VSR(H4),VSR(F2)
- vxor R,R,T
- vxor R2,R2,T2
- vxor H3,R,H3
- vxor H4,R2,H4
- xxmrgld VSR(Hl),VSR(H3),VSR(ZERO)
- xxmrgld VSR(Hl2),VSR(H4),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H3)
- xxswapd VSR(Hm2),VSR(H4)
- vpmsumd Hp,H3,POLY_L
- vpmsumd Hp2,H4,POLY_L
- vxor Hl,Hl,Hp
- vxor Hl2,Hl2,Hp2
- vxor Hm,Hm,Hp
- vxor Hm2,Hm2,Hp2
- xxmrghd VSR(H1M),VSR(H3),VSR(Hl)
- xxmrghd VSR(H2M),VSR(H4),VSR(Hl2)
- xxmrgld VSR(H1L),VSR(H3),VSR(Hm)
- xxmrgld VSR(H2L),VSR(H4),VSR(Hm2)
- C store H3M, H3L, H4M, H4L
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- stxvd2x VSR(H1M),r7,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- blr
+EPILOGUE(_nettle_gcm_init_key)
+define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6')
+define(`ZERO', `v16') +define(`POLY', `v17') +define(`POLY_L', `v0')
+define(`D', `v1') +define(`C0', `v2') +define(`C1', `v3') +define(`C2', `v4') +define(`C3', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`H3M', `v10') +define(`H3L', `v11') +define(`H4M', `v12') +define(`H4L', `v13') +define(`R', `v14') +define(`F', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`R3', `v18') +define(`F3', `v20') +define(`R4', `v21') +define(`F4', `v22') +define(`T', `v23')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
- C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
- C size_t length, const uint8_t *data)
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash)
- DATA_LOAD_VEC(POLY,.polynomial,r7)
+IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8
- vspltisb LE_TEMP,0x07
- vxor LE_MASK,LE_MASK,LE_TEMP
+')
- vxor ZERO,ZERO,ZERO
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
- lxvd2x VSR(D),0,X C load 'X' pointer
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- C --- process 4 blocks '128-bit each' per one loop ---
- srdi r7,LENGTH,6 C 4-blocks loop count
'LENGTH / (4 * 16)'
- cmpldi r7,0
- beq L2x
- mtctr r7 C assign counter
register to loop count
- C store non-volatile vector registers
- addi r8,SP,-64
- stvx 20,0,r8
- addi r8,r8,16
- stvx 21,0,r8
- addi r8,r8,16
- stvx 22,0,r8
- addi r8,r8,16
- stvx 23,0,r8
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- lxvd2x VSR(H3M),r7,TABLE
- lxvd2x VSR(H3L),r8,TABLE
- lxvd2x VSR(H4M),r9,TABLE
- lxvd2x VSR(H4L),r10,TABLE
- li r8,0x10
- li r9,0x20
- li r10,0x30
+.align 5 +L4x_loop:
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r8,DATA C load C1
- lxvd2x VSR(C2),r9,DATA C load C2
- lxvd2x VSR(C3),r10,DATA C load C3
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
- vperm C2,C2,C2,LE_MASK
- vperm C3,C3,C3,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H3L,C1
- vpmsumd R2,H3M,C1
- vpmsumd F3,H2L,C2
- vpmsumd R3,H2M,C2
- vpmsumd F4,H1L,C3
- vpmsumd R4,H1M,C3
- vpmsumd F,H4L,C0
- vpmsumd R,H4M,C0
- C deferred recombination of partial products
- vxor F3,F3,F4
- vxor R3,R3,R4
- vxor F,F,F2
- vxor R,R,R2
- vxor F,F,F3
- vxor R,R,R3
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x40
- bdnz L4x_loop
- C restore non-volatile vector registers
- addi r8,SP,-64
- lvx 20,0,r8
- addi r8,r8,16
- lvx 21,0,r8
- addi r8,r8,16
- lvx 22,0,r8
- addi r8,r8,16
- lvx 23,0,r8
- clrldi LENGTH,LENGTH,58 C 'set the
high-order 58
bits to zeros' +L2x:
- C --- process 2 blocks ---
- srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)'
- cmpldi r7,0
- beq L1x
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- C input loading
- li r10,0x10
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r10,DATA C load C1
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H1L,C1
- vpmsumd R2,H1M,C1
- vpmsumd F,H2L,C0
- vpmsumd R,H2M,C0
- C deferred recombination of partial products
- vxor F,F,F2
- vxor R,R,R2
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x20
- clrldi LENGTH,LENGTH,59 C 'set the
high-order 59
bits to zeros' +L1x:
- C --- process 1 block ---
- srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)'
- cmpldi r7,0
- beq Lmod
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x10
- clrldi LENGTH,LENGTH,60 C 'set the
high-order 60
bits to zeros' +Lmod:
- C --- process the modulo bytes, padding the low-order bytes with
zeros ---
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding
into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
+Ldone:
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- stxvd2x VSR(D),0,X C store digest 'D'
- blr
+EPILOGUE(_nettle_gcm_hash)
+.data
- C 0xC2000000000000000000000000000001
+.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')
-- 2.17.1
nettle-bugs mailing list nettle-bugs@lists.lysator.liu.se http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
-- George Wilson IBM Linux Technology Center Security Development
Maamoun TK maamoun.tk@googlemail.com writes:
This implementation takes advantage of research made by Niels Möller to optimize GCM on PowerPC, this optimization yields a +27.7% performance boost on POWER8 over the previous implementation that was based on intel documents. The performance comparison is made by processing 4 blocks per loop without any further optimizations.
Hi, the patch didn't apply cleanly due to email line breaks (maybe try posting as a text attachment next time?), but I've applied it semi-manually, and pushed it to a branch ppc-gcm.
I gave it a test run on gcc112 in the gcc compile farm, and speedup of gcm update seems to be 26 times(!) compared to the C version.
I made some documentations between the lines but I suggest writing a document similar to the intel ones that go into more details and clarify the preference of this method.
Where would that documentation be published? In the Nettle manual, as some IBM white paper, or as a more-or-less academic paper, e.g., on arxiv? I will not be able to spend much time on writing, but I'd be happy to review.
I'm also curious if this method can also make a difference in other architectures like ARM, I'm planning to try it out for ARM to figure that out.
I have a sketch of ARM Neon code doing the equivalent of two vpmsumd, with reasonable parallelism. Quite a lot of instructions needed.
Regards, /Niels
+C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100')
I still find this large constant puzzling. If I try
struct gcm_key key; printf("sizeof (key): %zd, sizeof(key.h[0]): %zd\n", sizeof(key), sizeof(key.h[0]));
(I added it to the start of test_main in gcm-test.c) and run on the gcc112 machine, I get
sizeof (key): 4096, sizeof(key.h[0]): 16
Which is what I'd expect, with elements of size 16 bytes, not 256 bytes.
I haven't yet had the time to read the code carefully.
Regards, /Niels
I reuploaded the patch as attachment since it didn't apply due to email line breaks, I also fixed the gcm table alignment issue, Thanks to Niels Möller.
On Tue, Nov 10, 2020 at 6:25 AM Maamoun TK maamoun.tk@googlemail.com wrote:
This implementation takes advantage of research made by Niels Möller to optimize GCM on PowerPC, this optimization yields a +27.7% performance boost on POWER8 over the previous implementation that was based on intel documents. The performance comparison is made by processing 4 blocks per loop without any further optimizations. I made some documentations between the lines but I suggest writing a document similar to the intel ones that go into more details and clarify the preference of this method. I'm also curious if this method can also make a difference in other architectures like ARM, I'm planning to try it out for ARM to figure that out.
configure.ac | 6 +- gcm.c | 49 +++-- powerpc64/p8/gcm-hash.asm | 502 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+), 15 deletions(-) create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index 2a47f940..20f7cf74 100644 --- a/configure.ac +++ b/configure.ac @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -621,9 +621,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/gcm.c b/gcm.c index 48b3e75a..81981c1c 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key
+#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash
+#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
- size_t length, const uint8_t *data);
+# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key +static void +gcm_init_key(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS
- /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
element */
- unsigned i = (1<<GCM_TABLE_BITS)/2;
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&table[i], &table[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&table[i+j], &table[i], &table[j]);
- }
+#endif +} +#endif /* !gcm_init_key */
/* Initialization of GCM.
- @ctx: The context of GCM
- @cipher: The context of the underlying block cipher
@@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&key->h[i], &key->h[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
- }
-#endif
- gcm_init_key(key->h);
}
#ifndef gcm_hash diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..e79fbdc2 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,502 @@ +C powerpc64/p8/gcm-hash.asm
+ifelse(`
- Copyright (C) 2020 Niels Möller and Mamone Tarsha
- This file is part of GNU Nettle.
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
- or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
- or both in parallel, as here.
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received copies of the GNU General Public License and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
+')
+C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100')
+C Register usage:
+define(`SP', `r1') +define(`TOCP', `r2')
+define(`TABLE', `r3')
+define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v16') +define(`POLY', `v17') +define(`POLY_L', `v1')
+define(`H', `v2') +define(`H2', `v3') +define(`H3', `v4') +define(`H4', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`Hl', `v10') +define(`Hm', `v11') +define(`Hp', `v12') +define(`Hl2', `v13') +define(`Hm2', `v14') +define(`Hp2', `v15') +define(`R', `v13') +define(`F', `v14') +define(`T', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T2', `v18')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
+.file "gcm-hash.asm"
+.text
- C void gcm_init_key (union gcm_block *table)
+C This function populates the gcm table as the following layout +C
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) | +C | | +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) | +C | | +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) | +C | | +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | +C
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key)
- DATA_LOAD_VEC(POLY,.polynomial,r7) C
0xC2000000000000000000000000000001 +IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8 C
0x000102030405060708090A0B0C0D0E0F
- vspltisb LE_TEMP,0x07 C
0x07070707070707070707070707070707
- vxor LE_MASK,LE_MASK,LE_TEMP C
0x07060504030201000F0E0D0C0B0A0908 +')
- C 'H' is assigned by gcm_set_key() to the middle element of the table
- li r10,8*TableElemAlign
- lxvd2x VSR(H),r10,TABLE C load 'H'
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm H,H,H,LE_MASK
+')
- C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1)
- vupkhsb EMSB,H C extend most
significant bit to first byte
- vspltisb B1,1 C
0x01010101010101010101010101010101
- vspltb EMSB,EMSB,0 C first byte
quadword-extend
- vsl H,H,B1 C H = H << 1
- vand EMSB,EMSB,POLY C EMSB &=
0xC2000000000000000000000000000001
- vxor ZERO,ZERO,ZERO C
0x00000000000000000000000000000000
- vxor H,H,EMSB C H ^= EMSB
- C --- calculate H^2 = H*H ---
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C
0x0000000000000000C200000000000000
- C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
- vpmsumd Hp,H,POLY_L C Hp = (H mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxmrgld VSR(Hl),VSR(H),VSR(ZERO) C Hl = (H mod x⁶⁴) × x⁶⁴
- xxswapd VSR(Hm),VSR(H)
- vxor Hl,Hl,Hp C Hl = Hl + Hp
- vxor Hm,Hm,Hp C Hm = Hm + Hp
- xxmrghd VSR(H1M),VSR(H),VSR(Hl) C H1M = (H div x⁶⁴)||(Hl
div x⁶⁴)
- xxmrgld VSR(H1L),VSR(H),VSR(Hm) C H1L = (H mod x⁶⁴)||(Hl
mod x⁶⁴)
- vpmsumd F,H1L,H C F = (H1Lh × Hh) +
(H1Ll × Hl)
- vpmsumd R,H1M,H C R = (H1Mh × Hh) +
(H1Ml × Hl)
- C --- rduction ---
- vpmsumd T,F,POLY_L C T = (F mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxswapd VSR(H2),VSR(F)
- vxor R,R,T C R = R + T
- vxor H2,R,H2
- xxmrgld VSR(Hl),VSR(H2),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H2)
- vpmsumd Hp,H2,POLY_L
- vxor Hl,Hl,Hp
- vxor Hm,Hm,Hp
- xxmrghd VSR(H2M),VSR(H2),VSR(Hl)
- xxmrgld VSR(H2L),VSR(H2),VSR(Hm)
- C store H1M, H1L, H2M, H2L
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- stxvd2x VSR(H1M),0,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
- vpmsumd F,H1L,H2
- vpmsumd F2,H2L,H2
- vpmsumd R,H1M,H2
- vpmsumd R2,H2M,H2
- vpmsumd T,F,POLY_L
- vpmsumd T2,F2,POLY_L
- xxswapd VSR(H3),VSR(F)
- xxswapd VSR(H4),VSR(F2)
- vxor R,R,T
- vxor R2,R2,T2
- vxor H3,R,H3
- vxor H4,R2,H4
- xxmrgld VSR(Hl),VSR(H3),VSR(ZERO)
- xxmrgld VSR(Hl2),VSR(H4),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H3)
- xxswapd VSR(Hm2),VSR(H4)
- vpmsumd Hp,H3,POLY_L
- vpmsumd Hp2,H4,POLY_L
- vxor Hl,Hl,Hp
- vxor Hl2,Hl2,Hp2
- vxor Hm,Hm,Hp
- vxor Hm2,Hm2,Hp2
- xxmrghd VSR(H1M),VSR(H3),VSR(Hl)
- xxmrghd VSR(H2M),VSR(H4),VSR(Hl2)
- xxmrgld VSR(H1L),VSR(H3),VSR(Hm)
- xxmrgld VSR(H2L),VSR(H4),VSR(Hm2)
- C store H3M, H3L, H4M, H4L
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- stxvd2x VSR(H1M),r7,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- blr
+EPILOGUE(_nettle_gcm_init_key)
+define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6')
+define(`ZERO', `v16') +define(`POLY', `v17') +define(`POLY_L', `v0')
+define(`D', `v1') +define(`C0', `v2') +define(`C1', `v3') +define(`C2', `v4') +define(`C3', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`H3M', `v10') +define(`H3L', `v11') +define(`H4M', `v12') +define(`H4L', `v13') +define(`R', `v14') +define(`F', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`R3', `v18') +define(`F3', `v20') +define(`R4', `v21') +define(`F4', `v22') +define(`T', `v23')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
- C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
- C size_t length, const uint8_t *data)
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash)
- DATA_LOAD_VEC(POLY,.polynomial,r7)
+IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8
- vspltisb LE_TEMP,0x07
- vxor LE_MASK,LE_MASK,LE_TEMP
+')
- vxor ZERO,ZERO,ZERO
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
- lxvd2x VSR(D),0,X C load 'X' pointer
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- C --- process 4 blocks '128-bit each' per one loop ---
- srdi r7,LENGTH,6 C 4-blocks loop count
'LENGTH / (4 * 16)'
- cmpldi r7,0
- beq L2x
- mtctr r7 C assign counter
register to loop count
- C store non-volatile vector registers
- addi r8,SP,-64
- stvx 20,0,r8
- addi r8,r8,16
- stvx 21,0,r8
- addi r8,r8,16
- stvx 22,0,r8
- addi r8,r8,16
- stvx 23,0,r8
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- lxvd2x VSR(H3M),r7,TABLE
- lxvd2x VSR(H3L),r8,TABLE
- lxvd2x VSR(H4M),r9,TABLE
- lxvd2x VSR(H4L),r10,TABLE
- li r8,0x10
- li r9,0x20
- li r10,0x30
+.align 5 +L4x_loop:
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r8,DATA C load C1
- lxvd2x VSR(C2),r9,DATA C load C2
- lxvd2x VSR(C3),r10,DATA C load C3
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
- vperm C2,C2,C2,LE_MASK
- vperm C3,C3,C3,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H3L,C1
- vpmsumd R2,H3M,C1
- vpmsumd F3,H2L,C2
- vpmsumd R3,H2M,C2
- vpmsumd F4,H1L,C3
- vpmsumd R4,H1M,C3
- vpmsumd F,H4L,C0
- vpmsumd R,H4M,C0
- C deferred recombination of partial products
- vxor F3,F3,F4
- vxor R3,R3,R4
- vxor F,F,F2
- vxor R,R,R2
- vxor F,F,F3
- vxor R,R,R3
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x40
- bdnz L4x_loop
- C restore non-volatile vector registers
- addi r8,SP,-64
- lvx 20,0,r8
- addi r8,r8,16
- lvx 21,0,r8
- addi r8,r8,16
- lvx 22,0,r8
- addi r8,r8,16
- lvx 23,0,r8
- clrldi LENGTH,LENGTH,58 C 'set the high-order 58
bits to zeros' +L2x:
- C --- process 2 blocks ---
- srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)'
- cmpldi r7,0
- beq L1x
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- C input loading
- li r10,0x10
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r10,DATA C load C1
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H1L,C1
- vpmsumd R2,H1M,C1
- vpmsumd F,H2L,C0
- vpmsumd R,H2M,C0
- C deferred recombination of partial products
- vxor F,F,F2
- vxor R,R,R2
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x20
- clrldi LENGTH,LENGTH,59 C 'set the high-order 59
bits to zeros' +L1x:
- C --- process 1 block ---
- srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)'
- cmpldi r7,0
- beq Lmod
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x10
- clrldi LENGTH,LENGTH,60 C 'set the high-order 60
bits to zeros' +Lmod:
- C --- process the modulo bytes, padding the low-order bytes with
zeros ---
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
+Ldone:
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- stxvd2x VSR(D),0,X C store digest 'D'
- blr
+EPILOGUE(_nettle_gcm_hash)
+.data
- C 0xC2000000000000000000000000000001
+.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')
-- 2.17.1
Maamoun TK maamoun.tk@googlemail.com writes:
+Lmod:
- C --- process the modulo bytes, padding the low-order bytes with zeros
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
It's always a bit annoying to have to deal with leftovers like this in the assembly code. Can we avoid having to store it to memory and read back? I can see three other approaches:
1. Loop, reading a byte at a time, and shift into a target register. I guess we would need to assemble the bytes in a regular register, and then transfer the final value to a vector register. Is that expensive?
2. Round the address down to make it aligned, read an aligned word and, only if needed, the next word. And shift and mask to get the needed bytes. I think it is fine to read a few bytes outside of the input area, as long as the reads do *not* cross any word boundary (and hence a potential page boundary). We do things like this in some other places, but then for reading unaligned data in general, not just leftover parts.
3. Adapt the internal C/asm interface, so that the assembly routine only needs to handle complete blocks. It could provide a gcm_gf_mul, and let the C code handle partial blocks using memxor + gcm_gf_mul.
I would guess (1) or maybe (3) is the most reasonable. I don't think performance is that important, since it looks like for each message, this case can happen only for the last call to gcm_update and the last call to gcm_encrypt/gcm_decrypt.
What about test coverage? It looks like we have test cases for sizes up to 8 blocks, and for partial blocks, so I guess that should be fine?
Reards, /Niels
For the first approach I can think of this method: lxvd2x VSR(C0),0,DATA IF_LE(` vperm C0,C0,C0,LE_MASK ') slwi LENGTH,LENGTH,4 (Shift left 4 bitls because vsro get bit[121:124]) vspltisb v10,-1 (0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF) mtvrwz v11,LENGTH (LENGTH in bit[57:60]) xxspltd VSR(v11),VSR(v11),0 (LENGTH in bit[121:124]) vsro v10,v10,v11 (Sift right by octet) vnot v10,v10 vand C0,C0,v10
I recommend the third approach so we don't have to deal with the leftover bytes in the upcoming implementations but the problem is that gcm_init_key() initialize the table for the compatible gcm_hash() function, that means we can't process the remaining bytes using gcm_gf_mul() of gcm_gf_shift_8() because its table potentially has not been initialized, so I'm thinking of keeping gcm_gf_mul() of the one that don't need table (where GCM_TABLE_BITS == 0) and always process the remaining bytes with this function.
The test coverage is fine, I can't think of any potential untested cases.
regards, Mamone
On Sat, Nov 14, 2020 at 6:54 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
+Lmod:
- C --- process the modulo bytes, padding the low-order bytes with
zeros
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding
into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
It's always a bit annoying to have to deal with leftovers like this in the assembly code. Can we avoid having to store it to memory and read back? I can see three other approaches:
Loop, reading a byte at a time, and shift into a target register. I guess we would need to assemble the bytes in a regular register, and then transfer the final value to a vector register. Is that expensive?
Round the address down to make it aligned, read an aligned word and, only if needed, the next word. And shift and mask to get the needed bytes. I think it is fine to read a few bytes outside of the input area, as long as the reads do *not* cross any word boundary (and hence a potential page boundary). We do things like this in some other places, but then for reading unaligned data in general, not just leftover parts.
Adapt the internal C/asm interface, so that the assembly routine only needs to handle complete blocks. It could provide a gcm_gf_mul, and let the C code handle partial blocks using memxor + gcm_gf_mul.
I would guess (1) or maybe (3) is the most reasonable. I don't think performance is that important, since it looks like for each message, this case can happen only for the last call to gcm_update and the last call to gcm_encrypt/gcm_decrypt.
What about test coverage? It looks like we have test cases for sizes up to 8 blocks, and for partial blocks, so I guess that should be fine?
Reards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
I replaced the method of using the stack to handle the leftovers with the first approach, also I changed some vector registers in the defines because I defined `LE_MASK' in a non-volatile register which is not always preserved.
This patch is built on the top ppc-gcm branch.
regards, Mamone
On Sat, Nov 14, 2020 at 8:11 PM Maamoun TK maamoun.tk@googlemail.com wrote:
For the first approach I can think of this method: lxvd2x VSR(C0),0,DATA IF_LE(` vperm C0,C0,C0,LE_MASK ') slwi LENGTH,LENGTH,4 (Shift left 4 bitls because vsro get bit[121:124]) vspltisb v10,-1 (0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF) mtvrwz v11,LENGTH (LENGTH in bit[57:60]) xxspltd VSR(v11),VSR(v11),0 (LENGTH in bit[121:124]) vsro v10,v10,v11 (Sift right by octet) vnot v10,v10 vand C0,C0,v10
I recommend the third approach so we don't have to deal with the leftover bytes in the upcoming implementations but the problem is that gcm_init_key() initialize the table for the compatible gcm_hash() function, that means we can't process the remaining bytes using gcm_gf_mul() of gcm_gf_shift_8() because its table potentially has not been initialized, so I'm thinking of keeping gcm_gf_mul() of the one that don't need table (where GCM_TABLE_BITS == 0) and always process the remaining bytes with this function.
The test coverage is fine, I can't think of any potential untested cases.
regards, Mamone
On Sat, Nov 14, 2020 at 6:54 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
+Lmod:
- C --- process the modulo bytes, padding the low-order bytes with
zeros
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding
into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
It's always a bit annoying to have to deal with leftovers like this in the assembly code. Can we avoid having to store it to memory and read back? I can see three other approaches:
Loop, reading a byte at a time, and shift into a target register. I guess we would need to assemble the bytes in a regular register, and then transfer the final value to a vector register. Is that expensive?
Round the address down to make it aligned, read an aligned word and, only if needed, the next word. And shift and mask to get the needed bytes. I think it is fine to read a few bytes outside of the input area, as long as the reads do *not* cross any word boundary (and hence a potential page boundary). We do things like this in some other places, but then for reading unaligned data in general, not just leftover parts.
Adapt the internal C/asm interface, so that the assembly routine only needs to handle complete blocks. It could provide a gcm_gf_mul, and let the C code handle partial blocks using memxor + gcm_gf_mul.
I would guess (1) or maybe (3) is the most reasonable. I don't think performance is that important, since it looks like for each message, this case can happen only for the last call to gcm_update and the last call to gcm_encrypt/gcm_decrypt.
What about test coverage? It looks like we have test cases for sizes up to 8 blocks, and for partial blocks, so I guess that should be fine?
Reards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
Another patch for register defines. I apologize for that.
regards, Mamone
On Tue, Nov 17, 2020 at 11:10 PM Maamoun TK maamoun.tk@googlemail.com wrote:
I replaced the method of using the stack to handle the leftovers with the first approach, also I changed some vector registers in the defines because I defined `LE_MASK' in a non-volatile register which is not always preserved.
This patch is built on the top ppc-gcm branch.
regards, Mamone
On Sat, Nov 14, 2020 at 8:11 PM Maamoun TK maamoun.tk@googlemail.com wrote:
For the first approach I can think of this method: lxvd2x VSR(C0),0,DATA IF_LE(` vperm C0,C0,C0,LE_MASK ') slwi LENGTH,LENGTH,4 (Shift left 4 bitls because vsro get bit[121:124]) vspltisb v10,-1 (0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF) mtvrwz v11,LENGTH (LENGTH in bit[57:60]) xxspltd VSR(v11),VSR(v11),0 (LENGTH in bit[121:124]) vsro v10,v10,v11 (Sift right by octet) vnot v10,v10 vand C0,C0,v10
I recommend the third approach so we don't have to deal with the leftover bytes in the upcoming implementations but the problem is that gcm_init_key() initialize the table for the compatible gcm_hash() function, that means we can't process the remaining bytes using gcm_gf_mul() of gcm_gf_shift_8() because its table potentially has not been initialized, so I'm thinking of keeping gcm_gf_mul() of the one that don't need table (where GCM_TABLE_BITS == 0) and always process the remaining bytes with this function.
The test coverage is fine, I can't think of any potential untested cases.
regards, Mamone
On Sat, Nov 14, 2020 at 6:54 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
+Lmod:
- C --- process the modulo bytes, padding the low-order bytes with
zeros
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding
into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
It's always a bit annoying to have to deal with leftovers like this in the assembly code. Can we avoid having to store it to memory and read back? I can see three other approaches:
Loop, reading a byte at a time, and shift into a target register. I guess we would need to assemble the bytes in a regular register, and then transfer the final value to a vector register. Is that expensive?
Round the address down to make it aligned, read an aligned word and, only if needed, the next word. And shift and mask to get the needed bytes. I think it is fine to read a few bytes outside of the input area, as long as the reads do *not* cross any word boundary (and hence a potential page boundary). We do things like this in some other places, but then for reading unaligned data in general, not just leftover parts.
Adapt the internal C/asm interface, so that the assembly routine only needs to handle complete blocks. It could provide a gcm_gf_mul, and let the C code handle partial blocks using memxor + gcm_gf_mul.
I would guess (1) or maybe (3) is the most reasonable. I don't think performance is that important, since it looks like for each message, this case can happen only for the last call to gcm_update and the last call to gcm_encrypt/gcm_decrypt.
What about test coverage? It looks like we have test cases for sizes up to 8 blocks, and for partial blocks, so I guess that should be fine?
Reards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
Maamoun TK maamoun.tk@googlemail.com writes:
For the first approach I can think of this method: lxvd2x VSR(C0),0,DATA IF_LE(` vperm C0,C0,C0,LE_MASK ') slwi LENGTH,LENGTH,4 (Shift left 4 bitls because vsro get bit[121:124]) vspltisb v10,-1 (0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF) mtvrwz v11,LENGTH (LENGTH in bit[57:60]) xxspltd VSR(v11),VSR(v11),0 (LENGTH in bit[121:124]) vsro v10,v10,v11 (Sift right by octet) vnot v10,v10 vand C0,C0,v10
I'm having some difficulty following along. Is this a loop, part of a loop, or is there some vector load instruction that lets you pass a byte length?
I recommend the third approach so we don't have to deal with the leftover bytes in the upcoming implementations but the problem is that gcm_init_key() initialize the table for the compatible gcm_hash() function,
If we go this way, the power assembly file would have to provide an implementation of gcm_gf_mul, compatible with its gcm_init_key. It would do essentially the same thing as the single-block part of gcm_hash. But approach 1 is fine too, if it doesn't get too complicated.
Your recent mails have not included actual patches, neither inline, nor as attachments. E.g., https://lists.lysator.liu.se/pipermail/nettle-bugs/2020/009234.html. (The mailist software might discard some attachments, but content-type: text/x-patch and the like should be fine). If your mail client doesn't cooperate, feel free to create a pull request on git.lysator.liu.se instead (and ping the list).
Regards, /Niels
On Sat, Nov 21, 2020 at 5:32 PM Niels Möller nisse@lysator.liu.se wrote:
Is this a loop, part of a loop, or is there some vector load instruction that lets you pass a byte length?
It generates a mask compatible with the length of leftovers, for example if the length is 1 then the mask generated is 0xFF000000000000000000000000000000 then the mask is ANDed with the vector register of leftovers to clear the extra unneeded bytes. It's not exactly like the first approach but it avoids using stack and handles the leftovers inside the assembly implementation, sorry for mixing up.
I recommend the third approach so we don't have to deal with the leftover bytes in the upcoming implementations but the problem is that gcm_init_key() initialize the table for the compatible gcm_hash() function,
If we go this way, the power assembly file would have to provide an implementation of gcm_gf_mul, compatible with its gcm_init_key. It would do essentially the same thing as the single-block part of gcm_hash. But approach 1 is fine too, if it doesn't get too complicated.
Your recent mails have not included actual patches, neither inline, nor as attachments. E.g., https://lists.lysator.liu.se/pipermail/nettle-bugs/2020/009234.html. (The mailist software might discard some attachments, but content-type: text/x-patch and the like should be fine). If your mail client doesn't cooperate, feel free to create a pull request on git.lysator.liu.se instead (and ping the list).
I made a merge request in git.lysator.liu.se, it ended up easier for me to push patches to the repository in this way, I hope you don't mind dealing with the future patches the same way.
regards, Mamone
Maamoun TK maamoun.tk@googlemail.com writes:
It generates a mask compatible with the length of leftovers, for example if the length is 1 then the mask generated is 0xFF000000000000000000000000000000 then the mask is ANDed with the vector register of leftovers to clear the extra unneeded bytes. It's not exactly like the first approach but it avoids using stack and handles the leftovers inside the assembly implementation, sorry for mixing up.
I see. I'm a bit worried that it may read to far. E.g, assume that leftover size to read is 5 bytes, and those 5 bytes start at address 1ffffff8. Then the final
lxvd2x VSR(C0),0,DATA
will read 16 bytes from memory, including a few bytes starting at address 20000000, which may result in a segfault. Getting this right would need approach 2, "Round the address down to make it aligned, read an aligned word and, only if needed, the next word. And shift and mask to get the needed bytes."
I would expect that the simplest is to go with approach two: Have a loop to read a byte at the time, and shift into a register.
I made a merge request in git.lysator.liu.se, it ended up easier for me to push patches to the repository in this way, I hope you don't mind dealing with the future patches the same way.
Thanks, that's fine. But you may need to ping me, since I don't look at the gitlab web interface that often.
Regards, /Niels
I'm not aware of a simple way to accomplish either approaches on POWER8, I recommend to use allocated stack buffer to assist handling leftovers rather than making it complicated or we can use POWER9 specific instruction 'lxvll' which can used to load vector with length passed to general register as parameter, it also work on both endian modes without any post-loading operations, another benefit from switching to POWER ISA 3.0 is that we can use 'lxvb16x/stxvb16x' to load/store input and output data instead of 'lxvd2x/stxvd2x' instructions, this eliminate the need for post-loading/pre-storing permuting operations on little-endian mode.
regards, Mamone
On Sun, Nov 22, 2020 at 11:26 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
It generates a mask compatible with the length of leftovers, for example
if
the length is 1 then the mask generated is 0xFF000000000000000000000000000000 then the mask is ANDed with the vector register of leftovers to clear the extra unneeded bytes. It's not exactly like the first approach but it avoids using stack and handles the
leftovers
inside the assembly implementation, sorry for mixing up.
I see. I'm a bit worried that it may read to far. E.g, assume that leftover size to read is 5 bytes, and those 5 bytes start at address 1ffffff8. Then the final
lxvd2x VSR(C0),0,DATA
will read 16 bytes from memory, including a few bytes starting at address 20000000, which may result in a segfault. Getting this right would need approach 2, "Round the address down to make it aligned, read an aligned word and, only if needed, the next word. And shift and mask to get the needed bytes."
I would expect that the simplest is to go with approach two: Have a loop to read a byte at the time, and shift into a register.
I made a merge request in git.lysator.liu.se, it ended up easier for me
to
push patches to the repository in this way, I hope you don't mind dealing with the future patches the same way.
Thanks, that's fine. But you may need to ping me, since I don't look at the gitlab web interface that often.
Regards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
Maamoun TK maamoun.tk@googlemail.com writes:
I'm not aware of a simple way to accomplish either approaches on POWER8, I recommend to use allocated stack buffer
Let's leave that as is, then. Do you want to make another pull request with only the fixes for register usage?
to assist handling leftovers rather than making it complicated or we can use POWER9 specific instruction 'lxvll' which can used to load vector with length passed to general register as parameter, it also work on both endian modes without any post-loading operations, another benefit from switching to POWER ISA 3.0 is that we can use 'lxvb16x/stxvb16x' to load/store input and output data instead of 'lxvd2x/stxvd2x' instructions, this eliminate the need for post-loading/pre-storing permuting operations on little-endian mode.
I was thinking of something similar to how the unaligned input is handled in arm/v6/sha1-compress.asm. And then, to handle leftovers at the end, one would need to compare leftover size with the alignment related address bits, to decide whether or not to load one more word. But perhaps only worth the effort if there's a performance advantage in avoiding unaligned loads also in the main loop.
Regards, /Niels
On Wed, Nov 25, 2020 at 10:15 AM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
Let's leave that as is, then. Do you want to make another pull request with only the fixes for register usage?
Sure. I updated the pull request.
I was thinking of something similar to how the unaligned input is handled in arm/v6/sha1-compress.asm. And then, to handle leftovers at the end, one would need to compare leftover size with the alignment related address bits, to decide whether or not to load one more word. But perhaps only worth the effort if there's a performance advantage in avoiding unaligned loads also in the main loop.
Yes, it makes sense to avoid unaligned loads in the main loop by checking low-order bits of address, but still I can't imagine it would be more simple in this case. Allocating stack buffers used very often along the lifespan of process and I think it's ok to be used for this purpose.
regards, Mamone
Maamoun TK maamoun.tk@googlemail.com writes:
Sure. I updated the pull request.
Thanks. Merged (first time I try the merge button on gitlab).
Yes, it makes sense to avoid unaligned loads in the main loop by checking low-order bits of address, but still I can't imagine it would be more simple in this case. Allocating stack buffers used very often along the lifespan of process and I think it's ok to be used for this purpose.
It's no big problem, it just seems slightly wasteful with an extra round of load and store to memory.
We could revisit it if we ever get to rearranging the loads for the main loop.
Regards, /Niels
Niels Möller nisse@lysator.liu.se writes:
Maamoun TK maamoun.tk@googlemail.com writes:
Sure. I updated the pull request.
Thanks. Merged (first time I try the merge button on gitlab).
It remains to wire it up for fat-ppc.c. Anything else that is missing?
Regards, /Niels
On Wed, Nov 25, 2020 at 9:21 PM Niels Möller nisse@lysator.liu.se wrote:
It remains to wire it up for fat-ppc.c. Anything else that is missing?
No, I'll make a pull request for fat build support.
On Wed, Nov 25, 2020 at 10:13 PM Maamoun TK maamoun.tk@googlemail.com wrote:
I'll make a pull request for fat build support.
Done!
Maamoun TK maamoun.tk@googlemail.com writes:
I'll make a pull request for fat build support.
Done!
I added two comments on the merge request.
Regards, /Niels
Niels Möller nisse@lysator.liu.se writes:
Maamoun TK maamoun.tk@googlemail.com writes:
I'll make a pull request for fat build support.
Done!
I added two comments on the merge request.
I reorganized the ifdefs a bit more, and pushed to the ppc-gcm branch. Tested on gcc112. Please try it out.
Regards, /Niels
Great. It works on PowerPC with configure options "./configure", "./configure --enable-power-crypto-ext", and "./configure --enable-fat" and get the expected results.
However, there are two warning popped up when configured with --enable-power-crypto-ext
gcm.c: In function ‘nettle_gcm_set_key’: gcm.c:287:3: warning: implicit declaration of function ‘_nettle_gcm_init_key’; did you mean ‘nettle_gcm_set_key’? [-Wimplicit-function-declaration] 287 | _nettle_gcm_init_key(key->h); | ^~~~~~~~~~~~~~~~~~~~ | nettle_gcm_set_key gcm.c:287:3: warning: nested extern declaration of ‘_nettle_gcm_init_key’ [-Wnested-externs] gcm.c: In function ‘gcm_hash_sizes’: gcm.c:325:3: warning: implicit declaration of function ‘_nettle_gcm_hash’; did you mean ‘nettle_get_hashes’? [-Wimplicit-function-declaration] 325 | _nettle_gcm_hash(key, x, GCM_BLOCK_SIZE, buffer); | ^~~~~~~~~~~~~~~~ | nettle_get_hashes gcm.c:325:3: warning: nested extern declaration of ‘_nettle_gcm_hash’ [-Wnested-externs]
To suppress these warnings we need to declare a prototype for _nettle_gcm_init_key() and _nettle_gcm_hash() if "HAVE_NATIVE_gcm_init_key" and "HAVE_NATIVE_gcm_hash" are defined respectively.
Also, I think an error will pop up in x86_64 build if gcm_hash8 is enabled, we can fix this error by replacing the line 156 "#define gcm_hash _nettle_gcm_hash8" with "#define _nettle_gcm_hash _nettle_gcm_hash8"
Let me know if you want me to make a pull request for these changes.
regards, Mamone
On Thu, Nov 26, 2020 at 9:13 PM Niels Möller nisse@lysator.liu.se wrote:
Niels Möller nisse@lysator.liu.se writes:
Maamoun TK maamoun.tk@googlemail.com writes:
I'll make a pull request for fat build support.
Done!
I added two comments on the merge request.
I reorganized the ifdefs a bit more, and pushed to the ppc-gcm branch. Tested on gcc112. Please try it out.
Regards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
Maamoun TK maamoun.tk@googlemail.com writes:
To suppress these warnings we need to declare a prototype for _nettle_gcm_init_key() and _nettle_gcm_hash() if "HAVE_NATIVE_gcm_init_key" and "HAVE_NATIVE_gcm_hash" are defined respectively.
Could be fixed in the new gcm-internal.h file. (I don't quite like that it needs any ifdefs around the declarations; the reason I had to add that was that I'd like to have the definitions be static in the case that it's all defined in C, and then it conflicts with non-static declarations in this file).
Also, I think an error will pop up in x86_64 build if gcm_hash8 is enabled, we can fix this error by replacing the line 156 "#define gcm_hash _nettle_gcm_hash8" with "#define _nettle_gcm_hash _nettle_gcm_hash8"
You're right, it's broken on x86_64.
Let me know if you want me to make a pull request for these changes.
If you can help out with that, that's much appreciated.
Regards, /Niels
I made a pull request in the repository.
regards, Mamone
On Thu, Nov 26, 2020 at 11:41 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
To suppress these warnings we need to declare a prototype for _nettle_gcm_init_key() and _nettle_gcm_hash() if
"HAVE_NATIVE_gcm_init_key"
and "HAVE_NATIVE_gcm_hash" are defined respectively.
Could be fixed in the new gcm-internal.h file. (I don't quite like that it needs any ifdefs around the declarations; the reason I had to add that was that I'd like to have the definitions be static in the case that it's all defined in C, and then it conflicts with non-static declarations in this file).
Also, I think an error will pop up in x86_64 build if gcm_hash8 is
enabled,
we can fix this error by replacing the line 156 "#define gcm_hash _nettle_gcm_hash8" with "#define _nettle_gcm_hash _nettle_gcm_hash8"
You're right, it's broken on x86_64.
Let me know if you want me to make a pull request for these changes.
If you can help out with that, that's much appreciated.
Regards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
Maamoun TK maamoun.tk@googlemail.com writes:
I made a pull request in the repository.
Merged, thanks! I wonder if gcm-internal.h can be cut down a bit, to
/* Functions available only in some configurations */ void _nettle_gcm_init_key (union nettle_block16 *table);
void _nettle_gcm_hash(const struct gcm_key *key, union nettle_block16 *x, size_t length, const uint8_t *data);
#if HAVE_NATIVE_fat_gcm_init_key void _nettle_gcm_init_key_c (union nettle_block16 *table); #endif
#if HAVE_NATIVE_fat_gcm_hash void _nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x, size_t length, const uint8_t *data); #endif
(it's only the _c-functions that are static in some configurations, and need ifdefs). I've tested on gcc112, configurations --enable-power-crypto-ext, --enable-fat, and --disable-assembly, and I see no warnings or errors.
Regards, /Niels
On Fri, Nov 27, 2020 at 8:13 PM Niels Möller nisse@lysator.liu.se wrote:
I wonder if gcm-internal.h can be cut down a bit, to
/* Functions available only in some configurations */ void _nettle_gcm_init_key (union nettle_block16 *table);
void _nettle_gcm_hash(const struct gcm_key *key, union nettle_block16 *x, size_t length, const uint8_t *data);
But if HAVE_NATIVE_gcm_init_key and HAVE_NATIVE_gcm_hash are not defined, there are no definitions for _nettle_gcm_init_key() and _nettle_gcm_hash() respectively. Maybe it doesn't yield a warning or error because it's ok for the compiler to have a prototype declaration without function definition.
Maamoun TK maamoun.tk@googlemail.com writes:
But if HAVE_NATIVE_gcm_init_key and HAVE_NATIVE_gcm_hash are not defined, there are no definitions for _nettle_gcm_init_key() and _nettle_gcm_hash() respectively. Maybe it doesn't yield a warning or error because it's ok for the compiler to have a prototype declaration without function definition.
It's harmless to declare a function that's neither defined nor used.
Regards, /Niels
Maamoun TK maamoun.tk@googlemail.com writes:
On Wed, Nov 25, 2020 at 10:13 PM Maamoun TK maamoun.tk@googlemail.com wrote:
I'll make a pull request for fat build support.
The gcm code is now merged to the master branch. Thanks!
Regards, /Niels
nettle-bugs@lists.lysator.liu.se