--- configure.ac | 6 +- gcm.c | 49 +++- powerpc64/p8/gcm-hash.asm | 607 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 647 insertions(+), 15 deletions(-) create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index e9983697..0129f950 100644 --- a/configure.ac +++ b/configure.ac @@ -488,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -612,9 +612,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/gcm.c b/gcm.c index 48b3e75a..81981c1c 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key + +#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash + +#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key +static void +gcm_init_key(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS + /* Middle element if GCM_TABLE_BITS > 0, otherwise the first + element */ + unsigned i = (1<<GCM_TABLE_BITS)/2; + + /* Algorithm 3 from the gcm paper. First do powers of two, then do + the rest by adding. */ + while (i /= 2) + block16_mulx_ghash(&table[i], &table[2*i]); + for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) + { + unsigned j; + for (j = 1; j < i; j++) + block16_xor3(&table[i+j], &table[i], &table[j]); + } +#endif +} +#endif /* !gcm_init_key */ + /* Initialization of GCM. * @ctx: The context of GCM * @cipher: The context of the underlying block cipher @@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS - /* Algorithm 3 from the gcm paper. First do powers of two, then do - the rest by adding. */ - while (i /= 2) - block16_mulx_ghash(&key->h[i], &key->h[2*i]); - for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) - { - unsigned j; - for (j = 1; j < i; j++) - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]); - } -#endif + gcm_init_key(key->h); }
#ifndef gcm_hash diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..540c9f97 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,607 @@ +C powerpc64/p8/gcm-hash.asm + +ifelse(` + Copyright (D) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`TABLE', `r3') + +define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v2') +define(`POLY', `v7') +define(`POLY_L', `v1') +define(`POLY_H', `v2') + +define(`H', `v3') +define(`H3', `v3') +define(`Hl', `v4') +define(`Hm', `v5') +define(`Hh', `v6') +define(`RP', `v7') +define(`Mh', `v8') +define(`Ml', `v9') +define(`H2', `v7') +define(`H4', `v7') +define(`H2m', `v8') +define(`H2l', `v9') +define(`H2h', `v10') +define(`RP2', `v11') +define(`M2h', `v12') +define(`M2l', `v13') + +define(`H2_l', `v14') +define(`H2_m', `v15') +define(`H2_h', `v16') +define(`H3_l', `v14') +define(`H3_m', `v15') +define(`H3_h', `v16') +define(`H4_l', `v17') +define(`H4_m', `v18') +define(`H4_h', `v19') + +define(`H21l', `v16') +define(`H21h', `v17') +define(`H3m', `v16') +define(`H4m', `v17') +define(`H43l', `v18') +define(`H43h', `v19') + +define(`LE_TEMP', `v18') +define(`LE_MASK', `v19') + +.file "gcm-hash.asm" + +.text + + C void gcm_init_key (union gcm_block *table) + +C This function populates the gcm table as the following layout +C ******************************************************************** +C | Hm = low-order doubleword of H^1:high-order doubleword of H^1 | +C | Hl = 64-bits zeros:low-order doubleword of H^1 | +C | Hh = high-order doubleword of H^1:64-bits zeros | +C | | +C | H2m = low-order doubleword of H^2:high-order doubleword of H^2 | +C | H21l = low-order doubleword of H^2:low-order doubleword of H^1 | +C | H21h = high-order doubleword of H^2:high-order doubleword of H^1 | +C | | +C | H3m = low-order doubleword of H^3:high-order doubleword of H^3 | +C | H4m = low-order doubleword of H^4:high-order doubleword of H^4 | +C | H43l = low-order doubleword of H^4:low-order doubleword of H^3 | +C | H43h = high-order doubleword of H^4:high-order doubleword of H^3 | +C ******************************************************************** + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key) + DATA_LOAD_VEC(POLY,.polynomial,r7) C 0xC2000000000000000000000000000001 +IF_LE(` + li r8,0 + lvsl LE_MASK,0,r8 C 0x000102030405060708090A0B0C0D0E0F + vspltisb LE_TEMP,0x07 C 0x07070707070707070707070707070707 + vxor LE_MASK,LE_MASK,LE_TEMP C 0x07060504030201000F0E0D0C0B0A0908 +') + + C 'H' is assigned by gcm_set_key() to the middle element of the table + li r10,8*TableElemAlign + lxvd2x VSR(H),r10,TABLE C load 'H' + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm H,H,H,LE_MASK +') + + C --- calculate [H = H << 1 modulo polynomial] --- + + vupkhsb EMSB,H C extend most significant bit to first byte + vspltisb B1,1 C 0x01010101010101010101010101010101 + vspltb EMSB,EMSB,0 C first byte quadword-extend + vsl H,H,B1 C H = H << 1 + vand EMSB,EMSB,POLY C EMSB &= 0xC2000000000000000000000000000001 + vxor ZERO,ZERO,ZERO C 0x00000000000000000000000000000000 + vxor H,H,EMSB C H ^= EMSB + + C calculate [Hl = 0:H^1l, Hm = H^1l:H^1h, Hh = H^1h:0] + xxmrgld VSR(Hl),VSR(ZERO),VSR(H) + xxswapd VSR(Hm),VSR(H) + xxmrghd VSR(Hh),VSR(H),VSR(ZERO) + + C --- calculate H^2 = H*H --- + + C reduction pre-processing + xxmrghd VSR(POLY_H),VSR(POLY),VSR(ZERO) C 0xC2000000000000000000000000000000 + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C 0x0000000000000000C200000000000000 + + C polynomial multiplication "classical" + vpmsumd H2_l,H,Hl C H^1l*H^1l + vpmsumd H2_m,H,Hm C H^1h*H^1l⊕H^1l*H^1h + vpmsumd H2_h,H,Hh C H^1h*H^1h + + C reduction first phase [1] + vpmsumd RP,H2_l,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(H2_m) C [2] + xxmrgld VSR(Ml),VSR(H2_m),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor H2_h,H2_h,Mh C [2] + vxor H2_l,H2_l,Ml C [2] + vxor H2_l,H2_l,RP C [1] + + C reduction second phase + vpmsumd RP,H2_l,POLY_H + vxor H2_h,H2_h,H2_l + vxor H2,H2_h,RP + + C store [H2m = H^2l:H^2h, H2l = 0:H^2l, H2h = H^2h:0] + xxswapd VSR(H2m),VSR(H2) + xxmrgld VSR(H2l),VSR(ZERO),VSR(H2) + xxmrghd VSR(H2h),VSR(H2),VSR(ZERO) + + C calculate [H21l = H^2l:H^1l, H21h = H^2h:H^1h] + xxmrgld VSR(H21l),VSR(H2),VSR(H) + xxmrghd VSR(H21h),VSR(H2),VSR(H) + + C store [Hm, Hl, Hh] + li r9,1*TableElemAlign + li r10,2*TableElemAlign + stxvd2x VSR(Hm),0,TABLE + stxvd2x VSR(Hl),r9,TABLE + stxvd2x VSR(Hh),r10,TABLE + + C store [H2m, H21l, H21h] + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + stxvd2x VSR(H2m),r8,TABLE + stxvd2x VSR(H21l),r9,TABLE + stxvd2x VSR(H21h),r10,TABLE + + C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 --- + + C polynomial multiplication "classical" + vpmsumd H3_l,H,H2l C H^1l*H^2l + vpmsumd H4_l,H2,H2l C H^2l*H^2l + vpmsumd H3_m,H,H2m C H^1h*H^2l⊕H^1l*H^2h + vpmsumd H4_m,H2,H2m C H^2h*H^2l⊕H^2l*H^2h + vpmsumd H3_h,H,H2h C H^1h*H^2h + vpmsumd H4_h,H2,H2h C H^2h*H^2h + + C reduction first phase [1] + vpmsumd RP,H3_l,POLY_L C [1] H^3 + vpmsumd RP2,H4_l,POLY_L C [1] H^4 + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(H3_m) C [2] H^3 + xxmrghd VSR(M2h),VSR(ZERO),VSR(H4_m) C [2] H^4 + xxmrgld VSR(Ml),VSR(H3_m),VSR(ZERO) C [2] H^3 + xxmrgld VSR(M2l),VSR(H4_m),VSR(ZERO) C [2] H^4 + xxswapd VSR(RP),VSR(RP) C [1] H^3 + xxswapd VSR(RP2),VSR(RP2) C [1] H^4 + vxor H3_h,H3_h,Mh C [2] H^3 + vxor H4_h,H4_h,M2h C [2] H^4 + vxor H3_l,H3_l,Ml C [2] H^3 + vxor H4_l,H4_l,M2l C [2] H^4 + vxor H3_l,H3_l,RP C [1] H^3 + vxor H4_l,H4_l,RP2 C [1] H^4 + + C reduction second phase + vpmsumd RP,H3_l,POLY_H C H^3 + vpmsumd RP2,H4_l,POLY_H C H^4 + vxor H3_h,H3_h,H3_l C H^3 + vxor H4_h,H4_h,H4_l C H^4 + vxor H3,H3_h,RP C H^3 + vxor H4,H4_h,RP2 C H^4 + + C calculate [H3m = H^3l:H^3h, H4m = H^4l:H^4h, H43l = H^4l:H^3l, H43h = H^4h:H^3h] + xxswapd VSR(H3m),VSR(H3) + xxswapd VSR(H4m),VSR(H4) + xxmrgld VSR(H43l),VSR(H4),VSR(H3) + xxmrghd VSR(H43h),VSR(H4),VSR(H3) + + C store [H3m, H4m, H43l, H43h] + li r7,6*TableElemAlign + li r8,7*TableElemAlign + li r9,8*TableElemAlign + li r10,9*TableElemAlign + stxvd2x VSR(H3m),r7,TABLE + stxvd2x VSR(H4m),r8,TABLE + stxvd2x VSR(H43l),r9,TABLE + stxvd2x VSR(H43h),r10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key) + +define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6') + +define(`ZERO', `v0') +define(`POLY', `v3') +define(`POLY_L', `v1') +define(`POLY_H', `v2') + +define(`D', `v3') +define(`C0', `v4') +define(`C1', `v5') +define(`C2', `v6') +define(`C3', `v7') +define(`Mh', `v8') +define(`Ml', `v9') +define(`RP', `v10') +define(`C01h', `v11') +define(`C01l', `v12') +define(`C23h', `v13') +define(`C23l', `v14') + +define(`H1', `v15') +define(`H2', `v16') +define(`H21l', `v17') +define(`H21h', `v18') +define(`H3', `v20') +define(`H4', `v21') +define(`H43l', `v22') +define(`H43h', `v23') + +define(`Cl', `v5') +define(`Cm', `v6') +define(`Ch', `v7') +define(`Hl', `v15') +define(`H', `v16') +define(`Hh', `v17') + +define(`LE_TEMP', `v18') +define(`LE_MASK', `v19') + + C void gcm_hash (const struct gcm_key *key, union gcm_block *x, + C size_t length, const uint8_t *data) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash) + DATA_LOAD_VEC(POLY,.polynomial,r7) +IF_LE(` + li r8,0 + lvsl LE_MASK,0,r8 + vspltisb LE_TEMP,0x07 + vxor LE_MASK,LE_MASK,LE_TEMP +') + vxor ZERO,ZERO,ZERO + + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) + xxmrghd VSR(POLY_H),VSR(POLY),VSR(ZERO) + + lxvd2x VSR(D),0,X C load 'X' pointer + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + + C --- process 4 blocks '128-bit each' per one loop --- + + srdi r7,LENGTH,6 C 4-blocks loop count 'LENGTH / (4 * 16)' + cmpldi r7,0 + beq L2x + + mtctr r7 C assign counter register to loop count + + C backup non-volatile vector registers + addi r8,SP,-64 + stvx 20,0,r8 + addi r8,r8,16 + stvx 21,0,r8 + addi r8,r8,16 + stvx 22,0,r8 + addi r8,r8,16 + stvx 23,0,r8 + + C load table elements + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H1),0,TABLE + lxvd2x VSR(H2),r8,TABLE + lxvd2x VSR(H21l),r9,TABLE + lxvd2x VSR(H21h),r10,TABLE + li r7,6*TableElemAlign + li r8,7*TableElemAlign + li r9,8*TableElemAlign + li r10,9*TableElemAlign + lxvd2x VSR(H3),r7,TABLE + lxvd2x VSR(H4),r8,TABLE + lxvd2x VSR(H43l),r9,TABLE + lxvd2x VSR(H43h),r10,TABLE + + li r8,0x10 + li r9,0x20 + li r10,0x30 +.align 5 +L4x_loop: + C input loading + lxvd2x VSR(C0),0,DATA C load C0 + lxvd2x VSR(C1),r8,DATA C load C1 + lxvd2x VSR(C2),r9,DATA C load C2 + lxvd2x VSR(C3),r10,DATA C load C3 + +IF_LE(` + vperm C0,C0,C0,LE_MASK + vperm C1,C1,C1,LE_MASK + vperm C2,C2,C2,LE_MASK + vperm C3,C3,C3,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication "classical" pre-processing + xxmrghd VSR(C23h),VSR(C2),VSR(C3) + xxmrgld VSR(C23l),VSR(C2),VSR(C3) + xxmrghd VSR(C01h),VSR(C0),VSR(C1) + xxmrgld VSR(C01l),VSR(C0),VSR(C1) + + C polynomial multiplication "classical" + vpmsumd C3,C3,H1 C M3 = H^1l*C3h⊕H^1h*C3l + vpmsumd C2,C2,H2 C M2 = H^2l*C2h⊕H^2h*C2l + vpmsumd C1,C1,H3 C M1 = H^3l*C1h⊕H^3h*C1l + vpmsumd C0,C0,H4 C M0 = H^4l*C0h⊕H^4h*C0l + vpmsumd C23h,C23h,H21h C H23 = H^2h*C2h⊕H^1h*C3h + vpmsumd C23l,C23l,H21l C L23 = H^2l*C2l⊕H^1l*C3l + vpmsumd C01h,C01h,H43h C H01 = H^4h*C0h⊕H^4h*C1h + vpmsumd C01l,C01l,H43l C L01 = H^4l*C0l⊕H^4l*C1l + + C polynomial multiplication "classical" post-processing + vxor C2,C2,C3 C M2 = M2⊕M3 + vxor C0,C0,C1 C M0 = M0⊕M1 + + C deferred recombination of partial products + vxor C01h,C01h,C23h C H0 = H01⊕H23 + vxor C01l,C01l,C23l C L0 = L01⊕L23 + vxor C0,C0,C2 C M0 = M0⊕M2 + + C reduction first phase [1] + vpmsumd RP,C01l,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(C0) C [2] + xxmrgld VSR(Ml),VSR(C0),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor C01h,C01h,Mh C [2] + vxor C01l,C01l,Ml C [2] + vxor C01l,C01l,RP C [1] + + C reduction second phase + vpmsumd RP,C01l,POLY_H + vxor C01h,C01l,C01h + vxor D,C01h,RP + + addi DATA,DATA,0x40 + bdnz L4x_loop + + C restore non-volatile vector registers + addi r8,SP,-64 + lvx 20,0,r8 + addi r8,r8,16 + lvx 21,0,r8 + addi r8,r8,16 + lvx 22,0,r8 + addi r8,r8,16 + lvx 23,0,r8 + + clrldi LENGTH,LENGTH,58 C 'set the high-order 58 bits to zeros' +L2x: + C --- process 2 blocks --- + + srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)' + cmpldi r7,0 + beq L1x + + C load table elements + li r8,3*TableElemAlign + li r9,4*TableElemAlign + li r10,5*TableElemAlign + lxvd2x VSR(H1),0,TABLE + lxvd2x VSR(H2),r8,TABLE + lxvd2x VSR(H21l),r9,TABLE + lxvd2x VSR(H21h),r10,TABLE + + C input loading + li r10,0x10 + lxvd2x VSR(C0),0,DATA C load C0 + lxvd2x VSR(C1),r10,DATA C load C1 + +IF_LE(` + vperm C0,C0,C0,LE_MASK + vperm C1,C1,C1,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication "classical" pre-processing + xxmrghd VSR(C01h),VSR(C0),VSR(C1) + xxmrgld VSR(C01l),VSR(C0),VSR(C1) + + C polynomial multiplication "classical" + vpmsumd C1,C1,H1 C M1 = H^1l*C1h⊕H^1h*C1l + vpmsumd C0,C0,H2 C M0 = H^2l*C0h⊕H^2h*C0l + vpmsumd C01h,C01h,H21h C H01 = H^2h*C0h⊕H^1h*C1h + vpmsumd C01l,C01l,H21l C L01 = H^2l*C0l⊕H^1l*C1l + + C deferred recombination of partial products + vxor C0,C0,C1 C M0 = M0⊕M1 + + C reduction first phase [1] + vpmsumd RP,C01l,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(C0) C [2] + xxmrgld VSR(Ml),VSR(C0),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor C01h,C01h,Mh C [2] + vxor C01l,C01l,Ml C [2] + vxor C01l,C01l,RP C [1] + + C reduction second phase + vpmsumd RP,C01l,POLY_H + vxor C01h,C01l,C01h + vxor D,C01h,RP + + addi DATA,DATA,0x20 + clrldi LENGTH,LENGTH,59 C 'set the high-order 59 bits to zeros' +L1x: + C --- process 1 block --- + + srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)' + cmpldi r7,0 + beq Lmod + + C load table elements + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(H),0,TABLE + lxvd2x VSR(Hl),r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + C input loading + lxvd2x VSR(C0),0,DATA C load C0 + +IF_LE(` + vperm C0,C0,C0,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication "classical" + vpmsumd Cl,C0,Hl C L = Hl*Cl + vpmsumd Cm,C0,H C M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh C H = Hh*Ch + + C reduction first phase C [1] + vpmsumd RP,Cl,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(Cm) C [2] + xxmrgld VSR(Ml),VSR(Cm),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor Ch,Ch,Mh C [2] + vxor Cl,Cl,Ml C [2] + vxor Cl,Cl,RP C [1] + + C reduction second phase + vpmsumd RP,Cl,POLY_H + vxor Ch,Cl,Ch + vxor D,Ch,RP + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 C 'set the high-order 60 bits to zeros' +Lmod: + C --- process the modulo bytes, padding the low-order bytes with zeros --- + + cmpldi LENGTH,0 + beq Ldone + + C load table elements + li r9,1*TableElemAlign + li r10,2*TableElemAlign + lxvd2x VSR(H),0,TABLE + lxvd2x VSR(Hl),r9,TABLE + lxvd2x VSR(Hh),r10,TABLE + + C push every modulo byte to the stack and load them with padding into vector register + addi r8,SP,-16 + stvx ZERO,0,r8 +Lstb_loop: + subic. LENGTH,LENGTH,1 + lbzx r7,LENGTH,DATA + stbx r7,LENGTH,r8 + bne Lstb_loop + lxvd2x VSR(C0),0,r8 + +IF_LE(` + vperm C0,C0,C0,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication "classical" + vpmsumd Cl,C0,Hl C L = Hl*Cl + vpmsumd Cm,C0,H C M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh C H = Hh*Ch + + C reduction first phase [1] + vpmsumd RP,Cl,POLY_L C [1] + + C polynomial multiplication post-processing [2] + xxmrghd VSR(Mh),VSR(ZERO),VSR(Cm) C [2] + xxmrgld VSR(Ml),VSR(Cm),VSR(ZERO) C [2] + xxswapd VSR(RP),VSR(RP) C [1] + vxor Ch,Ch,Mh C [2] + vxor Cl,Cl,Ml C [2] + vxor Cl,Cl,RP C [1] + + C reduction second phase + vpmsumd RP,Cl,POLY_H + vxor Ch,Cl,Ch + vxor D,Ch,RP + +Ldone: + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + stxvd2x VSR(D),0,X C store digest 'D' + + blr +EPILOGUE(_nettle_gcm_hash) + +.data + C 0xC2000000000000000000000000000001 +.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')
Maamoun TK maamoun.tk@googlemail.com writes:
+L4x_loop:
[...]
- C polynomial multiplication "classical" pre-processing
- xxmrghd VSR(C23h),VSR(C2),VSR(C3)
- xxmrgld VSR(C23l),VSR(C2),VSR(C3)
- xxmrghd VSR(C01h),VSR(C0),VSR(C1)
- xxmrgld VSR(C01l),VSR(C0),VSR(C1)
- C polynomial multiplication "classical"
- vpmsumd C3,C3,H1 C M3 = H^1l*C3h⊕H^1h*C3l
- vpmsumd C2,C2,H2 C M2 = H^2l*C2h⊕H^2h*C2l
- vpmsumd C1,C1,H3 C M1 = H^3l*C1h⊕H^3h*C1l
- vpmsumd C0,C0,H4 C M0 = H^4l*C0h⊕H^4h*C0l
- vpmsumd C23h,C23h,H21h C H23 = H^2h*C2h⊕H^1h*C3h
- vpmsumd C23l,C23l,H21l C L23 = H^2l*C2l⊕H^1l*C3l
- vpmsumd C01h,C01h,H43h C H01 = H^4h*C0h⊕H^4h*C1h
- vpmsumd C01l,C01l,H43l C L01 = H^4l*C0l⊕H^4l*C1l
So you do 4 blocks with only 10 vpmsumd instructions (the 8 above, and 2 more below for the reduction). That's nice! It would be even nicer if the H terms could be rearranged to eliminate the pre-processing of the C values.
And it is all with polynomials bits reversed compared to the spec? I find the spec, https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pd..., not crystal clear, but as I understand it, the bit that is least significant when interpreted as a plain integer corresponds to the highest power of x, while instructions like vpmsumd use the opposite (and more natural, imo) convention.
This is how I understand bit reversal. We want
c(x) = a(x) b(x) mod p(x) = a(x) b(x) + q(x) p(x)
where a, b and c all are 128 bits, p is 129 bits, and q is 127 bits (chosen so the high 127 bits of the 255 bit product cancel). Reverse as
c'(x) = x^127 c(1/x), similarly for a', b',
q'(x) = x^126 q(1/x), p'(x) = x^128 p(1/x)
Then we get
c'(x) = [a'(x) b'(x) + q'(x) p'(x)] / x^127
i.e., q' now cancels the *low* 127 bits of the product. Which can also be written as
c'(x) = a'(x) b'(x) / x^127 (mod p'(x))
So in this way, bit reversal is a bit similar to montgomery representation for modular arithmetic on integers. And if 128 and corresponding bit boundary is more convenient than 127, one can either shift the product one bit left, or premultiply one of the factors with x mod p'(x).
Regards, /Niels
nisse@lysator.liu.se (Niels Möller) writes:
Maamoun TK maamoun.tk@googlemail.com writes:
+L4x_loop:
[...]
- C polynomial multiplication "classical" pre-processing
- xxmrghd VSR(C23h),VSR(C2),VSR(C3)
- xxmrgld VSR(C23l),VSR(C2),VSR(C3)
- xxmrghd VSR(C01h),VSR(C0),VSR(C1)
- xxmrgld VSR(C01l),VSR(C0),VSR(C1)
- C polynomial multiplication "classical"
- vpmsumd C3,C3,H1 C M3 = H^1l*C3h⊕H^1h*C3l
- vpmsumd C2,C2,H2 C M2 = H^2l*C2h⊕H^2h*C2l
- vpmsumd C1,C1,H3 C M1 = H^3l*C1h⊕H^3h*C1l
- vpmsumd C0,C0,H4 C M0 = H^4l*C0h⊕H^4h*C0l
- vpmsumd C23h,C23h,H21h C H23 = H^2h*C2h⊕H^1h*C3h
- vpmsumd C23l,C23l,H21l C L23 = H^2l*C2l⊕H^1l*C3l
- vpmsumd C01h,C01h,H43h C H01 = H^4h*C0h⊕H^4h*C1h
- vpmsumd C01l,C01l,H43l C L01 = H^4l*C0l⊕H^4l*C1l
So you do 4 blocks with only 10 vpmsumd instructions (the 8 above, and 2 more below for the reduction). That's nice! It would be even nicer if the H terms could be rearranged to eliminate the pre-processing of the C values.
I think I've found a slightly better way to organize it. I'm assuming bit reversal and that we have pre-multiplied the key factor by x, so that we need to compute
A(x) B(x) / x^128 (mod P(x))
where P(x) is the reversed polynomial P(x) = x^128 + x^127 + x^126 + x^121 + 1. Denote the 64-bit pieces as
A(x) = a_1(x) x^64 + a_0(x), B(x) = b_1(x) x^64 + b_0(x)
We do precomputations on B (the key, invariant when processing a message of multiple blocks).
First, compute b_0(x) / x^64 (mod P(x)), which expands it from 64 bits to 128,
c_1(x) x^64 + c_0(x) = b_0(x) / x^64 (mod P(x))
Next, add together d_1 = b_1 + c_0. We can then write (to simplify notation, everything below is (mod P(x)), + means xor, and we also omit the (x) arguments).
A B / x^128 = a_1 b_1 + (a_0 b_1 + a_1 b_0) / x^64 + a_0 b_0 / x^128 = a_1 b_1 + (a_0 b_1 + a_1 b_0) / x^64 + a_0 (c_1 x^64 + c_0) / x^64 = a_1 b_1 + a_0 c_1 + (a_0 b_1 + a_1 b_0 + a_0 c_0) / 2^64 = a_1 b_1 + a_0 c_1 + (a_0 d_1 + a_1 b_0) / x^64 `------R--------' `------F--------'
So if we have the input in register A (loaded from memory with no processing besides ensuring proper *byte* order), and precompute two values, M representing b_1(x) x^64 + c_1(x), and L representing b_0(x) x^64 + d_1(x)), then we get the two halves above with two vpmsumd,
vpmsumd R, M, A vpmsumd F, L, A
When doing more than one block at a time, I think it's easiest to accumulate the R and F values separately.
After accumulation, we have the pieces
R = r_1 x^64 + r_0, F = f_1 x^64 + f_0
To do the division with x^64 (mod P), and reduce to a 128-bit result. we need to cancel f_0, and we do that by adding multiplyign f_0 P and adding in:
(f_1 x^64 + f_0 + f_0 P) / 2^64 = f_1 + f_0 (x^64 + x^63 + x^62 + x^57)
If we prepare a register G representing the constant polynomial (x^63 + x^62 + x^57), i.e, all zero high part, we can do that as
vpmsumd T, F, G
and then to get the final result, we need to xor together the three values
r_1 x^64 + r_0 f_0 x^64 + f_1 (F swapped) t_1 x^64 + t_0
It may be worth noting that both r_1 and t_1 are only 63 bits, so it's only the contribution of f_0 in the high half position that increases the size of the result from 127 bits to 128.
There are some possible variations of the final reduction, e.g, one could use G' = x^63 + x^62 + x^61 + x^56 (i.e., one more term), but then left shift the result of f_0 G'. With some different and really clever handling of the extra x factor (here, assumed premultiplied into b), maybe one could use a single vpmsumd to multiply f_0 G' and add in f_1 at the right place before shifting.
So to sum up, with this method, there's no preprocessing of the input, two vpmsumd per block to produce the values to be accumulated (same as in your patch), and a single vpmsumd (one less than in your patch) for the final reduction.
Furthermore, there's actually no need to do the final reduction until the end of the message. If we can keep the accumulated R and M values separately in the gcm_ctx struct (an ABI change, though), the final reduction can be postponed until gcm_digest is called.
Regards, /Niels
Hi Niels,
I tried to apply your method but can't get it work, while applying it one question came to my mind.
First, compute b_0(x) / x^64 (mod P(x)), which expands it from 64 bits to 128,
c_1(x) x^64 + c_0(x) = b_0(x) / x^64 (mod P(x))
Here you are trying to get partially reduced product by computing b_0(x) / x^64 (mod P(x)) but since the degree of input is 127, we can use the polynomial defining the finite field with x^64 elements, in this case P(x) = X^64+X^4+X^3+X+1 and P' = P^-1 (mod X^64) = X^63+X^61+X^60+1 which is the same constant 0xB0 and the function now: c_1(x) x^64 + c_0(x) = ((b_0 mod X^64) * p') mod X^64
Forgot to mention that P(x) = X^64+X^63+X^61+X^60+1 after being reflected.
Regards, Mamone
On Sun, Oct 11, 2020 at 5:17 PM Maamoun TK maamoun.tk@googlemail.com wrote:
Hi Niels,
I tried to apply your method but can't get it work, while applying it one question came to my mind.
First, compute b_0(x) / x^64 (mod P(x)), which expands it from 64 bits to 128,
c_1(x) x^64 + c_0(x) = b_0(x) / x^64 (mod P(x))
Here you are trying to get partially reduced product by computing b_0(x) / x^64 (mod P(x)) but since the degree of input is 127, we can use the polynomial defining the finite field with x^64 elements, in this case P(x) = X^64+X^4+X^3+X+1 and P' = P^-1 (mod X^64) = X^63+X^61+X^60+1 which is the same constant 0xB0 and the function now: c_1(x) x^64 + c_0(x) = ((b_0 mod X^64) * p') mod X^64
Maamoun TK maamoun.tk@googlemail.com writes:
Hi Niels,
I tried to apply your method but can't get it work,
Hmm, do you think I've missed something in the math, or are there other difficulties?
while applying it one question came to my mind.
First, compute b_0(x) / x^64 (mod P(x)), which expands it from 64 bits to 128,
c_1(x) x^64 + c_0(x) = b_0(x) / x^64 (mod P(x))
Here you are trying to get partially reduced product by computing b_0(x) / x^64 (mod P(x)) but since the degree of input is 127, we can use the polynomial defining the finite field with x^64 elements, in this case P(x) = X^64+X^4+X^3+X+1 and P' = P^-1 (mod X^64) = X^63+X^61+X^60+1 which is the same constant 0xB0 and the function now: c_1(x) x^64 + c_0(x) = ((b_0 mod X^64) * p') mod X^64
For correctness, I think it is important that the computation b_0(x) / x^64 is done modulo the gcm polynomial (originally, x^128 + x^7 + x^2 + x + 1, but after bit reflection, P(x) = x^128 + x^127 + x^126 + x^125 + 1).
I don't see how one can do part of the computation in GF(2^64), or how your degree-64 polynomial relates the the original degree-128 polynomial. If there's some useful embedding of GF(2^64) as a subfield of GF(2^128), please explain?
That said, division by x^64 is fairly cheap, since P(x) = 1 (mod x^64). I think we get
b_0(x) / x^64 (mod P(x)) = b_0(x) (1 + P(x)) / x^64 (mod P(x)
where we can simplify (P(x) + 1) / x^64 to x^64 + x^63 + x^62 + x^58, or
b_0(x) / x^64 (mod P(x)) = b_0(x) (x^64 to x^64 + x^63 + x^62 + x^58)
So no reduction needed, just split the product in high and low part to get c_1 and c_0.
Regards, /Niels
Thanks for the clarification, I just misunderstanded the division with the partial reduction in a previous reply.
Ok, so you mean a polynomial division of b_0(x) by P(x) where P(x) = X^128 + X^127 + X^126 + X^121 + 1 b_0(x)/P(x) = (b_0(x)*(p^-1 mod P(x))) mod P(x) b_0(x)/P(x) = (b_0(x)*(p')) mod P(x) P(x) = X^64 + X^63 + X^62 + X^57 P' = p^-1 mod P(x) = X^63 + X^62 + X^57 so the constant 0xC2
let me show you part of the new implementation of _nettle_gcm_init_key in PPC
C --- calculate [H = H << 1 modulo polynomial] ---
vupkhsb EMSB,H C extend most significant bit to first byte vspltisb B1,1 C 0x01010101010101010101010101010101 vspltb EMSB,EMSB,0 C first byte quadword-extend vsl H,H,B1 C H = H << 1 vand EMSB,EMSB,POLY C EMSB &= 0xC2000000000000000000000000000001 vxor ZERO,ZERO,ZERO C 0x00000000000000000000000000000000 vxor H,H,EMSB C H ^= EMSB
xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C 0x0000000000000000C200000000000000 xxmrghd VSR(POLY_H),VSR(POLY),VSR(ZERO) C 0xC2000000000000000000000000000000
C --- calculate [H^2 = H*H] ---
xxswapd VSR(Hm),VSR(H) vpmsumd HP,H,POLY_L vxor L,HP,Hm xxmrghd VSR(M),VSR(H),VSR(HP) xxmrgld VSR(L),VSR(H),VSR(L)
vpmsumd R,M,H vpmsumd F,L,H
vpmsumd T,F,POLY_H xxswapd VSR(F),VSR(F) vxor R,R,T vxor R,R,F
R still yields the wrong value of H^2, did I miss something in the implementation or did it wrong?
On Sun, Oct 11, 2020 at 8:14 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
Hi Niels,
I tried to apply your method but can't get it work,
Hmm, do you think I've missed something in the math, or are there other difficulties?
while applying it one question came to my mind.
First, compute b_0(x) / x^64 (mod P(x)), which expands it from 64 bits
to
128,
c_1(x) x^64 + c_0(x) = b_0(x) / x^64 (mod P(x))
Here you are trying to get partially reduced product by computing b_0(x)
/
x^64 (mod P(x)) but since the degree of input is 127, we can use the polynomial defining the finite field with x^64 elements, in this case
P(x)
= X^64+X^4+X^3+X+1 and P' = P^-1 (mod X^64) = X^63+X^61+X^60+1 which is
the
same constant 0xB0 and the function now: c_1(x) x^64 + c_0(x) = ((b_0 mod X^64) * p') mod X^64
For correctness, I think it is important that the computation b_0(x) / x^64 is done modulo the gcm polynomial (originally, x^128 + x^7 + x^2 + x + 1, but after bit reflection, P(x) = x^128 + x^127 + x^126 + x^125 + 1).
I don't see how one can do part of the computation in GF(2^64), or how your degree-64 polynomial relates the the original degree-128 polynomial. If there's some useful embedding of GF(2^64) as a subfield of GF(2^128), please explain?
That said, division by x^64 is fairly cheap, since P(x) = 1 (mod x^64). I think we get
b_0(x) / x^64 (mod P(x)) = b_0(x) (1 + P(x)) / x^64 (mod P(x)
where we can simplify (P(x) + 1) / x^64 to x^64 + x^63 + x^62 + x^58, or
b_0(x) / x^64 (mod P(x)) = b_0(x) (x^64 to x^64 + x^63 + x^62 + x^58)
So no reduction needed, just split the product in high and low part to get c_1 and c_0.
Regards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
Maamoun TK maamoun.tk@googlemail.com writes:
Thanks for the clarification, I just misunderstanded the division with the partial reduction in a previous reply.
Ok, so you mean a polynomial division of b_0(x) by P(x) where P(x) = X^128
- X^127 + X^126 + X^121 + 1
b_0(x)/P(x) = (b_0(x)*(p^-1 mod P(x))) mod P(x) b_0(x)/P(x) = (b_0(x)*(p')) mod P(x) P(x) = X^64 + X^63 + X^62 + X^57 P' = p^-1 mod P(x) = X^63 + X^62 + X^57 so the constant 0xC2
Hmm, I'm not following you (and it seems you are using P(x) to refer to two different polynomials). The operation is a bit similar to Montgomery redc, and it has taken me some time to get used to the initially strange mix of operations mod P(x) and mod x^k (or in the integer case, it's a mix of mod m, with m odd, and mod 2^k).
b_0(x) / x^64 (mod P(x))
can be computed in two different ways, giving the same result:
(i) Compute the inverse u(x) = (x^64){-1} mod P(x), then multiply b_0(x) u(x) (mod P(x))
We get u(x) = x^64 + x^63 + x^62 + x^57, with degree only 64. (This is thanks to the special structure of P(x); an inverse of some arbitrary element mod a 128-degree polynomial would be expected to be larger).
(ii) Add a multiple of P(x) to b_0(x) to make the the lowest 64 coefficients all cancel, then divide by x^64 by simply shifting / subtracting 64 from exponents of remaining terms. And because of the special structure of P(x), P(x) = 1 mod x^64, the right multiple is b_0(x) P(x). So
b_0(x) + P(x) b_0(x)
is a degree 191 polynomial, with the least 64 coefficients all zero. When simplified, it boils down to the same b_0(x) u(x), no additional reduction needed.
I tend to think about reductions (from either end) in terms of cancelling bits, so to me, (ii) is a familiar way to think about it.
let me show you part of the new implementation of _nettle_gcm_init_key in PPC
C --- calculate [H = H << 1 modulo polynomial] --- vupkhsb EMSB,H C extend most significant
bit to first byte vspltisb B1,1 C 0x01010101010101010101010101010101 vspltb EMSB,EMSB,0 C first byte quadword-extend vsl H,H,B1 C H = H << 1 vand EMSB,EMSB,POLY C EMSB &= 0xC2000000000000000000000000000001 vxor ZERO,ZERO,ZERO C 0x00000000000000000000000000000000 vxor H,H,EMSB C H ^= EMSB
xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C
0x0000000000000000C200000000000000 xxmrghd VSR(POLY_H),VSR(POLY),VSR(ZERO) C 0xC2000000000000000000000000000000
C --- calculate [H^2 = H*H] --- xxswapd VSR(Hm),VSR(H) vpmsumd HP,H,POLY_L vxor L,HP,Hm xxmrghd VSR(M),VSR(H),VSR(HP) xxmrgld VSR(L),VSR(H),VSR(L) vpmsumd R,M,H vpmsumd F,L,H vpmsumd T,F,POLY_H xxswapd VSR(F),VSR(F) vxor R,R,T vxor R,R,F
R still yields the wrong value of H^2, did I miss something in the implementation or did it wrong?
I'm afraid I don't follow all details (and I would suggest trying to get the most basic variant working first, doing only a single block at a time). But one potential issue is the powers of x. If H(x) is the original key, then I think you need to precompute values corresponding to
x H(x) (mod P(x)) x H(x)^2 (mod P(x))
But it looks like you may be computing
(x H(x))^2 / x^128 = x^2 H(x)^2 / x^128
which is different. It's easy to get confused, but I think the precomputation needs a standard mod P(x), i.e., adding a multiple of P(x) cancelling the most significant coefficients, rather than the more efficient reduction cancelling the least significant coefficients.
If you're familiar with montgomery multiplication of integers, there it's possible to consistently use the operation a,b -> ab / 2^k everywhere, but that requires that all inputs are transformed to montgomery form, a --> 2^k a mod m. And in this case, it's desirable to scale the precomputed values appropriately, so no such transformation is needed for the inputs corresponding to message blocks.
To compute more powers of H, one could do the standard reduction only once, essentially transforming H to montgomery form,
H' = H(x) x^128 (mod P(x)),
then further powers of H can use the reduction of least significant coefficients,
(x H(x)) H' / x^128 = x H(x)^2
(x H(x)^2) H' / x^128 = x H(x)^3
etc.
Regards, /Niels
nisse@lysator.liu.se (Niels Möller) writes:
So if we have the input in register A (loaded from memory with no processing besides ensuring proper *byte* order), and precompute two values, M representing b_1(x) x^64 + c_1(x), and L representing b_0(x) x^64 + d_1(x)), then we get the two halves above with two vpmsumd,
vpmsumd R, M, A vpmsumd F, L, A
When doing more than one block at a time, I think it's easiest to accumulate the R and F values separately.
BTW, I wonder if similar organization would make sense for Arm Neon. Now, Neon doesn't have vpmsumd, the widest carryless multiplication available is vmull.p8, which is an 8-bit to 15-bit multiply, 8 in parallel.
I'm sketching an instruction sequence doing the equivalent of two vpmsumd using 32 vmull.p8, with good parallelism and not too many instructions to shuffle around data to the right places. Is that a good idea? To be compared to what the C code does, a loop of 16 iterations, each doing some table lookup, shift and xoring.
With this large number of multiply instructions, it might pay off to use Karatsuba, which could reduce it to 24 multiples (one level) or 18 (two levels), at the cost of more xors and data movement instructions, and lots of complexity.
(There have been ARM Neon code for gcm posted to the list earlier, but if I remember correctly, that code didn't work in bit-reversed representation, but used a bunch of explicit reversal operations).
Regards, /Niels
On Sun, Oct 11, 2020 at 1:42 PM Niels Möller nisse@lysator.liu.se wrote:
nisse@lysator.liu.se (Niels Möller) writes:
So if we have the input in register A (loaded from memory with no processing besides ensuring proper *byte* order), and precompute two values, M representing b_1(x) x^64 + c_1(x), and L representing b_0(x) x^64 + d_1(x)), then we get the two halves above with two vpmsumd,
vpmsumd R, M, A vpmsumd F, L, A
When doing more than one block at a time, I think it's easiest to accumulate the R and F values separately.
BTW, I wonder if similar organization would make sense for Arm Neon. Now, Neon doesn't have vpmsumd, the widest carryless multiplication available is vmull.p8, which is an 8-bit to 15-bit multiply, 8 in parallel...
I may be mistaken, but I believe 64-bit poly multiplies are available. Or they are available on Aarch64 with Crypto extensions.
I'm not aware of poly multiplies on other ARM arches, like ARMv6 or ARMv7 with NEON.
Jeff
Jeffrey Walton noloader@gmail.com writes:
I may be mistaken, but I believe 64-bit poly multiplies are available. Or they are available on Aarch64 with Crypto extensions.
I'm looking in the Arm Instruction Set Reference Guide, labeled version 1.0, 2018.
It includes a section on cryptographic instructions, but that's aes, sha1 and sha256, no carry-less multiplication.
But I may well be missing something, I'm not really familiar with Aarch64.
I'm not aware of poly multiplies on other ARM arches, like ARMv6 or ARMv7 with NEON.
I think the "p8" SIMD datatype and vmull.p8 have been part of the Neon instruction set for a long time, at least since I wrote my first ARM code back in 2013. It's just a bit annoyning that one needs so many of them to do a wide multiply.
Regards, /Niels
On Sun, Oct 11, 2020 at 2:03 PM Niels Möller nisse@lysator.liu.se wrote:
Jeffrey Walton noloader@gmail.com writes:
I may be mistaken, but I believe 64-bit poly multiplies are available. Or they are available on Aarch64 with Crypto extensions.
I'm looking in the Arm Instruction Set Reference Guide, labeled version 1.0, 2018.
It includes a section on cryptographic instructions, but that's aes, sha1 and sha256, no carry-less multiplication.
But I may well be missing something, I'm not really familiar with Aarch64.
I'm not aware of poly multiplies on other ARM arches, like ARMv6 or ARMv7 with NEON.
I think the "p8" SIMD datatype and vmull.p8 have been part of the Neon instruction set for a long time, at least since I wrote my first ARM code back in 2013. It's just a bit annoyning that one needs so many of them to do a wide multiply.
Oh, you're right. There is a vmull for NEON.
According to an early NEON programming guide from ARM (https://static.docs.arm.com/den0018/a/DEN0018A_neon_programmers_guide_en.pdf), the widest you can perform is P16 poly multiply.
Jeff
nettle-bugs@lists.lysator.liu.se