Patch implementation benchmark for GCM_AES (Tested on POWER8): little-endian: - Encrypt x~17.5 of nettle C implementation - Decrypt x~17.5 of nettle C implementation - Update x~30 of nettle C implementation big-endian: - Encrypt x~18.5 of nettle C implementation - Decrypt x~18.5 of nettle C implementation - Update x~28.5 of nettle C implementation
I investigated falling back to process 4 blocks instead of 8 blocks, 8x loop outperform 4x loop by x~1.22 on POWER8. Processing 4 blocks leaves plenty of registers on the table, to reduce the performance margin these registers can be used to overlap the instructions inside the loop. GHASH loop has two separate operations: - Digest calculation - Reduction The reduction procedure is hardly parallelized, to overcome that, the two procedures can overlap as follows
|`````````````````````````````````````````| |Head: | |Digest calculation [0..3] | |`````````````````````````````````````````| |Loop: | |Overlapped Reduction [i+0..i+3] and | |Digest calculation [i+4..i+7] | |`````````````````````````````````````````| |Tail: | |Reduction [i+4..i+7] | ```````````````````````````````````````````
With that implemented the performance margin is reduced to x~1.15 and the code is messed up a little bit I can do more documentations so the reader can still keep track of what's going on but with that said the performance margin is still considerable and I think it's better to stick with 8x loop.
- The patch passed the testsuite for both ppc64 and ppc64el.
--- Makefile.in | 2 +- asm.m4 | 17 +- config.m4.in | 1 + configure.ac | 12 + gcm.c | 19 +- powerpc64/aes-decrypt-internal.asm | 584 +++++++++++++++++++++ powerpc64/aes-encrypt-internal.asm | 545 +++++++++++++++++++ powerpc64/gcm-hash8.asm | 1015 ++++++++++++++++++++++++++++++++++++ powerpc64/machine.m4 | 0 testsuite/gcm-test.c | 23 + 10 files changed, 2214 insertions(+), 4 deletions(-) create mode 100644 powerpc64/aes-decrypt-internal.asm create mode 100644 powerpc64/aes-encrypt-internal.asm create mode 100644 powerpc64/gcm-hash8.asm create mode 100644 powerpc64/machine.m4
diff --git a/Makefile.in b/Makefile.in index 64ff1001..fdc819f1 100644 --- a/Makefile.in +++ b/Makefile.in @@ -603,7 +603,7 @@ distdir: $(DISTFILES) done set -e; for d in sparc32 sparc64 x86 \ x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \ - arm arm/neon arm/v6 arm/fat ; do \ + arm arm/neon arm/v6 arm/fat powerpc64 ; do \ mkdir "$(distdir)/$$d" ; \ find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \ -exec cp '{}' "$(distdir)/$$d" ';' ; \ diff --git a/asm.m4 b/asm.m4 index 59d64098..1033a326 100644 --- a/asm.m4 +++ b/asm.m4 @@ -30,13 +30,26 @@ COFF_STYLE, yes, define(<GMP_NUMB_BITS>,<>)dnl
define(<PROLOGUE>, +<ifelse(ASM_POWERPC64,no, <.globl C_NAME($1) DECLARE_FUNC(C_NAME($1)) -C_NAME($1): ASM_X86_ENDBR>) +C_NAME($1): ASM_X86_ENDBR>, +<.globl C_NAME($1) +DECLARE_FUNC(C_NAME($1)) +.section ".opd","aw" +.align 3 +C_NAME($1): +.quad .C_NAME($1),.TOC.@tocbase,0 +.previous +.align 5 +.C_NAME($1):>)>)
define(<EPILOGUE>, <ifelse(ELF_STYLE,yes, -<.size C_NAME($1), . - C_NAME($1)>,<>)>) +<ifelse(ASM_POWERPC64,no, +<.size C_NAME($1), . - C_NAME($1)>, +<.size .C_NAME($1), . - .C_NAME($1) +.size C_NAME($1), . - .C_NAME($1)>)>,<>)>)
define(<m4_log2>, <m4_log2_internal($1,1,0)>) define(<m4_log2_internal>, diff --git a/config.m4.in b/config.m4.in index f7f5f283..6265a679 100644 --- a/config.m4.in +++ b/config.m4.in @@ -10,6 +10,7 @@ define(<RODATA>, <@ASM_RODATA@>)dnl define(<WORDS_BIGENDIAN>, <@ASM_WORDS_BIGENDIAN@>)dnl define(<ASM_X86_ENDBR>,<@ASM_X86_ENDBR@>)dnl define(<ASM_X86_MARK_CET_ALIGN>,<@ASM_X86_MARK_CET_ALIGN@>)dnl +define(<ASM_POWERPC64>,<@ASM_POWERPC64@>)dnl divert(1) @ASM_X86_MARK_CET@ @ASM_MARK_NOEXEC_STACK@ diff --git a/configure.ac b/configure.ac index 90ea1ea8..d7baface 100644 --- a/configure.ac +++ b/configure.ac @@ -435,6 +435,9 @@ if test "x$enable_assembler" = xyes ; then esac fi ;; + *powerpc64*) + asm_path=powerpc64 + ;; *) enable_assembler=no ;; @@ -572,7 +575,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc +#undef HAVE_NATIVE_gcm_init_key8 #undef HAVE_NATIVE_gcm_hash8 +#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_sha1_compress #undef HAVE_NATIVE_sha256_compress @@ -866,6 +871,12 @@ if test "$nettle_cv_asm_x86_gnu_property" = yes; then .popsection' fi
+if test "$host_cpu" = "powerpc64" ; then + ASM_POWERPC64='yes' +else + ASM_POWERPC64='no' +fi + AC_SUBST(ASM_SYMBOL_PREFIX) AC_SUBST(ASM_ELF_STYLE) AC_SUBST(ASM_COFF_STYLE) @@ -879,6 +890,7 @@ AC_SUBST(EMULATOR) AC_SUBST(ASM_X86_ENDBR) AC_SUBST(ASM_X86_MARK_CET) AC_SUBST(ASM_X86_MARK_CET_ALIGN) +AC_SUBST(ASM_POWERPC64)
AC_SUBST(LIBNETTLE_MAJOR) AC_SUBST(LIBNETTLE_MINOR) diff --git a/gcm.c b/gcm.c index cf615daf..809c03bc 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,12 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key8 + +#define gcm_init_key _nettle_gcm_init_key8 +void +_nettle_gcm_init_key8 (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key8 */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -225,6 +231,13 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table)
#endif /* GCM_TABLE_BITS */
+#if HAVE_NATIVE_gcm_fill + +#define gcm_fill _nettle_gcm_fill +void +_nettle_gcm_fill (uint8_t *ctr, size_t blocks, union nettle_block16 *buffer); +#endif /* HAVE_NATIVE_gcm_fill */ + /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
@@ -245,7 +258,9 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS +#ifdef gcm_init_key + gcm_init_key(key->h); +#elif GCM_TABLE_BITS /* Algorithm 3 from the gcm paper. First do powers of two, then do the rest by adding. */ while (i /= 2) @@ -333,6 +348,7 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key *key, ctx->auth_size += length; }
+#ifndef gcm_fill static nettle_fill16_func gcm_fill; static void gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) @@ -349,6 +365,7 @@ gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
WRITE_UINT32(ctr + GCM_BLOCK_SIZE - 4, c); } +#endif /* !gcm_fill */
void gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key, diff --git a/powerpc64/aes-decrypt-internal.asm b/powerpc64/aes-decrypt-internal.asm new file mode 100644 index 00000000..70fd69f0 --- /dev/null +++ b/powerpc64/aes-decrypt-internal.asm @@ -0,0 +1,584 @@ +C powerpc64/aes-decrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) +define(<S8>, <10>) +define(<S9>, <11>) +define(<S10>, <12>) +define(<S11>, <13>) +define(<S12>, <14>) +define(<S13>, <15>) +define(<S14>, <16>) +define(<S15>, <17>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) +define(<S8X>, <42>) +define(<S9X>, <43>) +define(<S10X>, <44>) +define(<S11X>, <45>) +define(<S12X>, <46>) +define(<S13X>, <47>) +define(<S14X>, <48>) +define(<S15X>, <49>) + +C ZERO vector register is used in place of RoundKey +C for vncipher instruction because the order of InvMixColumns +C and Xor processes are flipped in that instruction. +C The Xor process with RoundKey is executed afterward. +define(<ZERO>, <18>) + +.file "aes-decrypt-internal.asm" + +.machine "any" + +IF_LE(<.abiversion 2>) +.text + + C _aes_decrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +IF_LE(<.align 5>) +PROLOGUE(_nettle_aes_decrypt) +IF_LE(<.localentry _nettle_aes_decrypt,0>) + + vxor ZERO,ZERO,ZERO + + ld 5,.swap_mask@got(TOCP) + lvx swap_mask,0,5 + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,4 #16x loop count + cmpldi 5,0 + beq L8x + + std 17,-120(SP); + std 18,-112(SP); + std 19,-104(SP); + std 20,-96(SP); + std 21,-88(SP); + std 22,-80(SP); + std 23,-72(SP); + std 24,-64(SP); + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 17,0x10 + li 18,0x20 + li 19,0x30 + li 20,0x40 + li 21,0x50 + li 22,0x60 + li 23,0x70 + li 24,0x80 + li 25,0x90 + li 26,0xA0 + li 27,0xB0 + li 28,0xC0 + li 29,0xD0 + li 30,0xE0 + li 31,0xF0 + +.align 5 +Lx16_loop: + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + lxvd2x S1X,17,SRC + lxvd2x S2X,18,SRC + lxvd2x S3X,19,SRC + lxvd2x S4X,20,SRC + lxvd2x S5X,21,SRC + lxvd2x S6X,22,SRC + lxvd2x S7X,23,SRC + lxvd2x S8X,24,SRC + lxvd2x S9X,25,SRC + lxvd2x S10X,26,SRC + lxvd2x S11X,27,SRC + lxvd2x S12X,28,SRC + lxvd2x S13X,29,SRC + lxvd2x S14X,30,SRC + lxvd2x S15X,31,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L16x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vncipher S8,S8,ZERO + vncipher S9,S9,ZERO + vncipher S10,S10,ZERO + vncipher S11,S11,ZERO + vncipher S12,S12,ZERO + vncipher S13,S13,ZERO + vncipher S14,S14,ZERO + vncipher S15,S15,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + addi 10,10,0x10 + bdnz L16x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + vncipherlast S8,S8,K + vncipherlast S9,S9,K + vncipherlast S10,S10,K + vncipherlast S11,S11,K + vncipherlast S12,S12,K + vncipherlast S13,S13,K + vncipherlast S14,S14,K + vncipherlast S15,S15,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask>) + + stxvd2x S0X,0,DST + stxvd2x S1X,17,DST + stxvd2x S2X,18,DST + stxvd2x S3X,19,DST + stxvd2x S4X,20,DST + stxvd2x S5X,21,DST + stxvd2x S6X,22,DST + stxvd2x S7X,23,DST + stxvd2x S8X,24,DST + stxvd2x S9X,25,DST + stxvd2x S10X,26,DST + stxvd2x S11X,27,DST + stxvd2x S12X,28,DST + stxvd2x S13X,29,DST + stxvd2x S14X,30,DST + stxvd2x S15X,31,DST + + addi SRC,SRC,0x100 + addi DST,DST,0x100 + subic. 5,5,1 + bne Lx16_loop + + ld 17,-120(SP); + ld 18,-112(SP); + ld 19,-104(SP); + ld 20,-96(SP); + ld 21,-88(SP); + ld 22,-80(SP); + ld 23,-72(SP); + ld 24,-64(SP); + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,60 + +L8x: + srdi 5,LENGTH,3 + cmpldi 5,0 + beq L4x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + addi 9,9,0x10 + lxvd2x S4X,9,SRC + addi 9,9,0x10 + lxvd2x S5X,9,SRC + addi 9,9,0x10 + lxvd2x S6X,9,SRC + addi 9,9,0x10 + lxvd2x S7X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + addi 9,9,0x10 + stxvd2x S4X,9,DST + addi 9,9,0x10 + stxvd2x S5X,9,DST + addi 9,9,0x10 + stxvd2x S6X,9,DST + addi 9,9,0x10 + stxvd2x S7X,9,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vxor S0,S0,K + vxor S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vxor S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_decrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>) diff --git a/powerpc64/aes-encrypt-internal.asm b/powerpc64/aes-encrypt-internal.asm new file mode 100644 index 00000000..20711598 --- /dev/null +++ b/powerpc64/aes-encrypt-internal.asm @@ -0,0 +1,545 @@ +C powerpc64/aes-encrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) +define(<S8>, <10>) +define(<S9>, <11>) +define(<S10>, <12>) +define(<S11>, <13>) +define(<S12>, <14>) +define(<S13>, <15>) +define(<S14>, <16>) +define(<S15>, <17>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) +define(<S8X>, <42>) +define(<S9X>, <43>) +define(<S10X>, <44>) +define(<S11X>, <45>) +define(<S12X>, <46>) +define(<S13X>, <47>) +define(<S14X>, <48>) +define(<S15X>, <49>) + +.file "aes-encrypt-internal.asm" + +.machine "any" + +IF_LE(<.abiversion 2>) +.text + + C _aes_encrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +IF_LE(<.align 5>) +PROLOGUE(_nettle_aes_encrypt) +IF_LE(<.localentry _nettle_aes_encrypt,0>) + + ld 5,.swap_mask@got(TOCP) + lvx swap_mask,0,5 + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,4 #16x loop count + cmpldi 5,0 + beq L8x + + std 17,-120(SP); + std 18,-112(SP); + std 19,-104(SP); + std 20,-96(SP); + std 21,-88(SP); + std 22,-80(SP); + std 23,-72(SP); + std 24,-64(SP); + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 17,0x10 + li 18,0x20 + li 19,0x30 + li 20,0x40 + li 21,0x50 + li 22,0x60 + li 23,0x70 + li 24,0x80 + li 25,0x90 + li 26,0xA0 + li 27,0xB0 + li 28,0xC0 + li 29,0xD0 + li 30,0xE0 + li 31,0xF0 + +.align 5 +Lx16_loop: + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + lxvd2x S1X,17,SRC + lxvd2x S2X,18,SRC + lxvd2x S3X,19,SRC + lxvd2x S4X,20,SRC + lxvd2x S5X,21,SRC + lxvd2x S6X,22,SRC + lxvd2x S7X,23,SRC + lxvd2x S8X,24,SRC + lxvd2x S9X,25,SRC + lxvd2x S10X,26,SRC + lxvd2x S11X,27,SRC + lxvd2x S12X,28,SRC + lxvd2x S13X,29,SRC + lxvd2x S14X,30,SRC + lxvd2x S15X,31,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L16x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + vcipher S8,S8,K + vcipher S9,S9,K + vcipher S10,S10,K + vcipher S11,S11,K + vcipher S12,S12,K + vcipher S13,S13,K + vcipher S14,S14,K + vcipher S15,S15,K + addi 10,10,0x10 + bdnz L16x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + vcipherlast S8,S8,K + vcipherlast S9,S9,K + vcipherlast S10,S10,K + vcipherlast S11,S11,K + vcipherlast S12,S12,K + vcipherlast S13,S13,K + vcipherlast S14,S14,K + vcipherlast S15,S15,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask>) + + stxvd2x S0X,0,DST + stxvd2x S1X,17,DST + stxvd2x S2X,18,DST + stxvd2x S3X,19,DST + stxvd2x S4X,20,DST + stxvd2x S5X,21,DST + stxvd2x S6X,22,DST + stxvd2x S7X,23,DST + stxvd2x S8X,24,DST + stxvd2x S9X,25,DST + stxvd2x S10X,26,DST + stxvd2x S11X,27,DST + stxvd2x S12X,28,DST + stxvd2x S13X,29,DST + stxvd2x S14X,30,DST + stxvd2x S15X,31,DST + + addi SRC,SRC,0x100 + addi DST,DST,0x100 + subic. 5,5,1 + bne Lx16_loop + + ld 17,-120(SP); + ld 18,-112(SP); + ld 19,-104(SP); + ld 20,-96(SP); + ld 21,-88(SP); + ld 22,-80(SP); + ld 23,-72(SP); + ld 24,-64(SP); + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,60 + +L8x: + srdi 5,LENGTH,3 + cmpldi 5,0 + beq L4x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + addi 9,9,0x10 + lxvd2x S4X,9,SRC + addi 9,9,0x10 + lxvd2x S5X,9,SRC + addi 9,9,0x10 + lxvd2x S6X,9,SRC + addi 9,9,0x10 + lxvd2x S7X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + addi 9,9,0x10 + stxvd2x S4X,9,DST + addi 9,9,0x10 + stxvd2x S5X,9,DST + addi 9,9,0x10 + stxvd2x S6X,9,DST + addi 9,9,0x10 + stxvd2x S7X,9,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_encrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>) diff --git a/powerpc64/gcm-hash8.asm b/powerpc64/gcm-hash8.asm new file mode 100644 index 00000000..fc9ce739 --- /dev/null +++ b/powerpc64/gcm-hash8.asm @@ -0,0 +1,1015 @@ +C powerpc64/gcm-hash8.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: +C VSX instructions is used to load and store data to memory "lxvd2x, stxvd2x" +C instead of VR instructions "lvx, stvx" as a workaround to access unaligned data +C VSX registers are defined with "X" suffix + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<TABLE>, <3>) +define(<X>, <4>) +define(<LENGTH>, <5>) +define(<DATA>, <6>) + +define(<zero>, <0>) +define(<swap_mask>, <1>) +define(<hidw_mask>, <2>) +define(<lodw_mask>, <3>) +define(<poly>, <4>) +define(<poly_h>, <4>) +define(<poly_l>, <5>) +define(<RP>, <6>) +define(<Mh>, <7>) +define(<Ml>, <8>) +define(<H>, <9>) +define(<Hh>, <10>) +define(<Hl>, <11>) +define(<RP2>, <9>) +define(<M2h>, <10>) +define(<M2l>, <11>) + +define(<HX>, <41>) +define(<HhX>, <42>) +define(<HlX>, <43>) +define(<H_HhX>, <44>) +define(<H_HX>, <45>) +define(<H_HlX>, <46>) + +define(<sl1>, <1>) +define(<msb>, <5>) +define(<H2>, <6>) +define(<H2h>, <7>) +define(<H2l>, <8>) +define(<H_h>, <12>) +define(<H_m>, <13>) +define(<H_l>, <14>) +define(<H_Hh>, <12>) +define(<H_H>, <13>) +define(<H_Hl>, <14>) +define(<H_t>, <15>) +define(<H2_h>, <16>) +define(<H2_m>, <17>) +define(<H2_l>, <18>) +define(<H2_t>, <19>) + +define(<C0X>, <38>) +define(<C1X>, <39>) +define(<C2X>, <40>) +define(<C3X>, <44>) +define(<C4X>, <38>) +define(<C5X>, <39>) +define(<C6X>, <40>) +define(<C7X>, <44>) + +define(<CX>, <45>) + +define(<C0>, <6>) +define(<C1>, <7>) +define(<C2>, <8>) +define(<C3>, <12>) +define(<C4>, <6>) +define(<C5>, <7>) +define(<C6>, <8>) +define(<C7>, <12>) + +define(<C>, <13>) + +define(<Ch>, <14>) +define(<Cl>, <15>) +define(<Cm>, <16>) + +define(<C01h>, <14>) +define(<C01l>, <15>) +define(<C01>, <16>) +define(<C23h>, <17>) +define(<C23l>, <18>) +define(<C23>, <19>) +define(<C45h>, <20>) +define(<C45l>, <21>) +define(<C45>, <22>) +define(<C67h>, <6>) +define(<C67l>, <7>) +define(<C67>, <8>) + +define(<H21>, <9>) +define(<H21h>, <10>) +define(<H21l>, <11>) +define(<H43>, <23>) +define(<H43h>, <24>) +define(<H43l>, <25>) +define(<H65>, <26>) +define(<H65h>, <27>) +define(<H65l>, <28>) +define(<H87>, <29>) +define(<H87h>, <30>) +define(<H87l>, <31>) + +define(<H21X>, <41>) +define(<H21hX>, <42>) +define(<H21lX>, <43>) +define(<H43X>, <55>) +define(<H43hX>, <56>) +define(<H43lX>, <57>) +define(<H65X>, <58>) +define(<H65hX>, <59>) +define(<H65lX>, <60>) +define(<H87X>, <61>) +define(<H87hX>, <62>) +define(<H87lX>, <63>) + +# gcm_fill registers: + +define(<CTR>, <3>) +define(<BLOCKS>, <4>) +define(<BUFFER>, <5>) + +define(<CTR0>, <2>) +define(<CTR0S>, <3>) +define(<CTR1>, <4>) +define(<CTR2>, <5>) +define(<CTR3>, <6>) +define(<CTR4>, <7>) +define(<CTR5>, <8>) +define(<CTR6>, <9>) +define(<CTR7>, <10>) + +define(<CTR0X>, <34>) +define(<CTR0SX>, <35>) +define(<CTR1X>, <36>) +define(<CTR2X>, <37>) +define(<CTR3X>, <38>) +define(<CTR4X>, <39>) +define(<CTR5X>, <40>) +define(<CTR6X>, <41>) +define(<CTR7X>, <42>) + +define(<I1>, <11>) +define(<I2>, <12>) +define(<I3>, <13>) +define(<I4>, <14>) +define(<I5>, <15>) +define(<I6>, <16>) +define(<I7>, <17>) +define(<I8>, <18>) + +.file "gcm-hash8.asm" + +.machine "any" + +IF_LE(<.abiversion 2>) +.text + + # void gcm_init_key (union gcm_block *table) + +IF_LE(<.align 5>) +PROLOGUE(_nettle_gcm_init_key8) +IF_LE(<.localentry _nettle_gcm_init_key8,0>) + + ld 7,.polynomial@got(TOCP) + lvx poly,0,7 +IF_LE(<ld 7,.swap_mask@got(TOCP) + lvx swap_mask,0,7>) + ld 7,.hidw_mask@got(TOCP) + lvx hidw_mask,0,7 + ld 7,.lodw_mask@got(TOCP) + lvx lodw_mask,0,7 + + li 10,0x800 + lxvd2x HX,10,TABLE # load H +IF_LE(<vperm H,H,H,swap_mask>) + + # --- calculate H = H shift left 1 modulo polynomial --- + + vupkhsw msb,H # most significant bit word-extend + vspltisb sl1,1 # splat 1 for shift left + vspltw msb,msb,0 # most significant bit extend + vsl H,H,sl1 # H shift left 1 + vand msb,msb,poly + vxor zero,zero,zero + vxor H_t,H,msb + + vsldoi H,H_t,H_t,8 # doubleword swap + vsldoi Hh,H,zero,8 + vsldoi Hl,zero,H,8 + + # --- calculate H^2 = H*H --- + + # reduction pre-processing + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + # polynomial multiplication "classical" + vpmsumd H_h,H_t,Hh # H^1h*H^1h + vpmsumd H_l,H_t,Hl # H^1l*H^1l + vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] + vsldoi Ml,H_m,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor H_h,H_h,Mh # [2] + vxor H_l,H_l,Ml # [2] + vxor H_l,H_l,RP # [1] + + # reduction second phase + vpmsumd RP,H_l,poly_l + vxor H_h,H_l,H_h + vxor H2_t,H_h,RP + + vsldoi H2,H2_t,H2_t,8 + vsldoi H2h,H2,zero,8 + vsldoi H2l,zero,H2,8 + + # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + li 8,0x00 + li 9,0x100 + li 10,0x200 + stxvd2x HlX,8,TABLE + stxvd2x HX,9,TABLE + stxvd2x HhX,10,TABLE + + li 8,0x300 + li 9,0x400 + li 10,0x500 + stxvd2x H_HhX,8,TABLE + stxvd2x H_HX,9,TABLE + stxvd2x H_HlX,10,TABLE + + # --- calculate H^3,H^4 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^2l + vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h + vpmsumd H_h,H_t,H2h # H^1h*H^2h + vpmsumd H2_l,H2_t,H2l # H^2l*H^2l + vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h + vpmsumd H2_h,H2_t,H2h # H^2h*H^2h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^3 + vpmsumd RP2,H2_l,poly_h # [1] H^4 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^3 + vsldoi M2h,zero,H2_m,8 # [2] H^4 + vsldoi Ml,H_m,zero,8 # [2] H^3 + vsldoi M2l,H2_m,zero,8 # [2] H^4 + vsldoi RP,RP,RP,8 # [1] H^3 + vsldoi RP2,RP2,RP2,8 # [1] H^4 + vxor H_h,H_h,Mh # [2] H^3 + vxor H2_h,H2_h,M2h # [2] H^4 + vxor H_l,H_l,Ml # [2] H^3 + vxor H2_l,H2_l,M2l # [2] H^4 + vxor H_l,H_l,RP # [1] H^3 + vxor H2_l,H2_l,RP2 # [1] H^4 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^3 + vpmsumd RP2,H2_l,poly_l # H^4 + vxor H_h,H_l,H_h # H^3 + vxor H2_h,H2_l,H2_h # H^4 + vxor H_h,H_h,RP # H^3 + vxor H2_h,H2_h,RP2 # H^4 + + vsldoi H2,H2_h,H2_h,8 # H^4 + vsldoi H,H_h,H_h,8 # H^3 + vsldoi H2l,zero,H2,8 # H^4 + vsldoi H2h,H2,zero,8 # H^4 + + # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + li 8,0x600 + li 9,0x700 + li 10,0x800 + stxvd2x H_HhX,8,TABLE + stxvd2x H_HX,9,TABLE + stxvd2x H_HlX,10,TABLE + + # --- calculate H^5,H^6 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^4l + vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h + vpmsumd H_h,H_t,H2h # H^1h*H^4h + vpmsumd H2_l,H2_t,H2l # H^2l*H^4l + vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h + vpmsumd H2_h,H2_t,H2h # H^2h*H^4h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^5 + vpmsumd RP2,H2_l,poly_h # [1] H^6 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^5 + vsldoi M2h,zero,H2_m,8 # [2] H^6 + vsldoi Ml,H_m,zero,8 # [2] H^5 + vsldoi M2l,H2_m,zero,8 # [2] H^6 + vsldoi RP,RP,RP,8 # [1] H^5 + vsldoi RP2,RP2,RP2,8 # [1] H^6 + vxor H_h,H_h,Mh # [2] H^5 + vxor H2_h,H2_h,M2h # [2] H^6 + vxor H_l,H_l,Ml # [2] H^5 + vxor H2_l,H2_l,M2l # [2] H^6 + vxor H_l,H_l,RP # [1] H^5 + vxor H2_l,H2_l,RP2 # [1] H^6 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^5 + vpmsumd RP2,H2_l,poly_l # H^6 + vxor H_h,H_l,H_h # H^5 + vxor H2_h,H2_l,H2_h # H^6 + vxor H_h,H_h,RP # H^5 + vxor H2_h,H2_h,RP2 # H^6 + + vsldoi H2,H2_h,H2_h,8 # H^6 + vsldoi H,H_h,H_h,8 # H^5 + vsldoi H2l,zero,H2,8 # H^6 + vsldoi H2h,H2,zero,8 # H^6 + + # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + li 8,0x900 + li 9,0xA00 + li 10,0xB00 + stxvd2x H_HhX,8,TABLE + stxvd2x H_HX,9,TABLE + stxvd2x H_HlX,10,TABLE + + # --- calculate H^7,H^8 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^6l + vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h + vpmsumd H_h,H_t,H2h # H^1h*H^6h + vpmsumd H2_l,H2_t,H2l # H^2l*H^6l + vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h + vpmsumd H2_h,H2_t,H2h # H^2h*H^6h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^7 + vpmsumd RP2,H2_l,poly_h # [1] H^8 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^7 + vsldoi M2h,zero,H2_m,8 # [2] H^8 + vsldoi Ml,H_m,zero,8 # [2] H^7 + vsldoi M2l,H2_m,zero,8 # [2] H^8 + vsldoi RP,RP,RP,8 # [1] H^7 + vsldoi RP2,RP2,RP2,8 # [1] H^8 + vxor H_h,H_h,Mh # [2] H^7 + vxor H2_h,H2_h,M2h # [2] H^8 + vxor H_l,H_l,Ml # [2] H^7 + vxor H2_l,H2_l,M2l # [2] H^8 + vxor H_l,H_l,RP # [1] H^7 + vxor H2_l,H2_l,RP2 # [1] H^8 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^7 + vpmsumd RP2,H2_l,poly_l # H^8 + vxor H_h,H_l,H_h # H^7 + vxor H2_h,H2_l,H2_h # H^8 + vxor H_h,H_h,RP # H^7 + vxor H2_h,H2_h,RP2 # H^8 + + vsldoi H,H_h,H_h,8 # H^7 + vsldoi H2,H2_h,H2_h,8 # H^8 + + # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + li 8,0xC00 + li 9,0xD00 + li 10,0xE00 + stxvd2x H_HhX,8,TABLE + stxvd2x H_HX,9,TABLE + stxvd2x H_HlX,10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key8) + + # void gcm_hash (const struct gcm_key *key, union gcm_block *x, + # size_t length, const uint8_t *data) + +IF_LE(<.align 5>) +PROLOGUE(_nettle_gcm_hash8) +IF_LE(<.localentry _nettle_gcm_hash8,0>) + + vxor zero,zero,zero + + ld 7,.polynomial@got(TOCP) + lvx poly,0,7 +IF_LE(<ld 7,.swap_mask@got(TOCP) + lvx swap_mask,0,7>) + ld 7,.hidw_mask@got(TOCP) + lvx hidw_mask,0,7 + ld 7,.lodw_mask@got(TOCP) + lvx lodw_mask,0,7 + + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + lxvd2x CX,0,X # load X +IF_LE(<vperm C,C,C,swap_mask>) + + srdi 7,LENGTH,7 # 8x loop count + cmpldi 7,0 + beq L2x + + # backup registers + stdu SP,-224(SP) + std 28,216(SP) + std 29,208(SP) + std 30,200(SP) + std 31,192(SP) + li 8,176 + stvx 20,8,SP + subi 8,8,16 + stvx 21,8,SP + subi 8,8,16 + stvx 22,8,SP + subi 8,8,16 + stvx 23,8,SP + subi 8,8,16 + stvx 24,8,SP + subi 8,8,16 + stvx 25,8,SP + subi 8,8,16 + stvx 26,8,SP + subi 8,8,16 + stvx 27,8,SP + subi 8,8,16 + stvx 28,8,SP + subi 8,8,16 + stvx 29,8,SP + subi 8,8,16 + stvx 30,8,SP + subi 8,8,16 + stvx 31,8,SP + + # table loading + li 8,0x300 + li 9,0x400 + li 10,0x500 + lxvd2x H21hX,8,TABLE + lxvd2x H21X,9,TABLE + lxvd2x H21lX,10,TABLE + li 8,0x600 + li 9,0x700 + li 10,0x800 + lxvd2x H43hX,8,TABLE + lxvd2x H43X,9,TABLE + lxvd2x H43lX,10,TABLE + li 8,0x900 + li 9,0xA00 + li 10,0xB00 + lxvd2x H65hX,8,TABLE + lxvd2x H65X,9,TABLE + lxvd2x H65lX,10,TABLE + li 8,0xC00 + li 9,0xD00 + li 10,0xE00 + lxvd2x H87hX,8,TABLE + lxvd2x H87X,9,TABLE + lxvd2x H87lX,10,TABLE + + li 8,0x10 + li 9,0x20 + li 10,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + + mtctr 7 +.align 5 +L8x_loop: + # input loading + lxvd2x C0X,0,DATA # load C0 + lxvd2x C1X,8,DATA # load C1 + lxvd2x C2X,9,DATA # load C2 + lxvd2x C3X,10,DATA # load C3 + + # swap permuting +IF_LE(<vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask + vperm C2,C2,C2,swap_mask + vperm C3,C3,C3,swap_mask>) + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C23h,C2,C3,hidw_mask + vperm C23l,C2,C3,lodw_mask + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + + # input loading + lxvd2x C4X,28,DATA # load C4 + lxvd2x C5X,29,DATA # load C5 + lxvd2x C6X,30,DATA # load C6 + lxvd2x C7X,31,DATA # load C7 + + # swap permuting +IF_LE(<vperm C4,C4,C4,swap_mask + vperm C5,C5,C5,swap_mask + vperm C6,C6,C6,swap_mask + vperm C7,C7,C7,swap_mask>) + + # polynomial multiplication "karatsuba" pre-processing + vperm C45h,C4,C5,hidw_mask + vperm C45l,C4,C5,lodw_mask + vperm C67h,C6,C7,hidw_mask + vperm C67l,C6,C7,lodw_mask + vxor C23,C23h,C23l + vxor C01,C01h,C01l + vxor C45,C45h,C45l + vxor C67,C67h,C67l + + # polynomial multiplication "karatsuba" + vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h + vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l + vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h + vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l + vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h + vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l + vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h + vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l + vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l) + vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l) + vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l) + vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l) + + # polynomial multiplication "karatsuba" post-processing + vxor C23,C23,C23h + vxor C01,C01,C01h + vxor C45,C45,C45h + vxor C67,C67,C67h + vxor C23,C23,C23l + vxor C01,C01,C01l + vxor C45,C45,C45l + vxor C67,C67,C67l + + # deferred recombination of partial products + vxor C01h,C01h,C23h # H0 = H01⊕H23 + vxor C45h,C45h,C67h # H1 = H45⊕H67 + vxor C01l,C01l,C23l # L0 = L01⊕L23 + vxor C45l,C45l,C67l # L1 = L45⊕L45 + vxor C01,C01,C23 # M0 = M01⊕M23 + vxor C45,C45,C67 # M1 = M45⊕M45 + vxor C01h,C01h,C45h # H = H0⊕H1 + vxor C01l,C01l,C45l # L = L0⊕L1 + vxor C01,C01,C45 # M = M0⊕M1 + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x80 + bdnz L8x_loop + + # restore registers + li 8,0 + lvx 31,8,SP + addi 8,8,16 + lvx 30,8,SP + addi 8,8,16 + lvx 29,8,SP + addi 8,8,16 + lvx 28,8,SP + addi 8,8,16 + lvx 27,8,SP + addi 8,8,16 + lvx 26,8,SP + addi 8,8,16 + lvx 25,8,SP + addi 8,8,16 + lvx 24,8,SP + addi 8,8,16 + lvx 23,8,SP + addi 8,8,16 + lvx 22,8,SP + addi 8,8,16 + lvx 21,8,SP + addi 8,8,16 + lvx 20,8,SP + ld 31,192(SP) + ld 30,200(SP) + ld 29,208(SP) + ld 28,216(SP) + addi SP,SP,224 + + clrldi LENGTH,LENGTH,57 +L2x: + srdi 7,LENGTH,5 + cmpldi 7,0 + beq L1x + + # table loading + li 8,0x300 + li 9,0x400 + li 10,0x500 + lxvd2x H21hX,8,TABLE + lxvd2x H21X,9,TABLE + lxvd2x H21lX,10,TABLE + + li 10,0x10 + + mtctr 7 +.align 5 +L2x_loop: + # input loading + lxvd2x C0X,0,DATA # load C0 + lxvd2x C1X,10,DATA # load C1 + + # swap permuting +IF_LE(<vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask>) + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + vxor C01,C01h,C01l + + # polynomial multiplication "karatsuba" + vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h + vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l + vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l) + + # polynomial multiplication "karatsuba" post-processing + vxor C01,C01,C01h + vxor C01,C01,C01l + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x20 + bdnz L2x_loop + + clrldi LENGTH,LENGTH,59 +L1x: + srdi 7,LENGTH,4 + cmpldi 7,0 + beq Lrem + + # table loading + li 9,0x100 + li 10,0x200 + lxvd2x HlX,0,TABLE + lxvd2x HX, 9,TABLE + lxvd2x HhX,10,TABLE + + # input loading + lxvd2x C0X,0,DATA # load C0 + + # swap permuting +IF_LE(<vperm C0,C0,C0,swap_mask>) + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 +Lrem: + cmpldi LENGTH,0 + beq Ldone + + # table loading + li 9,0x100 + li 10,0x200 + lxvd2x HlX,0,TABLE + lxvd2x HX, 9,TABLE + lxvd2x HhX,10,TABLE + + # input loading + stdu SP,-16(SP) + stvx zero,0,SP +Lst_loop: + subic. LENGTH,LENGTH,1 + lbzx 7,LENGTH,DATA + stbx 7,LENGTH,SP + bne Lst_loop + lxvd2x C0X,0,SP + addi SP,SP,16 + + # swap permuting +IF_LE(<vperm C0,C0,C0,swap_mask>) + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + +Ldone: +IF_LE(<vperm C,C,C,swap_mask>) + stxvd2x CX,0,X # store C + blr +EPILOGUE(_nettle_gcm_hash8) + + # gcm_fill (uint8_t *ctr, size_t blocks, union gcm_block *buffer) + +IF_LE(<.align 5>) +PROLOGUE(_nettle_gcm_fill) +IF_LE(<.localentry _nettle_gcm_fill,0>) + +IF_LE(<ld 6,.swap_mask@got(TOCP) + lvx swap_mask,0,6>) + + vxor zero,zero,zero + vspltisb I1,1 + vspltisb I2,2 + vspltisb I3,3 + vspltisb I4,4 + vspltisb I5,5 + vspltisb I6,6 + vspltisb I7,7 + vspltisb I8,8 + vsldoi I1,zero,I1,1 + vsldoi I2,zero,I2,1 + vsldoi I3,zero,I3,1 + vsldoi I4,zero,I4,1 + vsldoi I5,zero,I5,1 + vsldoi I6,zero,I6,1 + vsldoi I7,zero,I7,1 + vsldoi I8,zero,I8,1 + + lxvd2x CTR0X,0,CTR + IF_LE(<vperm CTR0,CTR0,CTR0,swap_mask>) + + srdi 6,BLOCKS,3 # 8x loop count + cmpldi 6,0 + beq Lfill_4x + + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 25,0x10 + li 26,0x20 + li 27,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + + mtctr 6 + L8x_fill_loop: + vadduwm CTR1,CTR0,I1 + vadduwm CTR2,CTR0,I2 + vadduwm CTR3,CTR0,I3 + vadduwm CTR4,CTR0,I4 + vadduwm CTR5,CTR0,I5 + vadduwm CTR6,CTR0,I6 + vadduwm CTR7,CTR0,I7 + + IF_LE(<vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask + vperm CTR2,CTR2,CTR2,swap_mask + vperm CTR3,CTR3,CTR3,swap_mask + vperm CTR4,CTR4,CTR4,swap_mask + vperm CTR5,CTR5,CTR5,swap_mask + vperm CTR6,CTR6,CTR6,swap_mask + vperm CTR7,CTR7,CTR7,swap_mask>) + + IF_LE(<stxvd2x CTR0SX,0,BUFFER>) + IF_BE(<stxvd2x CTR0X,0,BUFFER>) + stxvd2x CTR1X,25,BUFFER + stxvd2x CTR2X,26,BUFFER + stxvd2x CTR3X,27,BUFFER + stxvd2x CTR4X,28,BUFFER + stxvd2x CTR5X,29,BUFFER + stxvd2x CTR6X,30,BUFFER + stxvd2x CTR7X,31,BUFFER + + vadduwm CTR0,CTR0,I8 + addi BUFFER,BUFFER,0x80 + bdnz L8x_fill_loop + + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi BLOCKS,BLOCKS,61 + + Lfill_4x: + srdi 6,BLOCKS,2 + cmpldi 6,0 + beq Lfill_2x + + li 8,0x10 + li 9,0x20 + li 10,0x30 + + vadduwm CTR1,CTR0,I1 + vadduwm CTR2,CTR0,I2 + vadduwm CTR3,CTR0,I3 + + IF_LE(<vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask + vperm CTR2,CTR2,CTR2,swap_mask + vperm CTR3,CTR3,CTR3,swap_mask>) + + IF_LE(<stxvd2x CTR0SX,0,BUFFER>) + IF_BE(<stxvd2x CTR0X,0,BUFFER>) + stxvd2x CTR1X,8,BUFFER + stxvd2x CTR2X,9,BUFFER + stxvd2x CTR3X,10,BUFFER + + vadduwm CTR0,CTR0,I4 + addi BUFFER,BUFFER,0x40 + + clrldi BLOCKS,BLOCKS,62 + + Lfill_2x: + srdi 6,BLOCKS,1 + cmpldi 6,0 + beq Lfill_1x + + li 10,0x10 + + vadduwm CTR1,CTR0,I1 + + IF_LE(<vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask>) + + IF_LE(<stxvd2x CTR0SX,0,BUFFER>) + IF_BE(<stxvd2x CTR0X,0,BUFFER>) + stxvd2x CTR1X,10,BUFFER + + vadduwm CTR0,CTR0,I2 + addi BUFFER,BUFFER,0x20 + + clrldi BLOCKS,BLOCKS,63 + + Lfill_1x: + cmpldi BLOCKS,0 + beq Lfill_done + + IF_LE(<vperm CTR0S,CTR0,CTR0,swap_mask>) + + IF_LE(<stxvd2x CTR0SX,0,BUFFER>) + IF_BE(<stxvd2x CTR0X,0,BUFFER>) + + vadduwm CTR0,CTR0,I1 + + Lfill_done: + IF_LE(<vperm CTR0,CTR0,CTR0,swap_mask>) + stxvd2x CTR0X,0,CTR + + blr +EPILOGUE(_nettle_gcm_fill) + + .data +IF_LE(<.align 4 +.polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 + .align 4 +.swap_mask: + .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 + .align 4 +.hidw_mask: + .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0 + .align 4 +.lodw_mask: + .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8>) +IF_BE(<.align 4 +.polynomial: + .byte 0xc2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 + .align 4 +.hidw_mask: + .byte 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23 + .align 4 +.lodw_mask: + .byte 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31>) diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 new file mode 100644 index 00000000..e69de29b diff --git a/testsuite/gcm-test.c b/testsuite/gcm-test.c index c8174019..df1fc94a 100644 --- a/testsuite/gcm-test.c +++ b/testsuite/gcm-test.c @@ -170,6 +170,29 @@ test_main(void) "16aedbf5a0de6a57a637b39b"), SHEX("619cc5aefffe0bfa462af43c1699d050"));
+ /* Test 128 bytes */ + test_aead(&nettle_gcm_aes128, NULL, + SHEX("feffe9928665731c6d6a8f9467308308"), + SHEX(""), + SHEX("d9313225f88406e5a55909c5aff5269a" + "86a7a9531534f7da2e4c303d8a318a72" + "1c3c0c95956809532fcf0e2449a6b525" + "b16aedf5aa0de657ba637b391aafd255" + "5ae376bc5e9f6a1b08e34db7a6ee0736" + "9ba662ea12f6f197e6bc3ed69d2480f3" + "ea5691347f2ba69113eb37910ebc18c8" + "0f697234582016fa956ca8f63ae6b473"), + SHEX("42831ec2217774244b7221b784d0d49c" + "e3aa212f2c02a4e035c17e2329aca12e" + "21d514b25466931c7d8f6a5aac84aa05" + "1ba30b396a0aac973d58e091473f5985" + "874b1178906ddbeab04ab2fe6cce8c57" + "8d7e961bd13fd6a8c56b66ca5e576492" + "1a48cd8bda04e66343e73055118b69b9" + "ced486813846958a11e602c03cfc232b"), + SHEX("cafebabefacedbaddecaf888"), + SHEX("796836f1246c9d735c5e1be0a715ccc3")); + /* Test case 7 */ test_aead(&nettle_gcm_aes192, NULL, SHEX("00000000000000000000000000000000"
On Tue, Jun 30, 2020 at 5:14 AM Maamoun TK maamoun.tk@googlemail.com wrote:
Patch implementation benchmark for GCM_AES (Tested on POWER8): little-endian:
- Encrypt x~17.5 of nettle C implementation
- Decrypt x~17.5 of nettle C implementation
- Update x~30 of nettle C implementation
big-endian:
- Encrypt x~18.5 of nettle C implementation
- Decrypt x~18.5 of nettle C implementation
- Update x~28.5 of nettle C implementation
...
One small comment for aes_encrypt and aes_decrypt... src and dst are usually user supplied buffers. Using lxvd2x to load a vector may produce incorrect results if the user is feeding a stream to an encryptor or decryptor that is not naturally aligned to that of an unsigned int. (On the other hand, Nettle controls the round keys array so lxvd2x should be fine.)
Instead of lxvd2x and friends for the user's buffers you should consider using lvx and doing the lvsl thing to fix the data in the registers.
Jeff
On Tue, Jun 30, 2020 at 5:29 AM Jeffrey Walton noloader@gmail.com wrote:
On Tue, Jun 30, 2020 at 5:14 AM Maamoun TK maamoun.tk@googlemail.com wrote:
Patch implementation benchmark for GCM_AES (Tested on POWER8): little-endian:
- Encrypt x~17.5 of nettle C implementation
- Decrypt x~17.5 of nettle C implementation
- Update x~30 of nettle C implementation
big-endian:
- Encrypt x~18.5 of nettle C implementation
- Decrypt x~18.5 of nettle C implementation
- Update x~28.5 of nettle C implementation
...
One small comment for aes_encrypt and aes_decrypt... src and dst are usually user supplied buffers. Using lxvd2x to load a vector may produce incorrect results if the user is feeding a stream to an encryptor or decryptor that is not naturally aligned to that of an unsigned int. (On the other hand, Nettle controls the round keys array so lxvd2x should be fine.)
Instead of lxvd2x and friends for the user's buffers you should consider using lvx and doing the lvsl thing to fix the data in the registers.
In fact, you might want to add a test case like this:
uint8_t plain[19] = {0,1, ..., 17, 18}; uint8_t cipher[16], recover[16];
Then send plain, plain+1, plain+2 and plain+3 into the encryptor and see if it round trips. lxvd2x will choke even on POWER9 because two of the tests will not even be naturally aligned for a byte.
Jeff
I tested something similar, I tried to load data at address 0xXXXXXXX1 using lxvd2x and it loaded it properly.
On Tue, Jun 30, 2020 at 12:35 PM Jeffrey Walton noloader@gmail.com wrote:
On Tue, Jun 30, 2020 at 5:29 AM Jeffrey Walton noloader@gmail.com wrote:
On Tue, Jun 30, 2020 at 5:14 AM Maamoun TK maamoun.tk@googlemail.com
wrote:
Patch implementation benchmark for GCM_AES (Tested on POWER8): little-endian:
- Encrypt x~17.5 of nettle C implementation
- Decrypt x~17.5 of nettle C implementation
- Update x~30 of nettle C implementation
big-endian:
- Encrypt x~18.5 of nettle C implementation
- Decrypt x~18.5 of nettle C implementation
- Update x~28.5 of nettle C implementation
...
One small comment for aes_encrypt and aes_decrypt... src and dst are usually user supplied buffers. Using lxvd2x to load a vector may produce incorrect results if the user is feeding a stream to an encryptor or decryptor that is not naturally aligned to that of an unsigned int. (On the other hand, Nettle controls the round keys array so lxvd2x should be fine.)
Instead of lxvd2x and friends for the user's buffers you should consider using lvx and doing the lvsl thing to fix the data in the registers.
In fact, you might want to add a test case like this:
uint8_t plain[19] = {0,1, ..., 17, 18}; uint8_t cipher[16], recover[16];
Then send plain, plain+1, plain+2 and plain+3 into the encryptor and see if it round trips. lxvd2x will choke even on POWER9 because two of the tests will not even be naturally aligned for a byte.
Jeff
On Tue, Jun 30, 2020 at 7:55 AM Maamoun TK maamoun.tk@googlemail.com wrote:
I tested something similar, I tried to load data at address 0xXXXXXXX1 using lxvd2x and it loaded it properly.
I think you should reconsider.
Consider, even Andy Polyakov uses lvx/lvsl when loading [potentially] unaligned buffers on POWER8: https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl. Andy uses it for the user's key and data.
Jeff
On Tue, Jun 30, 2020 at 12:35 PM Jeffrey Walton noloader@gmail.com wrote:
On Tue, Jun 30, 2020 at 5:29 AM Jeffrey Walton noloader@gmail.com wrote:
On Tue, Jun 30, 2020 at 5:14 AM Maamoun TK maamoun.tk@googlemail.com
wrote:
Patch implementation benchmark for GCM_AES (Tested on POWER8): little-endian:
- Encrypt x~17.5 of nettle C implementation
- Decrypt x~17.5 of nettle C implementation
- Update x~30 of nettle C implementation
big-endian:
- Encrypt x~18.5 of nettle C implementation
- Decrypt x~18.5 of nettle C implementation
- Update x~28.5 of nettle C implementation
...
One small comment for aes_encrypt and aes_decrypt... src and dst are usually user supplied buffers. Using lxvd2x to load a vector may produce incorrect results if the user is feeding a stream to an encryptor or decryptor that is not naturally aligned to that of an unsigned int. (On the other hand, Nettle controls the round keys array so lxvd2x should be fine.)
Instead of lxvd2x and friends for the user's buffers you should consider using lvx and doing the lvsl thing to fix the data in the registers.
In fact, you might want to add a test case like this:
uint8_t plain[19] = {0,1, ..., 17, 18}; uint8_t cipher[16], recover[16];
Then send plain, plain+1, plain+2 and plain+3 into the encryptor and see if it round trips. lxvd2x will choke even on POWER9 because two of the tests will not even be naturally aligned for a byte.
From POWER ISA 2.07 B:
- Load Vector Indexed X-form lvx VRT,RA,RB . . VRT ← MEM(EA & 0xFFFF_FFFF_FFFF_FFF0, 16)
- Load VSX Vector Doubleword*2 Indexed XX1-form lxvd2x XT,RA,RB . . VSR[XT]{0:63} ← MEM(EA,8) VSR[XT]{64:127} ← MEM(EA+8,8)
lxvd2x doesn't set the least significant 4-bit of effective address to zero, this isn't enough to prove that lxvd2x loads unaligned data properly so I tested this instruction to load unaligned data and it did succeed.
Andy Polyakov did use lxvd2x to load user's buffers in his ghash implementation for ppc: https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl He uses lvx_u which indicate to lvx_unaligned I guess, Perl translates this instruction to lxvd2x before passing it to the assembler.
regards, Mamone
On Tue, Jun 30, 2020 at 11:31 PM Jeffrey Walton noloader@gmail.com wrote:
On Tue, Jun 30, 2020 at 7:55 AM Maamoun TK maamoun.tk@googlemail.com wrote:
I tested something similar, I tried to load data at address 0xXXXXXXX1 using lxvd2x and it loaded it properly.
I think you should reconsider.
Consider, even Andy Polyakov uses lvx/lvsl when loading [potentially] unaligned buffers on POWER8: https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl. Andy uses it for the user's key and data.
Jeff
On Tue, Jun 30, 2020 at 12:35 PM Jeffrey Walton noloader@gmail.com
wrote:
On Tue, Jun 30, 2020 at 5:29 AM Jeffrey Walton noloader@gmail.com
wrote:
On Tue, Jun 30, 2020 at 5:14 AM Maamoun TK <
maamoun.tk@googlemail.com>
wrote:
Patch implementation benchmark for GCM_AES (Tested on POWER8): little-endian:
- Encrypt x~17.5 of nettle C implementation
- Decrypt x~17.5 of nettle C implementation
- Update x~30 of nettle C implementation
big-endian:
- Encrypt x~18.5 of nettle C implementation
- Decrypt x~18.5 of nettle C implementation
- Update x~28.5 of nettle C implementation
...
One small comment for aes_encrypt and aes_decrypt... src and dst are usually user supplied buffers. Using lxvd2x to load a vector may produce incorrect results if the user is feeding a stream to an encryptor or decryptor that is not naturally aligned to that of an unsigned int. (On the other hand, Nettle controls the round keys
array
so lxvd2x should be fine.)
Instead of lxvd2x and friends for the user's buffers you should consider using lvx and doing the lvsl thing to fix the data in the registers.
In fact, you might want to add a test case like this:
uint8_t plain[19] = {0,1, ..., 17, 18}; uint8_t cipher[16], recover[16];
Then send plain, plain+1, plain+2 and plain+3 into the encryptor and see if it round trips. lxvd2x will choke even on POWER9 because two of the tests will not even be naturally aligned for a byte.
Maamoun TK maamoun.tk@googlemail.com writes:
--- a/asm.m4 +++ b/asm.m4 @@ -30,13 +30,26 @@ COFF_STYLE, yes, define(<GMP_NUMB_BITS>,<>)dnl
define(<PROLOGUE>, +<ifelse(ASM_POWERPC64,no, <.globl C_NAME($1) DECLARE_FUNC(C_NAME($1)) -C_NAME($1): ASM_X86_ENDBR>) +C_NAME($1): ASM_X86_ENDBR>, +<.globl C_NAME($1) +DECLARE_FUNC(C_NAME($1)) +.section ".opd","aw" +.align 3 +C_NAME($1): +.quad .C_NAME($1),.TOC.@tocbase,0 +.previous +.align 5 +.C_NAME($1):>)>)
define(<EPILOGUE>, <ifelse(ELF_STYLE,yes, -<.size C_NAME($1), . - C_NAME($1)>,<>)>) +<ifelse(ASM_POWERPC64,no, +<.size C_NAME($1), . - C_NAME($1)>, +<.size .C_NAME($1), . - .C_NAME($1) +.size C_NAME($1), . - .C_NAME($1)>)>,<>)>)
Can you explain a bit more what's special with the powerpc64 prologue and epilogue? What are the .C_NAME($1) symbols?
If possible, definitions could be moved to powerpc64/machine.m4 instead, avoiding ifelse(ASM_POWERPC64,...).
It would also be nice with a powerpc64/README summarizing register usage and calling conventions.
Regards, /Niels
On Tue, Jun 30, 2020 at 11:06 PM Niels Möller nisse@lysator.liu.se wrote:
Can you explain a bit more what's special with the powerpc64 prologue
and epilogue? What are the .C_NAME($1) symbols?
This is ELFv1 ABI stuff that is still required to get the function working for PowerPC64 big-endian systems. PowerPC64 little-endian systems is compatible with ELFv2 ABI which has the classical prologue along with other features.
If possible, definitions could be moved to powerpc64/machine.m4 instead,
avoiding ifelse(ASM_POWERPC64,...).
Is it possible to override the PROLOGUE definition in powerpc64/machine.m4?
It would also be nice with a powerpc64/README summarizing register usage
and calling conventions.
I will follow the README for other architectures to make this one.
regards, Mamone
Maamoun TK maamoun.tk@googlemail.com writes:
Is it possible to override the PROLOGUE definition in powerpc64/machine.m4?
The m4 files are processed in the order asm.m4, machine.m4, config.m4 (not sure why config.m4 is last).
So machine.m4 can override definitions in asm.m4. Or if you don't want to override all of PROLOGUE, you could define some helper macros which expand to empty by default, and override them instead. A bit like the ASM_X86_ENDBR, which is used unconditionally in the definition of PROLOGUE, but expands to the empty string where not enabled by cnfigure.
Regards, /Niels
nettle-bugs@lists.lysator.liu.se