Makefile.in | 2 +- configure.ac | 5 + gcm.c | 19 +- powerpc64le/aes-decrypt-internal.asm (new +x) | 573 +++++++++++++++ powerpc64le/aes-encrypt-internal.asm (new +x) | 534 ++++++++++++++ powerpc64le/gcm-hash8.asm (new +x) | 992 ++++++++++++++++++++++++++ powerpc64le/machine.m4 (new +x) | 0 testsuite/gcm-test.c | 23 + 8 files changed, 2146 insertions(+), 2 deletions(-) create mode 100644 powerpc64le/aes-decrypt-internal.asm create mode 100644 powerpc64le/aes-encrypt-internal.asm create mode 100644 powerpc64le/gcm-hash8.asm create mode 100644 powerpc64le/machine.m4
diff --git a/Makefile.in b/Makefile.in index 64ff1001..5bbc0f79 100644 --- a/Makefile.in +++ b/Makefile.in @@ -603,7 +603,7 @@ distdir: $(DISTFILES) done set -e; for d in sparc32 sparc64 x86 \ x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \ - arm arm/neon arm/v6 arm/fat ; do \ + arm arm/neon arm/v6 arm/fat powerpc64le ; do \ mkdir "$(distdir)/$$d" ; \ find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \ -exec cp '{}' "$(distdir)/$$d" ';' ; \ diff --git a/configure.ac b/configure.ac index 90ea1ea8..1ea54ce8 100644 --- a/configure.ac +++ b/configure.ac @@ -435,6 +435,9 @@ if test "x$enable_assembler" = xyes ; then esac fi ;; + *powerpc64le*) + asm_path=powerpc64le + ;; *) enable_assembler=no ;; @@ -572,7 +575,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc +#undef HAVE_NATIVE_gcm_init_key8 #undef HAVE_NATIVE_gcm_hash8 +#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_sha1_compress #undef HAVE_NATIVE_sha256_compress diff --git a/gcm.c b/gcm.c index cf615daf..809c03bc 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,12 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key8 + +#define gcm_init_key _nettle_gcm_init_key8 +void +_nettle_gcm_init_key8 (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key8 */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -225,6 +231,13 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table)
#endif /* GCM_TABLE_BITS */
+#if HAVE_NATIVE_gcm_fill + +#define gcm_fill _nettle_gcm_fill +void +_nettle_gcm_fill (uint8_t *ctr, size_t blocks, union nettle_block16 *buffer); +#endif /* HAVE_NATIVE_gcm_fill */ + /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
@@ -245,7 +258,9 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS +#ifdef gcm_init_key + gcm_init_key(key->h); +#elif GCM_TABLE_BITS /* Algorithm 3 from the gcm paper. First do powers of two, then do the rest by adding. */ while (i /= 2) @@ -333,6 +348,7 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key *key, ctx->auth_size += length; }
+#ifndef gcm_fill static nettle_fill16_func gcm_fill; static void gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer) @@ -349,6 +365,7 @@ gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
WRITE_UINT32(ctr + GCM_BLOCK_SIZE - 4, c); } +#endif /* !gcm_fill */
void gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key, diff --git a/powerpc64le/aes-decrypt-internal.asm b/powerpc64le/aes-decrypt-internal.asm new file mode 100644 index 00000000..bde34779 --- /dev/null +++ b/powerpc64le/aes-decrypt-internal.asm @@ -0,0 +1,573 @@ +C powerpc64le/aes-decrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) +define(<S8>, <10>) +define(<S9>, <11>) +define(<S10>, <12>) +define(<S11>, <13>) +define(<S12>, <14>) +define(<S13>, <15>) +define(<S14>, <16>) +define(<S15>, <17>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) +define(<S8X>, <42>) +define(<S9X>, <43>) +define(<S10X>, <44>) +define(<S11X>, <45>) +define(<S12X>, <46>) +define(<S13X>, <47>) +define(<S14X>, <48>) +define(<S15X>, <49>) + +C ZERO vector register is used in place of RoundKey +C for vncipher instruction because the order of InvMixColumns +C and Xor processes are flipped in that instruction. +C The Xor process with RoundKey is executed afterward. +define(<ZERO>, <18>) + + .file "aes-decrypt-internal.asm" + + C _aes_decrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + + .text +.align 5 +PROLOGUE(_nettle_aes_decrypt) + vxor ZERO,ZERO,ZERO + + ld 5,.swap_mask@got(TOCP) + lvx swap_mask,0,5 + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,4 # 16x loop count + cmpldi 5,0 + beq L8x + + std 17,-120(SP); + std 18,-112(SP); + std 19,-104(SP); + std 20,-96(SP); + std 21,-88(SP); + std 22,-80(SP); + std 23,-72(SP); + std 24,-64(SP); + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 17,0x10 + li 18,0x20 + li 19,0x30 + li 20,0x40 + li 21,0x50 + li 22,0x60 + li 23,0x70 + li 24,0x80 + li 25,0x90 + li 26,0xA0 + li 27,0xB0 + li 28,0xC0 + li 29,0xD0 + li 30,0xE0 + li 31,0xF0 + +.align 5 +Lx16_loop: + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + lxvd2x S1X,17,SRC + lxvd2x S2X,18,SRC + lxvd2x S3X,19,SRC + lxvd2x S4X,20,SRC + lxvd2x S5X,21,SRC + lxvd2x S6X,22,SRC + lxvd2x S7X,23,SRC + lxvd2x S8X,24,SRC + lxvd2x S9X,25,SRC + lxvd2x S10X,26,SRC + lxvd2x S11X,27,SRC + lxvd2x S12X,28,SRC + lxvd2x S13X,29,SRC + lxvd2x S14X,30,SRC + lxvd2x S15X,31,SRC + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L16x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vncipher S8,S8,ZERO + vncipher S9,S9,ZERO + vncipher S10,S10,ZERO + vncipher S11,S11,ZERO + vncipher S12,S12,ZERO + vncipher S13,S13,ZERO + vncipher S14,S14,ZERO + vncipher S15,S15,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + addi 10,10,0x10 + bdnz L16x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + vncipherlast S8,S8,K + vncipherlast S9,S9,K + vncipherlast S10,S10,K + vncipherlast S11,S11,K + vncipherlast S12,S12,K + vncipherlast S13,S13,K + vncipherlast S14,S14,K + vncipherlast S15,S15,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask + + stxvd2x S0X,0,DST + stxvd2x S1X,17,DST + stxvd2x S2X,18,DST + stxvd2x S3X,19,DST + stxvd2x S4X,20,DST + stxvd2x S5X,21,DST + stxvd2x S6X,22,DST + stxvd2x S7X,23,DST + stxvd2x S8X,24,DST + stxvd2x S9X,25,DST + stxvd2x S10X,26,DST + stxvd2x S11X,27,DST + stxvd2x S12X,28,DST + stxvd2x S13X,29,DST + stxvd2x S14X,30,DST + stxvd2x S15X,31,DST + + addi SRC,SRC,0x100 + addi DST,DST,0x100 + subic. 5,5,1 + bne Lx16_loop + + ld 17,-120(SP); + ld 18,-112(SP); + ld 19,-104(SP); + ld 20,-96(SP); + ld 21,-88(SP); + ld 22,-80(SP); + ld 23,-72(SP); + ld 24,-64(SP); + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,60 + +L8x: + srdi 5,LENGTH,3 + cmpldi 5,0 + beq L4x + + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + addi 9,9,0x10 + lxvd2x S4X,9,SRC + addi 9,9,0x10 + lxvd2x S5X,9,SRC + addi 9,9,0x10 + lxvd2x S6X,9,SRC + addi 9,9,0x10 + lxvd2x S7X,9,SRC + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + addi 9,9,0x10 + stxvd2x S4X,9,DST + addi 9,9,0x10 + stxvd2x S5X,9,DST + addi 9,9,0x10 + stxvd2x S6X,9,DST + addi 9,9,0x10 + stxvd2x S7X,9,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + + vxor S0,S0,K + vxor S1,S1,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vxor S0,S0,K + vxor S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + + vxor S0,S0,K + + vperm S0,S0,S0,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vxor S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + + vperm S0,S0,S0,swap_mask + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_decrypt) + + .data + .align 4 +.swap_mask: + .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 diff --git a/powerpc64le/aes-encrypt-internal.asm b/powerpc64le/aes-encrypt-internal.asm new file mode 100644 index 00000000..1bbd86a8 --- /dev/null +++ b/powerpc64le/aes-encrypt-internal.asm @@ -0,0 +1,534 @@ +C powerpc64le/aes-encrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) +define(<S8>, <10>) +define(<S9>, <11>) +define(<S10>, <12>) +define(<S11>, <13>) +define(<S12>, <14>) +define(<S13>, <15>) +define(<S14>, <16>) +define(<S15>, <17>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) +define(<S8X>, <42>) +define(<S9X>, <43>) +define(<S10X>, <44>) +define(<S11X>, <45>) +define(<S12X>, <46>) +define(<S13X>, <47>) +define(<S14X>, <48>) +define(<S15X>, <49>) + + .file "aes-encrypt-internal.asm" + + C _aes_encrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + + .text +.align 5 +PROLOGUE(_nettle_aes_encrypt) + ld 5,.swap_mask@got(TOCP) + lvx swap_mask,0,5 + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,4 # 16x loop count + cmpldi 5,0 + beq L8x + + std 17,-120(SP); + std 18,-112(SP); + std 19,-104(SP); + std 20,-96(SP); + std 21,-88(SP); + std 22,-80(SP); + std 23,-72(SP); + std 24,-64(SP); + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 17,0x10 + li 18,0x20 + li 19,0x30 + li 20,0x40 + li 21,0x50 + li 22,0x60 + li 23,0x70 + li 24,0x80 + li 25,0x90 + li 26,0xA0 + li 27,0xB0 + li 28,0xC0 + li 29,0xD0 + li 30,0xE0 + li 31,0xF0 + +.align 5 +Lx16_loop: + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + lxvd2x S1X,17,SRC + lxvd2x S2X,18,SRC + lxvd2x S3X,19,SRC + lxvd2x S4X,20,SRC + lxvd2x S5X,21,SRC + lxvd2x S6X,22,SRC + lxvd2x S7X,23,SRC + lxvd2x S8X,24,SRC + lxvd2x S9X,25,SRC + lxvd2x S10X,26,SRC + lxvd2x S11X,27,SRC + lxvd2x S12X,28,SRC + lxvd2x S13X,29,SRC + lxvd2x S14X,30,SRC + lxvd2x S15X,31,SRC + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L16x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + vcipher S8,S8,K + vcipher S9,S9,K + vcipher S10,S10,K + vcipher S11,S11,K + vcipher S12,S12,K + vcipher S13,S13,K + vcipher S14,S14,K + vcipher S15,S15,K + addi 10,10,0x10 + bdnz L16x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + vcipherlast S8,S8,K + vcipherlast S9,S9,K + vcipherlast S10,S10,K + vcipherlast S11,S11,K + vcipherlast S12,S12,K + vcipherlast S13,S13,K + vcipherlast S14,S14,K + vcipherlast S15,S15,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask + + stxvd2x S0X,0,DST + stxvd2x S1X,17,DST + stxvd2x S2X,18,DST + stxvd2x S3X,19,DST + stxvd2x S4X,20,DST + stxvd2x S5X,21,DST + stxvd2x S6X,22,DST + stxvd2x S7X,23,DST + stxvd2x S8X,24,DST + stxvd2x S9X,25,DST + stxvd2x S10X,26,DST + stxvd2x S11X,27,DST + stxvd2x S12X,28,DST + stxvd2x S13X,29,DST + stxvd2x S14X,30,DST + stxvd2x S15X,31,DST + + addi SRC,SRC,0x100 + addi DST,DST,0x100 + subic. 5,5,1 + bne Lx16_loop + + ld 17,-120(SP); + ld 18,-112(SP); + ld 19,-104(SP); + ld 20,-96(SP); + ld 21,-88(SP); + ld 22,-80(SP); + ld 23,-72(SP); + ld 24,-64(SP); + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,60 + +L8x: + srdi 5,LENGTH,3 + cmpldi 5,0 + beq L4x + + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + addi 9,9,0x10 + lxvd2x S4X,9,SRC + addi 9,9,0x10 + lxvd2x S5X,9,SRC + addi 9,9,0x10 + lxvd2x S6X,9,SRC + addi 9,9,0x10 + lxvd2x S7X,9,SRC + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + addi 9,9,0x10 + stxvd2x S4X,9,DST + addi 9,9,0x10 + stxvd2x S5X,9,DST + addi 9,9,0x10 + stxvd2x S6X,9,DST + addi 9,9,0x10 + stxvd2x S7X,9,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + + vxor S0,S0,K + vxor S1,S1,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + + vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + + lxvd2x S0X,0,SRC + + vxor S0,S0,K + + vperm S0,S0,S0,swap_mask + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + + vperm S0,S0,S0,swap_mask + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_encrypt) + + .data + .align 4 +.swap_mask: + .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 diff --git a/powerpc64le/gcm-hash8.asm b/powerpc64le/gcm-hash8.asm new file mode 100644 index 00000000..a809f6ef --- /dev/null +++ b/powerpc64le/gcm-hash8.asm @@ -0,0 +1,992 @@ +C powerpc64le/gcm-hash8.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: +C VSX instructions is used to load and store data to memory "lxvd2x, stxvd2x" +C instead of VR instructions "lvx, stvx" as a workaround to access unaligned data +C VSX registers are defined with "X" suffix + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<TABLE>, <3>) +define(<X>, <4>) +define(<LENGTH>, <5>) +define(<DATA>, <6>) + +define(<zero>, <0>) +define(<swap_mask>, <1>) +define(<hidw_mask>, <2>) +define(<lodw_mask>, <3>) +define(<poly>, <4>) +define(<poly_h>, <4>) +define(<poly_l>, <5>) +define(<RP>, <6>) +define(<Mh>, <7>) +define(<Ml>, <8>) +define(<H>, <9>) +define(<Hh>, <10>) +define(<Hl>, <11>) +define(<RP2>, <9>) +define(<M2h>, <10>) +define(<M2l>, <11>) + +define(<HX>, <41>) +define(<HhX>, <42>) +define(<HlX>, <43>) +define(<H_HhX>, <44>) +define(<H_HX>, <45>) +define(<H_HlX>, <46>) + +define(<sl1>, <1>) +define(<msb>, <5>) +define(<H2>, <6>) +define(<H2h>, <7>) +define(<H2l>, <8>) +define(<H_h>, <12>) +define(<H_m>, <13>) +define(<H_l>, <14>) +define(<H_Hh>, <12>) +define(<H_H>, <13>) +define(<H_Hl>, <14>) +define(<H_t>, <15>) +define(<H2_h>, <16>) +define(<H2_m>, <17>) +define(<H2_l>, <18>) +define(<H2_t>, <19>) + +define(<C0X>, <38>) +define(<C1X>, <39>) +define(<C2X>, <40>) +define(<C3X>, <44>) +define(<C4X>, <38>) +define(<C5X>, <39>) +define(<C6X>, <40>) +define(<C7X>, <44>) + +define(<CX>, <45>) + +define(<C0>, <6>) +define(<C1>, <7>) +define(<C2>, <8>) +define(<C3>, <12>) +define(<C4>, <6>) +define(<C5>, <7>) +define(<C6>, <8>) +define(<C7>, <12>) + +define(<C>, <13>) + +define(<Ch>, <14>) +define(<Cl>, <15>) +define(<Cm>, <16>) + +define(<C01h>, <14>) +define(<C01l>, <15>) +define(<C01>, <16>) +define(<C23h>, <17>) +define(<C23l>, <18>) +define(<C23>, <19>) +define(<C45h>, <20>) +define(<C45l>, <21>) +define(<C45>, <22>) +define(<C67h>, <6>) +define(<C67l>, <7>) +define(<C67>, <8>) + +define(<H21>, <9>) +define(<H21h>, <10>) +define(<H21l>, <11>) +define(<H43>, <23>) +define(<H43h>, <24>) +define(<H43l>, <25>) +define(<H65>, <26>) +define(<H65h>, <27>) +define(<H65l>, <28>) +define(<H87>, <29>) +define(<H87h>, <30>) +define(<H87l>, <31>) + +define(<H21X>, <41>) +define(<H21hX>, <42>) +define(<H21lX>, <43>) +define(<H43X>, <55>) +define(<H43hX>, <56>) +define(<H43lX>, <57>) +define(<H65X>, <58>) +define(<H65hX>, <59>) +define(<H65lX>, <60>) +define(<H87X>, <61>) +define(<H87hX>, <62>) +define(<H87lX>, <63>) + +# gcm_fill registers: + +define(<CTR>, <3>) +define(<BLOCKS>, <4>) +define(<BUFFER>, <5>) + +define(<CTR0>, <2>) +define(<CTR0S>, <3>) +define(<CTR1>, <4>) +define(<CTR2>, <5>) +define(<CTR3>, <6>) +define(<CTR4>, <7>) +define(<CTR5>, <8>) +define(<CTR6>, <9>) +define(<CTR7>, <10>) + +define(<CTR0X>, <34>) +define(<CTR0SX>, <35>) +define(<CTR1X>, <36>) +define(<CTR2X>, <37>) +define(<CTR3X>, <38>) +define(<CTR4X>, <39>) +define(<CTR5X>, <40>) +define(<CTR6X>, <41>) +define(<CTR7X>, <42>) + +define(<I1>, <11>) +define(<I2>, <12>) +define(<I3>, <13>) +define(<I4>, <14>) +define(<I5>, <15>) +define(<I6>, <16>) +define(<I7>, <17>) +define(<I8>, <18>) + + .file "gcm-hash8.asm" + + # void gcm_init_key (union gcm_block *table) + + .text +.align 5 +PROLOGUE(_nettle_gcm_init_key8) + ld 7,.polynomial@got(TOCP) + lvx poly,0,7 + ld 7,.swap_mask@got(TOCP) + lvx swap_mask,0,7 + ld 7,.hidw_mask@got(TOCP) + lvx hidw_mask,0,7 + ld 7,.lodw_mask@got(TOCP) + lvx lodw_mask,0,7 + + li 10,0x800 + lxvd2x HX,10,TABLE # load H + vperm H,H,H,swap_mask + + # --- calculate H = H shift left 1 modulo polynomial --- + + vupkhsw msb,H # most significant bit word-extend + vspltisb sl1,1 # splat 1 for shift left + vspltw msb,msb,0 # most significant bit extend + vsl H,H,sl1 # H shift left 1 + vand msb,msb,poly + vxor zero,zero,zero + vxor H_t,H,msb + + vsldoi H,H_t,H_t,8 # doubleword swap + vsldoi Hh,H,zero,8 + vsldoi Hl,zero,H,8 + + # --- calculate H^2 = H*H --- + + # reduction pre-processing + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + # polynomial multiplication "classical" + vpmsumd H_h,H_t,Hh # H^1h*H^1h + vpmsumd H_l,H_t,Hl # H^1l*H^1l + vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] + vsldoi Ml,H_m,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor H_h,H_h,Mh # [2] + vxor H_l,H_l,Ml # [2] + vxor H_l,H_l,RP # [1] + + # reduction second phase + vpmsumd RP,H_l,poly_l + vxor H_h,H_l,H_h + vxor H2_t,H_h,RP + + vsldoi H2,H2_t,H2_t,8 + vsldoi H2h,H2,zero,8 + vsldoi H2l,zero,H2,8 + + # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] --- + + li 8,0x00 + li 9,0x100 + li 10,0x200 + stxvd2x HlX,8,TABLE + stxvd2x HX,9,TABLE + stxvd2x HhX,10,TABLE + + li 8,0x300 + li 9,0x400 + li 10,0x500 + stxvd2x H_HhX,8,TABLE + stxvd2x H_HX,9,TABLE + stxvd2x H_HlX,10,TABLE + + # --- calculate H^3,H^4 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^2l + vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h + vpmsumd H_h,H_t,H2h # H^1h*H^2h + vpmsumd H2_l,H2_t,H2l # H^2l*H^2l + vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h + vpmsumd H2_h,H2_t,H2h # H^2h*H^2h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^3 + vpmsumd RP2,H2_l,poly_h # [1] H^4 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^3 + vsldoi M2h,zero,H2_m,8 # [2] H^4 + vsldoi Ml,H_m,zero,8 # [2] H^3 + vsldoi M2l,H2_m,zero,8 # [2] H^4 + vsldoi RP,RP,RP,8 # [1] H^3 + vsldoi RP2,RP2,RP2,8 # [1] H^4 + vxor H_h,H_h,Mh # [2] H^3 + vxor H2_h,H2_h,M2h # [2] H^4 + vxor H_l,H_l,Ml # [2] H^3 + vxor H2_l,H2_l,M2l # [2] H^4 + vxor H_l,H_l,RP # [1] H^3 + vxor H2_l,H2_l,RP2 # [1] H^4 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^3 + vpmsumd RP2,H2_l,poly_l # H^4 + vxor H_h,H_l,H_h # H^3 + vxor H2_h,H2_l,H2_h # H^4 + vxor H_h,H_h,RP # H^3 + vxor H2_h,H2_h,RP2 # H^4 + + vsldoi H2,H2_h,H2_h,8 # H^4 + vsldoi H,H_h,H_h,8 # H^3 + vsldoi H2l,zero,H2,8 # H^4 + vsldoi H2h,H2,zero,8 # H^4 + + # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] --- + + li 8,0x600 + li 9,0x700 + li 10,0x800 + stxvd2x H_HhX,8,TABLE + stxvd2x H_HX,9,TABLE + stxvd2x H_HlX,10,TABLE + + # --- calculate H^5,H^6 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^4l + vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h + vpmsumd H_h,H_t,H2h # H^1h*H^4h + vpmsumd H2_l,H2_t,H2l # H^2l*H^4l + vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h + vpmsumd H2_h,H2_t,H2h # H^2h*H^4h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^5 + vpmsumd RP2,H2_l,poly_h # [1] H^6 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^5 + vsldoi M2h,zero,H2_m,8 # [2] H^6 + vsldoi Ml,H_m,zero,8 # [2] H^5 + vsldoi M2l,H2_m,zero,8 # [2] H^6 + vsldoi RP,RP,RP,8 # [1] H^5 + vsldoi RP2,RP2,RP2,8 # [1] H^6 + vxor H_h,H_h,Mh # [2] H^5 + vxor H2_h,H2_h,M2h # [2] H^6 + vxor H_l,H_l,Ml # [2] H^5 + vxor H2_l,H2_l,M2l # [2] H^6 + vxor H_l,H_l,RP # [1] H^5 + vxor H2_l,H2_l,RP2 # [1] H^6 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^5 + vpmsumd RP2,H2_l,poly_l # H^6 + vxor H_h,H_l,H_h # H^5 + vxor H2_h,H2_l,H2_h # H^6 + vxor H_h,H_h,RP # H^5 + vxor H2_h,H2_h,RP2 # H^6 + + vsldoi H2,H2_h,H2_h,8 # H^6 + vsldoi H,H_h,H_h,8 # H^5 + vsldoi H2l,zero,H2,8 # H^6 + vsldoi H2h,H2,zero,8 # H^6 + + # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] --- + + li 8,0x900 + li 9,0xA00 + li 10,0xB00 + stxvd2x H_HhX,8,TABLE + stxvd2x H_HX,9,TABLE + stxvd2x H_HlX,10,TABLE + + # --- calculate H^7,H^8 --- + + # polynomial multiplication "classical" + vpmsumd H_l,H_t,H2l # H^1l*H^6l + vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h + vpmsumd H_h,H_t,H2h # H^1h*H^6h + vpmsumd H2_l,H2_t,H2l # H^2l*H^6l + vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h + vpmsumd H2_h,H2_t,H2h # H^2h*H^6h + + # reduction first phase # [1] + vpmsumd RP,H_l,poly_h # [1] H^7 + vpmsumd RP2,H2_l,poly_h # [1] H^8 + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,H_m,8 # [2] H^7 + vsldoi M2h,zero,H2_m,8 # [2] H^8 + vsldoi Ml,H_m,zero,8 # [2] H^7 + vsldoi M2l,H2_m,zero,8 # [2] H^8 + vsldoi RP,RP,RP,8 # [1] H^7 + vsldoi RP2,RP2,RP2,8 # [1] H^8 + vxor H_h,H_h,Mh # [2] H^7 + vxor H2_h,H2_h,M2h # [2] H^8 + vxor H_l,H_l,Ml # [2] H^7 + vxor H2_l,H2_l,M2l # [2] H^8 + vxor H_l,H_l,RP # [1] H^7 + vxor H2_l,H2_l,RP2 # [1] H^8 + + # reduction second phase + vpmsumd RP,H_l,poly_l # H^7 + vpmsumd RP2,H2_l,poly_l # H^8 + vxor H_h,H_l,H_h # H^7 + vxor H2_h,H2_l,H2_h # H^8 + vxor H_h,H_h,RP # H^7 + vxor H2_h,H2_h,RP2 # H^8 + + vsldoi H,H_h,H_h,8 # H^7 + vsldoi H2,H2_h,H2_h,8 # H^8 + + # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + vperm H_Hh,H2,H,lodw_mask + vperm H_Hl,H2,H,hidw_mask + vxor H_H,H_Hh,H_Hl + + # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] --- + + li 8,0xC00 + li 9,0xD00 + li 10,0xE00 + stxvd2x H_HhX,8,TABLE + stxvd2x H_HX,9,TABLE + stxvd2x H_HlX,10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key8) + + # void gcm_hash (const struct gcm_key *key, union gcm_block *x, + # size_t length, const uint8_t *data) + +.align 5 +PROLOGUE(_nettle_gcm_hash8) + vxor zero,zero,zero + + ld 7,.polynomial@got(TOCP) + lvx poly,0,7 + ld 7,.swap_mask@got(TOCP) + lvx swap_mask,0,7 + ld 7,.hidw_mask@got(TOCP) + lvx hidw_mask,0,7 + ld 7,.lodw_mask@got(TOCP) + lvx lodw_mask,0,7 + + vsldoi poly_h,zero,poly,8 + vsldoi poly_l,poly_h,poly_h,8 + + lxvd2x CX,0,X # load X + vperm C,C,C,swap_mask + + srdi 7,LENGTH,7 # 8x loop count + cmpldi 7,0 + beq L2x + + # backup registers + stdu SP,-224(SP) + std 28,216(SP) + std 29,208(SP) + std 30,200(SP) + std 31,192(SP) + li 8,176 + stvx 20,8,SP + subi 8,8,16 + stvx 21,8,SP + subi 8,8,16 + stvx 22,8,SP + subi 8,8,16 + stvx 23,8,SP + subi 8,8,16 + stvx 24,8,SP + subi 8,8,16 + stvx 25,8,SP + subi 8,8,16 + stvx 26,8,SP + subi 8,8,16 + stvx 27,8,SP + subi 8,8,16 + stvx 28,8,SP + subi 8,8,16 + stvx 29,8,SP + subi 8,8,16 + stvx 30,8,SP + subi 8,8,16 + stvx 31,8,SP + + # table loading + li 8,0x300 + li 9,0x400 + li 10,0x500 + lxvd2x H21hX,8,TABLE + lxvd2x H21X,9,TABLE + lxvd2x H21lX,10,TABLE + li 8,0x600 + li 9,0x700 + li 10,0x800 + lxvd2x H43hX,8,TABLE + lxvd2x H43X,9,TABLE + lxvd2x H43lX,10,TABLE + li 8,0x900 + li 9,0xA00 + li 10,0xB00 + lxvd2x H65hX,8,TABLE + lxvd2x H65X,9,TABLE + lxvd2x H65lX,10,TABLE + li 8,0xC00 + li 9,0xD00 + li 10,0xE00 + lxvd2x H87hX,8,TABLE + lxvd2x H87X,9,TABLE + lxvd2x H87lX,10,TABLE + + li 8,0x10 + li 9,0x20 + li 10,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + + mtctr 7 +.align 5 +L8x_loop: + # input loading + lxvd2x C0X,0,DATA # load C0 + lxvd2x C1X,8,DATA # load C1 + lxvd2x C2X,9,DATA # load C2 + lxvd2x C3X,10,DATA # load C3 + + # swap permuting + vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask + vperm C2,C2,C2,swap_mask + vperm C3,C3,C3,swap_mask + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C23h,C2,C3,hidw_mask + vperm C23l,C2,C3,lodw_mask + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + + # input loading + lxvd2x C4X,28,DATA # load C4 + lxvd2x C5X,29,DATA # load C5 + lxvd2x C6X,30,DATA # load C6 + lxvd2x C7X,31,DATA # load C7 + + # swap permuting + vperm C4,C4,C4,swap_mask + vperm C5,C5,C5,swap_mask + vperm C6,C6,C6,swap_mask + vperm C7,C7,C7,swap_mask + + # polynomial multiplication "karatsuba" pre-processing + vperm C45h,C4,C5,hidw_mask + vperm C45l,C4,C5,lodw_mask + vperm C67h,C6,C7,hidw_mask + vperm C67l,C6,C7,lodw_mask + vxor C23,C23h,C23l + vxor C01,C01h,C01l + vxor C45,C45h,C45l + vxor C67,C67h,C67l + + # polynomial multiplication "karatsuba" + vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h + vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l + vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h + vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l + vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h + vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l + vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h + vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l + vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l) + vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l) + vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l) + vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l) + + # polynomial multiplication "karatsuba" post-processing + vxor C23,C23,C23h + vxor C01,C01,C01h + vxor C45,C45,C45h + vxor C67,C67,C67h + vxor C23,C23,C23l + vxor C01,C01,C01l + vxor C45,C45,C45l + vxor C67,C67,C67l + + # deferred recombination of partial products + vxor C01h,C01h,C23h # H0 = H01⊕H23 + vxor C45h,C45h,C67h # H1 = H45⊕H67 + vxor C01l,C01l,C23l # L0 = L01⊕L23 + vxor C45l,C45l,C67l # L1 = L45⊕L45 + vxor C01,C01,C23 # M0 = M01⊕M23 + vxor C45,C45,C67 # M1 = M45⊕M45 + vxor C01h,C01h,C45h # H = H0⊕H1 + vxor C01l,C01l,C45l # L = L0⊕L1 + vxor C01,C01,C45 # M = M0⊕M1 + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x80 + bdnz L8x_loop + + # restore registers + li 8,0 + lvx 31,8,SP + addi 8,8,16 + lvx 30,8,SP + addi 8,8,16 + lvx 29,8,SP + addi 8,8,16 + lvx 28,8,SP + addi 8,8,16 + lvx 27,8,SP + addi 8,8,16 + lvx 26,8,SP + addi 8,8,16 + lvx 25,8,SP + addi 8,8,16 + lvx 24,8,SP + addi 8,8,16 + lvx 23,8,SP + addi 8,8,16 + lvx 22,8,SP + addi 8,8,16 + lvx 21,8,SP + addi 8,8,16 + lvx 20,8,SP + ld 31,192(SP) + ld 30,200(SP) + ld 29,208(SP) + ld 28,216(SP) + addi SP,SP,224 + + clrldi LENGTH,LENGTH,57 +L2x: + srdi 7,LENGTH,5 + cmpldi 7,0 + beq L1x + + # table loading + li 8,0x300 + li 9,0x400 + li 10,0x500 + lxvd2x H21hX,8,TABLE + lxvd2x H21X,9,TABLE + lxvd2x H21lX,10,TABLE + + li 10,0x10 + + mtctr 7 +.align 5 +L2x_loop: + # input loading + lxvd2x C0X,0,DATA # load C0 + lxvd2x C1X,10,DATA # load C1 + + # swap permuting + vperm C0,C0,C0,swap_mask + vperm C1,C1,C1,swap_mask + + # previous digest combining + vxor C0,C0,C + + # polynomial multiplication "karatsuba" pre-processing + vperm C01h,C0,C1,hidw_mask + vperm C01l,C0,C1,lodw_mask + vxor C01,C01h,C01l + + # polynomial multiplication "karatsuba" + vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h + vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l + vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l) + + # polynomial multiplication "karatsuba" post-processing + vxor C01,C01,C01h + vxor C01,C01,C01l + + # reduction first phase # [1] + vpmsumd RP,C01l,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,C01,8 # [2] + vsldoi Ml,C01,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor C01h,C01h,Mh # [2] + vxor C01l,C01l,Ml # [2] + vxor C01l,C01l,RP # [1] + + # reduction second phase + vpmsumd RP,C01l,poly_l + vxor C01h,C01l,C01h + vxor C,C01h,RP + + addi DATA,DATA,0x20 + bdnz L2x_loop + + clrldi LENGTH,LENGTH,59 +L1x: + srdi 7,LENGTH,4 + cmpldi 7,0 + beq Lrem + + # table loading + li 9,0x100 + li 10,0x200 + lxvd2x HlX,0,TABLE + lxvd2x HX, 9,TABLE + lxvd2x HhX,10,TABLE + + # input loading + lxvd2x C0X,0,DATA # load C0 + + # swap permuting + vperm C0,C0,C0,swap_mask + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 +Lrem: + cmpldi LENGTH,0 + beq Ldone + + # table loading + li 9,0x100 + li 10,0x200 + lxvd2x HlX,0,TABLE + lxvd2x HX, 9,TABLE + lxvd2x HhX,10,TABLE + + # input loading + stdu SP,-16(SP) + stvx zero,0,SP +Lst_loop: + subic. LENGTH,LENGTH,1 + lbzx 7,LENGTH,DATA + stbx 7,LENGTH,SP + bne Lst_loop + lxvd2x C0X,0,SP + addi SP,SP,16 + + # swap permuting + vperm C0,C0,C0,swap_mask + + # previous digest combining + vxor C0,C0,C + + vpmsumd Cl,C0,Hl # L = Hl*Cl + vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch + vpmsumd Ch,C0,Hh # H = Hh*Ch + + # reduction first phase # [1] + vpmsumd RP,Cl,poly_h # [1] + + # polynomial multiplication post-processing # [2] + vsldoi Mh,zero,Cm,8 # [2] + vsldoi Ml,Cm,zero,8 # [2] + vsldoi RP,RP,RP,8 # [1] + vxor Ch,Ch,Mh # [2] + vxor Cl,Cl,Ml # [2] + vxor Cl,Cl,RP # [1] + + # reduction second phase + vpmsumd RP,Cl,poly_l + vxor Ch,Cl,Ch + vxor C,Ch,RP + +Ldone: + vperm C,C,C,swap_mask + stxvd2x CX,0,X # store C + blr +EPILOGUE(_nettle_gcm_hash8) + + # gcm_fill (uint8_t *ctr, size_t blocks, union gcm_block *buffer) + +.align 5 +PROLOGUE(_nettle_gcm_fill) + ld 6,.swap_mask@got(TOCP) + lvx swap_mask,0,6 + + vxor zero,zero,zero + vspltisb I1,1 + vspltisb I2,2 + vspltisb I3,3 + vspltisb I4,4 + vspltisb I5,5 + vspltisb I6,6 + vspltisb I7,7 + vspltisb I8,8 + vsldoi I1,zero,I1,1 + vsldoi I2,zero,I2,1 + vsldoi I3,zero,I3,1 + vsldoi I4,zero,I4,1 + vsldoi I5,zero,I5,1 + vsldoi I6,zero,I6,1 + vsldoi I7,zero,I7,1 + vsldoi I8,zero,I8,1 + + lxvd2x CTR0X,0,CTR + vperm CTR0,CTR0,CTR0,swap_mask + + srdi 6,BLOCKS,3 # 8x loop count + cmpldi 6,0 + beq Lfill_4x + + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 25,0x10 + li 26,0x20 + li 27,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + + mtctr 6 +L8x_fill_loop: + vadduwm CTR1,CTR0,I1 + vadduwm CTR2,CTR0,I2 + vadduwm CTR3,CTR0,I3 + vadduwm CTR4,CTR0,I4 + vadduwm CTR5,CTR0,I5 + vadduwm CTR6,CTR0,I6 + vadduwm CTR7,CTR0,I7 + + vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask + vperm CTR2,CTR2,CTR2,swap_mask + vperm CTR3,CTR3,CTR3,swap_mask + vperm CTR4,CTR4,CTR4,swap_mask + vperm CTR5,CTR5,CTR5,swap_mask + vperm CTR6,CTR6,CTR6,swap_mask + vperm CTR7,CTR7,CTR7,swap_mask + + stxvd2x CTR0SX,0,BUFFER + stxvd2x CTR1X,25,BUFFER + stxvd2x CTR2X,26,BUFFER + stxvd2x CTR3X,27,BUFFER + stxvd2x CTR4X,28,BUFFER + stxvd2x CTR5X,29,BUFFER + stxvd2x CTR6X,30,BUFFER + stxvd2x CTR7X,31,BUFFER + + vadduwm CTR0,CTR0,I8 + addi BUFFER,BUFFER,0x80 + bdnz L8x_fill_loop + + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi BLOCKS,BLOCKS,61 + +Lfill_4x: + srdi 6,BLOCKS,2 + cmpldi 6,0 + beq Lfill_2x + + li 8,0x10 + li 9,0x20 + li 10,0x30 + + vadduwm CTR1,CTR0,I1 + vadduwm CTR2,CTR0,I2 + vadduwm CTR3,CTR0,I3 + + vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask + vperm CTR2,CTR2,CTR2,swap_mask + vperm CTR3,CTR3,CTR3,swap_mask + + stxvd2x CTR0SX,0,BUFFER + stxvd2x CTR1X,8,BUFFER + stxvd2x CTR2X,9,BUFFER + stxvd2x CTR3X,10,BUFFER + + vadduwm CTR0,CTR0,I4 + addi BUFFER,BUFFER,0x40 + + clrldi BLOCKS,BLOCKS,62 + +Lfill_2x: + srdi 6,BLOCKS,1 + cmpldi 6,0 + beq Lfill_1x + + li 10,0x10 + + vadduwm CTR1,CTR0,I1 + + vperm CTR0S,CTR0,CTR0,swap_mask + vperm CTR1,CTR1,CTR1,swap_mask + + stxvd2x CTR0SX,0,BUFFER + stxvd2x CTR1X,10,BUFFER + + vadduwm CTR0,CTR0,I2 + addi BUFFER,BUFFER,0x20 + + clrldi BLOCKS,BLOCKS,63 + +Lfill_1x: + cmpldi BLOCKS,0 + beq Lfill_done + + vperm CTR0S,CTR0,CTR0,swap_mask + + stxvd2x CTR0SX,0,BUFFER + + vadduwm CTR0,CTR0,I1 + +Lfill_done: + vperm CTR0,CTR0,CTR0,swap_mask + stxvd2x CTR0X,0,CTR + + blr +EPILOGUE(_nettle_gcm_fill) + + .data + .align 4 +.polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 + .align 4 +.swap_mask: + .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 + .align 4 +.hidw_mask: + .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0 + .align 4 +.lodw_mask: + .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8 diff --git a/powerpc64le/machine.m4 b/powerpc64le/machine.m4 new file mode 100644 index 00000000..e69de29b diff --git a/testsuite/gcm-test.c b/testsuite/gcm-test.c index c8174019..df1fc94a 100644 --- a/testsuite/gcm-test.c +++ b/testsuite/gcm-test.c @@ -170,6 +170,29 @@ test_main(void) "16aedbf5a0de6a57a637b39b"), SHEX("619cc5aefffe0bfa462af43c1699d050"));
+ /* Test 128 bytes */ + test_aead(&nettle_gcm_aes128, NULL, + SHEX("feffe9928665731c6d6a8f9467308308"), + SHEX(""), + SHEX("d9313225f88406e5a55909c5aff5269a" + "86a7a9531534f7da2e4c303d8a318a72" + "1c3c0c95956809532fcf0e2449a6b525" + "b16aedf5aa0de657ba637b391aafd255" + "5ae376bc5e9f6a1b08e34db7a6ee0736" + "9ba662ea12f6f197e6bc3ed69d2480f3" + "ea5691347f2ba69113eb37910ebc18c8" + "0f697234582016fa956ca8f63ae6b473"), + SHEX("42831ec2217774244b7221b784d0d49c" + "e3aa212f2c02a4e035c17e2329aca12e" + "21d514b25466931c7d8f6a5aac84aa05" + "1ba30b396a0aac973d58e091473f5985" + "874b1178906ddbeab04ab2fe6cce8c57" + "8d7e961bd13fd6a8c56b66ca5e576492" + "1a48cd8bda04e66343e73055118b69b9" + "ced486813846958a11e602c03cfc232b"), + SHEX("cafebabefacedbaddecaf888"), + SHEX("796836f1246c9d735c5e1be0a715ccc3")); + /* Test case 7 */ test_aead(&nettle_gcm_aes192, NULL, SHEX("00000000000000000000000000000000"
nettle-bugs@lists.lysator.liu.se