This implementation is based on the existing, per-algorithm optimized powerpc64/p8/aes-encrypt-internal.asm and powerpc64/p8/gcm-hash.asm implementations by Niels Möller and Mamone Tarsha.
Significant changes:
- Combine AES + GCM into a single function call which does up-to 8x unrolled AES followed by 2x 4x unrolled GCM back-to-back. - Handle the IV|CTR increment in assembly and avoid the somewhat costly gcm_fill() call to precalculate the counter values. - Use ISA 3.0 (P9) lxvb16x/stxvb16x to load/store unaligned VSX registers to avoid permutes on LE machines. - Use ISA 3.0 (P9) lxvll/stxvll to load/store left-aligned, zero-padded partial (<16B) blocks. - Use ISA 3.0 (P9) lxv/stxv to load/store the non-volatile vector registers from/to the stack redzone to avoid using a GPR register as an index.
Signed-off-by: Christopher M. Riedl cmr@linux.ibm.com --- gcm.c | 4 + powerpc64/p9/gcm-aes-encrypt.asm | 666 +++++++++++++++++++++++++++++++ 2 files changed, 670 insertions(+) create mode 100644 powerpc64/p9/gcm-aes-encrypt.asm
diff --git a/gcm.c b/gcm.c index 6fe25a01..39e7a7c7 100644 --- a/gcm.c +++ b/gcm.c @@ -61,8 +61,12 @@ GCM_TABLE_BITS == 8 layout */ #undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_aes_decrypt +#undef HAVE_NATIVE_gcm_aes_encrypt #undef HAVE_NATIVE_fat_gcm_hash #undef HAVE_NATIVE_fat_gcm_init_key +#undef HAVE_NATIVE_fat_gcm_aes_decrypt +#undef HAVE_NATIVE_fat_gcm_aes_encrypt #endif
#if !HAVE_NATIVE_gcm_hash diff --git a/powerpc64/p9/gcm-aes-encrypt.asm b/powerpc64/p9/gcm-aes-encrypt.asm new file mode 100644 index 00000000..43f577fa --- /dev/null +++ b/powerpc64/p9/gcm-aes-encrypt.asm @@ -0,0 +1,666 @@ +C powerpc64/p9/gcm-aes-encrypt.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Mamone Tarsha + Copyright (C) 2021 Christopher M. Riedl + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + +.file "gcm-aes-encrypt.asm" + +.text + +C void gcm_aes_encrypt(const struct gcm_key *key, union gcm_block *x, +C size_t length, const uint8_t *src, +C unsigned rounds, const uint32_t *keys, +C uint8_t *dst, uint32_t *ctr) + +C Register usage: +define(`SP', `r1') +define(`TOCP', `r2') + +C Parameters: +define(`TABLE', `r3') +define(`X', `r4') C Output GCM/Ghash tag +define(`LENGTH',`r5') +define(`SRC', `r6') C Plaintext input +define(`ROUNDS',`r7') +define(`KEYS', `r8') +define(`DST', `r9') +define(`PCTR', `r10') C Pointer to 12B IV and starting 4B ctr + +C GCM/Ghash: +define(`POLY_L',`v0') +define(`D', `v1') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`H3M', `v10') +define(`H3L', `v11') +define(`H4M', `v12') +define(`H4L', `v13') +define(`R', `v14') +define(`F', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T', `v18') +define(`R3', `v20') +define(`F3', `v21') +define(`R4', `v22') +define(`F4', `v23') + +C AES: +define(`K', `v25') +define(`S0', `v2') +define(`S1', `v3') +define(`S2', `v4') +define(`S3', `v5') +define(`S4', `v26') +define(`S5', `v27') +define(`S6', `v28') +define(`S7', `v29') +define(`CTR', `v30') +define(`INC', `v31') +define(`C0', `v14') +define(`C1', `v15') +define(`C2', `v16') +define(`C3', `v17') +define(`C4', `v20') +define(`C5', `v21') +define(`C6', `v22') +define(`C7', `v23') + +define(`LCNT', `r14') +define(`ZERO', `v16') +define(`POLY', `v24') +C misc: r15,r16,r17 + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_aes_encrypt) + + vxor ZERO,ZERO,ZERO + subi ROUNDS,ROUNDS,1 C Last AES round uses vcipherlast + + C Store non-volatiles on the 288B stack redzone + std r14,-8*1(SP) + std r15,-8*2(SP) + std r16,-8*3(SP) + std r17,-8*4(SP) + stxv VSR(v20),-16*3(SP) + stxv VSR(v21),-16*4(SP) + stxv VSR(v22),-16*5(SP) + stxv VSR(v23),-16*6(SP) + stxv VSR(v24),-16*7(SP) + stxv VSR(v25),-16*8(SP) + stxv VSR(v26),-16*9(SP) + stxv VSR(v27),-16*10(SP) + stxv VSR(v28),-16*11(SP) + stxv VSR(v29),-16*12(SP) + stxv VSR(v30),-16*13(SP) + stxv VSR(v31),-16*14(SP) + + DATA_LOAD_VEC(POLY,.polynomial,r14) + DATA_LOAD_VEC(INC,.increment,r14) + + lxvb16x VSR(CTR),0,PCTR C Load 'ctr' pointer + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) + lxvb16x VSR(D),0,X C load 'X' pointer + +L8x: + C --- process 8 blocks '128-bit each' per one loop --- + srdi. LCNT,LENGTH,7 C 8-blocks loop count 'LENGTH / (8 * 16)' + beq L4x + + C load table elements + li r15,4*16 + li r16,5*16 + li r17,6*16 + lxvd2x VSR(H3M),r15,TABLE + lxvd2x VSR(H3L),r16,TABLE + lxvd2x VSR(H4M),r17,TABLE + li r16,7*16 + lxvd2x VSR(H4L),r16,TABLE + li r15,1*16 + li r16,2*16 + li r17,3*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r15,TABLE + lxvd2x VSR(H2M),r16,TABLE + lxvd2x VSR(H2L),r17,TABLE + +L8x_loop: +L8x_aes: + lxvb16x VSR(K),0,KEYS + + C Increment ctr + vmr S0,CTR + vadduwm CTR,CTR,INC + vxor S0,S0,K + vmr S1,CTR + vadduwm CTR,CTR,INC + vxor S1,S1,K + vmr S2,CTR + vadduwm CTR,CTR,INC + vxor S2,S2,K + vmr S3,CTR + vadduwm CTR,CTR,INC + vxor S3,S3,K + + mtctr ROUNDS + li r15,1*16 + + vmr S4,CTR + vadduwm CTR,CTR,INC + vxor S4,S4,K + vmr S5,CTR + vadduwm CTR,CTR,INC + vxor S5,S5,K + vmr S6,CTR + vadduwm CTR,CTR,INC + vxor S6,S6,K + vmr S7,CTR + vadduwm CTR,CTR,INC + vxor S7,S7,K + +.align 5 +L8x_aes_rnd_loop: + lxvb16x VSR(K),r15,KEYS + addi r15,r15,1*16 + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + bdnz L8x_aes_rnd_loop + + lxvb16x VSR(K),r15,KEYS + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + + C AES(counter) XOR plaintext = ciphertext + li r15,1*16 + li r16,2*16 + li r17,3*16 + lxvb16x VSR(C0),0,SRC + lxvb16x VSR(C1),r15,SRC + lxvb16x VSR(C2),r16,SRC + lxvb16x VSR(C3),r17,SRC + vxor S0,C0,S0 + vxor S1,C1,S1 + vxor S2,C2,S2 + vxor S3,C3,S3 + + addi SRC,SRC,4*16 + lxvb16x VSR(C4),0,SRC + lxvb16x VSR(C5),r15,SRC + lxvb16x VSR(C6),r16,SRC + lxvb16x VSR(C7),r17,SRC + vxor S4,C4,S4 + vxor S5,C5,S5 + vxor S6,C6,S6 + vxor S7,C7,S7 + + C Store ciphertext + stxvb16x VSR(S0),0,DST + stxvb16x VSR(S1),r15,DST + stxvb16x VSR(S2),r16,DST + stxvb16x VSR(S3),r17,DST + addi DST,DST,4*16 + stxvb16x VSR(S4),0,DST + stxvb16x VSR(S5),r15,DST + stxvb16x VSR(S6),r16,DST + stxvb16x VSR(S7),r17,DST + + addi SRC,SRC,4*16 + addi DST,DST,4*16 + +L8x_gcm: + C previous digest combining + vxor S0,S0,D + + C polynomial multiplication + vpmsumd F2,H3L,S1 + vpmsumd R2,H3M,S1 + vpmsumd F3,H2L,S2 + vpmsumd R3,H2M,S2 + vpmsumd F4,H1L,S3 + vpmsumd R4,H1M,S3 + vpmsumd F,H4L,S0 + vpmsumd R,H4M,S0 + + C deferred recombination of partial products + vxor F3,F3,F4 + vxor R3,R3,R4 + vxor F,F,F2 + vxor R,R,R2 + vxor F,F,F3 + vxor R,R,R3 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + C previous digest combining + vxor S4,S4,D + + C polynomial multiplication + vpmsumd F2,H3L,S5 + vpmsumd R2,H3M,S5 + vpmsumd F3,H2L,S6 + vpmsumd R3,H2M,S6 + vpmsumd F4,H1L,S7 + vpmsumd R4,H1M,S7 + vpmsumd F,H4L,S4 + vpmsumd R,H4M,S4 + + C deferred recombination of partial products + vxor F3,F3,F4 + vxor R3,R3,R4 + vxor F,F,F2 + vxor R,R,R2 + vxor F,F,F3 + vxor R,R,R3 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + C Decrement 8x block count and check if done + subi LCNT,LCNT,1 + cmpldi LCNT,0 + bne L8x_loop + clrldi LENGTH,LENGTH,57 C 'set the high-order 57 bits to zeros' + +L4x: + C --- process 4 blocks --- + srdi. LCNT,LENGTH,6 C 4-blocks loop count 'LENGTH / (4 * 16)' + beq L2x + + C load table elements + li r15,4*16 + li r16,5*16 + li r17,6*16 + lxvd2x VSR(H3M),r15,TABLE + lxvd2x VSR(H3L),r16,TABLE + lxvd2x VSR(H4M),r17,TABLE + li r16,7*16 + lxvd2x VSR(H4L),r16,TABLE + li r15,1*16 + li r16,2*16 + li r17,3*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r15,TABLE + lxvd2x VSR(H2M),r16,TABLE + lxvd2x VSR(H2L),r17,TABLE + +L4x_aes: + lxvb16x VSR(K),0,KEYS + + C Increment ctr + vmr S0,CTR + vadduwm CTR,CTR,INC + vmr S1,CTR + vadduwm CTR,CTR,INC + vmr S2,CTR + vadduwm CTR,CTR,INC + vmr S3,CTR + vadduwm CTR,CTR,INC + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li r15,1*16 + +.align 5 +L4x_aes_rnd_loop: + lxvb16x VSR(K),r15,KEYS + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + addi r15,r15,1*16 + bdnz L4x_aes_rnd_loop + + lxvb16x VSR(K),r15,KEYS + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + + C AES(counter) XOR plaintext = ciphertext + li r15,1*16 + li r16,2*16 + li r17,3*16 + lxvb16x VSR(C0),0,SRC + lxvb16x VSR(C1),r15,SRC + lxvb16x VSR(C2),r16,SRC + lxvb16x VSR(C3),r17,SRC + vxor S0,C0,S0 + vxor S1,C1,S1 + vxor S2,C2,S2 + vxor S3,C3,S3 + + C Store ciphertext in DST + stxvb16x VSR(S0),0,DST + stxvb16x VSR(S1),r15,DST + stxvb16x VSR(S2),r16,DST + stxvb16x VSR(S3),r17,DST + +L4x_gcm: + C previous digest combining + vxor S0,S0,D + + C polynomial multiplication + vpmsumd F2,H3L,S1 + vpmsumd R2,H3M,S1 + vpmsumd F3,H2L,S2 + vpmsumd R3,H2M,S2 + vpmsumd F4,H1L,S3 + vpmsumd R4,H1M,S3 + vpmsumd F,H4L,S0 + vpmsumd R,H4M,S0 + + C deferred recombination of partial products + vxor F3,F3,F4 + vxor R3,R3,R4 + vxor F,F,F2 + vxor R,R,R2 + vxor F,F,F3 + vxor R,R,R3 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DST,DST,4*16 + addi SRC,SRC,4*16 + clrldi LENGTH,LENGTH,58 C 'set the high-order 58 bits to zeros' + +L2x: + C --- process 2 blocks --- + srdi. r14,LENGTH,5 C 'LENGTH / (2 * 16)' + beq L1x + + C load table elements + li r15,1*16 + li r16,2*16 + li r17,3*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r15,TABLE + lxvd2x VSR(H2M),r16,TABLE + lxvd2x VSR(H2L),r17,TABLE + +L2x_aes: + lxvb16x VSR(K),0,KEYS + + C Increment ctr + vmr S0,CTR + vadduwm CTR,CTR,INC + vmr S1,CTR + vadduwm CTR,CTR,INC + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li r15,1*16 + +.align 5 +L2x_aes_rnd_loop: + lxvb16x VSR(K),r15,KEYS + vcipher S0,S0,K + vcipher S1,S1,K + addi r15,r15,1*16 + bdnz L2x_aes_rnd_loop + + lxvb16x VSR(K),r15,KEYS + vcipherlast S0,S0,K + vcipherlast S1,S1,K + + C AES(counter) XOR plaintext = ciphertext + li r15,1*16 + lxvb16x VSR(C0),0,SRC + lxvb16x VSR(C1),r15,SRC + vxor S0,C0,S0 + vxor S1,C1,S1 + + C Store ciphertext in DST + stxvb16x VSR(S0),0,DST + stxvb16x VSR(S1),r15,DST + +L2x_gcm: + C previous digest combining + vxor S0,S0,D + + C polynomial multiplication + vpmsumd F2,H1L,S1 + vpmsumd R2,H1M,S1 + vpmsumd F,H2L,S0 + vpmsumd R,H2M,S0 + + C deferred recombination of partial products + vxor F,F,F2 + vxor R,R,R2 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DST,DST,2*16 + addi SRC,SRC,2*16 + clrldi LENGTH,LENGTH,59 C 'set the high-order 59 bits to zeros' + +L1x: + C --- process 1 block --- + srdi. r14,LENGTH,4 C 'LENGTH / (1 * 16)' + beq Lpartial + + C load table elements + li r15,1*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r15,TABLE + +L1x_aes: + lxvb16x VSR(K),0,KEYS + + C Increment ctr + vmr S0,CTR + vadduwm CTR,CTR,INC + + vxor S0,S0,K + + mtctr ROUNDS + li r15,1*16 + +.align 5 +L1x_aes_rnd_loop: + lxvb16x VSR(K),r15,KEYS + vcipher S0,S0,K + addi r15,r15,1*16 + bdnz L1x_aes_rnd_loop + + lxvb16x VSR(K),r15,KEYS + vcipherlast S0,S0,K + + C AES(counter) XOR plaintext = ciphertext + lxvb16x VSR(C0),0,SRC + vxor S0,C0,S0 + + C Store ciphertext in DST + stxvb16x VSR(S0),0,DST + +L1x_gcm: + C previous digest combining + vxor S0,S0,D + + C polynomial multiplication + vpmsumd F,H1L,S0 + vpmsumd R,H1M,S0 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DST,DST,1*16 + addi SRC,SRC,1*16 + clrldi LENGTH,LENGTH,60 C 'set the high-order 60 bits to zeros' + +Lpartial: + C --- process partial block --- + cmpldi LENGTH,0 + beq Ldone + + C load table elements + li r15,1*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r15,TABLE + +Lpartial_aes: + lxvb16x VSR(K),0,KEYS + + C Increment ctr + vmr S0,CTR + vadduwm CTR,CTR,INC + + vxor S0,S0,K + + mtctr ROUNDS + li r15,1*16 + +.align 5 +Lpartial_aes_rnd_loop: + lxvb16x VSR(K),r15,KEYS + vcipher S0,S0,K + addi r15,r15,1*16 + bdnz Lpartial_aes_rnd_loop + + lxvb16x VSR(K),r15,KEYS + vcipherlast S0,S0,K + + C Load the partial block left-aligned and zero-padded + sldi LENGTH,LENGTH,56 + lxvll VSR(C0),SRC,LENGTH + + C AES(counter) XOR plaintext = ciphertext + vxor S0,C0,S0 + + C Store ciphertext in DST + stxvll VSR(S0),DST,LENGTH + + C TODO: Lazy, reload the value to zero-out the padding bits again + lxvll VSR(S0),DST,LENGTH + +Lpartial_gcm: + C previous digest combining + vxor S0,S0,D + + C polynomial multiplication + vpmsumd F,H1L,S0 + vpmsumd R,H1M,S0 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + +Ldone: + stxvb16x VSR(D),0,X C store digest 'D' + stxvb16x VSR(CTR),0,PCTR C store updated 'ctr' + + C Restore non-volatiles from the 288B stack redzone + ld r14,-8*1(SP) + ld r15,-8*2(SP) + ld r16,-8*3(SP) + ld r17,-8*4(SP) + lxv VSR(v20),-16*3(SP) + lxv VSR(v21),-16*4(SP) + lxv VSR(v22),-16*5(SP) + lxv VSR(v23),-16*6(SP) + lxv VSR(v24),-16*7(SP) + lxv VSR(v25),-16*8(SP) + lxv VSR(v26),-16*9(SP) + lxv VSR(v27),-16*10(SP) + lxv VSR(v28),-16*11(SP) + lxv VSR(v29),-16*12(SP) + lxv VSR(v30),-16*13(SP) + lxv VSR(v31),-16*14(SP) + + li r3,0 C return 0 for success + blr + +EPILOGUE(_nettle_gcm_aes_encrypt) + +.data +.align 4 +C 0xC2000000000000000000000000000001 +.polynomial: +IF_BE(` + .byte 0xC2 + .rept 14 + .byte 0x00 + .endr + .byte 0x01 +',` + .byte 0x01 + .rept 14 + .byte 0x00 + .endr + .byte 0xC2 +') +.align 4 +.increment: +IF_LE(` + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +') +IF_BE(` + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +')