I measured the latency and throughput of vcipher/vncipher/vxor instructions for POWER8 vcipher/vncipher throughput 6 instructions per cycle latency 0.91 clock cycles vxor throughput 6 instructions per cycle latency 0.32 clock cycles So the ideal option for POWER8 is processing 8 blocks, it has +12% performance over processing 4 blocks.
--- powerpc64/P8/aes-decrypt-internal.asm | 367 ++++++++++++++++++++++++++++++++++ powerpc64/P8/aes-encrypt-internal.asm | 344 +++++++++++++++++++++++++++++++ 2 files changed, 711 insertions(+) create mode 100644 powerpc64/P8/aes-decrypt-internal.asm create mode 100644 powerpc64/P8/aes-encrypt-internal.asm
diff --git a/powerpc64/P8/aes-decrypt-internal.asm b/powerpc64/P8/aes-decrypt-internal.asm new file mode 100644 index 00000000..f5d64548 --- /dev/null +++ b/powerpc64/P8/aes-decrypt-internal.asm @@ -0,0 +1,367 @@ +C powerpc64/P8/aes-decrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) + +C ZERO vector register is used in place of RoundKey +C for vncipher instruction because the order of InvMixColumns +C and Xor processes are flipped in that instruction. +C The Xor process with RoundKey is executed afterward. +define(<ZERO>, <10>) + +.file "aes-decrypt-internal.asm" + +IF_LE(<.abiversion 2>) +.text + + C _aes_decrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +define(<FUNC_ALIGN>, <5>) +PROLOGUE(_nettle_aes_decrypt) + vxor ZERO,ZERO,ZERO + + DATA_LOAD_VEC(swap_mask,.swap_mask,5) + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,3 #8x loop count + cmpldi 5,0 + beq L4x + + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 25,0x10 + li 26,0x20 + li 27,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + +.align 5 +Lx8_loop: + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + lxvd2x S1X,25,SRC + lxvd2x S2X,26,SRC + lxvd2x S3X,27,SRC + lxvd2x S4X,28,SRC + lxvd2x S5X,29,SRC + lxvd2x S6X,30,SRC + lxvd2x S7X,31,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + stxvd2x S0X,0,DST + stxvd2x S1X,25,DST + stxvd2x S2X,26,DST + stxvd2x S3X,27,DST + stxvd2x S4X,28,DST + stxvd2x S5X,29,DST + stxvd2x S6X,30,DST + stxvd2x S7X,31,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + subic. 5,5,1 + bne Lx8_loop + + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vxor S0,S0,K + vxor S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vxor S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_decrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>) diff --git a/powerpc64/P8/aes-encrypt-internal.asm b/powerpc64/P8/aes-encrypt-internal.asm new file mode 100644 index 00000000..3e0fa6f0 --- /dev/null +++ b/powerpc64/P8/aes-encrypt-internal.asm @@ -0,0 +1,344 @@ +C powerpc64/P8/aes-encrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) + +.file "aes-encrypt-internal.asm" + +IF_LE(<.abiversion 2>) +.text + + C _aes_encrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +define(<FUNC_ALIGN>, <5>) +PROLOGUE(_nettle_aes_encrypt) + DATA_LOAD_VEC(swap_mask,.swap_mask,5) + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,3 #8x loop count + cmpldi 5,0 + beq L4x + + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 25,0x10 + li 26,0x20 + li 27,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + +.align 5 +Lx8_loop: + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + lxvd2x S1X,25,SRC + lxvd2x S2X,26,SRC + lxvd2x S3X,27,SRC + lxvd2x S4X,28,SRC + lxvd2x S5X,29,SRC + lxvd2x S6X,30,SRC + lxvd2x S7X,31,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + stxvd2x S0X,0,DST + stxvd2x S1X,25,DST + stxvd2x S2X,26,DST + stxvd2x S3X,27,DST + stxvd2x S4X,28,DST + stxvd2x S5X,29,DST + stxvd2x S6X,30,DST + stxvd2x S7X,31,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + subic. 5,5,1 + bne Lx8_loop + + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_encrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>)