--- powerpc64/aes-decrypt-internal.asm | 579 +++++++++++++++++++++++++++++++++++++ powerpc64/aes-encrypt-internal.asm | 540 ++++++++++++++++++++++++++++++++++ 2 files changed, 1119 insertions(+) create mode 100644 powerpc64/aes-decrypt-internal.asm create mode 100644 powerpc64/aes-encrypt-internal.asm
diff --git a/powerpc64/aes-decrypt-internal.asm b/powerpc64/aes-decrypt-internal.asm new file mode 100644 index 00000000..b194d65f --- /dev/null +++ b/powerpc64/aes-decrypt-internal.asm @@ -0,0 +1,579 @@ +C powerpc64/aes-decrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) +define(<S8>, <10>) +define(<S9>, <11>) +define(<S10>, <12>) +define(<S11>, <13>) +define(<S12>, <14>) +define(<S13>, <15>) +define(<S14>, <16>) +define(<S15>, <17>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) +define(<S8X>, <42>) +define(<S9X>, <43>) +define(<S10X>, <44>) +define(<S11X>, <45>) +define(<S12X>, <46>) +define(<S13X>, <47>) +define(<S14X>, <48>) +define(<S15X>, <49>) + +C ZERO vector register is used in place of RoundKey +C for vncipher instruction because the order of InvMixColumns +C and Xor processes are flipped in that instruction. +C The Xor process with RoundKey is executed afterward. +define(<ZERO>, <18>) + +.file "aes-decrypt-internal.asm" + +IF_LE(<.abiversion 2>) +.text + + C _aes_decrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +PROLOGUE(_nettle_aes_decrypt) + vxor ZERO,ZERO,ZERO + + ld 5,.swap_mask@got(TOCP) + lvx swap_mask,0,5 + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,4 #16x loop count + cmpldi 5,0 + beq L8x + + std 17,-120(SP); + std 18,-112(SP); + std 19,-104(SP); + std 20,-96(SP); + std 21,-88(SP); + std 22,-80(SP); + std 23,-72(SP); + std 24,-64(SP); + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 17,0x10 + li 18,0x20 + li 19,0x30 + li 20,0x40 + li 21,0x50 + li 22,0x60 + li 23,0x70 + li 24,0x80 + li 25,0x90 + li 26,0xA0 + li 27,0xB0 + li 28,0xC0 + li 29,0xD0 + li 30,0xE0 + li 31,0xF0 + +.align 5 +Lx16_loop: + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + lxvd2x S1X,17,SRC + lxvd2x S2X,18,SRC + lxvd2x S3X,19,SRC + lxvd2x S4X,20,SRC + lxvd2x S5X,21,SRC + lxvd2x S6X,22,SRC + lxvd2x S7X,23,SRC + lxvd2x S8X,24,SRC + lxvd2x S9X,25,SRC + lxvd2x S10X,26,SRC + lxvd2x S11X,27,SRC + lxvd2x S12X,28,SRC + lxvd2x S13X,29,SRC + lxvd2x S14X,30,SRC + lxvd2x S15X,31,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L16x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vncipher S8,S8,ZERO + vncipher S9,S9,ZERO + vncipher S10,S10,ZERO + vncipher S11,S11,ZERO + vncipher S12,S12,ZERO + vncipher S13,S13,ZERO + vncipher S14,S14,ZERO + vncipher S15,S15,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + addi 10,10,0x10 + bdnz L16x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + vncipherlast S8,S8,K + vncipherlast S9,S9,K + vncipherlast S10,S10,K + vncipherlast S11,S11,K + vncipherlast S12,S12,K + vncipherlast S13,S13,K + vncipherlast S14,S14,K + vncipherlast S15,S15,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask>) + + stxvd2x S0X,0,DST + stxvd2x S1X,17,DST + stxvd2x S2X,18,DST + stxvd2x S3X,19,DST + stxvd2x S4X,20,DST + stxvd2x S5X,21,DST + stxvd2x S6X,22,DST + stxvd2x S7X,23,DST + stxvd2x S8X,24,DST + stxvd2x S9X,25,DST + stxvd2x S10X,26,DST + stxvd2x S11X,27,DST + stxvd2x S12X,28,DST + stxvd2x S13X,29,DST + stxvd2x S14X,30,DST + stxvd2x S15X,31,DST + + addi SRC,SRC,0x100 + addi DST,DST,0x100 + subic. 5,5,1 + bne Lx16_loop + + ld 17,-120(SP); + ld 18,-112(SP); + ld 19,-104(SP); + ld 20,-96(SP); + ld 21,-88(SP); + ld 22,-80(SP); + ld 23,-72(SP); + ld 24,-64(SP); + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,60 + +L8x: + srdi 5,LENGTH,3 + cmpldi 5,0 + beq L4x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + addi 9,9,0x10 + lxvd2x S4X,9,SRC + addi 9,9,0x10 + lxvd2x S5X,9,SRC + addi 9,9,0x10 + lxvd2x S6X,9,SRC + addi 9,9,0x10 + lxvd2x S7X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + addi 9,9,0x10 + stxvd2x S4X,9,DST + addi 9,9,0x10 + stxvd2x S5X,9,DST + addi 9,9,0x10 + stxvd2x S6X,9,DST + addi 9,9,0x10 + stxvd2x S7X,9,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vxor S0,S0,K + vxor S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vxor S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_decrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>) diff --git a/powerpc64/aes-encrypt-internal.asm b/powerpc64/aes-encrypt-internal.asm new file mode 100644 index 00000000..76630222 --- /dev/null +++ b/powerpc64/aes-encrypt-internal.asm @@ -0,0 +1,540 @@ +C powerpc64/aes-encrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) +define(<S8>, <10>) +define(<S9>, <11>) +define(<S10>, <12>) +define(<S11>, <13>) +define(<S12>, <14>) +define(<S13>, <15>) +define(<S14>, <16>) +define(<S15>, <17>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) +define(<S8X>, <42>) +define(<S9X>, <43>) +define(<S10X>, <44>) +define(<S11X>, <45>) +define(<S12X>, <46>) +define(<S13X>, <47>) +define(<S14X>, <48>) +define(<S15X>, <49>) + +.file "aes-encrypt-internal.asm" + +IF_LE(<.abiversion 2>) +.text + + C _aes_encrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +PROLOGUE(_nettle_aes_encrypt) + ld 5,.swap_mask@got(TOCP) + lvx swap_mask,0,5 + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,4 #16x loop count + cmpldi 5,0 + beq L8x + + std 17,-120(SP); + std 18,-112(SP); + std 19,-104(SP); + std 20,-96(SP); + std 21,-88(SP); + std 22,-80(SP); + std 23,-72(SP); + std 24,-64(SP); + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 17,0x10 + li 18,0x20 + li 19,0x30 + li 20,0x40 + li 21,0x50 + li 22,0x60 + li 23,0x70 + li 24,0x80 + li 25,0x90 + li 26,0xA0 + li 27,0xB0 + li 28,0xC0 + li 29,0xD0 + li 30,0xE0 + li 31,0xF0 + +.align 5 +Lx16_loop: + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + lxvd2x S1X,17,SRC + lxvd2x S2X,18,SRC + lxvd2x S3X,19,SRC + lxvd2x S4X,20,SRC + lxvd2x S5X,21,SRC + lxvd2x S6X,22,SRC + lxvd2x S7X,23,SRC + lxvd2x S8X,24,SRC + lxvd2x S9X,25,SRC + lxvd2x S10X,26,SRC + lxvd2x S11X,27,SRC + lxvd2x S12X,28,SRC + lxvd2x S13X,29,SRC + lxvd2x S14X,30,SRC + lxvd2x S15X,31,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + vxor S8,S8,K + vxor S9,S9,K + vxor S10,S10,K + vxor S11,S11,K + vxor S12,S12,K + vxor S13,S13,K + vxor S14,S14,K + vxor S15,S15,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L16x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + vcipher S8,S8,K + vcipher S9,S9,K + vcipher S10,S10,K + vcipher S11,S11,K + vcipher S12,S12,K + vcipher S13,S13,K + vcipher S14,S14,K + vcipher S15,S15,K + addi 10,10,0x10 + bdnz L16x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + vcipherlast S8,S8,K + vcipherlast S9,S9,K + vcipherlast S10,S10,K + vcipherlast S11,S11,K + vcipherlast S12,S12,K + vcipherlast S13,S13,K + vcipherlast S14,S14,K + vcipherlast S15,S15,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask + vperm S8,S8,S8,swap_mask + vperm S9,S9,S9,swap_mask + vperm S10,S10,S10,swap_mask + vperm S11,S11,S11,swap_mask + vperm S12,S12,S12,swap_mask + vperm S13,S13,S13,swap_mask + vperm S14,S14,S14,swap_mask + vperm S15,S15,S15,swap_mask>) + + stxvd2x S0X,0,DST + stxvd2x S1X,17,DST + stxvd2x S2X,18,DST + stxvd2x S3X,19,DST + stxvd2x S4X,20,DST + stxvd2x S5X,21,DST + stxvd2x S6X,22,DST + stxvd2x S7X,23,DST + stxvd2x S8X,24,DST + stxvd2x S9X,25,DST + stxvd2x S10X,26,DST + stxvd2x S11X,27,DST + stxvd2x S12X,28,DST + stxvd2x S13X,29,DST + stxvd2x S14X,30,DST + stxvd2x S15X,31,DST + + addi SRC,SRC,0x100 + addi DST,DST,0x100 + subic. 5,5,1 + bne Lx16_loop + + ld 17,-120(SP); + ld 18,-112(SP); + ld 19,-104(SP); + ld 20,-96(SP); + ld 21,-88(SP); + ld 22,-80(SP); + ld 23,-72(SP); + ld 24,-64(SP); + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,60 + +L8x: + srdi 5,LENGTH,3 + cmpldi 5,0 + beq L4x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + addi 9,9,0x10 + lxvd2x S4X,9,SRC + addi 9,9,0x10 + lxvd2x S5X,9,SRC + addi 9,9,0x10 + lxvd2x S6X,9,SRC + addi 9,9,0x10 + lxvd2x S7X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + addi 9,9,0x10 + stxvd2x S4X,9,DST + addi 9,9,0x10 + stxvd2x S5X,9,DST + addi 9,9,0x10 + stxvd2x S6X,9,DST + addi 9,9,0x10 + stxvd2x S7X,9,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_encrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>)
Maamoun TK maamoun.tk@googlemail.com writes:
+L16x_round_loop:
- lxvd2x KX,10,KEYS
- vperm K,K,K,swap_mask
- vncipher S0,S0,ZERO
- vncipher S1,S1,ZERO
- vncipher S2,S2,ZERO
- vncipher S3,S3,ZERO
- vncipher S4,S4,ZERO
- vncipher S5,S5,ZERO
- vncipher S6,S6,ZERO
- vncipher S7,S7,ZERO
- vncipher S8,S8,ZERO
- vncipher S9,S9,ZERO
- vncipher S10,S10,ZERO
- vncipher S11,S11,ZERO
- vncipher S12,S12,ZERO
- vncipher S13,S13,ZERO
- vncipher S14,S14,ZERO
- vncipher S15,S15,ZERO
- vxor S0,S0,K
- vxor S1,S1,K
- vxor S2,S2,K
- vxor S3,S3,K
- vxor S4,S4,K
- vxor S5,S5,K
- vxor S6,S6,K
- vxor S7,S7,K
- vxor S8,S8,K
- vxor S9,S9,K
- vxor S10,S10,K
- vxor S11,S11,K
- vxor S12,S12,K
- vxor S13,S13,K
- vxor S14,S14,K
- vxor S15,S15,K
- addi 10,10,0x10
- bdnz L16x_round_loop
Do you really need to go all the way to 16 blocks in parallel to saturate the execution units? I'm used to defining throughput and latency of an instruction (e.g., vncipher) as follows:
Throughput: The number of independent vncipher instructions that can be executed per cycle. Can be measured by benchmarking a loop of independent instructions.
Latency: The number of cycles from the start of execution of a vncipher instruction until execution of an instruction depending on the vncipher result can start. Can be measured by benchmarking a loop where each instruction depends on the result of the preceding instruction.
Do you know throughput and latency of the vncipher and vxor instructions? (Official manuals are not always to be trusted). Those numbers determines how much parallelism is needed, typically the product of latency and throughput.
Regards, /Niels
You are right, I measured the throughput and latency for vncipher and vxor instructions for POWER8 and updated the patch accordingly.
On Thu, Jul 9, 2020 at 5:58 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
+L16x_round_loop:
- lxvd2x KX,10,KEYS
- vperm K,K,K,swap_mask
- vncipher S0,S0,ZERO
- vncipher S1,S1,ZERO
- vncipher S2,S2,ZERO
- vncipher S3,S3,ZERO
- vncipher S4,S4,ZERO
- vncipher S5,S5,ZERO
- vncipher S6,S6,ZERO
- vncipher S7,S7,ZERO
- vncipher S8,S8,ZERO
- vncipher S9,S9,ZERO
- vncipher S10,S10,ZERO
- vncipher S11,S11,ZERO
- vncipher S12,S12,ZERO
- vncipher S13,S13,ZERO
- vncipher S14,S14,ZERO
- vncipher S15,S15,ZERO
- vxor S0,S0,K
- vxor S1,S1,K
- vxor S2,S2,K
- vxor S3,S3,K
- vxor S4,S4,K
- vxor S5,S5,K
- vxor S6,S6,K
- vxor S7,S7,K
- vxor S8,S8,K
- vxor S9,S9,K
- vxor S10,S10,K
- vxor S11,S11,K
- vxor S12,S12,K
- vxor S13,S13,K
- vxor S14,S14,K
- vxor S15,S15,K
- addi 10,10,0x10
- bdnz L16x_round_loop
Do you really need to go all the way to 16 blocks in parallel to saturate the execution units? I'm used to defining throughput and latency of an instruction (e.g., vncipher) as follows:
Throughput: The number of independent vncipher instructions that can be executed per cycle. Can be measured by benchmarking a loop of independent instructions.
Latency: The number of cycles from the start of execution of a vncipher instruction until execution of an instruction depending on the vncipher result can start. Can be measured by benchmarking a loop where each instruction depends on the result of the preceding instruction.
Do you know throughput and latency of the vncipher and vxor instructions? (Official manuals are not always to be trusted). Those numbers determines how much parallelism is needed, typically the product of latency and throughput.
Regards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
nettle-bugs@lists.lysator.liu.se