[PATCH 2/6] "PowerPC64" Add optimized AES [Enc|Dec] - nettle-bugs

14 Jul 2020

I measured the latency and throughput of vcipher/vncipher/vxor instructions
for POWER8
vcipher/vncipher
throughput 6 instructions per cycle
latency 0.91 clock cycles
vxor
throughput 6 instructions per cycle
latency 0.32 clock cycles
So the ideal option for POWER8 is processing 8 blocks, it has +12%
performance over processing 4 blocks.
---
 powerpc64/P8/aes-decrypt-internal.asm | 367
++++++++++++++++++++++++++++++++++
 powerpc64/P8/aes-encrypt-internal.asm | 344 +++++++++++++++++++++++++++++++
 2 files changed, 711 insertions(+)
 create mode 100644 powerpc64/P8/aes-decrypt-internal.asm
 create mode 100644 powerpc64/P8/aes-encrypt-internal.asm

diff --git a/powerpc64/P8/aes-decrypt-internal.asm
b/powerpc64/P8/aes-decrypt-internal.asm
new file mode 100644
index 00000000..f5d64548
--- /dev/null
+++ b/powerpc64/P8/aes-decrypt-internal.asm
@@ -0,0 +1,367 @@
+C powerpc64/P8/aes-decrypt-internal.asm
+
+ifelse(<
+   Copyright (C) 2020 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+
+define(<SP>, <1>)
+define(<TOCP>, <2>)
+
+define(<ROUNDS>, <3>)
+define(<KEYS>, <4>)
+define(<LENGTH>, <6>)
+define(<DST>, <7>)
+define(<SRC>, <8>)
+
+define(<swap_mask>, <0>)
+
+define(<K>, <1>)
+define(<S0>, <2>)
+define(<S1>, <3>)
+define(<S2>, <4>)
+define(<S3>, <5>)
+define(<S4>, <6>)
+define(<S5>, <7>)
+define(<S6>, <8>)
+define(<S7>, <9>)
+
+define(<KX>, <33>)
+define(<S0X>, <34>)
+define(<S1X>, <35>)
+define(<S2X>, <36>)
+define(<S3X>, <37>)
+define(<S4X>, <38>)
+define(<S5X>, <39>)
+define(<S6X>, <40>)
+define(<S7X>, <41>)
+
+C ZERO vector register is used in place of RoundKey
+C for vncipher instruction because the order of InvMixColumns
+C and Xor processes are flipped in that instruction.
+C The Xor process with RoundKey is executed afterward.
+define(<ZERO>, <10>)
+
+.file "aes-decrypt-internal.asm"
+
+IF_LE(<.abiversion 2>)
+.text
+
+ C _aes_decrypt(unsigned rounds, const uint32_t *keys,
+ C       const struct aes_table *T,
+ C       size_t length, uint8_t *dst,
+ C       uint8_t *src)
+
+define(<FUNC_ALIGN>, <5>)
+PROLOGUE(_nettle_aes_decrypt)
+ vxor ZERO,ZERO,ZERO
+
+ DATA_LOAD_VEC(swap_mask,.swap_mask,5)
+
+ subi ROUNDS,ROUNDS,1
+ srdi LENGTH,LENGTH,4
+
+ srdi 5,LENGTH,3 #8x loop count
+ cmpldi 5,0
+ beq L4x
+
+ std 25,-56(SP);
+ std 26,-48(SP);
+ std 27,-40(SP);
+ std 28,-32(SP);
+ std 29,-24(SP);
+ std 30,-16(SP);
+ std 31,-8(SP);
+
+ li 25,0x10
+ li 26,0x20
+ li 27,0x30
+ li 28,0x40
+ li 29,0x50
+ li 30,0x60
+ li 31,0x70
+
+.align 5
+Lx8_loop:
+ lxvd2x KX,0,KEYS
+ vperm   K,K,K,swap_mask
+
+ lxvd2x S0X,0,SRC
+ lxvd2x S1X,25,SRC
+ lxvd2x S2X,26,SRC
+ lxvd2x S3X,27,SRC
+ lxvd2x S4X,28,SRC
+ lxvd2x S5X,29,SRC
+ lxvd2x S6X,30,SRC
+ lxvd2x S7X,31,SRC
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask>)
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L8x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm   K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vncipher S1,S1,ZERO
+ vncipher S2,S2,ZERO
+ vncipher S3,S3,ZERO
+ vncipher S4,S4,ZERO
+ vncipher S5,S5,ZERO
+ vncipher S6,S6,ZERO
+ vncipher S7,S7,ZERO
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+ addi 10,10,0x10
+ bdnz L8x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm   K,K,K,swap_mask
+ vncipherlast S0,S0,K
+ vncipherlast S1,S1,K
+ vncipherlast S2,S2,K
+ vncipherlast S3,S3,K
+ vncipherlast S4,S4,K
+ vncipherlast S5,S5,K
+ vncipherlast S6,S6,K
+ vncipherlast S7,S7,K
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask>)
+
+ stxvd2x S0X,0,DST
+ stxvd2x S1X,25,DST
+ stxvd2x S2X,26,DST
+ stxvd2x S3X,27,DST
+ stxvd2x S4X,28,DST
+ stxvd2x S5X,29,DST
+ stxvd2x S6X,30,DST
+ stxvd2x S7X,31,DST
+
+ addi SRC,SRC,0x80
+ addi DST,DST,0x80
+ subic. 5,5,1
+ bne Lx8_loop
+
+ ld 25,-56(SP);
+ ld 26,-48(SP);
+ ld 27,-40(SP);
+ ld 28,-32(SP);
+ ld 29,-24(SP);
+ ld 30,-16(SP);
+ ld 31,-8(SP);
+
+ clrldi LENGTH,LENGTH,61
+
+L4x:
+ srdi   5,LENGTH,2
+ cmpldi   5,0
+ beq   L2x
+
+ lxvd2x   KX,0,KEYS
+ vperm   K,K,K,swap_mask
+
+ lxvd2x S0X,0,SRC
+ li  9,0x10
+ lxvd2x S1X,9,SRC
+ addi   9,9,0x10
+ lxvd2x S2X,9,SRC
+ addi   9,9,0x10
+ lxvd2x S3X,9,SRC
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask>)
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L4x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vncipher S1,S1,ZERO
+ vncipher S2,S2,ZERO
+ vncipher S3,S3,ZERO
+ vxor   S0,S0,K
+ vxor  S1,S1,K
+ vxor   S2,S2,K
+ vxor   S3,S3,K
+ addi   10,10,0x10
+ bdnz  L4x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm   K,K,K,swap_mask
+ vncipherlast S0,S0,K
+ vncipherlast S1,S1,K
+ vncipherlast S2,S2,K
+ vncipherlast S3,S3,K
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask>)
+
+ stxvd2x S0X,0,DST
+ li  9,0x10
+ stxvd2x S1X,9,DST
+ addi   9,9,0x10
+ stxvd2x S2X,9,DST
+ addi  9,9,0x10
+ stxvd2x S3X,9,DST
+
+ addi   SRC,SRC,0x40
+ addi   DST,DST,0x40
+
+ clrldi LENGTH,LENGTH,62
+
+L2x:
+ srdi  5,LENGTH,1
+ cmpldi  5,0
+ beq   L1x
+
+ lxvd2x KX,0,KEYS
+ vperm K,K,K,swap_mask
+
+ lxvd2x S0X,0,SRC
+ li   9,0x10
+ lxvd2x S1X,9,SRC
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask>)
+
+ vxor  S0,S0,K
+ vxor   S1,S1,K
+
+ mtctr   ROUNDS
+ li  10,0x10
+.align 5
+L2x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vncipher S1,S1,ZERO
+ vxor  S0,S0,K
+ vxor  S1,S1,K
+ addi   10,10,0x10
+ bdnz   L2x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vncipherlast S0,S0,K
+ vncipherlast S1,S1,K
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask>)
+
+ stxvd2x S0X,0,DST
+ li  9,0x10
+ stxvd2x S1X,9,DST
+
+ addi   SRC,SRC,0x20
+ addi   DST,DST,0x20
+
+ clrldi LENGTH,LENGTH,63
+
+L1x:
+ cmpldi LENGTH,0
+ beq   Ldone
+
+ lxvd2x KX,0,KEYS
+ vperm   K,K,K,swap_mask
+
+ lxvd2x S0X,0,SRC
+
+IF_LE(<vperm S0,S0,S0,swap_mask>)
+
+ vxor   S0,S0,K
+
+ mtctr   ROUNDS
+ li   10,0x10
+.align 5
+L1x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vxor   S0,S0,K
+ addi   10,10,0x10
+ bdnz   L1x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vncipherlast S0,S0,K
+
+IF_LE(<vperm S0,S0,S0,swap_mask>)
+
+ stxvd2x S0X,0,DST
+
+Ldone:
+ blr
+EPILOGUE(_nettle_aes_decrypt)
+
+ .data
+ .align 4
+.swap_mask:
+IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>)
+IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>)
diff --git a/powerpc64/P8/aes-encrypt-internal.asm
b/powerpc64/P8/aes-encrypt-internal.asm
new file mode 100644
index 00000000..3e0fa6f0
--- /dev/null
+++ b/powerpc64/P8/aes-encrypt-internal.asm
@@ -0,0 +1,344 @@
+C powerpc64/P8/aes-encrypt-internal.asm
+
+ifelse(<
+   Copyright (C) 2020 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+
+define(<SP>, <1>)
+define(<TOCP>, <2>)
+
+define(<ROUNDS>, <3>)
+define(<KEYS>, <4>)
+define(<LENGTH>, <6>)
+define(<DST>, <7>)
+define(<SRC>, <8>)
+
+define(<swap_mask>, <0>)
+
+define(<K>, <1>)
+define(<S0>, <2>)
+define(<S1>, <3>)
+define(<S2>, <4>)
+define(<S3>, <5>)
+define(<S4>, <6>)
+define(<S5>, <7>)
+define(<S6>, <8>)
+define(<S7>, <9>)
+
+define(<KX>, <33>)
+define(<S0X>, <34>)
+define(<S1X>, <35>)
+define(<S2X>, <36>)
+define(<S3X>, <37>)
+define(<S4X>, <38>)
+define(<S5X>, <39>)
+define(<S6X>, <40>)
+define(<S7X>, <41>)
+
+.file "aes-encrypt-internal.asm"
+
+IF_LE(<.abiversion 2>)
+.text
+
+ C _aes_encrypt(unsigned rounds, const uint32_t *keys,
+ C       const struct aes_table *T,
+ C       size_t length, uint8_t *dst,
+ C       uint8_t *src)
+
+define(<FUNC_ALIGN>, <5>)
+PROLOGUE(_nettle_aes_encrypt)
+ DATA_LOAD_VEC(swap_mask,.swap_mask,5)
+
+ subi ROUNDS,ROUNDS,1
+ srdi LENGTH,LENGTH,4
+
+ srdi 5,LENGTH,3 #8x loop count
+ cmpldi 5,0
+ beq L4x
+
+ std 25,-56(SP);
+ std 26,-48(SP);
+ std 27,-40(SP);
+ std 28,-32(SP);
+ std 29,-24(SP);
+ std 30,-16(SP);
+ std 31,-8(SP);
+
+ li 25,0x10
+ li 26,0x20
+ li 27,0x30
+ li 28,0x40
+ li 29,0x50
+ li 30,0x60
+ li 31,0x70
+
+.align 5
+Lx8_loop:
+ lxvd2x KX,0,KEYS
+ vperm   K,K,K,swap_mask
+
+ lxvd2x S0X,0,SRC
+ lxvd2x S1X,25,SRC
+ lxvd2x S2X,26,SRC
+ lxvd2x S3X,27,SRC
+ lxvd2x S4X,28,SRC
+ lxvd2x S5X,29,SRC
+ lxvd2x S6X,30,SRC
+ lxvd2x S7X,31,SRC
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask>)
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L8x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm   K,K,K,swap_mask
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ vcipher S2,S2,K
+ vcipher S3,S3,K
+ vcipher S4,S4,K
+ vcipher S5,S5,K
+ vcipher S6,S6,K
+ vcipher S7,S7,K
+ addi 10,10,0x10
+ bdnz L8x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm   K,K,K,swap_mask
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+ vcipherlast S2,S2,K
+ vcipherlast S3,S3,K
+ vcipherlast S4,S4,K
+ vcipherlast S5,S5,K
+ vcipherlast S6,S6,K
+ vcipherlast S7,S7,K
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask>)
+
+ stxvd2x S0X,0,DST
+ stxvd2x S1X,25,DST
+ stxvd2x S2X,26,DST
+ stxvd2x S3X,27,DST
+ stxvd2x S4X,28,DST
+ stxvd2x S5X,29,DST
+ stxvd2x S6X,30,DST
+ stxvd2x S7X,31,DST
+
+ addi SRC,SRC,0x80
+ addi DST,DST,0x80
+ subic. 5,5,1
+ bne Lx8_loop
+
+ ld 25,-56(SP);
+ ld 26,-48(SP);
+ ld 27,-40(SP);
+ ld 28,-32(SP);
+ ld 29,-24(SP);
+ ld 30,-16(SP);
+ ld 31,-8(SP);
+
+ clrldi LENGTH,LENGTH,61
+
+L4x:
+ srdi   5,LENGTH,2
+ cmpldi   5,0
+ beq   L2x
+
+ lxvd2x   KX,0,KEYS
+ vperm   K,K,K,swap_mask
+
+ lxvd2x S0X,0,SRC
+ li  9,0x10
+ lxvd2x S1X,9,SRC
+ addi   9,9,0x10
+ lxvd2x S2X,9,SRC
+ addi   9,9,0x10
+ lxvd2x S3X,9,SRC
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask>)
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L4x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ vcipher S2,S2,K
+ vcipher S3,S3,K
+ addi   10,10,0x10
+ bdnz  L4x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm   K,K,K,swap_mask
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+ vcipherlast S2,S2,K
+ vcipherlast S3,S3,K
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask>)
+
+ stxvd2x S0X,0,DST
+ li  9,0x10
+ stxvd2x S1X,9,DST
+ addi   9,9,0x10
+ stxvd2x S2X,9,DST
+ addi  9,9,0x10
+ stxvd2x S3X,9,DST
+
+ addi   SRC,SRC,0x40
+ addi   DST,DST,0x40
+
+ clrldi LENGTH,LENGTH,62
+
+L2x:
+ srdi  5,LENGTH,1
+ cmpldi  5,0
+ beq   L1x
+
+ lxvd2x KX,0,KEYS
+ vperm K,K,K,swap_mask
+
+ lxvd2x S0X,0,SRC
+ li   9,0x10
+ lxvd2x S1X,9,SRC
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask>)
+
+ vxor  S0,S0,K
+ vxor   S1,S1,K
+
+ mtctr   ROUNDS
+ li  10,0x10
+.align 5
+L2x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ addi   10,10,0x10
+ bdnz   L2x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+
+IF_LE(<vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask>)
+
+ stxvd2x S0X,0,DST
+ li  9,0x10
+ stxvd2x S1X,9,DST
+
+ addi   SRC,SRC,0x20
+ addi   DST,DST,0x20
+
+ clrldi LENGTH,LENGTH,63
+
+L1x:
+ cmpldi LENGTH,0
+ beq   Ldone
+
+ lxvd2x KX,0,KEYS
+ vperm   K,K,K,swap_mask
+
+ lxvd2x S0X,0,SRC
+
+IF_LE(<vperm S0,S0,S0,swap_mask>)
+
+ vxor   S0,S0,K
+
+ mtctr   ROUNDS
+ li   10,0x10
+.align 5
+L1x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vcipher S0,S0,K
+ addi   10,10,0x10
+ bdnz   L1x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm  K,K,K,swap_mask
+ vcipherlast S0,S0,K
+
+IF_LE(<vperm S0,S0,S0,swap_mask>)
+
+ stxvd2x S0X,0,DST
+
+Ldone:
+ blr
+EPILOGUE(_nettle_aes_encrypt)
+
+ .data
+ .align 4
+.swap_mask:
+IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>)
+IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>)
-- 
2.17.1