This patch adds "VSR" macro to improve the syntax of assembly code, I will create a separate patch for gcm-hash since it hasn't merged yet to the master. I also removed the TODO from README because I tried to use "lxv/stxv" in POWER9 instead of "lxvd2x/stxvd2x" but gcc produced "lxvd2x/stxvd2x" in the binary. I'm not sure if it's variant issue of gcc but this will be problematic since "lxvd2x/stxvd2x" need permuting in little-endian mode while "lxv/stxv" is endianness aware.
--- powerpc64/README | 2 - powerpc64/machine.m4 | 4 ++ powerpc64/p8/aes-decrypt-internal.asm | 97 ++++++++++++++++------------------- powerpc64/p8/aes-encrypt-internal.asm | 97 ++++++++++++++++------------------- 4 files changed, 90 insertions(+), 110 deletions(-)
diff --git a/powerpc64/README b/powerpc64/README index 5410791f..7301953b 100644 --- a/powerpc64/README +++ b/powerpc64/README @@ -53,8 +53,6 @@ in [3] to see an example of accessing unaligned storage operands. "lxvd2x/stxvd2x" can be used to load/store data into unaligned storage operands but permuting is needed for loading and storing data in little-endian mode VSX registers are defined with "X" suffix -TODO: use architecture 3.0 instructions "lxv/stxv" instead for POWER9 - and newer
Function Prologue
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 index 2f91adec..b76bb8b1 100644 --- a/powerpc64/machine.m4 +++ b/powerpc64/machine.m4 @@ -22,6 +22,10 @@ define(<EPILOGUE>, <.size .C_NAME($1), . - .C_NAME($1) .size C_NAME($1), . - .C_NAME($1)>)>)
+C Get vector-scalar register from vector register +C VSR(VR) +define(<VSR>,<32+$1>) + C Load the quadword in DATA_SRC storage into C VEC_DST. GPR is general-purpose register C used to obtain the effective address of diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm index 7d518cd9..bfedb32b 100644 --- a/powerpc64/p8/aes-decrypt-internal.asm +++ b/powerpc64/p8/aes-decrypt-internal.asm @@ -1,4 +1,4 @@ -C powerpc64/P8/aes-decrypt-internal.asm +C powerpc64/p8/aes-decrypt-internal.asm
ifelse(< Copyright (C) 2020 Mamone Tarsha @@ -52,16 +52,6 @@ define(<S5>, <7>) define(<S6>, <8>) define(<S7>, <9>)
-define(<KX>, <33>) -define(<S0X>, <34>) -define(<S1X>, <35>) -define(<S2X>, <36>) -define(<S3X>, <37>) -define(<S4X>, <38>) -define(<S5X>, <39>) -define(<S6X>, <40>) -define(<S7X>, <41>) - C ZERO vector register is used in place of RoundKey C for vncipher instruction because the order of InvMixColumns C and Xor processes are flipped in that instruction. @@ -70,7 +60,6 @@ define(<ZERO>, <10>)
.file "aes-decrypt-internal.asm"
-IF_LE(<.abiversion 2>) .text
C _aes_decrypt(unsigned rounds, const uint32_t *keys, @@ -109,17 +98,17 @@ PROLOGUE(_nettle_aes_decrypt)
.align 5 Lx8_loop: - lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC - lxvd2x S1X,25,SRC - lxvd2x S2X,26,SRC - lxvd2x S3X,27,SRC - lxvd2x S4X,28,SRC - lxvd2x S5X,29,SRC - lxvd2x S6X,30,SRC - lxvd2x S7X,31,SRC + lxvd2x VSR(S0),0,SRC + lxvd2x VSR(S1),25,SRC + lxvd2x VSR(S2),26,SRC + lxvd2x VSR(S3),27,SRC + lxvd2x VSR(S4),28,SRC + lxvd2x VSR(S5),29,SRC + lxvd2x VSR(S6),30,SRC + lxvd2x VSR(S7),31,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -143,7 +132,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L8x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO @@ -164,7 +153,7 @@ L8x_round_loop: addi 10,10,0x10 bdnz L8x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -184,14 +173,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask vperm S6,S6,S6,swap_mask vperm S7,S7,S7,swap_mask>)
- stxvd2x S0X,0,DST - stxvd2x S1X,25,DST - stxvd2x S2X,26,DST - stxvd2x S3X,27,DST - stxvd2x S4X,28,DST - stxvd2x S5X,29,DST - stxvd2x S6X,30,DST - stxvd2x S7X,31,DST + stxvd2x VSR(S0),0,DST + stxvd2x VSR(S1),25,DST + stxvd2x VSR(S2),26,DST + stxvd2x VSR(S3),27,DST + stxvd2x VSR(S4),28,DST + stxvd2x VSR(S5),29,DST + stxvd2x VSR(S6),30,DST + stxvd2x VSR(S7),31,DST
addi SRC,SRC,0x80 addi DST,DST,0x80 @@ -213,16 +202,16 @@ L4x: cmpldi 5,0 beq L2x
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC li 9,0x10 - lxvd2x S1X,9,SRC + lxvd2x VSR(S1),9,SRC addi 9,9,0x10 - lxvd2x S2X,9,SRC + lxvd2x VSR(S2),9,SRC addi 9,9,0x10 - lxvd2x S3X,9,SRC + lxvd2x VSR(S3),9,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -238,7 +227,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L4x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO @@ -251,7 +240,7 @@ L4x_round_loop: addi 10,10,0x10 bdnz L4x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -263,13 +252,13 @@ IF_LE(<vperm S0,S0,S0,swap_mask vperm S2,S2,S2,swap_mask vperm S3,S3,S3,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST li 9,0x10 - stxvd2x S1X,9,DST + stxvd2x VSR(S1),9,DST addi 9,9,0x10 - stxvd2x S2X,9,DST + stxvd2x VSR(S2),9,DST addi 9,9,0x10 - stxvd2x S3X,9,DST + stxvd2x VSR(S3),9,DST
addi SRC,SRC,0x40 addi DST,DST,0x40 @@ -281,12 +270,12 @@ L2x: cmpldi 5,0 beq L1x
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC li 9,0x10 - lxvd2x S1X,9,SRC + lxvd2x VSR(S1),9,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask>) @@ -298,7 +287,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L2x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO @@ -307,7 +296,7 @@ L2x_round_loop: addi 10,10,0x10 bdnz L2x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -315,9 +304,9 @@ L2x_round_loop: IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST li 9,0x10 - stxvd2x S1X,9,DST + stxvd2x VSR(S1),9,DST
addi SRC,SRC,0x20 addi DST,DST,0x20 @@ -328,10 +317,10 @@ L1x: cmpldi LENGTH,0 beq Ldone
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC
IF_LE(<vperm S0,S0,S0,swap_mask>)
@@ -341,20 +330,20 @@ IF_LE(<vperm S0,S0,S0,swap_mask>) li 10,0x10 .align 5 L1x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vxor S0,S0,K addi 10,10,0x10 bdnz L1x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K
IF_LE(<vperm S0,S0,S0,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST
Ldone: blr diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm index c696a4a3..67c7e597 100644 --- a/powerpc64/p8/aes-encrypt-internal.asm +++ b/powerpc64/p8/aes-encrypt-internal.asm @@ -1,4 +1,4 @@ -C powerpc64/P8/aes-encrypt-internal.asm +C powerpc64/p8/aes-encrypt-internal.asm
ifelse(< Copyright (C) 2020 Mamone Tarsha @@ -52,19 +52,8 @@ define(<S5>, <7>) define(<S6>, <8>) define(<S7>, <9>)
-define(<KX>, <33>) -define(<S0X>, <34>) -define(<S1X>, <35>) -define(<S2X>, <36>) -define(<S3X>, <37>) -define(<S4X>, <38>) -define(<S5X>, <39>) -define(<S6X>, <40>) -define(<S7X>, <41>) - .file "aes-encrypt-internal.asm"
-IF_LE(<.abiversion 2>) .text
C _aes_encrypt(unsigned rounds, const uint32_t *keys, @@ -101,17 +90,17 @@ PROLOGUE(_nettle_aes_encrypt)
.align 5 Lx8_loop: - lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC - lxvd2x S1X,25,SRC - lxvd2x S2X,26,SRC - lxvd2x S3X,27,SRC - lxvd2x S4X,28,SRC - lxvd2x S5X,29,SRC - lxvd2x S6X,30,SRC - lxvd2x S7X,31,SRC + lxvd2x VSR(S0),0,SRC + lxvd2x VSR(S1),25,SRC + lxvd2x VSR(S2),26,SRC + lxvd2x VSR(S3),27,SRC + lxvd2x VSR(S4),28,SRC + lxvd2x VSR(S5),29,SRC + lxvd2x VSR(S6),30,SRC + lxvd2x VSR(S7),31,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -135,7 +124,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L8x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K @@ -148,7 +137,7 @@ L8x_round_loop: addi 10,10,0x10 bdnz L8x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -168,14 +157,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask vperm S6,S6,S6,swap_mask vperm S7,S7,S7,swap_mask>)
- stxvd2x S0X,0,DST - stxvd2x S1X,25,DST - stxvd2x S2X,26,DST - stxvd2x S3X,27,DST - stxvd2x S4X,28,DST - stxvd2x S5X,29,DST - stxvd2x S6X,30,DST - stxvd2x S7X,31,DST + stxvd2x VSR(S0),0,DST + stxvd2x VSR(S1),25,DST + stxvd2x VSR(S2),26,DST + stxvd2x VSR(S3),27,DST + stxvd2x VSR(S4),28,DST + stxvd2x VSR(S5),29,DST + stxvd2x VSR(S6),30,DST + stxvd2x VSR(S7),31,DST
addi SRC,SRC,0x80 addi DST,DST,0x80 @@ -197,16 +186,16 @@ L4x: cmpldi 5,0 beq L2x
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC li 9,0x10 - lxvd2x S1X,9,SRC + lxvd2x VSR(S1),9,SRC addi 9,9,0x10 - lxvd2x S2X,9,SRC + lxvd2x VSR(S2),9,SRC addi 9,9,0x10 - lxvd2x S3X,9,SRC + lxvd2x VSR(S3),9,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -222,7 +211,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L4x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K @@ -231,7 +220,7 @@ L4x_round_loop: addi 10,10,0x10 bdnz L4x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -243,13 +232,13 @@ IF_LE(<vperm S0,S0,S0,swap_mask vperm S2,S2,S2,swap_mask vperm S3,S3,S3,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST li 9,0x10 - stxvd2x S1X,9,DST + stxvd2x VSR(S1),9,DST addi 9,9,0x10 - stxvd2x S2X,9,DST + stxvd2x VSR(S2),9,DST addi 9,9,0x10 - stxvd2x S3X,9,DST + stxvd2x VSR(S3),9,DST
addi SRC,SRC,0x40 addi DST,DST,0x40 @@ -261,12 +250,12 @@ L2x: cmpldi 5,0 beq L1x
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC li 9,0x10 - lxvd2x S1X,9,SRC + lxvd2x VSR(S1),9,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask>) @@ -278,14 +267,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L2x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K addi 10,10,0x10 bdnz L2x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -293,9 +282,9 @@ L2x_round_loop: IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST li 9,0x10 - stxvd2x S1X,9,DST + stxvd2x VSR(S1),9,DST
addi SRC,SRC,0x20 addi DST,DST,0x20 @@ -306,10 +295,10 @@ L1x: cmpldi LENGTH,0 beq Ldone
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC
IF_LE(<vperm S0,S0,S0,swap_mask>)
@@ -319,19 +308,19 @@ IF_LE(<vperm S0,S0,S0,swap_mask>) li 10,0x10 .align 5 L1x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K addi 10,10,0x10 bdnz L1x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K
IF_LE(<vperm S0,S0,S0,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST
Ldone: blr