This patch adds "VSR" macro to improve the syntax of assembly code, I will create a separate patch for gcm-hash since it hasn't merged yet to the master. I also removed the TODO from README because I tried to use "lxv/stxv" in POWER9 instead of "lxvd2x/stxvd2x" but gcc produced "lxvd2x/stxvd2x" in the binary. I'm not sure if it's variant issue of gcc but this will be problematic since "lxvd2x/stxvd2x" need permuting in little-endian mode while "lxv/stxv" is endianness aware.
--- powerpc64/README | 2 - powerpc64/machine.m4 | 4 ++ powerpc64/p8/aes-decrypt-internal.asm | 97 ++++++++++++++++------------------- powerpc64/p8/aes-encrypt-internal.asm | 97 ++++++++++++++++------------------- 4 files changed, 90 insertions(+), 110 deletions(-)
diff --git a/powerpc64/README b/powerpc64/README index 5410791f..7301953b 100644 --- a/powerpc64/README +++ b/powerpc64/README @@ -53,8 +53,6 @@ in [3] to see an example of accessing unaligned storage operands. "lxvd2x/stxvd2x" can be used to load/store data into unaligned storage operands but permuting is needed for loading and storing data in little-endian mode VSX registers are defined with "X" suffix -TODO: use architecture 3.0 instructions "lxv/stxv" instead for POWER9 - and newer
Function Prologue
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 index 2f91adec..b76bb8b1 100644 --- a/powerpc64/machine.m4 +++ b/powerpc64/machine.m4 @@ -22,6 +22,10 @@ define(<EPILOGUE>, <.size .C_NAME($1), . - .C_NAME($1) .size C_NAME($1), . - .C_NAME($1)>)>)
+C Get vector-scalar register from vector register +C VSR(VR) +define(<VSR>,<32+$1>) + C Load the quadword in DATA_SRC storage into C VEC_DST. GPR is general-purpose register C used to obtain the effective address of diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm index 7d518cd9..bfedb32b 100644 --- a/powerpc64/p8/aes-decrypt-internal.asm +++ b/powerpc64/p8/aes-decrypt-internal.asm @@ -1,4 +1,4 @@ -C powerpc64/P8/aes-decrypt-internal.asm +C powerpc64/p8/aes-decrypt-internal.asm
ifelse(< Copyright (C) 2020 Mamone Tarsha @@ -52,16 +52,6 @@ define(<S5>, <7>) define(<S6>, <8>) define(<S7>, <9>)
-define(<KX>, <33>) -define(<S0X>, <34>) -define(<S1X>, <35>) -define(<S2X>, <36>) -define(<S3X>, <37>) -define(<S4X>, <38>) -define(<S5X>, <39>) -define(<S6X>, <40>) -define(<S7X>, <41>) - C ZERO vector register is used in place of RoundKey C for vncipher instruction because the order of InvMixColumns C and Xor processes are flipped in that instruction. @@ -70,7 +60,6 @@ define(<ZERO>, <10>)
.file "aes-decrypt-internal.asm"
-IF_LE(<.abiversion 2>) .text
C _aes_decrypt(unsigned rounds, const uint32_t *keys, @@ -109,17 +98,17 @@ PROLOGUE(_nettle_aes_decrypt)
.align 5 Lx8_loop: - lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC - lxvd2x S1X,25,SRC - lxvd2x S2X,26,SRC - lxvd2x S3X,27,SRC - lxvd2x S4X,28,SRC - lxvd2x S5X,29,SRC - lxvd2x S6X,30,SRC - lxvd2x S7X,31,SRC + lxvd2x VSR(S0),0,SRC + lxvd2x VSR(S1),25,SRC + lxvd2x VSR(S2),26,SRC + lxvd2x VSR(S3),27,SRC + lxvd2x VSR(S4),28,SRC + lxvd2x VSR(S5),29,SRC + lxvd2x VSR(S6),30,SRC + lxvd2x VSR(S7),31,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -143,7 +132,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L8x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO @@ -164,7 +153,7 @@ L8x_round_loop: addi 10,10,0x10 bdnz L8x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -184,14 +173,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask vperm S6,S6,S6,swap_mask vperm S7,S7,S7,swap_mask>)
- stxvd2x S0X,0,DST - stxvd2x S1X,25,DST - stxvd2x S2X,26,DST - stxvd2x S3X,27,DST - stxvd2x S4X,28,DST - stxvd2x S5X,29,DST - stxvd2x S6X,30,DST - stxvd2x S7X,31,DST + stxvd2x VSR(S0),0,DST + stxvd2x VSR(S1),25,DST + stxvd2x VSR(S2),26,DST + stxvd2x VSR(S3),27,DST + stxvd2x VSR(S4),28,DST + stxvd2x VSR(S5),29,DST + stxvd2x VSR(S6),30,DST + stxvd2x VSR(S7),31,DST
addi SRC,SRC,0x80 addi DST,DST,0x80 @@ -213,16 +202,16 @@ L4x: cmpldi 5,0 beq L2x
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC li 9,0x10 - lxvd2x S1X,9,SRC + lxvd2x VSR(S1),9,SRC addi 9,9,0x10 - lxvd2x S2X,9,SRC + lxvd2x VSR(S2),9,SRC addi 9,9,0x10 - lxvd2x S3X,9,SRC + lxvd2x VSR(S3),9,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -238,7 +227,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L4x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO @@ -251,7 +240,7 @@ L4x_round_loop: addi 10,10,0x10 bdnz L4x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -263,13 +252,13 @@ IF_LE(<vperm S0,S0,S0,swap_mask vperm S2,S2,S2,swap_mask vperm S3,S3,S3,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST li 9,0x10 - stxvd2x S1X,9,DST + stxvd2x VSR(S1),9,DST addi 9,9,0x10 - stxvd2x S2X,9,DST + stxvd2x VSR(S2),9,DST addi 9,9,0x10 - stxvd2x S3X,9,DST + stxvd2x VSR(S3),9,DST
addi SRC,SRC,0x40 addi DST,DST,0x40 @@ -281,12 +270,12 @@ L2x: cmpldi 5,0 beq L1x
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC li 9,0x10 - lxvd2x S1X,9,SRC + lxvd2x VSR(S1),9,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask>) @@ -298,7 +287,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L2x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO @@ -307,7 +296,7 @@ L2x_round_loop: addi 10,10,0x10 bdnz L2x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -315,9 +304,9 @@ L2x_round_loop: IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST li 9,0x10 - stxvd2x S1X,9,DST + stxvd2x VSR(S1),9,DST
addi SRC,SRC,0x20 addi DST,DST,0x20 @@ -328,10 +317,10 @@ L1x: cmpldi LENGTH,0 beq Ldone
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC
IF_LE(<vperm S0,S0,S0,swap_mask>)
@@ -341,20 +330,20 @@ IF_LE(<vperm S0,S0,S0,swap_mask>) li 10,0x10 .align 5 L1x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vxor S0,S0,K addi 10,10,0x10 bdnz L1x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K
IF_LE(<vperm S0,S0,S0,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST
Ldone: blr diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm index c696a4a3..67c7e597 100644 --- a/powerpc64/p8/aes-encrypt-internal.asm +++ b/powerpc64/p8/aes-encrypt-internal.asm @@ -1,4 +1,4 @@ -C powerpc64/P8/aes-encrypt-internal.asm +C powerpc64/p8/aes-encrypt-internal.asm
ifelse(< Copyright (C) 2020 Mamone Tarsha @@ -52,19 +52,8 @@ define(<S5>, <7>) define(<S6>, <8>) define(<S7>, <9>)
-define(<KX>, <33>) -define(<S0X>, <34>) -define(<S1X>, <35>) -define(<S2X>, <36>) -define(<S3X>, <37>) -define(<S4X>, <38>) -define(<S5X>, <39>) -define(<S6X>, <40>) -define(<S7X>, <41>) - .file "aes-encrypt-internal.asm"
-IF_LE(<.abiversion 2>) .text
C _aes_encrypt(unsigned rounds, const uint32_t *keys, @@ -101,17 +90,17 @@ PROLOGUE(_nettle_aes_encrypt)
.align 5 Lx8_loop: - lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC - lxvd2x S1X,25,SRC - lxvd2x S2X,26,SRC - lxvd2x S3X,27,SRC - lxvd2x S4X,28,SRC - lxvd2x S5X,29,SRC - lxvd2x S6X,30,SRC - lxvd2x S7X,31,SRC + lxvd2x VSR(S0),0,SRC + lxvd2x VSR(S1),25,SRC + lxvd2x VSR(S2),26,SRC + lxvd2x VSR(S3),27,SRC + lxvd2x VSR(S4),28,SRC + lxvd2x VSR(S5),29,SRC + lxvd2x VSR(S6),30,SRC + lxvd2x VSR(S7),31,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -135,7 +124,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L8x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K @@ -148,7 +137,7 @@ L8x_round_loop: addi 10,10,0x10 bdnz L8x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -168,14 +157,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask vperm S6,S6,S6,swap_mask vperm S7,S7,S7,swap_mask>)
- stxvd2x S0X,0,DST - stxvd2x S1X,25,DST - stxvd2x S2X,26,DST - stxvd2x S3X,27,DST - stxvd2x S4X,28,DST - stxvd2x S5X,29,DST - stxvd2x S6X,30,DST - stxvd2x S7X,31,DST + stxvd2x VSR(S0),0,DST + stxvd2x VSR(S1),25,DST + stxvd2x VSR(S2),26,DST + stxvd2x VSR(S3),27,DST + stxvd2x VSR(S4),28,DST + stxvd2x VSR(S5),29,DST + stxvd2x VSR(S6),30,DST + stxvd2x VSR(S7),31,DST
addi SRC,SRC,0x80 addi DST,DST,0x80 @@ -197,16 +186,16 @@ L4x: cmpldi 5,0 beq L2x
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC li 9,0x10 - lxvd2x S1X,9,SRC + lxvd2x VSR(S1),9,SRC addi 9,9,0x10 - lxvd2x S2X,9,SRC + lxvd2x VSR(S2),9,SRC addi 9,9,0x10 - lxvd2x S3X,9,SRC + lxvd2x VSR(S3),9,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -222,7 +211,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L4x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K @@ -231,7 +220,7 @@ L4x_round_loop: addi 10,10,0x10 bdnz L4x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -243,13 +232,13 @@ IF_LE(<vperm S0,S0,S0,swap_mask vperm S2,S2,S2,swap_mask vperm S3,S3,S3,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST li 9,0x10 - stxvd2x S1X,9,DST + stxvd2x VSR(S1),9,DST addi 9,9,0x10 - stxvd2x S2X,9,DST + stxvd2x VSR(S2),9,DST addi 9,9,0x10 - stxvd2x S3X,9,DST + stxvd2x VSR(S3),9,DST
addi SRC,SRC,0x40 addi DST,DST,0x40 @@ -261,12 +250,12 @@ L2x: cmpldi 5,0 beq L1x
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC li 9,0x10 - lxvd2x S1X,9,SRC + lxvd2x VSR(S1),9,SRC
IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask>) @@ -278,14 +267,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask li 10,0x10 .align 5 L2x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K addi 10,10,0x10 bdnz L2x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -293,9 +282,9 @@ L2x_round_loop: IF_LE(<vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST li 9,0x10 - stxvd2x S1X,9,DST + stxvd2x VSR(S1),9,DST
addi SRC,SRC,0x20 addi DST,DST,0x20 @@ -306,10 +295,10 @@ L1x: cmpldi LENGTH,0 beq Ldone
- lxvd2x KX,0,KEYS + lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
- lxvd2x S0X,0,SRC + lxvd2x VSR(S0),0,SRC
IF_LE(<vperm S0,S0,S0,swap_mask>)
@@ -319,19 +308,19 @@ IF_LE(<vperm S0,S0,S0,swap_mask>) li 10,0x10 .align 5 L1x_round_loop: - lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K addi 10,10,0x10 bdnz L1x_round_loop
- lxvd2x KX,10,KEYS + lxvd2x VSR(K),10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K
IF_LE(<vperm S0,S0,S0,swap_mask>)
- stxvd2x S0X,0,DST + stxvd2x VSR(S0),0,DST
Ldone: blr
Maamoun TK maamoun.tk@googlemail.com writes:
This patch adds "VSR" macro to improve the syntax of assembly code,
Thanks, merged to master now.
Regards, /Niels
Maamoun TK maamoun.tk@googlemail.com writes:
This patch adds "VSR" macro to improve the syntax of assembly code,
Speaking of syntax, I've had quick look at the powerpc64 assembly in GMP, and it seems to use symbols like r1, r2, r3 etc for the general purpose registers, and v0, v1, v2 etc for the vector registers. I think that's a bit clearer than using raw numbers to reference registers. And should hopefully result in compile time errors if one accidentally uses a general purpose register where a vector register is expected, or vice versa.
I guess both ways work with relevant assemblers?
Regards, /Niels
If these registers are used explicitly, GAS will yield a bunch of errors like "Error: unsupported relocation against r1" unless "-mregnames" is passed to the assembler. I can add "-Wa,-mregnames" to CFLAGS in configure.ac and modify the assembly files to improve the syntax.
On Fri, Sep 4, 2020 at 1:07 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
This patch adds "VSR" macro to improve the syntax of assembly code,
Speaking of syntax, I've had quick look at the powerpc64 assembly in GMP, and it seems to use symbols like r1, r2, r3 etc for the general purpose registers, and v0, v1, v2 etc for the vector registers. I think that's a bit clearer than using raw numbers to reference registers. And should hopefully result in compile time errors if one accidentally uses a general purpose register where a vector register is expected, or vice versa.
I guess both ways work with relevant assemblers?
Regards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
Maamoun TK maamoun.tk@googlemail.com writes:
If these registers are used explicitly, GAS will yield a bunch of errors like "Error: unsupported relocation against r1" unless "-mregnames" is passed to the assembler. I can add "-Wa,-mregnames" to CFLAGS in configure.ac and modify the assembly files to improve the syntax.
I think I'll adopt the configure check and related m4 things from GMP. It's documented like this:
dnl GMP_ASM_POWERPC_R_REGISTERS dnl --------------------------- dnl Determine whether the assembler takes powerpc registers with an "r" as dnl in "r6", or as plain "6". The latter is standard, but NeXT, Rhapsody, dnl and MacOS-X require the "r" forms. dnl dnl See also mpn/powerpc32/powerpc-defs.m4 which uses the result of this dnl test.
I'd like to also try out -Wa,-mregnames, if that's needed for register names on more mainstream ppc systems.
Regards, /Niels
nettle-bugs@lists.lysator.liu.se