This patch is built upon ppc-m4-macrology.patch. Using explicit register
names is working as expected now.
---
powerpc64/machine.m4 | 11 +-
powerpc64/p8/aes-decrypt-internal.asm | 194
+++++++++++++++++-----------------
powerpc64/p8/aes-encrypt-internal.asm | 192
++++++++++++++++-----------------
3 files changed, 200 insertions(+), 197 deletions(-)
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4
index ae161d79..f867ec01 100644
--- a/powerpc64/machine.m4
+++ b/powerpc64/machine.m4
@@ -24,7 +24,10 @@ define(`EPILOGUE',
C Get vector-scalar register from vector register
C VSR(VR)
-define(`VSR',`32+$1')
+define(`VSR',`ifelse(ASM_PPC_WANT_R_REGISTERS,no,
+`eval(32+$1)',
+``vs'eval(32+substr($1,1,len($1)))'
+)')
C Load the quadword in DATA_SRC storage into
C VEC_DST. GPR is general-purpose register
@@ -32,19 +35,19 @@ C used to obtain the effective address of
C DATA_SRC storage.
C DATA_LOAD_VEC(VEC_DST, DATA_SRC, GPR)
define(`DATA_LOAD_VEC',
-`ld $3,$2@got(2)
+`ld $3,$2@got(r2)
lvx $1,0,$3')
dnl Usage: r0 ... r31, cr0 ... cr7
dnl
dnl Registers names, either left as "r0" etc or mapped to plain 0 etc,
-dnl according to the result of the GMP_ASM_POWERPC_REGISTERS configure
+dnl according to the result of the ASM_PPC_WANT_R_REGISTERS configure
dnl test.
ifelse(ASM_PPC_WANT_R_REGISTERS,no,`
forloop(i,0,31,`deflit(`r'i,i)')
forloop(i,0,31,`deflit(`v'i,i)')
+forloop(i,0,63,`deflit(`vs'i,i)')
forloop(i,0,31,`deflit(`f'i,i)')
forloop(i,0,7, `deflit(`cr'i,i)')
')
-
diff --git a/powerpc64/p8/aes-decrypt-internal.asm
b/powerpc64/p8/aes-decrypt-internal.asm
index acdbc1bd..7c79ffcb 100644
--- a/powerpc64/p8/aes-decrypt-internal.asm
+++ b/powerpc64/p8/aes-decrypt-internal.asm
@@ -31,32 +31,32 @@ ifelse(`
C Register usage:
-define(`SP', `1')
-define(`TOCP', `2')
-
-define(`ROUNDS', `3')
-define(`KEYS', `4')
-define(`LENGTH', `6')
-define(`DST', `7')
-define(`SRC', `8')
-
-define(`swap_mask', `0')
-
-define(`K', `1')
-define(`S0', `2')
-define(`S1', `3')
-define(`S2', `4')
-define(`S3', `5')
-define(`S4', `6')
-define(`S5', `7')
-define(`S6', `8')
-define(`S7', `9')
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+define(`ROUNDS', `r3')
+define(`KEYS', `r4')
+define(`LENGTH', `r6')
+define(`DST', `r7')
+define(`SRC', `r8')
+
+define(`swap_mask', `v0')
+
+define(`K', `v1')
+define(`S0', `v2')
+define(`S1', `v3')
+define(`S2', `v4')
+define(`S3', `v5')
+define(`S4', `v6')
+define(`S5', `v7')
+define(`S6', `v8')
+define(`S7', `v9')
C ZERO vector register is used in place of RoundKey
C for vncipher instruction because the order of InvMixColumns
C and Xor processes are flipped in that instruction.
C The Xor process with RoundKey is executed afterward.
-define(`ZERO', `10')
+define(`ZERO', `v10')
.file "aes-decrypt-internal.asm"
@@ -71,30 +71,30 @@ define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_aes_decrypt)
vxor ZERO,ZERO,ZERO
- DATA_LOAD_VEC(swap_mask,.swap_mask,5)
+ DATA_LOAD_VEC(swap_mask,.swap_mask,r5)
subi ROUNDS,ROUNDS,1
srdi LENGTH,LENGTH,4
- srdi 5,LENGTH,3 #8x loop count
- cmpldi 5,0
+ srdi r5,LENGTH,3 #8x loop count
+ cmpldi r5,0
beq L4x
- std 25,-56(SP);
- std 26,-48(SP);
- std 27,-40(SP);
- std 28,-32(SP);
- std 29,-24(SP);
- std 30,-16(SP);
- std 31,-8(SP);
-
- li 25,0x10
- li 26,0x20
- li 27,0x30
- li 28,0x40
- li 29,0x50
- li 30,0x60
- li 31,0x70
+ std r25,-56(SP);
+ std r26,-48(SP);
+ std r27,-40(SP);
+ std r28,-32(SP);
+ std r29,-24(SP);
+ std r30,-16(SP);
+ std r31,-8(SP);
+
+ li r25,0x10
+ li r26,0x20
+ li r27,0x30
+ li r28,0x40
+ li r29,0x50
+ li r30,0x60
+ li r31,0x70
.align 5
Lx8_loop:
@@ -102,13 +102,13 @@ Lx8_loop:
vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC
- lxvd2x VSR(S1),25,SRC
- lxvd2x VSR(S2),26,SRC
- lxvd2x VSR(S3),27,SRC
- lxvd2x VSR(S4),28,SRC
- lxvd2x VSR(S5),29,SRC
- lxvd2x VSR(S6),30,SRC
- lxvd2x VSR(S7),31,SRC
+ lxvd2x VSR(S1),r25,SRC
+ lxvd2x VSR(S2),r26,SRC
+ lxvd2x VSR(S3),r27,SRC
+ lxvd2x VSR(S4),r28,SRC
+ lxvd2x VSR(S5),r29,SRC
+ lxvd2x VSR(S6),r30,SRC
+ lxvd2x VSR(S7),r31,SRC
IF_LE(`vperm S0,S0,S0,swap_mask
vperm S1,S1,S1,swap_mask
@@ -129,10 +129,10 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vxor S7,S7,K
mtctr ROUNDS
- li 10,0x10
+ li r10,0x10
.align 5
L8x_round_loop:
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vncipher S0,S0,ZERO
vncipher S1,S1,ZERO
@@ -150,10 +150,10 @@ L8x_round_loop:
vxor S5,S5,K
vxor S6,S6,K
vxor S7,S7,K
- addi 10,10,0x10
+ addi r10,r10,0x10
bdnz L8x_round_loop
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vncipherlast S0,S0,K
vncipherlast S1,S1,K
@@ -174,44 +174,44 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vperm S7,S7,S7,swap_mask')
stxvd2x VSR(S0),0,DST
- stxvd2x VSR(S1),25,DST
- stxvd2x VSR(S2),26,DST
- stxvd2x VSR(S3),27,DST
- stxvd2x VSR(S4),28,DST
- stxvd2x VSR(S5),29,DST
- stxvd2x VSR(S6),30,DST
- stxvd2x VSR(S7),31,DST
+ stxvd2x VSR(S1),r25,DST
+ stxvd2x VSR(S2),r26,DST
+ stxvd2x VSR(S3),r27,DST
+ stxvd2x VSR(S4),r28,DST
+ stxvd2x VSR(S5),r29,DST
+ stxvd2x VSR(S6),r30,DST
+ stxvd2x VSR(S7),r31,DST
addi SRC,SRC,0x80
addi DST,DST,0x80
- subic. 5,5,1
+ subic. r5,r5,1
bne Lx8_loop
- ld 25,-56(SP);
- ld 26,-48(SP);
- ld 27,-40(SP);
- ld 28,-32(SP);
- ld 29,-24(SP);
- ld 30,-16(SP);
- ld 31,-8(SP);
+ ld r25,-56(SP);
+ ld r26,-48(SP);
+ ld r27,-40(SP);
+ ld r28,-32(SP);
+ ld r29,-24(SP);
+ ld r30,-16(SP);
+ ld r31,-8(SP);
clrldi LENGTH,LENGTH,61
L4x:
- srdi 5,LENGTH,2
- cmpldi 5,0
+ srdi r5,LENGTH,2
+ cmpldi r5,0
beq L2x
lxvd2x VSR(K),0,KEYS
vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC
- li 9,0x10
- lxvd2x VSR(S1),9,SRC
- addi 9,9,0x10
- lxvd2x VSR(S2),9,SRC
- addi 9,9,0x10
- lxvd2x VSR(S3),9,SRC
+ li r9,0x10
+ lxvd2x VSR(S1),r9,SRC
+ addi r9,r9,0x10
+ lxvd2x VSR(S2),r9,SRC
+ addi r9,r9,0x10
+ lxvd2x VSR(S3),r9,SRC
IF_LE(`vperm S0,S0,S0,swap_mask
vperm S1,S1,S1,swap_mask
@@ -224,10 +224,10 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vxor S3,S3,K
mtctr ROUNDS
- li 10,0x10
+ li r10,0x10
.align 5
L4x_round_loop:
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vncipher S0,S0,ZERO
vncipher S1,S1,ZERO
@@ -237,10 +237,10 @@ L4x_round_loop:
vxor S1,S1,K
vxor S2,S2,K
vxor S3,S3,K
- addi 10,10,0x10
+ addi r10,r10,0x10
bdnz L4x_round_loop
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vncipherlast S0,S0,K
vncipherlast S1,S1,K
@@ -253,12 +253,12 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vperm S3,S3,S3,swap_mask')
stxvd2x VSR(S0),0,DST
- li 9,0x10
- stxvd2x VSR(S1),9,DST
- addi 9,9,0x10
- stxvd2x VSR(S2),9,DST
- addi 9,9,0x10
- stxvd2x VSR(S3),9,DST
+ li r9,0x10
+ stxvd2x VSR(S1),r9,DST
+ addi r9,r9,0x10
+ stxvd2x VSR(S2),r9,DST
+ addi r9,r9,0x10
+ stxvd2x VSR(S3),r9,DST
addi SRC,SRC,0x40
addi DST,DST,0x40
@@ -266,16 +266,16 @@ IF_LE(`vperm S0,S0,S0,swap_mask
clrldi LENGTH,LENGTH,62
L2x:
- srdi 5,LENGTH,1
- cmpldi 5,0
+ srdi r5,LENGTH,1
+ cmpldi r5,0
beq L1x
lxvd2x VSR(K),0,KEYS
vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC
- li 9,0x10
- lxvd2x VSR(S1),9,SRC
+ li r9,0x10
+ lxvd2x VSR(S1),r9,SRC
IF_LE(`vperm S0,S0,S0,swap_mask
vperm S1,S1,S1,swap_mask')
@@ -284,19 +284,19 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vxor S1,S1,K
mtctr ROUNDS
- li 10,0x10
+ li r10,0x10
.align 5
L2x_round_loop:
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vncipher S0,S0,ZERO
vncipher S1,S1,ZERO
vxor S0,S0,K
vxor S1,S1,K
- addi 10,10,0x10
+ addi r10,r10,0x10
bdnz L2x_round_loop
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vncipherlast S0,S0,K
vncipherlast S1,S1,K
@@ -305,8 +305,8 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vperm S1,S1,S1,swap_mask')
stxvd2x VSR(S0),0,DST
- li 9,0x10
- stxvd2x VSR(S1),9,DST
+ li r9,0x10
+ stxvd2x VSR(S1),r9,DST
addi SRC,SRC,0x20
addi DST,DST,0x20
@@ -327,17 +327,17 @@ IF_LE(`vperm S0,S0,S0,swap_mask')
vxor S0,S0,K
mtctr ROUNDS
- li 10,0x10
+ li r10,0x10
.align 5
L1x_round_loop:
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vncipher S0,S0,ZERO
vxor S0,S0,K
- addi 10,10,0x10
+ addi r10,r10,0x10
bdnz L1x_round_loop
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vncipherlast S0,S0,K
diff --git a/powerpc64/p8/aes-encrypt-internal.asm
b/powerpc64/p8/aes-encrypt-internal.asm
index 482dff25..3dd6e7b5 100644
--- a/powerpc64/p8/aes-encrypt-internal.asm
+++ b/powerpc64/p8/aes-encrypt-internal.asm
@@ -31,26 +31,26 @@ ifelse(`
C Register usage:
-define(`SP', `1')
-define(`TOCP', `2')
-
-define(`ROUNDS', `3')
-define(`KEYS', `4')
-define(`LENGTH', `6')
-define(`DST', `7')
-define(`SRC', `8')
-
-define(`swap_mask', `0')
-
-define(`K', `1')
-define(`S0', `2')
-define(`S1', `3')
-define(`S2', `4')
-define(`S3', `5')
-define(`S4', `6')
-define(`S5', `7')
-define(`S6', `8')
-define(`S7', `9')
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+define(`ROUNDS', `r3')
+define(`KEYS', `r4')
+define(`LENGTH', `r6')
+define(`DST', `r7')
+define(`SRC', `r8')
+
+define(`swap_mask', `v0')
+
+define(`K', `v1')
+define(`S0', `v2')
+define(`S1', `v3')
+define(`S2', `v4')
+define(`S3', `v5')
+define(`S4', `v6')
+define(`S5', `v7')
+define(`S6', `v8')
+define(`S7', `v9')
.file "aes-encrypt-internal.asm"
@@ -63,30 +63,30 @@ define(`S7', `9')
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_aes_encrypt)
- DATA_LOAD_VEC(swap_mask,.swap_mask,5)
+ DATA_LOAD_VEC(swap_mask,.swap_mask,r5)
subi ROUNDS,ROUNDS,1
srdi LENGTH,LENGTH,4
- srdi 5,LENGTH,3 #8x loop count
- cmpldi 5,0
+ srdi r5,LENGTH,3 #8x loop count
+ cmpldi r5,0
beq L4x
- std 25,-56(SP);
- std 26,-48(SP);
- std 27,-40(SP);
- std 28,-32(SP);
- std 29,-24(SP);
- std 30,-16(SP);
- std 31,-8(SP);
-
- li 25,0x10
- li 26,0x20
- li 27,0x30
- li 28,0x40
- li 29,0x50
- li 30,0x60
- li 31,0x70
+ std r25,-56(SP);
+ std r26,-48(SP);
+ std r27,-40(SP);
+ std r28,-32(SP);
+ std r29,-24(SP);
+ std r30,-16(SP);
+ std r31,-8(SP);
+
+ li r25,0x10
+ li r26,0x20
+ li r27,0x30
+ li r28,0x40
+ li r29,0x50
+ li r30,0x60
+ li r31,0x70
.align 5
Lx8_loop:
@@ -94,13 +94,13 @@ Lx8_loop:
vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC
- lxvd2x VSR(S1),25,SRC
- lxvd2x VSR(S2),26,SRC
- lxvd2x VSR(S3),27,SRC
- lxvd2x VSR(S4),28,SRC
- lxvd2x VSR(S5),29,SRC
- lxvd2x VSR(S6),30,SRC
- lxvd2x VSR(S7),31,SRC
+ lxvd2x VSR(S1),r25,SRC
+ lxvd2x VSR(S2),r26,SRC
+ lxvd2x VSR(S3),r27,SRC
+ lxvd2x VSR(S4),r28,SRC
+ lxvd2x VSR(S5),r29,SRC
+ lxvd2x VSR(S6),r30,SRC
+ lxvd2x VSR(S7),r31,SRC
IF_LE(`vperm S0,S0,S0,swap_mask
vperm S1,S1,S1,swap_mask
@@ -121,10 +121,10 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vxor S7,S7,K
mtctr ROUNDS
- li 10,0x10
+ li r10,0x10
.align 5
L8x_round_loop:
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vcipher S0,S0,K
vcipher S1,S1,K
@@ -134,10 +134,10 @@ L8x_round_loop:
vcipher S5,S5,K
vcipher S6,S6,K
vcipher S7,S7,K
- addi 10,10,0x10
+ addi r10,r10,0x10
bdnz L8x_round_loop
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vcipherlast S0,S0,K
vcipherlast S1,S1,K
@@ -158,44 +158,44 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vperm S7,S7,S7,swap_mask')
stxvd2x VSR(S0),0,DST
- stxvd2x VSR(S1),25,DST
- stxvd2x VSR(S2),26,DST
- stxvd2x VSR(S3),27,DST
- stxvd2x VSR(S4),28,DST
- stxvd2x VSR(S5),29,DST
- stxvd2x VSR(S6),30,DST
- stxvd2x VSR(S7),31,DST
+ stxvd2x VSR(S1),r25,DST
+ stxvd2x VSR(S2),r26,DST
+ stxvd2x VSR(S3),r27,DST
+ stxvd2x VSR(S4),r28,DST
+ stxvd2x VSR(S5),r29,DST
+ stxvd2x VSR(S6),r30,DST
+ stxvd2x VSR(S7),r31,DST
addi SRC,SRC,0x80
addi DST,DST,0x80
- subic. 5,5,1
+ subic. r5,r5,1
bne Lx8_loop
- ld 25,-56(SP);
- ld 26,-48(SP);
- ld 27,-40(SP);
- ld 28,-32(SP);
- ld 29,-24(SP);
- ld 30,-16(SP);
- ld 31,-8(SP);
+ ld r25,-56(SP);
+ ld r26,-48(SP);
+ ld r27,-40(SP);
+ ld r28,-32(SP);
+ ld r29,-24(SP);
+ ld r30,-16(SP);
+ ld r31,-8(SP);
clrldi LENGTH,LENGTH,61
L4x:
- srdi 5,LENGTH,2
- cmpldi 5,0
+ srdi r5,LENGTH,2
+ cmpldi r5,0
beq L2x
lxvd2x VSR(K),0,KEYS
vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC
- li 9,0x10
- lxvd2x VSR(S1),9,SRC
- addi 9,9,0x10
- lxvd2x VSR(S2),9,SRC
- addi 9,9,0x10
- lxvd2x VSR(S3),9,SRC
+ li r9,0x10
+ lxvd2x VSR(S1),r9,SRC
+ addi r9,r9,0x10
+ lxvd2x VSR(S2),r9,SRC
+ addi r9,r9,0x10
+ lxvd2x VSR(S3),r9,SRC
IF_LE(`vperm S0,S0,S0,swap_mask
vperm S1,S1,S1,swap_mask
@@ -208,19 +208,19 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vxor S3,S3,K
mtctr ROUNDS
- li 10,0x10
+ li r10,0x10
.align 5
L4x_round_loop:
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vcipher S0,S0,K
vcipher S1,S1,K
vcipher S2,S2,K
vcipher S3,S3,K
- addi 10,10,0x10
+ addi r10,r10,0x10
bdnz L4x_round_loop
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vcipherlast S0,S0,K
vcipherlast S1,S1,K
@@ -233,12 +233,12 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vperm S3,S3,S3,swap_mask')
stxvd2x VSR(S0),0,DST
- li 9,0x10
- stxvd2x VSR(S1),9,DST
- addi 9,9,0x10
- stxvd2x VSR(S2),9,DST
- addi 9,9,0x10
- stxvd2x VSR(S3),9,DST
+ li r9,0x10
+ stxvd2x VSR(S1),r9,DST
+ addi r9,r9,0x10
+ stxvd2x VSR(S2),r9,DST
+ addi r9,r9,0x10
+ stxvd2x VSR(S3),r9,DST
addi SRC,SRC,0x40
addi DST,DST,0x40
@@ -246,16 +246,16 @@ IF_LE(`vperm S0,S0,S0,swap_mask
clrldi LENGTH,LENGTH,62
L2x:
- srdi 5,LENGTH,1
- cmpldi 5,0
+ srdi r5,LENGTH,1
+ cmpldi r5,0
beq L1x
lxvd2x VSR(K),0,KEYS
vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC
- li 9,0x10
- lxvd2x VSR(S1),9,SRC
+ li r9,0x10
+ lxvd2x VSR(S1),r9,SRC
IF_LE(`vperm S0,S0,S0,swap_mask
vperm S1,S1,S1,swap_mask')
@@ -264,17 +264,17 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vxor S1,S1,K
mtctr ROUNDS
- li 10,0x10
+ li r10,0x10
.align 5
L2x_round_loop:
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vcipher S0,S0,K
vcipher S1,S1,K
- addi 10,10,0x10
+ addi r10,r10,0x10
bdnz L2x_round_loop
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vcipherlast S0,S0,K
vcipherlast S1,S1,K
@@ -283,8 +283,8 @@ IF_LE(`vperm S0,S0,S0,swap_mask
vperm S1,S1,S1,swap_mask')
stxvd2x VSR(S0),0,DST
- li 9,0x10
- stxvd2x VSR(S1),9,DST
+ li r9,0x10
+ stxvd2x VSR(S1),r9,DST
addi SRC,SRC,0x20
addi DST,DST,0x20
@@ -305,16 +305,16 @@ IF_LE(`vperm S0,S0,S0,swap_mask')
vxor S0,S0,K
mtctr ROUNDS
- li 10,0x10
+ li r10,0x10
.align 5
L1x_round_loop:
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vcipher S0,S0,K
- addi 10,10,0x10
+ addi r10,r10,0x10
bdnz L1x_round_loop
- lxvd2x VSR(K),10,KEYS
+ lxvd2x VSR(K),r10,KEYS
vperm K,K,K,swap_mask
vcipherlast S0,S0,K
--
2.17.1