This patch is built upon ppc-m4-macrology.patch. Using explicit register names is working as expected now. --- powerpc64/machine.m4 | 11 +- powerpc64/p8/aes-decrypt-internal.asm | 194 +++++++++++++++++----------------- powerpc64/p8/aes-encrypt-internal.asm | 192 ++++++++++++++++----------------- 3 files changed, 200 insertions(+), 197 deletions(-)
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 index ae161d79..f867ec01 100644 --- a/powerpc64/machine.m4 +++ b/powerpc64/machine.m4 @@ -24,7 +24,10 @@ define(`EPILOGUE',
C Get vector-scalar register from vector register C VSR(VR) -define(`VSR',`32+$1') +define(`VSR',`ifelse(ASM_PPC_WANT_R_REGISTERS,no, +`eval(32+$1)', +``vs'eval(32+substr($1,1,len($1)))' +)')
C Load the quadword in DATA_SRC storage into C VEC_DST. GPR is general-purpose register @@ -32,19 +35,19 @@ C used to obtain the effective address of C DATA_SRC storage. C DATA_LOAD_VEC(VEC_DST, DATA_SRC, GPR) define(`DATA_LOAD_VEC', -`ld $3,$2@got(2) +`ld $3,$2@got(r2) lvx $1,0,$3')
dnl Usage: r0 ... r31, cr0 ... cr7 dnl dnl Registers names, either left as "r0" etc or mapped to plain 0 etc, -dnl according to the result of the GMP_ASM_POWERPC_REGISTERS configure +dnl according to the result of the ASM_PPC_WANT_R_REGISTERS configure dnl test.
ifelse(ASM_PPC_WANT_R_REGISTERS,no,` forloop(i,0,31,`deflit(`r'i,i)') forloop(i,0,31,`deflit(`v'i,i)') +forloop(i,0,63,`deflit(`vs'i,i)') forloop(i,0,31,`deflit(`f'i,i)') forloop(i,0,7, `deflit(`cr'i,i)') ') - diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm index acdbc1bd..7c79ffcb 100644 --- a/powerpc64/p8/aes-decrypt-internal.asm +++ b/powerpc64/p8/aes-decrypt-internal.asm @@ -31,32 +31,32 @@ ifelse(`
C Register usage:
-define(`SP', `1') -define(`TOCP', `2') - -define(`ROUNDS', `3') -define(`KEYS', `4') -define(`LENGTH', `6') -define(`DST', `7') -define(`SRC', `8') - -define(`swap_mask', `0') - -define(`K', `1') -define(`S0', `2') -define(`S1', `3') -define(`S2', `4') -define(`S3', `5') -define(`S4', `6') -define(`S5', `7') -define(`S6', `8') -define(`S7', `9') +define(`SP', `r1') +define(`TOCP', `r2') + +define(`ROUNDS', `r3') +define(`KEYS', `r4') +define(`LENGTH', `r6') +define(`DST', `r7') +define(`SRC', `r8') + +define(`swap_mask', `v0') + +define(`K', `v1') +define(`S0', `v2') +define(`S1', `v3') +define(`S2', `v4') +define(`S3', `v5') +define(`S4', `v6') +define(`S5', `v7') +define(`S6', `v8') +define(`S7', `v9')
C ZERO vector register is used in place of RoundKey C for vncipher instruction because the order of InvMixColumns C and Xor processes are flipped in that instruction. C The Xor process with RoundKey is executed afterward. -define(`ZERO', `10') +define(`ZERO', `v10')
.file "aes-decrypt-internal.asm"
@@ -71,30 +71,30 @@ define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_aes_decrypt) vxor ZERO,ZERO,ZERO
- DATA_LOAD_VEC(swap_mask,.swap_mask,5) + DATA_LOAD_VEC(swap_mask,.swap_mask,r5)
subi ROUNDS,ROUNDS,1 srdi LENGTH,LENGTH,4
- srdi 5,LENGTH,3 #8x loop count - cmpldi 5,0 + srdi r5,LENGTH,3 #8x loop count + cmpldi r5,0 beq L4x
- std 25,-56(SP); - std 26,-48(SP); - std 27,-40(SP); - std 28,-32(SP); - std 29,-24(SP); - std 30,-16(SP); - std 31,-8(SP); - - li 25,0x10 - li 26,0x20 - li 27,0x30 - li 28,0x40 - li 29,0x50 - li 30,0x60 - li 31,0x70 + std r25,-56(SP); + std r26,-48(SP); + std r27,-40(SP); + std r28,-32(SP); + std r29,-24(SP); + std r30,-16(SP); + std r31,-8(SP); + + li r25,0x10 + li r26,0x20 + li r27,0x30 + li r28,0x40 + li r29,0x50 + li r30,0x60 + li r31,0x70
.align 5 Lx8_loop: @@ -102,13 +102,13 @@ Lx8_loop: vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC - lxvd2x VSR(S1),25,SRC - lxvd2x VSR(S2),26,SRC - lxvd2x VSR(S3),27,SRC - lxvd2x VSR(S4),28,SRC - lxvd2x VSR(S5),29,SRC - lxvd2x VSR(S6),30,SRC - lxvd2x VSR(S7),31,SRC + lxvd2x VSR(S1),r25,SRC + lxvd2x VSR(S2),r26,SRC + lxvd2x VSR(S3),r27,SRC + lxvd2x VSR(S4),r28,SRC + lxvd2x VSR(S5),r29,SRC + lxvd2x VSR(S6),r30,SRC + lxvd2x VSR(S7),r31,SRC
IF_LE(`vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -129,10 +129,10 @@ IF_LE(`vperm S0,S0,S0,swap_mask vxor S7,S7,K
mtctr ROUNDS - li 10,0x10 + li r10,0x10 .align 5 L8x_round_loop: - lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO @@ -150,10 +150,10 @@ L8x_round_loop: vxor S5,S5,K vxor S6,S6,K vxor S7,S7,K - addi 10,10,0x10 + addi r10,r10,0x10 bdnz L8x_round_loop
- lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -174,44 +174,44 @@ IF_LE(`vperm S0,S0,S0,swap_mask vperm S7,S7,S7,swap_mask')
stxvd2x VSR(S0),0,DST - stxvd2x VSR(S1),25,DST - stxvd2x VSR(S2),26,DST - stxvd2x VSR(S3),27,DST - stxvd2x VSR(S4),28,DST - stxvd2x VSR(S5),29,DST - stxvd2x VSR(S6),30,DST - stxvd2x VSR(S7),31,DST + stxvd2x VSR(S1),r25,DST + stxvd2x VSR(S2),r26,DST + stxvd2x VSR(S3),r27,DST + stxvd2x VSR(S4),r28,DST + stxvd2x VSR(S5),r29,DST + stxvd2x VSR(S6),r30,DST + stxvd2x VSR(S7),r31,DST
addi SRC,SRC,0x80 addi DST,DST,0x80 - subic. 5,5,1 + subic. r5,r5,1 bne Lx8_loop
- ld 25,-56(SP); - ld 26,-48(SP); - ld 27,-40(SP); - ld 28,-32(SP); - ld 29,-24(SP); - ld 30,-16(SP); - ld 31,-8(SP); + ld r25,-56(SP); + ld r26,-48(SP); + ld r27,-40(SP); + ld r28,-32(SP); + ld r29,-24(SP); + ld r30,-16(SP); + ld r31,-8(SP);
clrldi LENGTH,LENGTH,61
L4x: - srdi 5,LENGTH,2 - cmpldi 5,0 + srdi r5,LENGTH,2 + cmpldi r5,0 beq L2x
lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC - li 9,0x10 - lxvd2x VSR(S1),9,SRC - addi 9,9,0x10 - lxvd2x VSR(S2),9,SRC - addi 9,9,0x10 - lxvd2x VSR(S3),9,SRC + li r9,0x10 + lxvd2x VSR(S1),r9,SRC + addi r9,r9,0x10 + lxvd2x VSR(S2),r9,SRC + addi r9,r9,0x10 + lxvd2x VSR(S3),r9,SRC
IF_LE(`vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -224,10 +224,10 @@ IF_LE(`vperm S0,S0,S0,swap_mask vxor S3,S3,K
mtctr ROUNDS - li 10,0x10 + li r10,0x10 .align 5 L4x_round_loop: - lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO @@ -237,10 +237,10 @@ L4x_round_loop: vxor S1,S1,K vxor S2,S2,K vxor S3,S3,K - addi 10,10,0x10 + addi r10,r10,0x10 bdnz L4x_round_loop
- lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -253,12 +253,12 @@ IF_LE(`vperm S0,S0,S0,swap_mask vperm S3,S3,S3,swap_mask')
stxvd2x VSR(S0),0,DST - li 9,0x10 - stxvd2x VSR(S1),9,DST - addi 9,9,0x10 - stxvd2x VSR(S2),9,DST - addi 9,9,0x10 - stxvd2x VSR(S3),9,DST + li r9,0x10 + stxvd2x VSR(S1),r9,DST + addi r9,r9,0x10 + stxvd2x VSR(S2),r9,DST + addi r9,r9,0x10 + stxvd2x VSR(S3),r9,DST
addi SRC,SRC,0x40 addi DST,DST,0x40 @@ -266,16 +266,16 @@ IF_LE(`vperm S0,S0,S0,swap_mask clrldi LENGTH,LENGTH,62
L2x: - srdi 5,LENGTH,1 - cmpldi 5,0 + srdi r5,LENGTH,1 + cmpldi r5,0 beq L1x
lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC - li 9,0x10 - lxvd2x VSR(S1),9,SRC + li r9,0x10 + lxvd2x VSR(S1),r9,SRC
IF_LE(`vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask') @@ -284,19 +284,19 @@ IF_LE(`vperm S0,S0,S0,swap_mask vxor S1,S1,K
mtctr ROUNDS - li 10,0x10 + li r10,0x10 .align 5 L2x_round_loop: - lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vncipher S1,S1,ZERO vxor S0,S0,K vxor S1,S1,K - addi 10,10,0x10 + addi r10,r10,0x10 bdnz L2x_round_loop
- lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K vncipherlast S1,S1,K @@ -305,8 +305,8 @@ IF_LE(`vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask')
stxvd2x VSR(S0),0,DST - li 9,0x10 - stxvd2x VSR(S1),9,DST + li r9,0x10 + stxvd2x VSR(S1),r9,DST
addi SRC,SRC,0x20 addi DST,DST,0x20 @@ -327,17 +327,17 @@ IF_LE(`vperm S0,S0,S0,swap_mask') vxor S0,S0,K
mtctr ROUNDS - li 10,0x10 + li r10,0x10 .align 5 L1x_round_loop: - lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vncipher S0,S0,ZERO vxor S0,S0,K - addi 10,10,0x10 + addi r10,r10,0x10 bdnz L1x_round_loop
- lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vncipherlast S0,S0,K
diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm index 482dff25..3dd6e7b5 100644 --- a/powerpc64/p8/aes-encrypt-internal.asm +++ b/powerpc64/p8/aes-encrypt-internal.asm @@ -31,26 +31,26 @@ ifelse(`
C Register usage:
-define(`SP', `1') -define(`TOCP', `2') - -define(`ROUNDS', `3') -define(`KEYS', `4') -define(`LENGTH', `6') -define(`DST', `7') -define(`SRC', `8') - -define(`swap_mask', `0') - -define(`K', `1') -define(`S0', `2') -define(`S1', `3') -define(`S2', `4') -define(`S3', `5') -define(`S4', `6') -define(`S5', `7') -define(`S6', `8') -define(`S7', `9') +define(`SP', `r1') +define(`TOCP', `r2') + +define(`ROUNDS', `r3') +define(`KEYS', `r4') +define(`LENGTH', `r6') +define(`DST', `r7') +define(`SRC', `r8') + +define(`swap_mask', `v0') + +define(`K', `v1') +define(`S0', `v2') +define(`S1', `v3') +define(`S2', `v4') +define(`S3', `v5') +define(`S4', `v6') +define(`S5', `v7') +define(`S6', `v8') +define(`S7', `v9')
.file "aes-encrypt-internal.asm"
@@ -63,30 +63,30 @@ define(`S7', `9')
define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_aes_encrypt) - DATA_LOAD_VEC(swap_mask,.swap_mask,5) + DATA_LOAD_VEC(swap_mask,.swap_mask,r5)
subi ROUNDS,ROUNDS,1 srdi LENGTH,LENGTH,4
- srdi 5,LENGTH,3 #8x loop count - cmpldi 5,0 + srdi r5,LENGTH,3 #8x loop count + cmpldi r5,0 beq L4x
- std 25,-56(SP); - std 26,-48(SP); - std 27,-40(SP); - std 28,-32(SP); - std 29,-24(SP); - std 30,-16(SP); - std 31,-8(SP); - - li 25,0x10 - li 26,0x20 - li 27,0x30 - li 28,0x40 - li 29,0x50 - li 30,0x60 - li 31,0x70 + std r25,-56(SP); + std r26,-48(SP); + std r27,-40(SP); + std r28,-32(SP); + std r29,-24(SP); + std r30,-16(SP); + std r31,-8(SP); + + li r25,0x10 + li r26,0x20 + li r27,0x30 + li r28,0x40 + li r29,0x50 + li r30,0x60 + li r31,0x70
.align 5 Lx8_loop: @@ -94,13 +94,13 @@ Lx8_loop: vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC - lxvd2x VSR(S1),25,SRC - lxvd2x VSR(S2),26,SRC - lxvd2x VSR(S3),27,SRC - lxvd2x VSR(S4),28,SRC - lxvd2x VSR(S5),29,SRC - lxvd2x VSR(S6),30,SRC - lxvd2x VSR(S7),31,SRC + lxvd2x VSR(S1),r25,SRC + lxvd2x VSR(S2),r26,SRC + lxvd2x VSR(S3),r27,SRC + lxvd2x VSR(S4),r28,SRC + lxvd2x VSR(S5),r29,SRC + lxvd2x VSR(S6),r30,SRC + lxvd2x VSR(S7),r31,SRC
IF_LE(`vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -121,10 +121,10 @@ IF_LE(`vperm S0,S0,S0,swap_mask vxor S7,S7,K
mtctr ROUNDS - li 10,0x10 + li r10,0x10 .align 5 L8x_round_loop: - lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K @@ -134,10 +134,10 @@ L8x_round_loop: vcipher S5,S5,K vcipher S6,S6,K vcipher S7,S7,K - addi 10,10,0x10 + addi r10,r10,0x10 bdnz L8x_round_loop
- lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -158,44 +158,44 @@ IF_LE(`vperm S0,S0,S0,swap_mask vperm S7,S7,S7,swap_mask')
stxvd2x VSR(S0),0,DST - stxvd2x VSR(S1),25,DST - stxvd2x VSR(S2),26,DST - stxvd2x VSR(S3),27,DST - stxvd2x VSR(S4),28,DST - stxvd2x VSR(S5),29,DST - stxvd2x VSR(S6),30,DST - stxvd2x VSR(S7),31,DST + stxvd2x VSR(S1),r25,DST + stxvd2x VSR(S2),r26,DST + stxvd2x VSR(S3),r27,DST + stxvd2x VSR(S4),r28,DST + stxvd2x VSR(S5),r29,DST + stxvd2x VSR(S6),r30,DST + stxvd2x VSR(S7),r31,DST
addi SRC,SRC,0x80 addi DST,DST,0x80 - subic. 5,5,1 + subic. r5,r5,1 bne Lx8_loop
- ld 25,-56(SP); - ld 26,-48(SP); - ld 27,-40(SP); - ld 28,-32(SP); - ld 29,-24(SP); - ld 30,-16(SP); - ld 31,-8(SP); + ld r25,-56(SP); + ld r26,-48(SP); + ld r27,-40(SP); + ld r28,-32(SP); + ld r29,-24(SP); + ld r30,-16(SP); + ld r31,-8(SP);
clrldi LENGTH,LENGTH,61
L4x: - srdi 5,LENGTH,2 - cmpldi 5,0 + srdi r5,LENGTH,2 + cmpldi r5,0 beq L2x
lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC - li 9,0x10 - lxvd2x VSR(S1),9,SRC - addi 9,9,0x10 - lxvd2x VSR(S2),9,SRC - addi 9,9,0x10 - lxvd2x VSR(S3),9,SRC + li r9,0x10 + lxvd2x VSR(S1),r9,SRC + addi r9,r9,0x10 + lxvd2x VSR(S2),r9,SRC + addi r9,r9,0x10 + lxvd2x VSR(S3),r9,SRC
IF_LE(`vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask @@ -208,19 +208,19 @@ IF_LE(`vperm S0,S0,S0,swap_mask vxor S3,S3,K
mtctr ROUNDS - li 10,0x10 + li r10,0x10 .align 5 L4x_round_loop: - lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K vcipher S2,S2,K vcipher S3,S3,K - addi 10,10,0x10 + addi r10,r10,0x10 bdnz L4x_round_loop
- lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -233,12 +233,12 @@ IF_LE(`vperm S0,S0,S0,swap_mask vperm S3,S3,S3,swap_mask')
stxvd2x VSR(S0),0,DST - li 9,0x10 - stxvd2x VSR(S1),9,DST - addi 9,9,0x10 - stxvd2x VSR(S2),9,DST - addi 9,9,0x10 - stxvd2x VSR(S3),9,DST + li r9,0x10 + stxvd2x VSR(S1),r9,DST + addi r9,r9,0x10 + stxvd2x VSR(S2),r9,DST + addi r9,r9,0x10 + stxvd2x VSR(S3),r9,DST
addi SRC,SRC,0x40 addi DST,DST,0x40 @@ -246,16 +246,16 @@ IF_LE(`vperm S0,S0,S0,swap_mask clrldi LENGTH,LENGTH,62
L2x: - srdi 5,LENGTH,1 - cmpldi 5,0 + srdi r5,LENGTH,1 + cmpldi r5,0 beq L1x
lxvd2x VSR(K),0,KEYS vperm K,K,K,swap_mask
lxvd2x VSR(S0),0,SRC - li 9,0x10 - lxvd2x VSR(S1),9,SRC + li r9,0x10 + lxvd2x VSR(S1),r9,SRC
IF_LE(`vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask') @@ -264,17 +264,17 @@ IF_LE(`vperm S0,S0,S0,swap_mask vxor S1,S1,K
mtctr ROUNDS - li 10,0x10 + li r10,0x10 .align 5 L2x_round_loop: - lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K vcipher S1,S1,K - addi 10,10,0x10 + addi r10,r10,0x10 bdnz L2x_round_loop
- lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K vcipherlast S1,S1,K @@ -283,8 +283,8 @@ IF_LE(`vperm S0,S0,S0,swap_mask vperm S1,S1,S1,swap_mask')
stxvd2x VSR(S0),0,DST - li 9,0x10 - stxvd2x VSR(S1),9,DST + li r9,0x10 + stxvd2x VSR(S1),r9,DST
addi SRC,SRC,0x20 addi DST,DST,0x20 @@ -305,16 +305,16 @@ IF_LE(`vperm S0,S0,S0,swap_mask') vxor S0,S0,K
mtctr ROUNDS - li 10,0x10 + li r10,0x10 .align 5 L1x_round_loop: - lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vcipher S0,S0,K - addi 10,10,0x10 + addi r10,r10,0x10 bdnz L1x_round_loop
- lxvd2x VSR(K),10,KEYS + lxvd2x VSR(K),r10,KEYS vperm K,K,K,swap_mask vcipherlast S0,S0,K
Maamoun TK maamoun.tk@googlemail.com writes:
This patch is built upon ppc-m4-macrology.patch. Using explicit register names is working as expected now.
Thanks. This and the next patch now merged to master-updates.
Regards, /Niels
nettle-bugs@lists.lysator.liu.se