Maamoun TK maamoun.tk@googlemail.com writes:
I considered to use m4 macros but it "mangles" parameter names, it becomes hard for reader to keep track on the macro body. However, I'm still up to change it to m4 macros if you like.
The below patch seems to work. It's a drawback that m4 doesn't have named parameters, only $1, $2, ..., but I think it's good with consistency, and I don't think names "param1" and "param2" are that helpful compared to $2, $3.
But it would be nice with a bit more documentation of the macros. And of the registers, at least, group them to make it clear which registers are the input data (C0 -- C3?) or other state (e.g., accumulators), which registers are used for the precomputed key-dependent parameters, and which registers are short-lived temporaries.
In other news, I've applied for an account at https://linuxone.cloud.marist.edu, but it seems there's some manual review involved so not completed yet,
Regards, /Niels
diff --git a/arm64/crypto/gcm-hash.asm b/arm64/crypto/gcm-hash.asm index b77b08d6..f86fb504 100644 --- a/arm64/crypto/gcm-hash.asm +++ b/arm64/crypto/gcm-hash.asm @@ -47,21 +47,22 @@ define(`R', `v18') define(`R1', `v19')
C common macros: -.macro PMUL in, param1, param2 - pmull F.1q,\param2().1d,\in().1d - pmull2 F1.1q,\param2().2d,\in().2d - pmull R.1q,\param1().1d,\in().1d - pmull2 R1.1q,\param1().2d,\in().2d +C PMUL(in, param1, param2) +define(`PMUL', m4_assert_numargs(3)` + pmull F.1q,$3.1d,$1.1d + pmull2 F1.1q,$3.2d,$1.2d + pmull R.1q,$2.1d,$1.1d + pmull2 R1.1q,$2.2d,$1.2d eor F.16b,F.16b,F1.16b eor R.16b,R.16b,R1.16b -.endm - -.macro REDUCTION out +') +C REDUCTION(out) +define(`REDUCTION', m4_assert_numargs(1)` pmull T.1q,F.1d,POLY.1d eor R.16b,R.16b,T.16b ext R.16b,R.16b,R.16b,#8 - eor \out().16b,F.16b,R.16b -.endm + eor $1.16b,F.16b,R.16b +')
C void gcm_init_key (union gcm_block *table)
@@ -101,13 +102,14 @@ define(`H3L', `v28') define(`H4M', `v29') define(`H4L', `v30')
-.macro PMUL_PARAM in, param1, param2 - pmull2 Hp.1q,\in().2d,POLY.2d - eor Hm.16b,\in().16b,Hp.16b - ext \param1().16b,Hm.16b,\in().16b,#8 - ext \param2().16b,\in().16b,Hm.16b,#8 - ext \param1().16b,\param1().16b,\param1().16b,#8 -.endm +C PMUL_PARAM(in, param1, param2) +define(`PMUL_PARAM', m4_assert_numargs(3)` + pmull2 Hp.1q,$1.2d,POLY.2d + eor Hm.16b,$1.16b,Hp.16b + ext $2.16b,Hm.16b,$1.16b,#8 + ext $3.16b,$1.16b,Hm.16b,#8 + ext $2.16b,$2.16b,$2.16b,#8 +')
PROLOGUE(_nettle_gcm_init_key) add x1,TABLE,#16*H_Idx @@ -138,13 +140,13 @@ IF_LE(`
C --- calculate H^2 = H*H ---
- PMUL_PARAM H,H1M,H1L + PMUL_PARAM(H,H1M,H1L)
- PMUL H,H1M,H1L + PMUL(H,H1M,H1L)
- REDUCTION H2 + REDUCTION(H2)
- PMUL_PARAM H2,H2M,H2L + PMUL_PARAM(H2,H2M,H2L)
C we store to the table as doubleword-vectors in current memory endianness C because it's our own strictly internal data structure and what gcm_hash @@ -153,19 +155,19 @@ IF_LE(`
C --- calculate H^3 = H^1*H^2 ---
- PMUL H2,H1M,H1L + PMUL(H2,H1M,H1L)
- REDUCTION H3 + REDUCTION(H3)
- PMUL_PARAM H3,H3M,H3L + PMUL_PARAM(H3,H3M,H3L)
C --- calculate H^4 = H^2*H^2 ---
- PMUL H2,H2M,H2L + PMUL(H2,H2M,H2L)
- REDUCTION H4 + REDUCTION(H4)
- PMUL_PARAM H4,H4M,H4L + PMUL_PARAM(H4,H4M,H4L)
st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE]
@@ -197,16 +199,17 @@ define(`H3L', `v29') define(`H4M', `v30') define(`H4L', `v31')
-.macro PMUL_SUM in, param1, param2 - pmull F2.1q,\param2().1d,\in().1d - pmull2 F3.1q,\param2().2d,\in().2d - pmull R2.1q,\param1().1d,\in().1d - pmull2 R3.1q,\param1().2d,\in().2d +C PMUL_SUM(in, param1, param2) +define(`PMUL_SUM', m4_assert_numargs(3)` + pmull F2.1q,$3.1d,$1.1d + pmull2 F3.1q,$3.2d,$1.2d + pmull R2.1q,$2.1d,$1.1d + pmull2 R3.1q,$2.2d,$1.2d eor F2.16b,F2.16b,F3.16b eor R2.16b,R2.16b,R3.16b eor F.16b,F.16b,F2.16b eor R.16b,R.16b,R2.16b -.endm +')
C void gcm_hash (const struct gcm_key *key, union gcm_block *x, C size_t length, const uint8_t *data) @@ -238,12 +241,12 @@ IF_LE(`
eor C0.16b,C0.16b,D.16b
- PMUL C1,H3M,H3L - PMUL_SUM C2,H2M,H2L - PMUL_SUM C3,H1M,H1L - PMUL_SUM C0,H4M,H4L + PMUL(C1,H3M,H3L) + PMUL_SUM(C2,H2M,H2L) + PMUL_SUM(C3,H1M,H1L) + PMUL_SUM(C0,H4M,H4L)
- REDUCTION D + REDUCTION(D)
subs x4,x4,#64 b.ne L4x_loop @@ -264,10 +267,10 @@ IF_LE(`
eor C0.16b,C0.16b,D.16b
- PMUL C1,H1M,H1L - PMUL_SUM C0,H2M,H2L + PMUL(C1,H1M,H1L) + PMUL_SUM(C0,H2M,H2L)
- REDUCTION D + REDUCTION(D)
and LENGTH,LENGTH,#31
@@ -284,9 +287,9 @@ IF_LE(`
eor C0.16b,C0.16b,D.16b
- PMUL C0,H1M,H1L + PMUL(C0,H1M,H1L)
- REDUCTION D + REDUCTION(D)
Lmod: tst LENGTH,#15 @@ -325,9 +328,9 @@ Lmod_8_load: Lmod_8_done: eor C0.16b,C0.16b,D.16b
- PMUL C0,H1M,H1L + PMUL(C0,H1M,H1L)
- REDUCTION D + REDUCTION(D)
Ldone: IF_LE(`