This is the same workaround as done in f58d1c288f6 for salsa20-crypt. --- x86_64/sha3-permute.asm | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-)
diff --git a/x86_64/sha3-permute.asm b/x86_64/sha3-permute.asm index 0ebd70c..360a1f4 100644 --- a/x86_64/sha3-permute.asm +++ b/x86_64/sha3-permute.asm @@ -72,6 +72,10 @@ define(<STATE>, <OFFSET($1)(CTX)>)
define(<SWAP64>, <pshufd <$>0x4e,>)
+C movq calls that are equal to the corresponding movd, +C where the Apple assembler requires them to be written as movd. +define(<MOVQ>, <movd>) + C ROTL64(rot, register, temp) C Caller needs to or together the result. define(<ROTL64>, < @@ -147,12 +151,12 @@ PROLOGUE(nettle_sha3_permute)
SWAP64 C34, C34 C Holds C4, C3 movdqa C12, D34 - movq C0, D12 + MOVQ C0, D12 punpcklqdq C12, D12 C Holds C0, C1 punpckhqdq C34, D34 C Holds C2, C3 punpcklqdq D12, C34 C Holds C4, C0 - movq C34, D0 - movq C12, T0 + MOVQ C34, D0 + MOVQ C12, T0 rolq $1, T0 xorq T0, D0
@@ -236,8 +240,8 @@ PROLOGUE(nettle_sha3_permute) C `-_________-^`-^ rolq $36, A05 - movq A05, W0 - movq A0607, A05 + MOVQ A05, W0 + MOVQ A0607, A05 rolq $44, A05 C Done A05 ROTL64(6, A0607, W1) por A0607, W1 @@ -260,8 +264,8 @@ PROLOGUE(nettle_sha3_permute)
rolq $42, A10 C 42 + 25 = 3 (mod 64) SWAP64 A1112, W0 - movq A10, A1112 - movq W0, A10 + MOVQ A10, A1112 + MOVQ W0, A10 rolq $43, A10 C Done A10
punpcklqdq A1314, A1112 @@ -285,8 +289,8 @@ PROLOGUE(nettle_sha3_permute)
SWAP64 A1819, W0 rolq $41, A15 - movq A15, W1 - movq A1819, A15 + MOVQ A15, W1 + MOVQ A1819, A15 rolq $21, A15 C Done A15 SWAP64 A1617, A1819 ROTL64(45, A1617, W2) @@ -308,7 +312,7 @@ PROLOGUE(nettle_sha3_permute) C _______/
rolq $18, A20 - movq A20, W0 + MOVQ A20, W0 SWAP64 A2324, W1 movd W1, A20 rolq $14, A20 C Done A20 @@ -386,21 +390,21 @@ PROLOGUE(nettle_sha3_permute) C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304, C and also copy to C12 and C34 while at it. - movq A05, C12 - movq A15, C34 - movq A10, W0 - movq A20, W1 + MOVQ A05, C12 + MOVQ A15, C34 + MOVQ A10, W0 + MOVQ A20, W1 movq A00, C0 punpcklqdq W0, C12 punpcklqdq W1, C34 - movq A0102, A05 - movq A0304, A15 + MOVQ A0102, A05 + MOVQ A0304, A15 psrldq $8, A0102 psrldq $8, A0304 xorq A05, C0 xorq A15, C0 - movq A0102, A10 - movq A0304, A20 + MOVQ A0102, A10 + MOVQ A0304, A20
movdqa C12, A0102 movdqa C34, A0304