The stack is not guaranteed to be 16-byte aligned on win64. --- x86_64/machine.m4 | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/x86_64/machine.m4 b/x86_64/machine.m4 index dc23dde..d5d5b37 100644 --- a/x86_64/machine.m4 +++ b/x86_64/machine.m4 @@ -71,34 +71,34 @@ define(<W64_ENTRY>, < ifelse(W64_ABI,yes,[ ifelse(eval($2 > 6), 1, [ sub [$]eval(8 + 16*($2 - 6)), %rsp - movdqa %xmm6, 0(%rsp) + movdqu %xmm6, 0(%rsp) ]) ifelse(eval($2 > 7), 1, [ - movdqa %xmm7, 16(%rsp) + movdqu %xmm7, 16(%rsp) ]) ifelse(eval($2 > 8), 1, [ - movdqa %xmm8, 32(%rsp) + movdqu %xmm8, 32(%rsp) ]) ifelse(eval($2 > 9), 1, [ - movdqa %xmm9, 48(%rsp) + movdqu %xmm9, 48(%rsp) ]) ifelse(eval($2 > 10), 1, [ - movdqa %xmm10, 64(%rsp) + movdqu %xmm10, 64(%rsp) ]) ifelse(eval($2 > 11), 1, [ - movdqa %xmm11, 80(%rsp) + movdqu %xmm11, 80(%rsp) ]) ifelse(eval($2 > 12), 1, [ - movdqa %xmm12, 96(%rsp) + movdqu %xmm12, 96(%rsp) ]) ifelse(eval($2 > 13), 1, [ - movdqa %xmm13, 112(%rsp) + movdqu %xmm13, 112(%rsp) ]) ifelse(eval($2 > 14), 1, [ - movdqa %xmm14, 128(%rsp) + movdqu %xmm14, 128(%rsp) ]) ifelse(eval($2 > 15), 1, [ - movdqa %xmm15, 144(%rsp) + movdqu %xmm15, 144(%rsp) ]) ifelse(eval($1 >= 1), 1, [ push %rdi @@ -133,34 +133,34 @@ define(<W64_EXIT>, < pop %rdi ]) ifelse(eval($2 > 15), 1, [ - movdqa 144(%rsp), %xmm15 + movdqu 144(%rsp), %xmm15 ]) ifelse(eval($2 > 14), 1, [ - movdqa 128(%rsp), %xmm14 + movdqu 128(%rsp), %xmm14 ]) ifelse(eval($2 > 13), 1, [ - movdqa 112(%rsp), %xmm13 + movdqu 112(%rsp), %xmm13 ]) ifelse(eval($2 > 12), 1, [ - movdqa 96(%rsp), %xmm12 + movdqu 96(%rsp), %xmm12 ]) ifelse(eval($2 > 11), 1, [ - movdqa 80(%rsp), %xmm11 + movdqu 80(%rsp), %xmm11 ]) ifelse(eval($2 > 10), 1, [ - movdqa 64(%rsp), %xmm10 + movdqu 64(%rsp), %xmm10 ]) ifelse(eval($2 > 9), 1, [ - movdqa 48(%rsp), %xmm9 + movdqu 48(%rsp), %xmm9 ]) ifelse(eval($2 > 8), 1, [ - movdqa 32(%rsp), %xmm8 + movdqu 32(%rsp), %xmm8 ]) ifelse(eval($2 > 7), 1, [ - movdqa 16(%rsp), %xmm7 + movdqu 16(%rsp), %xmm7 ]) ifelse(eval($2 > 6), 1, [ - movdqa 0(%rsp), %xmm6 + movdqu 0(%rsp), %xmm6 add [$]eval(8 + 16*($2 - 6)), %rsp ]) ])
The Lpartial subfunction is entered with plain call instructions, and the win64 epilogue should only run when actually exiting the whole salsa20_crypt function. --- x86_64/salsa20-crypt.asm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/x86_64/salsa20-crypt.asm b/x86_64/salsa20-crypt.asm index 25b7e49..9d1b53d 100644 --- a/x86_64/salsa20-crypt.asm +++ b/x86_64/salsa20-crypt.asm @@ -224,10 +224,11 @@ PROLOGUE(nettle_salsa20_crypt) shr $16, XREG(T64) .Llt2: test $1, LENGTH - jz .Lend + jz .Lret xor (SRC, POS), LREG(T64) mov LREG(T64), (DST, POS)
- jmp .Lend +.Lret: + ret
EPILOGUE(nettle_salsa20_crypt)
nettle-bugs@lists.lysator.liu.se