Intel AES-NI perforamnce - nettle-bugs

31 May 2020


      I've spent an evening taking a closer look at intel AES performance,
after it was pointed out to me by Torbjörn that the Intel AESNI
instructions are highly pipelined on most or all processors supporting
them. Meaning that the processor can start execution of one even two
instructions per cycle, provided that they are independent, while it
takes several cycles until the results become available for depending
instructions.
Out-of-order (OoO) execution can help to run things in parallel, even if
the instruction stream locally is a sequence of dependent instructions.
E.g., my main development machine has a broadwell processor, and it can
issue one aesni instruction per cycle, and I think the latency is 7 cycles.
Encrypting one block needs 10 aesni instructions (one per round, and then
there's a eleventh subkey applied with plain xor which can issue in
parallel with another instruction in the same cycle).
When I benchmark, aes128 ECB runs in 10.2 cycles per block, which means
that out-of-order execution is very successful, executing many
iterations of the loop in parallel. CBC encrypt, which is inherently
non-parallel, runs *much* slower, I get 91 cycles per block, where the
latency of just the aes encyption wold be 71 cycles (if 7 cycles latency
per aesni instruction is correct, and then one more cycle for the xor
subkey, the remaining 20 cycles for the CBC processing which seems to be
a bit slow in itself).
I'm considering rearranging the loops to interleave multiple blocks. See
below code which implements 4-way interleaving (a drop-in replacement
for x86_64/aesni/aes-encrypt-internal). If this approach turns out to be
useful, might be beneficial to extend to 8-way interleaving: That would
allow instructions to run nicely in parallel even without any
out-of-order-execution.
However, the interleaved code makes no change to performance in my
benchmarks on my machine. Since the old code is close to 10 cycles /
block, which is a hard limit from instruction issue of the aesni
instructions, and it seems almost all other instructions are already
executing in parallel with them.
But it might be an improvement on other processors, or for applications
that process a small number of blocks at a time (in the middle between
CBC which always does one block at a time, and the ECB benchmark which
does 10 KB, or 640 blocks, at a time), by making things easier for the
processor's OoO-machinery.
If you have any application benchmarks it would be interesting if you
could try out the interleaved version. I'm also thinking that maybe we
should add benchmarks for various message sizes?
Regards,
/Niels
C x86_64/aesni/aes-encrypt-internal.asm
ifelse(<
   Copyright (C) 2015, 2018 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.
or
* the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.
You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
...
)
C Input argument
define(<ROUNDS>, <%rdi>)
define(<KEYS>,	<%rsi>)
C define(<TABLE>,	<%rdx>) C Unused here
define(<LENGTH>,<%rcx>)
define(<DST>,	<%r8>)
define(<SRC>,	<%r9>)
define(<CNT>, <%r10>)
define(<TMP>, <%r10>) C Can overlap CNT
define(<TAB>, <%rdx>)
define(<SUBKEY>, <%xmm0>)
define(<B0>, <%xmm1>)
define(<B1>, <%xmm2>)
define(<B2>, <%xmm3>)
define(<B3>, <%xmm4>)
.file "aes-encrypt-internal.asm"
C _aes_encrypt(unsigned rounds, const uint32_t *keys,
    C	       const struct aes_table *T,
    C	       size_t length, uint8_t *dst,
    C	       uint8_t *src)
    .text
    ALIGN(16)
PROLOGUE(_nettle_aes_encrypt)
    W64_ENTRY(6, 5)
    shr	$4, LENGTH
    test	LENGTH, LENGTH
    jz	.Lend
C Each round uses 16 bytes of subkeys, i.e., 16 bytes. We have
    C an initial xor round, rounds-1 regular rounds, and one final
    C round. Adjust so that ROUNDS reflects the number of regular
    C rounds, KEYS points to the key for the final round, and KEYS
    C + ROUNDS points to the key for the first regular round.
dec	XREG(ROUNDS)
    shl	$4, XREG(ROUNDS)
    lea	16(ROUNDS, KEYS), KEYS
    neg	ROUNDS
jmp 	.Loop_end
.Lloop_4w:
    mov 	ROUNDS, CNT
    movups	-16(KEYS, ROUNDS), SUBKEY
    movups	(SRC), B0
    movups	16(SRC), B1
    movups	32(SRC), B2
    movups	48(SRC), B3
    pxor	SUBKEY, B0
    pxor	SUBKEY, B1
    pxor	SUBKEY, B2
    pxor	SUBKEY, B3
.Lround_loop_4w:
    movups	(KEYS, CNT), SUBKEY
    add	$16, CNT
    aesenc	SUBKEY, B0
    aesenc	SUBKEY, B1
    aesenc	SUBKEY, B2
    aesenc	SUBKEY, B3
    jne	.Lround_loop_4w
movups	(KEYS), SUBKEY
    aesenclast SUBKEY, B0
    aesenclast SUBKEY, B1
    aesenclast SUBKEY, B2
    aesenclast SUBKEY, B3
movups	B0, (DST)
    movups	B1, 16(DST)
    movups	B2, 32(DST)
    movups	B3, 48(DST)
add	$64, SRC
    add	$64, DST
sub	$4, LENGTH
.Loop_end:
    cmp	$4, LENGTH
    jnc	.Lloop_4w
lea	.Ljmptab(%rip), TAB
    movslq	(TAB, LENGTH, 4), TMP
    lea	(TAB, TMP), TMP
    jmp	*TMP
.Ltail3:
    mov 	ROUNDS, CNT
    movups	-16(KEYS, ROUNDS), SUBKEY
    movups	(SRC), B0
    movups	16(SRC), B1
    movups	32(SRC), B2
    pxor	SUBKEY, B0
    pxor	SUBKEY, B1
    pxor	SUBKEY, B2
.Lround_loop_3w:
    movups	(KEYS, CNT), SUBKEY
    add	$16, CNT
    aesenc	SUBKEY, B0
    aesenc	SUBKEY, B1
    aesenc	SUBKEY, B2
    jne	.Lround_loop_3w
movups	(KEYS), SUBKEY
    aesenclast SUBKEY, B0
    aesenclast SUBKEY, B1
    aesenclast SUBKEY, B2
movups	B0, (DST)
    movups	B1, 16(DST)
    movups	B2, 32(DST)
    jmp	.Lend
.Ltail2:
    mov 	ROUNDS, CNT
    movups	-16(KEYS, ROUNDS), SUBKEY
    movups	(SRC), B0
    movups	16(SRC), B1
    pxor	SUBKEY, B0
    pxor	SUBKEY, B1
.Lround_loop_2w:
    movups	(KEYS, CNT), SUBKEY
    add	$16, CNT
    aesenc	SUBKEY, B0
    aesenc	SUBKEY, B1
    jne	.Lround_loop_2w
movups	(KEYS), SUBKEY
    aesenclast SUBKEY, B0
    aesenclast SUBKEY, B1
movups	B0, (DST)
    movups	B1, 16(DST)
    jmp	.Lend
.Ltail1:
    mov 	ROUNDS, CNT
    movups	-16(KEYS, ROUNDS), SUBKEY
    movups	(SRC), B0
    pxor	SUBKEY, B0
.Lround_loop_1w:
    movups	(KEYS, CNT), SUBKEY
    add	$16, CNT
    aesenc	SUBKEY, B0
    jne	.Lround_loop_1w
movups	(KEYS), SUBKEY
    aesenclast SUBKEY, B0
movups	B0, (DST)
.Lend:
    W64_EXIT(6, 5)
    ret
EPILOGUE(_nettle_aes_encrypt)
C FIXME: Put in rodata?
    ALIGN(4)
.Ljmptab:
    .long .Lend - .Ljmptab
    .long .Ltail1 - .Ljmptab
    .long .Ltail2 - .Ljmptab
    .long .Ltail3 - .Ljmptab
-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.