I measured the latency and throughput of vcipher/vncipher/vxor instructions for POWER8 vcipher/vncipher throughput 6 instructions per cycle latency 0.91 clock cycles vxor throughput 6 instructions per cycle latency 0.32 clock cycles So the ideal option for POWER8 is processing 8 blocks, it has +12% performance over processing 4 blocks.
--- powerpc64/P8/aes-decrypt-internal.asm | 367 ++++++++++++++++++++++++++++++++++ powerpc64/P8/aes-encrypt-internal.asm | 344 +++++++++++++++++++++++++++++++ 2 files changed, 711 insertions(+) create mode 100644 powerpc64/P8/aes-decrypt-internal.asm create mode 100644 powerpc64/P8/aes-encrypt-internal.asm
diff --git a/powerpc64/P8/aes-decrypt-internal.asm b/powerpc64/P8/aes-decrypt-internal.asm new file mode 100644 index 00000000..f5d64548 --- /dev/null +++ b/powerpc64/P8/aes-decrypt-internal.asm @@ -0,0 +1,367 @@ +C powerpc64/P8/aes-decrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) + +C ZERO vector register is used in place of RoundKey +C for vncipher instruction because the order of InvMixColumns +C and Xor processes are flipped in that instruction. +C The Xor process with RoundKey is executed afterward. +define(<ZERO>, <10>) + +.file "aes-decrypt-internal.asm" + +IF_LE(<.abiversion 2>) +.text + + C _aes_decrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +define(<FUNC_ALIGN>, <5>) +PROLOGUE(_nettle_aes_decrypt) + vxor ZERO,ZERO,ZERO + + DATA_LOAD_VEC(swap_mask,.swap_mask,5) + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,3 #8x loop count + cmpldi 5,0 + beq L4x + + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 25,0x10 + li 26,0x20 + li 27,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + +.align 5 +Lx8_loop: + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + lxvd2x S1X,25,SRC + lxvd2x S2X,26,SRC + lxvd2x S3X,27,SRC + lxvd2x S4X,28,SRC + lxvd2x S5X,29,SRC + lxvd2x S6X,30,SRC + lxvd2x S7X,31,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + stxvd2x S0X,0,DST + stxvd2x S1X,25,DST + stxvd2x S2X,26,DST + stxvd2x S3X,27,DST + stxvd2x S4X,28,DST + stxvd2x S5X,29,DST + stxvd2x S6X,30,DST + stxvd2x S7X,31,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + subic. 5,5,1 + bne Lx8_loop + + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vxor S0,S0,K + vxor S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vxor S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_decrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>) diff --git a/powerpc64/P8/aes-encrypt-internal.asm b/powerpc64/P8/aes-encrypt-internal.asm new file mode 100644 index 00000000..3e0fa6f0 --- /dev/null +++ b/powerpc64/P8/aes-encrypt-internal.asm @@ -0,0 +1,344 @@ +C powerpc64/P8/aes-encrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(<SP>, <1>) +define(<TOCP>, <2>) + +define(<ROUNDS>, <3>) +define(<KEYS>, <4>) +define(<LENGTH>, <6>) +define(<DST>, <7>) +define(<SRC>, <8>) + +define(<swap_mask>, <0>) + +define(<K>, <1>) +define(<S0>, <2>) +define(<S1>, <3>) +define(<S2>, <4>) +define(<S3>, <5>) +define(<S4>, <6>) +define(<S5>, <7>) +define(<S6>, <8>) +define(<S7>, <9>) + +define(<KX>, <33>) +define(<S0X>, <34>) +define(<S1X>, <35>) +define(<S2X>, <36>) +define(<S3X>, <37>) +define(<S4X>, <38>) +define(<S5X>, <39>) +define(<S6X>, <40>) +define(<S7X>, <41>) + +.file "aes-encrypt-internal.asm" + +IF_LE(<.abiversion 2>) +.text + + C _aes_encrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +define(<FUNC_ALIGN>, <5>) +PROLOGUE(_nettle_aes_encrypt) + DATA_LOAD_VEC(swap_mask,.swap_mask,5) + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,3 #8x loop count + cmpldi 5,0 + beq L4x + + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 25,0x10 + li 26,0x20 + li 27,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + +.align 5 +Lx8_loop: + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + lxvd2x S1X,25,SRC + lxvd2x S2X,26,SRC + lxvd2x S3X,27,SRC + lxvd2x S4X,28,SRC + lxvd2x S5X,29,SRC + lxvd2x S6X,30,SRC + lxvd2x S7X,31,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask + vperm S4,S4,S4,swap_mask + vperm S5,S5,S5,swap_mask + vperm S6,S6,S6,swap_mask + vperm S7,S7,S7,swap_mask>) + + stxvd2x S0X,0,DST + stxvd2x S1X,25,DST + stxvd2x S2X,26,DST + stxvd2x S3X,27,DST + stxvd2x S4X,28,DST + stxvd2x S5X,29,DST + stxvd2x S6X,30,DST + stxvd2x S7X,31,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + subic. 5,5,1 + bne Lx8_loop + + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + addi 9,9,0x10 + lxvd2x S2X,9,SRC + addi 9,9,0x10 + lxvd2x S3X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask + vperm S2,S2,S2,swap_mask + vperm S3,S3,S3,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + addi 9,9,0x10 + stxvd2x S2X,9,DST + addi 9,9,0x10 + stxvd2x S3X,9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + li 9,0x10 + lxvd2x S1X,9,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + +IF_LE(<vperm S0,S0,S0,swap_mask + vperm S1,S1,S1,swap_mask>) + + stxvd2x S0X,0,DST + li 9,0x10 + stxvd2x S1X,9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x KX,0,KEYS + vperm K,K,K,swap_mask + + lxvd2x S0X,0,SRC + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x KX,10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + +IF_LE(<vperm S0,S0,S0,swap_mask>) + + stxvd2x S0X,0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_encrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>)
Maamoun TK maamoun.tk@googlemail.com writes:
I measured the latency and throughput of vcipher/vncipher/vxor instructions for POWER8 vcipher/vncipher throughput 6 instructions per cycle latency 0.91 clock cycles vxor throughput 6 instructions per cycle latency 0.32 clock cycles
Latency less than one cycle sounds wrong. Usually, simple ALU instructions like xor has a latency of exactly one cycle (i.e., when an instruction starts executing (all inputs are available), the result is available for depending instructions exactly one cycle later). While deeply pipelined instructions, e.g., multiplication, can have a latency of several cycles but still a throughput of one or a few instructions per cycle.
See https://gmplib.org/~tege/x86-timing.pdf for background and lots of numbers for x86 processors.
So the ideal option for POWER8 is processing 8 blocks, it has +12% performance over processing 4 blocks.
Sounds reasonable to me.
powerpc64/P8/aes-decrypt-internal.asm | 367
I take it "P8" in the path is for power 8? Are the crypto extensions always available for power 8? If not, directory should be named differently.
To get going, I've merged this and the machine.m4 patch to a development branch. I'd like to do things stepwise, first do the minimal configure changes to get AES working (and maybe with default on, to get it exercised by the .gitlab-ci machinery), then add ghash and fat builds (not sure in which order). I wanted to also merge the README patch right away, but that failed due to line breaks in the email.
BTW, about fat tests, I'm considering adding a make target "check-fat" which will run make check with some different settings of NETTLE_FAT_OVERRIDE (platform specific, and determined by configure).
Regards /Niels
nisse@lysator.liu.se (Niels Möller) writes:
To get going, I've merged this and the machine.m4 patch to a development branch. I'd like to do things stepwise, first do the minimal configure changes to get AES working (and maybe with default on, to get it exercised by the .gitlab-ci machinery),
Seems to pass tests! See https://gitlab.com/gnutls/nettle/-/jobs/647514000 and https://gitlab.com/gnutls/nettle/-/jobs/647514003
The branch is named "power-asm-wip".
Regards, /Niels
On Mon, Jul 20, 2020 at 8:41 PM Niels Möller nisse@lysator.liu.se wrote:
Latency less than one cycle sounds wrong. Usually, simple ALU
instructions like xor has a latency of exactly one cycle (i.e., when an instruction starts executing (all inputs are available), the result is available for depending instructions exactly one cycle later). While deeply pipelined instructions, e.g., multiplication, can have a latency of several cycles but still a throughput of one or a few instructions per cycle.
I had the same concern, I measured the clock time from the start of the instruction execution until the start of the next dependent instruction. I'm sure about the latency numbers but not sure how to subtend them with cycle numbers.
I take it "P8" in the path is for power 8? Are the crypto extensions
always available for power 8? If not, directory should be named differently.
Yes, it stands for POWER8, it's the minimal processor that supports the crypto extensions, sticking crypto extensions with POWER8 is fine.
To get going, I've merged this and the machine.m4 patch to a development
branch. I'd like to do things stepwise, first do the minimal configure changes to get AES working (and maybe with default on, to get it exercised by the .gitlab-ci machinery), then add ghash and fat builds (not sure in which order). I wanted to also merge the README patch right away, but that failed due to line breaks in the email.
Great, I will reupload the README file without incompatible line breaks.
BTW, about fat tests, I'm considering adding a make target "check-fat"
which will run make check with some different settings of NETTLE_FAT_OVERRIDE (platform specific, and determined by configure).
I can help implementing this feature if you give me more details on how to go with it.
Regards, Mamone
Maamoun TK maamoun.tk@googlemail.com writes:
On Mon, Jul 20, 2020 at 8:41 PM Niels Möller nisse@lysator.liu.se wrote:
Latency less than one cycle sounds wrong.
I had the same concern, I measured the clock time from the start of the instruction execution until the start of the next dependent instruction. I'm sure about the latency numbers but not sure how to subtend them with cycle numbers.
You may need to have a *long* chain of depending instructions to get an accurate measurement of latency.
I take it "P8" in the path is for power 8? Are the crypto extensions always available for power 8? If not, directory should be named differently.
Yes, it stands for POWER8, it's the minimal processor that supports the crypto extensions, sticking crypto extensions with POWER8 is fine.
But in the patch for fat builds, you do the runtime check as
+ hwcap2 = getauxval(AT_HWCAP2);
+ features->have_crypto_ext = + (hwcap2 & PPC_FEATURE2_VEC_CRYPTO) == PPC_FEATURE2_VEC_CRYPTO ? 1 : 0;
I think I would prefer to have a matching directory name in the source tree, e.g., powerpc64/crypto_ext.
Are the aes instructions and the ghash instructions (which I imagine also has non-cryptographic uses) part of the same extension?
BTW, about fat tests, I'm considering adding a make target "check-fat" which will run make check with some different settings of NETTLE_FAT_OVERRIDE (platform specific, and determined by configure).
I can help implementing this feature if you give me more details on how to go with it.
The main thing I'm unsure about is that I don't know what extensions the ci test machines can be expected to have. For cross tests, it shouldn't be an issue as long as qemu supports all extensions of interest. But for a native x86_64 fat build, do the test machines have, e.g., the "sha_ni" extension? If not, we'd need to find out, and prune what fat variants we test.
Regards, /Niels
On Wed, Jul 22, 2020 at 6:04 PM Niels Möller nisse@lysator.liu.se wrote:
But in the patch for fat builds, you do the runtime check as
hwcap2 = getauxval(AT_HWCAP2);
features->have_crypto_ext =
(hwcap2 & PPC_FEATURE2_VEC_CRYPTO) == PPC_FEATURE2_VEC_CRYPTO ? 1 : 0;
I think I would prefer to have a matching directory name in the source tree, e.g., powerpc64/crypto_ext.
Are the aes instructions and the ghash instructions (which I imagine also has non-cryptographic uses) part of the same extension?
Yes, both are part of the same extension. I considered calling the directory "P8" for three reasons: - POWER8 is the minimal processor that support the crypto extensions - I measured the throughput and latency of the instructions on POWER8 - The current implementations can be enhanced further for POWER9 and newer by using arch 3.00 specific instructions which was introduced in POWER9 so we can call the directory of new implementations "P9"
The main thing I'm unsure about is that I don't know what extensions the ci test machines can be expected to have. For cross tests, it shouldn't be an issue as long as qemu supports all extensions of interest. But for a native x86_64 fat build, do the test machines have, e.g., the "sha_ni" extension? If not, we'd need to find out, and prune what fat variants we test.
I tested on my fork, "sha_ni" is not supported on gitlab ci
Regards, Mamone
Maamoun TK maamoun.tk@googlemail.com writes:
Yes, both are part of the same extension. I considered calling the directory "P8" for three reasons:
- POWER8 is the minimal processor that support the crypto extensions
- I measured the throughput and latency of the instructions on POWER8
- The current implementations can be enhanced further for POWER9 and newer
by using arch 3.00 specific instructions which was introduced in POWER9 so we can call the directory of new implementations "P9"
Ok, let's stay with that naming (but I'll consider changing to lowercase "p8", to match other directory names). If it turns out something more fine grained is needed later, files can be mover around then.
Regards, /Niels
Sounds good.
Thank you, Mamone
On Fri, Jul 31, 2020 at 9:42 PM Niels Möller nisse@lysator.liu.se wrote:
Maamoun TK maamoun.tk@googlemail.com writes:
Yes, both are part of the same extension. I considered calling the directory "P8" for three reasons:
- POWER8 is the minimal processor that support the crypto extensions
- I measured the throughput and latency of the instructions on POWER8
- The current implementations can be enhanced further for POWER9 and
newer
by using arch 3.00 specific instructions which was introduced in POWER9
so
we can call the directory of new implementations "P9"
Ok, let's stay with that naming (but I'll consider changing to lowercase "p8", to match other directory names). If it turns out something more fine grained is needed later, files can be mover around then.
Regards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
On Mon, Jul 20, 2020 at 8:41 PM Niels Möller nisse@lysator.liu.se wrote:
then add ghash and fat builds (not sure in which order).
I forgot to mention that you can merge them at any order.
Regards, Mamone
nisse@lysator.liu.se (Niels Möller) writes:
BTW, about fat tests, I'm considering adding a make target "check-fat" which will run make check with some different settings of NETTLE_FAT_OVERRIDE (platform specific, and determined by configure).
I've added this now, with fairly solid coverage for ARM and less coverage for x86_64.
Regards, /Niels
I will add PPC to this check.
Thank you, Mamone
On Fri, Jul 31, 2020 at 8:56 PM Niels Möller nisse@lysator.liu.se wrote:
nisse@lysator.liu.se (Niels Möller) writes:
BTW, about fat tests, I'm considering adding a make target "check-fat" which will run make check with some different settings of NETTLE_FAT_OVERRIDE (platform specific, and determined by configure).
I've added this now, with fairly solid coverage for ARM and less coverage for x86_64.
Regards, /Niels
-- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance.
nettle-bugs@lists.lysator.liu.se