Hi,
This series of patches add the powerpc64 assembly for modp/redc functions for elliptic curves P192, P224, P256, P384, P521, X25519 and X448. It results in 15-30% performance improvements as measured on POWER9 system using hogweed-benchmark.
I posted the modified codes in the earlier email thread, but I think posting them as a seperate series will make them easier to cherry pick.
V2 changes: - Use actual register names when storing/restoring from stack - Drop m4 definitions which are not in use - Simplify C2 folding for P192 curve
Amitay Isaacs (2): ecc: Add powerpc64 assembly for ecc_192_modp ecc: Add powerpc64 assembly for ecc_224_modp
Martin Schwenke (4): ecc: Add powerpc64 assembly for ecc_384_modp ecc: Add powerpc64 assembly for ecc_521_modp ecc: Add powerpc64 assembly for ecc_25519_modp ecc: Add powerpc64 assembly for ecc_448_modp
powerpc64/ecc-curve25519-modp.asm | 101 +++++++++++++ powerpc64/ecc-curve448-modp.asm | 174 +++++++++++++++++++++++ powerpc64/ecc-secp192r1-modp.asm | 87 ++++++++++++ powerpc64/ecc-secp224r1-modp.asm | 123 ++++++++++++++++ powerpc64/ecc-secp384r1-modp.asm | 227 ++++++++++++++++++++++++++++++ powerpc64/ecc-secp521r1-modp.asm | 166 ++++++++++++++++++++++ 6 files changed, 878 insertions(+) create mode 100644 powerpc64/ecc-curve25519-modp.asm create mode 100644 powerpc64/ecc-curve448-modp.asm create mode 100644 powerpc64/ecc-secp192r1-modp.asm create mode 100644 powerpc64/ecc-secp224r1-modp.asm create mode 100644 powerpc64/ecc-secp384r1-modp.asm create mode 100644 powerpc64/ecc-secp521r1-modp.asm
Signed-off-by: Amitay Isaacs amitay@ozlabs.org --- powerpc64/ecc-secp192r1-modp.asm | 87 ++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 powerpc64/ecc-secp192r1-modp.asm
diff --git a/powerpc64/ecc-secp192r1-modp.asm b/powerpc64/ecc-secp192r1-modp.asm new file mode 100644 index 00000000..ee38ec60 --- /dev/null +++ b/powerpc64/ecc-secp192r1-modp.asm @@ -0,0 +1,87 @@ +C powerpc64/ecc-secp192r1-modp.asm + +ifelse(` + Copyright (C) 2021 Amitay Isaacs, IBM Corporation + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-secp192r1-modp.asm" + +define(`RP', `r4') +define(`XP', `r5') + +define(`T0', `r6') +define(`T1', `r7') +define(`T2', `r8') +define(`T3', `r9') +define(`C1', `r10') +define(`C2', `r11') + + C void ecc_secp192r1_modp (const struct ecc_modulo *m, mp_limb_t *rp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_secp192r1_modp) + ld T0, 0(XP) + ld T1, 8(XP) + ld T2, 16(XP) + + li C1, 0 + li C2, 0 + + ld T3, 24(XP) + addc T0, T3, T0 + adde T1, T3, T1 + addze T2, T2 + addze C1, C1 + + ld T3, 32(XP) + addc T1, T3, T1 + adde T2, T3, T2 + addze C1, C1 + + ld T3, 40(XP) + addc T0, T3, T0 + adde T1, T3, T1 + adde T2, T3, T2 + addze C1, C1 + + addc T0, C1, T0 + adde T1, C1, T1 + addze T2, T2 + addze C2, C2 + + addc T0, C2, T0 + adde T1, C2, T1 + addze T2, T2 + + std T0, 0(RP) + std T1, 8(RP) + std T2, 16(RP) + + blr +EPILOGUE(_nettle_ecc_secp192r1_modp)
Signed-off-by: Amitay Isaacs amitay@ozlabs.org --- powerpc64/ecc-secp224r1-modp.asm | 123 +++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 powerpc64/ecc-secp224r1-modp.asm
diff --git a/powerpc64/ecc-secp224r1-modp.asm b/powerpc64/ecc-secp224r1-modp.asm new file mode 100644 index 00000000..e4bbf366 --- /dev/null +++ b/powerpc64/ecc-secp224r1-modp.asm @@ -0,0 +1,123 @@ +C powerpc64/ecc-secp224r1-modp.asm + +ifelse(` + Copyright (C) 2021 Amitay Isaacs, IBM Corporation + + Based on x86_64/ecc-secp224r1-modp.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-secp224r1-modp.asm" + +define(`SP', `r1') + +define(`RP', `r4') +define(`XP', `r5') + +define(`T0', `r6') +define(`T1', `r7') +define(`H0', `r8') +define(`H1', `r9') +define(`H2', `r10') +define(`F0', `r11') +define(`F1', `r12') +define(`F2', `r14') +define(`T2', `r3') + + C void ecc_secp224r1_modp (const struct ecc_modulo *m, mp_limb_t *rp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_secp224r1_modp) + std r14, -8(SP) + + ld H0, 48(XP) + ld H1, 56(XP) + C set (F2, F1, F0) <-- (H1, H0) << 32 + sldi F0, H0, 32 + srdi F1, H0, 32 + sldi T0, H1, 32 + srdi F2, H1, 32 + or F1, T0, F1 + + li H2, 0 + ld T0, 16(XP) + ld T1, 24(XP) + subfc T0, F0, T0 + subfe T1, F1, T1 + subfe H0, F2, H0 + addme H1, H1 + + ld T2, 32(XP) + addc H0, T2, H0 + ld T2, 40(XP) + adde H1, T2, H1 + addze H2, H2 + + C Set (F2, F1, F0) <-- (H2, H1, H0) << 32 + sldi F0, H0, 32 + srdi F1, H0, 32 + addc H0, T0, H0 + sldi T0, H1, 32 + srdi F2, H1, 32 + adde H1, T1, H1 + sldi T1, H2, 32 + addze H2, H2 + or F1, T0, F1 + or F2, T1, F2 + + ld T0, 0(XP) + ld T1, 8(XP) + subfc T0, F0, T0 + subfe T1, F1, T1 + subfe H0, F2, H0 + addme H1, H1 + addme H2, H2 + + srdi F0, H1, 32 + sldi F1, H2, 32 + or F0, F1, F0 + clrrdi F1, H1, 32 + mr F2, H2 + clrldi H1, H1, 32 + + subfc T0, F0, T0 + addme F1, F1 + addme F2, F2 + addc T1, F1, T1 + adde H0, F2, H0 + addze H1, H1 + + std T0, 0(RP) + std T1, 8(RP) + std H0, 16(RP) + std H1, 24(RP) + + ld r14, -8(SP) + + blr +EPILOGUE(_nettle_ecc_secp224r1_modp)
From: Martin Schwenke martin@meltin.net
Signed-off-by: Martin Schwenke martin@meltin.net Signed-off-by: Amitay Isaacs amitay@ozlabs.org Signed-off-by: Alastair D'Silva alastair@d-silva.org --- powerpc64/ecc-secp384r1-modp.asm | 227 +++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 powerpc64/ecc-secp384r1-modp.asm
diff --git a/powerpc64/ecc-secp384r1-modp.asm b/powerpc64/ecc-secp384r1-modp.asm new file mode 100644 index 00000000..d673bf1e --- /dev/null +++ b/powerpc64/ecc-secp384r1-modp.asm @@ -0,0 +1,227 @@ +C powerpc64/ecc-secp384r1-modp.asm + +ifelse(` + Copyright (C) 2021 Martin Schwenke, Amitay Isaacs & Alastair D´Silva, IBM Corporation + + Based on x86_64/ecc-secp256r1-redc.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-secp384r1-modp.asm" + +C Register usage: + +define(`SP', `r1') + +define(`RP', `r4') +define(`XP', `r5') + +define(`D5', `r6') +define(`T0', `r7') +define(`T1', `r8') +define(`T2', `r9') +define(`T3', `r10') +define(`T4', `r11') +define(`T5', `r12') +define(`H0', `r14') +define(`H1', `r15') +define(`H2', `r16') +define(`H3', `r17') +define(`H4', `r18') +define(`H5', `r19') +define(`C2', `r3') +define(`C0', H5) C Overlap +define(`TMP', XP) C Overlap + + + C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, mp_limb_t *xp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_secp384r1_modp) + + std r14, -48(SP) + std r15, -40(SP) + std r16, -32(SP) + std r17, -24(SP) + std r18, -16(SP) + std r19, -8(SP) + + C First get top 2 limbs, which need folding twice. + C B^10 = B^6 + B^4 + 2^32 (B-1)B^4. + C We handle the terms as follow: + C + C B^6: Folded immediatly. + C + C B^4: Delayed, added in in the next folding. + C + C 2^32(B-1) B^4: Low half limb delayed until the next + C folding. Top 1.5 limbs subtracted and shifter now, resulting + C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added + C in. + + ld H4, 80(XP) + ld H5, 88(XP) + C Shift right 32 bits, into H1, H0 + srdi H1, H5, 32 + sldi D5, H5, 32 + srdi H0, H4, 32 + or H0, H0, D5 + + C H1 H0 + C - H1 H0 + C -------- + C H1 H0 D5 + subfic D5, H0, 0 + subfe H0, H1, H0 + addme H1, H1 + + li C2, 0 + addc H0, H4, H0 + adde H1, H5, H1 + addze C2, C2 + + C Add in to high part + ld T1, 48(XP) + ld T2, 56(XP) + addc H0, T1, H0 + adde H1, T2, H1 + addze C2, C2 C Do C2 later + + C +1 term + ld T0, 0(XP) + ld T1, 8(XP) + ld T2, 16(XP) + ld T3, 24(XP) + ld T4, 32(XP) + ld T5, 40(XP) + ld H2, 64(XP) + ld H3, 72(XP) + addc T0, H0, T0 + adde T1, H1, T1 + adde T2, H2, T2 + adde T3, H3, T3 + adde T4, H4, T4 + adde T5, H5, T5 + li C0, 0 + addze C0, C0 + + C +B^2 term + addc T2, H0, T2 + adde T3, H1, T3 + adde T4, H2, T4 + adde T5, H3, T5 + addze C0, C0 + + C Shift left, including low half of H4 + sldi H4, H4, 32 + srdi TMP, H3, 32 + or H4, TMP, H4 + + sldi H3, H3, 32 + srdi TMP, H2, 32 + or H3, TMP, H3 + + sldi H2, H2, 32 + srdi TMP, H1, 32 + or H2, TMP, H2 + + sldi H1, H1, 32 + srdi TMP, H0, 32 + or H1, TMP, H1 + + sldi H0, H0, 32 + + C H4 H3 H2 H1 H0 0 + C - H4 H3 H2 H1 H0 + C --------------- + C H4 H3 H2 H1 H0 TMP + + subfic TMP, H0, 0 + subfe H0, H1, H0 + subfe H1, H2, H1 + subfe H2, H3, H2 + subfe H3, H4, H3 + addme H4, H4 + + addc T0, TMP, T0 + adde T1, H0, T1 + adde T2, H1, T2 + adde T3, H2, T3 + adde T4, H3, T4 + adde T5, H4, T5 + addze C0, C0 + + C Remains to add in C2 and C0 + C Set H1, H0 = (2^96 - 2^32 + 1) C0 + sldi H1, C0, 32 + subfc H0, H1, C0 + addme H1, H1 + + C Set H3, H2 = (2^96 - 2^32 + 1) C2 + sldi H3, C2, 32 + subfc H2, H3, C2 + addme H3, H3 + addc H2, C0, H2 + + li C0, 0 + addc T0, H0, T0 + adde T1, H1, T1 + adde T2, H2, T2 + adde T3, H3, T3 + adde T4, C2, T4 + adde T5, D5, T5 C Value delayed from initial folding + addze C0, C0 + + C Final unlikely carry + sldi H1, C0, 32 + subfc H0, H1, C0 + addme H1, H1 + + addc T0, H0, T0 + adde T1, H1, T1 + adde T2, C0, T2 + addze T3, T3 + addze T4, T4 + addze T5, T5 + + std T0, 0(RP) + std T1, 8(RP) + std T2, 16(RP) + std T3, 24(RP) + std T4, 32(RP) + std T5, 40(RP) + + ld r14, -48(SP) + ld r15, -40(SP) + ld r16, -32(SP) + ld r17, -24(SP) + ld r18, -16(SP) + ld r19, -8(SP) + + blr +EPILOGUE(_nettle_ecc_secp384r1_modp)
From: Martin Schwenke martin@meltin.net
Signed-off-by: Martin Schwenke martin@meltin.net Signed-off-by: Alastair D'Silva alastair@d-silva.org --- powerpc64/ecc-secp521r1-modp.asm | 166 +++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 powerpc64/ecc-secp521r1-modp.asm
diff --git a/powerpc64/ecc-secp521r1-modp.asm b/powerpc64/ecc-secp521r1-modp.asm new file mode 100644 index 00000000..e989f9cf --- /dev/null +++ b/powerpc64/ecc-secp521r1-modp.asm @@ -0,0 +1,166 @@ +C powerpc64/ecc-secp521r1-modp.asm + +ifelse(` + Copyright (C) 2021 Martin Schwenke & Alastair D´Silva, IBM Corporation + + Based on x86_64/ecc-secp521r1-modp.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-secp521r1-modp.asm" + +define(`SP', `r1') + +define(`RP', `r4') +define(`XP', `r5') + +define(`U0', `r6') +define(`U1', `r7') +define(`U2', `r8') +define(`U3', `r9') +define(`U4', `r10') +define(`U5', `r11') +define(`U6', `r12') +define(`U7', `r14') +define(`U8', `r15') +define(`U9', `r16') + +define(`T0', `r3') +define(`T1', `r17') + + + C void ecc_secp521r1_modp (const struct ecc_modulo *p, mp_limb_t *rp, mp_limb_t *xp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_secp521r1_modp) + + std r14, -32(SP) + std r15, -24(SP) + std r16, -16(SP) + std r17, -8(SP) + + C Read top 17 limbs, shift left 55 bits + ld U1, 72(XP) + sldi U0, U1, 55 + srdi U1, U1, 9 + + ld T0, 80(XP) + srdi U2, T0, 9 + sldi T0, T0, 55 + or U1, T0, U1 + + ld T0, 88(XP) + srdi U3, T0, 9 + sldi T0, T0, 55 + or U2, T0, U2 + + ld T0, 96(XP) + srdi U4, T0, 9 + sldi T0, T0, 55 + or U3, T0, U3 + + ld T0, 104(XP) + srdi U5, T0, 9 + sldi T0, T0, 55 + or U4, T0, U4 + + ld T0, 112(XP) + srdi U6, T0, 9 + sldi T0, T0, 55 + or U5, T0, U5 + + ld T0, 120(XP) + srdi U7, T0, 9 + sldi T0, T0, 55 + or U6, T0, U6 + + ld T0, 128(XP) + srdi U8, T0, 9 + sldi T0, T0, 55 + or U7, T0, U7 + + ld T0, 136(XP) + srdi U9, T0, 9 + sldi T0, T0, 55 + or U8, T0, U8 + + ld T0, 0(XP) + ld T1, 8(XP) + addc U0, T0, U0 + adde U1, T1, U1 + ld T0, 16(XP) + ld T1, 24(XP) + adde U2, T0, U2 + adde U3, T1, U3 + ld T0, 32(XP) + ld T1, 40(XP) + adde U4, T0, U4 + adde U5, T1, U5 + ld T0, 48(XP) + ld T1, 56(XP) + adde U6, T0, U6 + adde U7, T1, U7 + ld T0, 64(XP) + adde U8, T0, U8 + addze U9, U9 + + C Top limbs are <U9, U8>. Keep low 9 bits of 8, and fold the + C top bits (at most 65 bits). + srdi T0, U8, 9 + andi. U8, U8, 0x1ff + srdi T1, U9, 9 + sldi U9, U9, 55 + or T0, U9, T0 + + addc U0, T0, U0 + adde U1, T1, U1 + addze U2, U2 + addze U3, U3 + addze U4, U4 + addze U5, U5 + addze U6, U6 + addze U7, U7 + addze U8, U8 + + std U0, 0(RP) + std U1, 8(RP) + std U2, 16(RP) + std U3, 24(RP) + std U4, 32(RP) + std U5, 40(RP) + std U6, 48(RP) + std U7, 56(RP) + std U8, 64(RP) + + ld r14, -32(SP) + ld r15, -24(SP) + ld r16, -16(SP) + ld r17, -8(SP) + + blr +EPILOGUE(_nettle_ecc_secp521r1_modp)
From: Martin Schwenke martin@meltin.net
Signed-off-by: Martin Schwenke martin@meltin.net Signed-off-by: Alastair D'Silva alastair@d-silva.org --- powerpc64/ecc-curve25519-modp.asm | 101 ++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 powerpc64/ecc-curve25519-modp.asm
diff --git a/powerpc64/ecc-curve25519-modp.asm b/powerpc64/ecc-curve25519-modp.asm new file mode 100644 index 00000000..8d87eeaf --- /dev/null +++ b/powerpc64/ecc-curve25519-modp.asm @@ -0,0 +1,101 @@ +C powerpc64/ecc-25519-modp.asm + +ifelse(` + Copyright (C) 2021 Martin Schwenke & Alastair D´Silva, IBM Corporation + + Based on x86_64/ecc-25519-modp.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-25519-modp.asm" + +define(`RP', `r4') +define(`XP', `r5') + +define(`U0', `r6') C Overlaps unused modulo input +define(`U1', `r7') +define(`U2', `r8') +define(`U3', `r9') +define(`T0', `r10') +define(`T1', `r11') +define(`M', `r12') + +define(`UN', r3) + + C void ecc_curve25519_modp (const struct ecc_modulo *p, mp_limb_t *rp, mp_limb_t *xp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_curve25519_modp) + + C First fold the limbs affecting bit 255 + ld UN, 56(XP) + li M, 38 + mulhdu T1, M, UN + mulld UN, M, UN + ld U3, 24(XP) + li T0, 0 + addc U3, UN, U3 + adde T0, T1, T0 + + ld UN, 40(XP) + mulhdu U2, M, UN + mulld UN, M, UN + + addc U3, U3, U3 + adde T0, T0, T0 + srdi U3, U3, 1 C Undo shift, clear high bit + + C Fold the high limb again, together with RP[5] + li T1, 19 + mulld T0, T1, T0 + ld U0, 0(XP) + ld U1, 8(XP) + ld T1, 16(XP) + addc U0, T0, U0 + adde U1, UN, U1 + ld T0, 32(XP) + adde U2, U2, T1 + addze U3, U3 + + mulhdu T1, M, T0 + mulld T0, M, T0 + addc U0, T0, U0 + adde U1, T1, U1 + std U0, 0(RP) + std U1, 8(RP) + + ld T0, 48(XP) + mulhdu T1, M, T0 + mulld UN, M, T0 + adde U2, UN, U2 + adde U3, T1, U3 + std U2, 16(RP) + std U3, 24(RP) + + blr +EPILOGUE(_nettle_ecc_curve25519_modp)
Amitay Isaacs amitay@ozlabs.org writes:
--- /dev/null +++ b/powerpc64/ecc-curve25519-modp.asm @@ -0,0 +1,101 @@ +C powerpc64/ecc-25519-modp.asm +define(`RP', `r4') +define(`XP', `r5')
+define(`U0', `r6') C Overlaps unused modulo input +define(`U1', `r7') +define(`U2', `r8') +define(`U3', `r9') +define(`T0', `r10') +define(`T1', `r11') +define(`M', `r12')
+define(`UN', r3)
Comment seems misplaced, it's UN / r3 that overlaps the unused input, right?
- C void ecc_curve25519_modp (const struct ecc_modulo *p, mp_limb_t *rp, mp_limb_t *xp)
- .text
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_curve25519_modp)
- C First fold the limbs affecting bit 255
- ld UN, 56(XP)
- li M, 38
- mulhdu T1, M, UN
- mulld UN, M, UN
- ld U3, 24(XP)
- li T0, 0
- addc U3, UN, U3
- adde T0, T1, T0
- ld UN, 40(XP)
- mulhdu U2, M, UN
- mulld UN, M, UN
- addc U3, U3, U3
- adde T0, T0, T0
- srdi U3, U3, 1 C Undo shift, clear high bit
- C Fold the high limb again, together with RP[5]
- li T1, 19
- mulld T0, T1, T0
- ld U0, 0(XP)
- ld U1, 8(XP)
- ld T1, 16(XP)
- addc U0, T0, U0
- adde U1, UN, U1
- ld T0, 32(XP)
- adde U2, U2, T1
- addze U3, U3
- mulhdu T1, M, T0
- mulld T0, M, T0
- addc U0, T0, U0
- adde U1, T1, U1
- std U0, 0(RP)
- std U1, 8(RP)
- ld T0, 48(XP)
- mulhdu T1, M, T0
- mulld UN, M, T0
- adde U2, UN, U2
- adde U3, T1, U3
- std U2, 16(RP)
- std U3, 24(RP)
- blr
+EPILOGUE(_nettle_ecc_curve25519_modp)
Looks good. I must admit that the x86_64 version this is based on is not so easy to follow.
Regards, /Niels
From: Martin Schwenke martin@meltin.net
Signed-off-by: Martin Schwenke martin@meltin.net Signed-off-by: Amitay Isaacs amitay@gmail.com --- powerpc64/ecc-curve448-modp.asm | 174 ++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 powerpc64/ecc-curve448-modp.asm
diff --git a/powerpc64/ecc-curve448-modp.asm b/powerpc64/ecc-curve448-modp.asm new file mode 100644 index 00000000..42ed1eb1 --- /dev/null +++ b/powerpc64/ecc-curve448-modp.asm @@ -0,0 +1,174 @@ +C powerpc/ecc-curve448-modp.asm + +ifelse(` + Copyright (C) 2021 Martin Schwenke & Amitay Isaacs, IBM Corporation + + Based on x86_64/ecc-curve448-modp.asm + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + + .file "ecc-curve448-modp.asm" + +define(`SP', `r1') + +define(`RP', `r4') +define(`XP', `r5') + +define(`X0', `r3') +define(`X1', `r9') +define(`X2', `r10') +define(`X3', `r11') +define(`X4', `r12') +define(`X5', `r14') +define(`X6', `r15') +define(`X7', `r16') +define(`T0', `r6') +define(`T1', `r7') +define(`T2', `r8') +define(`TT', `r17') + +define(`LO', `TT') C Overlap + + C void ecc_curve448_modp (const struct ecc_modulo *p, mp_limb_t *rp, mp_limb_t *xp) + .text +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_ecc_curve448_modp) + + std r14, -32(SP) + std r15, -24(SP) + std r16, -16(SP) + std r17, -8(SP) + + C First load the values to be shifted by 32. + ld T0, 88(XP) C use for X0, X1 + ld T1, 96(XP) C use for X2 + ld T2, 104(XP) C use for X3 + ld X4, 56(XP) + ld X5, 64(XP) + ld X6, 72(XP) + ld X7, 80(XP) + + C Multiply by 2^32 + sldi X0, T0, 32 + srdi LO, T0, 32 + sldi X1, T1, 32 + or X1, X1, LO + srdi LO, T1, 32 + sldi X2, T2, 32 + or X2, X2, LO + srdi LO, T2, 32 + sldi X3, X4, 32 + or X3, X3, LO + srdi LO, X4, 32 + sldi X4, X5, 32 + or X4, X4, LO + srdi LO, X5, 32 + sldi X5, X6, 32 + or X5, X5, LO + srdi LO, X6, 32 + sldi X6, X7, 32 + or X6, X6, LO + + srdi X7, X7, 32 + + C Multiply by 2 + addc T0, T0, T0 + adde T1, T1, T1 + adde T2, T2, T2 + addze X7, X7 + + C Main additions + ld TT, 56(XP) + addc X0, TT, X0 + ld TT, 64(XP) + adde X1, TT, X1 + ld TT, 72(XP) + adde X2, TT, X2 + ld TT, 80(XP) + adde X3, TT, X3 + adde X4, T0, X4 + adde X5, T1, X5 + adde X6, T2, X6 + addze X7, X7 + + ld T0, 0(XP) + addc X0, T0, X0 + ld T1, 8(XP) + adde X1, T1, X1 + ld T2, 16(XP) + adde X2, T2, X2 + ld TT, 24(XP) + adde X3, TT, X3 + ld T0, 32(XP) + adde X4, T0, X4 + ld T1, 40(XP) + adde X5, T1, X5 + ld T2, 48(XP) + adde X6, T2, X6 + addze X7, X7 + + C X7 wraparound + sldi T0, X7, 32 + srdi T1, X7, 32 + li T2, 0 + addc X0, X7, X0 + addze X1, X1 + addze X2, X2 + adde X3, T0, X3 + adde X4, T1, X4 + addze X5, X5 + addze X6, X6 + addze T2, T2 + + C Final carry wraparound. Carry T2 > 0 only if + C X6 is zero, so carry is absorbed. + sldi T0, T2, 32 + + addc X0, T2, X0 + addze X1, X1 + addze X2, X2 + adde X3, T0, X3 + addze X4, X4 + addze X5, X5 + addze X6, X6 + + std X0, 0(RP) + std X1, 8(RP) + std X2, 16(RP) + std X3, 24(RP) + std X4, 32(RP) + std X5, 40(RP) + std X6, 48(RP) + + ld r14, -32(SP) + ld r15, -24(SP) + ld r16, -16(SP) + ld r17, -8(SP) + + blr +EPILOGUE(_nettle_ecc_curve448_modp)
Amitay Isaacs amitay@ozlabs.org writes:
I posted the modified codes in the earlier email thread, but I think posting them as a seperate series will make them easier to cherry pick.
Thanks!
V2 changes:
- Use actual register names when storing/restoring from stack
- Drop m4 definitions which are not in use
- Simplify C2 folding for P192 curve
Amitay Isaacs (2): ecc: Add powerpc64 assembly for ecc_192_modp ecc: Add powerpc64 assembly for ecc_224_modp
Martin Schwenke (4): ecc: Add powerpc64 assembly for ecc_384_modp ecc: Add powerpc64 assembly for ecc_521_modp ecc: Add powerpc64 assembly for ecc_25519_modp ecc: Add powerpc64 assembly for ecc_448_modp
I merged secp192, secp384, secp521 a few days ago. The other three, secp224, curve25519, curve448 look good too (with one very minor comment fix which I can take care of). I'll do some local testing, then merge to master-updates for a run of the ci system, including tests on ppc big-endian.
Regards, /Niels
nettle-bugs@lists.lysator.liu.se