Re: PPC chacha

15 Nov 2020


      nisse@lysator.liu.se (Niels Möller) writes:
...
It could likely be speedup further by processing 2, 3 or 4 blocks in
parallel.
I've given 2 blocks in parallel a try, but not quite working yet. My
work-in-progress code below.
When I test it on the gcc112 machine, it fails with an illegal
instruction (SIGILL) on this line, close to function entry:
.globl _nettle_chacha_2core
  .type _nettle_chacha_2core,%function
  .align 5
  _nettle_chacha_2core:
  addis 2,12,(.TOC.-_nettle_chacha_2core)@ha
  addi 2,2,(.TOC.-_nettle_chacha_2core)@l
  .localentry _nettle_chacha_2core, .-_nettle_chacha_2core
li      r8, 0x30
          vspltisw v1, 1
  =>      vextractuw v1, v1, 0
I don't understand, from the manual, what's wrong with this. The
intention of this piece of code is just to construct the value {1, 0, 0,
0} in one of the vector registers. Maybe there's a better way to do
that?
Regards,
/Niels
C powerpc64/p7/chacha-core-internal.asm
ifelse(`
   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
   This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.
or
* the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.
You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
')
C Register usage:
C Argments
define(`DST', `r3')
define(`SRC', `r4')
define(`ROUNDS', `r5')
C State, even elements in X, odd elements in Y
define(`X0', `v0')
define(`X1', `v1')
define(`X2', `v2')
define(`X3', `v3')
define(`Y0', `v4')
define(`Y1', `v5')
define(`Y2', `v6')
define(`Y3', `v7')
define(`ROT16', `v8')
define(`ROT12', `v9')
define(`ROT8',  `v10')
define(`ROT7',  `v11')
C Original input state
define(`S0', `v12')
define(`S1', `v13')
define(`S2', `v14')
define(`S3', `v15')
define(`S3p1', `v16')
define(`T0', `v17')
.text
    C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_chacha_2core)
li	r8, 0x30	C offset for x3
    vspltisw X1, 1		C {1,1,...,1}
    vextractuw X1, X1, 0	C {1,0,...,0}
lxvw4x	VSR(X3), r8, SRC
vnegw	X0, X1
    vcmpequw Y3, X3, X0
    vand	Y3, Y3, X1	C Counter carry out
    vsldoi	Y3, Y3, Y3, 4
    vor	Y3, Y3, X1
.Lshared_entry:
    vadduwm	Y3, Y3, X3
li	r6, 0x10	C set up some...
    li	r7, 0x20	C ...useful...
    lxvw4x	VSR(X0), 0, SRC
    lxvw4x	VSR(X1), r6, SRC
    lxvw4x	VSR(X2), r7, SRC
vor	S0, X0, X0
    vor	S1, X1, X1
    vor	S2, X2, X2
    vor	S3, S3, X3
    vor	S3p1, Y3, Y3
vmrgow	Y0, X0, X0	C  1  1  3  3
    vmrgew	X0, X0, X0	C  0  0  2  2
    vmrgow	Y1, X1, X1	C  5  5  7  7
    vmrgew	X1, X1, X1	C  4  4  6  6
    vmrgow	Y2, X2, X2	C  9  9 11 11
    vmrgew	X2, X2, X2	C  8  8 10 10
    vmrgow	Y3, X3, X3	C 13 13 15 15
    vmrgew	X3, X3, X3	C 12 12 14 14
vspltisw ROT16, -16	C -16 instead of 16 actually works!
    vspltisw ROT12, 12
    vspltisw ROT8, 8
    vspltisw ROT7, 7
srdi	ROUNDS, ROUNDS, 1
    mtctr	ROUNDS
.Loop:
C Register layout (A is first block, B is second block)
C
C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
C X1:  A4  B4  A6  B6  Y1:  A5  B5  A7  B7
C X2:  A8  B8 A10 B10  Y2:  A9  B9 A11 B11
C X3: A12 B12 A14 B14  Y3: A13 B13 A15 B15
    vadduwm X0, X0, X1
     vadduwm Y0, Y0, Y1
    vxor	X3, X3, X0
     vxor	Y3, Y3, Y0
    vrlw	X3, X3, ROT16
     vrlw	Y3, Y3, ROT16
vadduwm X2, X2, X3
     vadduwm Y2, Y2, Y3
    vxor	X1, X1, X2
     vxor	Y1, Y1, Y2
    vrlw	X1, X1, ROT12
     vrlw	Y1, Y1, ROT12
vadduwm X0, X0, X1
     vadduwm Y0, Y0, Y1
    vxor	X3, X3, X0
     vxor	Y3, Y3, Y0
    vrlw	X3, X3, ROT8
     vrlw	Y3, Y3, ROT8
vadduwm X2, X2, X3
     vadduwm Y2, Y2, Y3
    vxor	X1, X1, X2
     vxor	Y1, Y1, Y2
    vrlw	X1, X1, ROT7
     vrlw	Y1, Y1, ROT7
vsldoi	X1, X1, X1, 8
    vsldoi	X2, X2, X2, 8
    vsldoi	Y2, Y2, Y2, 8
    vsldoi	X3, X3, X3, 8
C Register layout:
C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
C Y1:  A5  B5  A7  B7  X1:  A6  B6  A4  B4 (X1 swapped)
C X2: A10 B10  A8  B8  Y2: A11 A11  A9  B9 (X2, Y2 swapped)
C Y3  A15 B15 A13 B13  X3  A12 B12 A14 B14 (X3 swapped)
vadduwm X0, X0, Y1
     vadduwm Y0, Y0, X1
    vxor	Y3, Y3, X0
     vxor	X3, X3, Y0
    vrlw	Y3, Y3, ROT16
     vrlw	X3, X3, ROT16
vadduwm X2, X2, Y3
     vadduwm Y2, Y2, X3
    vxor	Y1, Y1, X2
     vxor	X1, X1, Y2
    vrlw	Y1, Y1, ROT12
     vrlw	X1, X1, ROT12
vadduwm X0, X0, Y1
     vadduwm Y0, Y0, Y1
    vxor	Y3, Y3, X0
     vxor	X3, X3, Y0
    vrlw	Y3, Y3, ROT8
     vrlw	X3, X3, ROT8
vadduwm X2, X2, Y3
     vadduwm Y2, Y2, X3
    vxor	Y1, Y1, X2
     vxor	X1, X1, Y2
    vrlw	Y1, Y1, ROT7
     vrlw	X1, X1, ROT7
vsldoi	X1, X1, X1, 8
    vsldoi	X2, X2, X2, 8
    vsldoi	Y2, Y2, Y2, 8
    vsldoi	X3, X3, X3, 8
bdnz	.Loop
vmrghw	T0, X0, Y0
    vmrglw	Y0, X0, Y0
vmrghw	X0, X1, Y1
    vmrglw	Y1, X1, Y1
vmrghw	X1, X2, Y2
    vmrglw	Y2, X2, Y2
vmrghw	X2, X3, Y3
    vmrglw	Y3, X3, Y3
vadduwm T0, T0, S0
    vadduwm Y0, Y0, S0
    vadduwm X0, X0, S1
    vadduwm Y1, Y1, S1
    vadduwm X1, X1, S2
    vadduwm Y2, Y2, S2
    vadduwm X2, X2, S3
    vadduwm Y3, Y3, S3p1
stxvw4x	VSR(T0), 0, DST
    stxvw4x	VSR(X0), r6, DST
    stxvw4x	VSR(X1), r7, DST
    stxvw4x	VSR(X2), r8, DST
addi	DST, DST, 64
stxvw4x	VSR(T0), 0, DST
    stxvw4x	VSR(X0), r6, DST
    stxvw4x	VSR(X1), r7, DST
    stxvw4x	VSR(X2), r8, DST
    blr
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_chacha_2core32)
    li	r8, 0x30	C offset for x3
    vspltisw Y3, 1		C {1,1,...,1}
    vextractuw Y3, Y3, 0	C {1,0,...,0}
    lxvw4x	VSR(X3), r8, SRC
    b	.Lshared_entry
EPILOGUE(_nettle_chacha_2core32)
.data
    .align 4
.Lcount1:
    .int 1,0,0,0
-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

Re: PPC chacha