PPC chacha

24 Sep 2020


      I'm trying to learn a bit of ppc assembly. Below is an implementation of
_chacha_core. Seems to work, when tested on gcc112.fsffrance.org (just
put the file in the powerpc64 directory and reconfigure). This machine
is little-endian, I haven't yet tested on big-endian.
Unfortunately I don't get any accurate benchmark numbers on that
machine, but I think speedup may be on the order of 50%. It could likely
be speedup further by processing 2, 3 or 4 blocks in parallel, similar to
recent improvements for arm and x86_64. I'd like to do that after the
simpler single-block function is properly merged.
I'm not sure where it fits under powerpc64. The code doesn't need any
cryptographic extensions, but it depends on vector instructions as well
as VSX registers (for the unaligned load and store instructions). So I'd
need advice both on the directory hierarchy and compile time
configuration, and appropriate runtime tests for fat builds.
Comments on the code highly appreciated! It's the first ppc code I've
written, and the reference manual isn't that easy to navigate. The
vector instructions seem very nice to work with, and makes for a shorter
QROUND than both x86_64 SSE and ARM Neon (these suffer a bit from
missing vector rotate instruction).
Help with additional benchmarking would also be useful.
Regards,
/Niels
C powerpc64/chacha-core-internal.asm
ifelse(`
   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
   This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.
or
* the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.
You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
')
C Register usage:
C Argments
define(`DST', `r3')
define(`SRC', `r4')
define(`ROUNDS', `r5')
C Working state
define(`X0', `v0')
define(`X1', `v1')
define(`X2', `v2')
define(`X3', `v3')
    
define(`ROT16', `v4')
define(`ROT12', `v5')
define(`ROT8',  `v6')
define(`ROT7',  `v7')
C Original input state
define(`S0', `v8')
define(`S1', `v9')
define(`S2', `v10')
define(`S3', `v11')
C QROUND(X0, X1, X2, X3)
define(`QROUND', `
    C x0 += x1, x3 ^= x0, x3 lrot 16
    C x2 += x3, x1 ^= x2, x1 lrot 12
    C x0 += x1, x3 ^= x0, x3 lrot 8
    C x2 += x3, x1 ^= x2, x1 lrot 7
vadduwm $1, $1, $2
    vxor	$4, $4, $1
    vrlw	$4, $4, ROT16
vadduwm $3, $3, $4
    vxor	$2, $2, $3
    vrlw	$2, $2, ROT12
vadduwm $1, $1, $2
    vxor	$4, $4, $1
    vrlw	$4, $4, ROT8
vadduwm $3, $3, $4
    vxor	$2, $2, $3
    vrlw	$2, $2, ROT7
')
.text
    .align 4
    C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_core)
li	r6, 0x10	C set up some...
    li	r7, 0x20	C ...useful...
    li	r8, 0x30	C ...offsets
vspltisw ROT16, -16	C -16 instead of 16 actually works!
    vspltisw ROT12, 12
    vspltisw ROT8, 8
    vspltisw ROT7, 7
lxvw4x	VSR(X0), 0, SRC
    lxvw4x	VSR(X1), r6, SRC
    lxvw4x	VSR(X2), r7, SRC
    lxvw4x	VSR(X3), r8, SRC
vor	S0, X0, X0
    vor	S1, X1, X1
    vor	S2, X2, X2
    vor	S3, X3, X3
srdi	ROUNDS, ROUNDS, 1
    mtctr	ROUNDS
.Loop:
    QROUND(X0, X1, X2, X3)
    C Rotate rows, to get
    C	 0  1  2  3
    C	 5  6  7  4  <<< 1
    C	10 11  8  9  <<< 2
    C	15 12 13 14  <<< 3
vsldoi	X1, X1, X1, 4
    vsldoi	X2, X2, X2, 8
    vsldoi	X3, X3, X3, 12
QROUND(X0, X1, X2, X3)
C Inverse rotation	
    vsldoi	X1, X1, X1, 12
    vsldoi	X2, X2, X2, 8
    vsldoi	X3, X3, X3, 4
bdnz    .Loop
vadduwm	X0, X0, S0
    vadduwm	X1, X1, S1
    vadduwm	X2, X2, S2
    vadduwm	X3, X3, S3
stxvw4x	VSR(X0), 0, DST
    stxvw4x	VSR(X1), r6, DST
    stxvw4x	VSR(X2), r7, DST
    stxvw4x	VSR(X3), r8, DST
blr
EPILOGUE(_nettle_chacha_core)
-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

PPC chacha