7 files changed, 388 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index f123ba19..d47c138e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
 2020-11-30  Niels Möller  <nisse@lysator.liu.se>
 
+	* chacha-crypt.c: (_nettle_chacha_crypt_4core)
+	(_nettle_chacha_crypt32_4core): New functions.
+	* chacha-internal.h: Add prototypes for _nettle_chacha_4core and
+	related functions.
+	* configure.ac (asm_nettle_optional_list): Add chacha-4core.asm.
+	* powerpc64/fat/chacha-4core.asm: New file.
+	* powerpc64/p7/chacha-4core.asm: New file.
+	* fat-ppc.c (fat_init): When altivec is available, use
+	_nettle_chacha_crypt_4core and _nettle_chacha_crypt32_4core
+	instead of _2core variants.
+
 	* chacha-crypt.c (_nettle_chacha_crypt32_3core): Fix bug in
 	handling of counter; this function should not propagate any carry.
 
diff --git a/chacha-crypt.c b/chacha-crypt.c
index a13898f1..d3af5f58 100644
--- a/chacha-crypt.c
+++ b/chacha-crypt.c
@@ -54,17 +54,60 @@
 
 #define CHACHA_ROUNDS 20
 
-#if HAVE_NATIVE_chacha_3core
+#if HAVE_NATIVE_chacha_4core
+#define _nettle_chacha_crypt_4core chacha_crypt
+#define _nettle_chacha_crypt32_4core chacha_crypt32
+#elif HAVE_NATIVE_chacha_3core
 #define _nettle_chacha_crypt_3core chacha_crypt
 #define _nettle_chacha_crypt32_3core chacha_crypt32
-#elif HAVE_NATIVE_chacha_2core
-#define _nettle_chacha_crypt_2core chacha_crypt
-#define _nettle_chacha_crypt32_2core chacha_crypt32
-#elif !(HAVE_NATIVE_fat_chacha_3core || HAVE_NATIVE_fat_chacha_2core)
+#elif !(HAVE_NATIVE_fat_chacha_4core || HAVE_NATIVE_fat_chacha_3core)
 #define _nettle_chacha_crypt_1core chacha_crypt
 #define _nettle_chacha_crypt32_1core chacha_crypt32
 #endif
 
+#if HAVE_NATIVE_chacha_4core || HAVE_NATIVE_fat_chacha_4core
+void
+_nettle_chacha_crypt_4core(struct chacha_ctx *ctx,
+			   size_t length,
+			   uint8_t *dst,
+			   const uint8_t *src)
+{
+  uint32_t x[4*_CHACHA_STATE_LENGTH];
+
+  if (!length)
+    return;
+
+  while (length > 2*CHACHA_BLOCK_SIZE)
+    {
+      _nettle_chacha_4core (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 4;
+      ctx->state[13] += (ctx->state[12] < 4);
+      if (length <= 4*CHACHA_BLOCK_SIZE)
+	{
+	  memxor3 (dst, src, x, length);
+	  return;
+	}
+      memxor3 (dst, src, x, 4*CHACHA_BLOCK_SIZE);
+
+      length -= 4*CHACHA_BLOCK_SIZE;
+      dst += 4*CHACHA_BLOCK_SIZE;
+      src += 4*CHACHA_BLOCK_SIZE;
+    }
+  if (length > CHACHA_BLOCK_SIZE)
+    {
+      _nettle_chacha_2core (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 2;
+      ctx->state[13] += (ctx->state[12] < 2);
+    }
+  else
+    {
+      _nettle_chacha_core (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[13] += (++ctx->state[12] == 0);
+    }
+  memxor3 (dst, src, x, length);
+}
+#endif
+
 #if HAVE_NATIVE_chacha_3core || HAVE_NATIVE_fat_chacha_3core
 void
 _nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
@@ -108,7 +151,7 @@ _nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
 }
 #endif
 
-#if HAVE_NATIVE_chacha_2core || HAVE_NATIVE_fat_chacha_2core
+#if 0
 void
 _nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
 			   size_t length,
@@ -143,7 +186,7 @@ _nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
 }
 #endif
 
-#if !(HAVE_NATIVE_chacha_3core || HAVE_NATIVE_chacha_2core)
+#if !(HAVE_NATIVE_chacha_4core || HAVE_NATIVE_chacha_3core)
 void
 _nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
 			   size_t length,
@@ -177,6 +220,47 @@ _nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
 }
 #endif
 
+#if HAVE_NATIVE_chacha_4core || HAVE_NATIVE_fat_chacha_4core
+void
+_nettle_chacha_crypt32_4core(struct chacha_ctx *ctx,
+			     size_t length,
+			     uint8_t *dst,
+			     const uint8_t *src)
+{
+  uint32_t x[4*_CHACHA_STATE_LENGTH];
+
+  if (!length)
+    return;
+
+  while (length > 2*CHACHA_BLOCK_SIZE)
+    {
+      _nettle_chacha_4core32 (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 4;
+      if (length <= 4*CHACHA_BLOCK_SIZE)
+	{
+	  memxor3 (dst, src, x, length);
+	  return;
+	}
+      memxor3 (dst, src, x, 4*CHACHA_BLOCK_SIZE);
+
+      length -= 4*CHACHA_BLOCK_SIZE;
+      dst += 4*CHACHA_BLOCK_SIZE;
+      src += 4*CHACHA_BLOCK_SIZE;
+    }
+  if (length > CHACHA_BLOCK_SIZE)
+    {
+      _nettle_chacha_2core32 (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 2;
+    }
+  else
+    {
+      _nettle_chacha_core (x, ctx->state, CHACHA_ROUNDS);
+      ++ctx->state[12];
+    }
+  memxor3 (dst, src, x, length);
+}
+#endif
+
 #if HAVE_NATIVE_chacha_3core || HAVE_NATIVE_fat_chacha_3core
 void
 _nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
@@ -218,7 +302,7 @@ _nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
 }
 #endif
 
-#if HAVE_NATIVE_chacha_2core || HAVE_NATIVE_fat_chacha_2core
+#if 0
 void
 _nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
 			     size_t length,
@@ -252,7 +336,7 @@ _nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
 }
 #endif
 
-#if !(HAVE_NATIVE_chacha_3core || HAVE_NATIVE_chacha_2core)
+#if !(HAVE_NATIVE_chacha_4core || HAVE_NATIVE_chacha_3core)
 void
 _nettle_chacha_crypt32_1core(struct chacha_ctx *ctx,
 			     size_t length,
diff --git a/chacha-internal.h b/chacha-internal.h
index d92a6779..897fdc16 100644
--- a/chacha-internal.h
+++ b/chacha-internal.h
@@ -56,19 +56,25 @@ void
 _nettle_chacha_3core32(uint32_t *dst, const uint32_t *src, unsigned rounds);
 
 void
+_nettle_chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
+void
+_nettle_chacha_4core32(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
+void
 _nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
 			   size_t length,
 			   uint8_t *dst,
 			   const uint8_t *src);
 
 void
-_nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
 			   size_t length,
 			   uint8_t *dst,
 			   const uint8_t *src);
 
 void
-_nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt_4core(struct chacha_ctx *ctx,
 			   size_t length,
 			   uint8_t *dst,
 			   const uint8_t *src);
@@ -80,13 +86,13 @@ _nettle_chacha_crypt32_1core(struct chacha_ctx *ctx,
 			     const uint8_t *src);
 
 void
-_nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
 			     size_t length,
 			     uint8_t *dst,
 			     const uint8_t *src);
 
 void
-_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt32_4core(struct chacha_ctx *ctx,
 			     size_t length,
 			     uint8_t *dst,
 			     const uint8_t *src);
diff --git a/configure.ac b/configure.ac
index 6fafaa77..776a9a61 100644
--- a/configure.ac
+++ b/configure.ac
@@ -499,8 +499,9 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 # Assembler files which generate additional object files if they are used.
 asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
-  chacha-2core.asm chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
-  salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
+  chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
+  salsa20-2core.asm salsa20-core-internal-2.asm \
+  sha1-compress-2.asm sha256-compress-2.asm \
   sha3-permute-2.asm sha512-compress-2.asm \
   umac-nh-n-2.asm umac-nh-2.asm"
 
@@ -609,8 +610,10 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_chacha_core
 #undef HAVE_NATIVE_chacha_2core
 #undef HAVE_NATIVE_chacha_3core
+#undef HAVE_NATIVE_chacha_4core
 #undef HAVE_NATIVE_fat_chacha_2core
 #undef HAVE_NATIVE_fat_chacha_3core
+#undef HAVE_NATIVE_fat_chacha_4core
 #undef HAVE_NATIVE_ecc_curve25519_modp
 #undef HAVE_NATIVE_ecc_curve448_modp
 #undef HAVE_NATIVE_ecc_secp192r1_modp
diff --git a/fat-ppc.c b/fat-ppc.c
index 8d4a703d..847af14f 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -214,8 +214,8 @@ fat_init (void)
       if (verbose)
 	fprintf (stderr, "libnettle: enabling altivec code.\n");
       _nettle_chacha_core_vec = _nettle_chacha_core_altivec;
-      nettle_chacha_crypt_vec = _nettle_chacha_crypt_2core;
-      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_2core;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
     }
   else
     {
diff --git a/powerpc64/fat/chacha-4core.asm b/powerpc64/fat/chacha-4core.asm
new file mode 100644
index 00000000..bd6be1be
--- /dev/null
+++ b/powerpc64/fat/chacha-4core.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/chacha-4core.asm
+
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure
+
+include_src(`powerpc64/p7/chacha-4core.asm')
diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm
new file mode 100644
index 00000000..49a801be
--- /dev/null
+++ b/powerpc64/p7/chacha-4core.asm
@@ -0,0 +1,231 @@
+C powerpc64/chacha-4core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+C Argments
+define(`DST', `r3')
+define(`SRC', `r4')
+define(`ROUNDS', `r5')
+
+C Working state in v0,...,v15
+
+define(`ROT16', v16)
+define(`ROT12', v17)
+define(`ROT8',	v18)
+define(`ROT7',	v19)
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `v20')
+define(`T1', `v21')
+define(`T2', `v22')
+define(`T3', `v23')
+
+C Main loop for round
+define(`QR',`
+	vadduwm $1, $1, $2
+	vxor	$4, $4, $1
+	vrlw	$4, $4, ROT16
+	vadduwm $3, $3, $4
+	vxor	$2, $2, $3
+	vrlw	$2, $2, ROT12
+	vadduwm $1, $1, $2
+	vxor	$4, $4, $1
+	vrlw	$4, $4, ROT8
+	vadduwm $3, $3, $4
+	vxor	$2, $2, $3
+	vrlw	$2, $2, ROT7
+ ')
+
+define(`TRANSPOSE',`
+	vmrghw	T0, $1, $3	C A0 A2 B0 B2
+	vmrghw	T1, $2, $4	C A1 A3 B1 B3
+	vmrglw	T2, $1, $3	C C0 C2 D0 D2
+	vmrglw	T3, $2, $4	C C1 C3 D1 D3
+
+	vmrghw	$1, T0, T1	C A0 A1 A2 A3
+	vmrglw	$2, T0, T1	C B0 B1 B2 B3
+	vmrghw	$3, T2, T3	C C0 C2 C1 C3
+	vmrglw	$4, T2, T3	C D0 D1 D2 D3
+')
+
+	C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_4core)
+
+	vspltisw T2, 1		C Apply counter carries
+
+.Lshared_entry:
+
+	li	r6, 0x10	C set up some...
+	li	r7, 0x20	C ...useful...
+	li	r8, 0x30	C ...offsets
+
+	addi	SP, SP, -0x40	C Save callee-save registers
+	stvx	v20, 0, SP
+	stvx	v21, r6, SP
+	stvx	v22, r7, SP
+	stvx	v23, r8, SP
+
+	vspltisw ROT16, -16	C -16 instead of 16 actually works!
+	vspltisw ROT12, 12
+	vspltisw ROT8, 8
+	vspltisw ROT7, 7
+
+C Load state and splat
+	lxvw4x	VSR(v0),  0, SRC	C "expa ..."
+	lxvw4x	VSR(v4),  r6, SRC	C key
+	lxvw4x	VSR(v8),  r7, SRC	C key
+	lxvw4x	VSR(v12), r8, SRC	C cnt and nonce
+
+	vspltw	v1, v0, 1
+	vspltw	v2, v0, 2
+	vspltw	v3, v0, 3
+	vspltw	v0, v0, 0
+	vspltw	v5, v4, 1
+	vspltw	v6, v4, 2
+	vspltw	v7, v4, 3
+	vspltw	v4, v4, 0
+	vspltw	v9,  v8, 1
+	vspltw	v10, v8, 2
+	vspltw	v11, v8, 3
+	vspltw	v8,  v8, 0
+	vspltw	v13, v12, 1
+	vspltw	v14, v12, 2
+	vspltw	v15, v12, 3
+	vspltw	v12, v12, 0
+
+	ld	r9, .Lcnts@got(r2)
+	lxvw4x	VSR(T0), 0, r9	C increments
+	vaddcuw	T1, v12, T0	C compute carry-out
+	vadduwm	v12, v12, T0	C low adds
+	vand	T1, T1, T2	C discard carries for 32-bit counter variant
+	vadduwm	v13, v13, T1	C apply carries
+
+	C Save all 4x4 of the last words.
+	vor	T0, v12, v12
+	vor	T1, v13, v13
+	vor	T2, v14, v14
+	vor	T3, v15, v15
+
+	srdi	ROUNDS, ROUNDS, 1
+	mtctr	ROUNDS
+.Loop:
+	QR(v0, v4,  v8, v12)
+	QR(v1, v5,  v9, v13)
+	QR(v2, v6, v10, v14)
+	QR(v3, v7, v11, v15)
+	QR(v0, v5, v10, v15)
+	QR(v1, v6, v11, v12)
+	QR(v2, v7,  v8, v13)
+	QR(v3, v4,  v9, v14)
+	bdnz	.Loop
+
+	C Add in saved original words, including counters, before
+	C transpose.
+	vadduwm	v12, v12, T0
+	vadduwm	v13, v13, T1
+	vadduwm v14, v14, T2
+	vadduwm	v15, v15, T3
+
+	TRANSPOSE(v0, v1,v2, v3)
+	TRANSPOSE(v4, v5, v6, v7)
+	TRANSPOSE(v8, v9, v10, v11)
+	TRANSPOSE(v12, v13, v14, v15)
+
+	lxvw4x	VSR(T0),  0, SRC
+	lxvw4x	VSR(T1), r6, SRC
+	lxvw4x	VSR(T2), r7, SRC
+
+	vadduwm	v0, v0, T0
+	vadduwm	v1, v1, T0
+	vadduwm	v2, v2, T0
+	vadduwm	v3, v3, T0
+
+	vadduwm	v4, v4, T1
+	vadduwm	v5, v5, T1
+	vadduwm	v6, v6, T1
+	vadduwm	v7, v7, T1
+
+	vadduwm	v8, v8, T2
+	vadduwm	v9, v9, T2
+	vadduwm	v10, v10, T2
+	vadduwm	v11, v11, T2
+
+	stxvw4x	VSR(v0), 0, DST
+	stxvw4x	VSR(v4), r6, DST
+	stxvw4x	VSR(v8), r7, DST
+	stxvw4x	VSR(v12), r8, DST
+
+	addi	DST, DST, 64
+
+	stxvw4x	VSR(v1), 0, DST
+	stxvw4x	VSR(v5), r6, DST
+	stxvw4x	VSR(v9), r7, DST
+	stxvw4x	VSR(v13), r8, DST
+
+	addi	DST, DST, 64
+
+	stxvw4x	VSR(v2), 0, DST
+	stxvw4x	VSR(v6), r6, DST
+	stxvw4x	VSR(v10), r7, DST
+	stxvw4x	VSR(v14), r8, DST
+
+	addi	DST, DST, 64
+
+	stxvw4x	VSR(v3), 0, DST
+	stxvw4x	VSR(v7), r6, DST
+	stxvw4x	VSR(v11), r7, DST
+	stxvw4x	VSR(v15), r8, DST
+
+	C Restore callee-save registers
+	lvx	v20, 0, SP
+	lvx	v21, r6, SP
+	lvx	v22, r7, SP
+	lvx	v23, r8, SP
+	addi	SP, SP, 0x40
+
+	blr
+EPILOGUE(_nettle_chacha_4core)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_4core32)
+	vspltisw T2, 0		C Ignore counter carries
+	b	.Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+	.section .rodata
+	ALIGN(16)
+.Lcnts: .long	0,1,2,3		C increments