From 6a0bb9ab7f886087d7edb0725c90485086a1c0b4 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 30 Dec 2020 17:46:04 +0200 Subject: Add s390x/zSeries implementation of ChaCha20 * cipher/Makefile.am: Add 'asm-common-s390x.h' and 'chacha20-s390x.S'. * cipher/asm-common-s390x.h: New. * cipher/chacha20-s390x.S: New. * cipher/chacha20.c (USE_S390X_VX): New. (CHACHA20_context_t): Change 'use_*' bit-field to unsigned type; Add 'use_s390x'. (_gcry_chacha20_s390x_vx_blocks8) (_gcry_chacha20_s390x_vx_blocks4_2_1): New. (chacha20_do_setkey): Add HW feature detect for s390x/VX. (chacha20_blocks, do_chacha20_encrypt_stream_tail): Add s390x/VX code-path. * configure.ac: Add 'chacha20-s390x.lo'. -- Patch adds VX vector instruction set accelerated ChaCha20 implementation for zSeries. Benchmark on z15 (4504 Mhz): Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.62 ns/B 364.0 MiB/s 11.80 c/B STREAM dec | 2.62 ns/B 363.8 MiB/s 11.81 c/B After (~5x faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.505 ns/B 1888 MiB/s 2.28 c/B STREAM dec | 0.506 ns/B 1887 MiB/s 2.28 c/B GnuPG-bug-id: 5201 Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 3 +- cipher/asm-common-s390x.h | 90 +++++ cipher/chacha20-s390x.S | 888 ++++++++++++++++++++++++++++++++++++++++++++++ cipher/chacha20.c | 53 ++- configure.ac | 7 +- 5 files changed, 1034 insertions(+), 7 deletions(-) create mode 100644 cipher/asm-common-s390x.h create mode 100644 cipher/chacha20-s390x.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index c445e590..3234bcb2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -71,6 +71,7 @@ libcipher_la_SOURCES = \ EXTRA_libcipher_la_SOURCES = \ asm-common-aarch64.h \ asm-common-amd64.h \ + asm-common-s390x.h \ asm-inline-s390x.h \ asm-poly1305-aarch64.h \ asm-poly1305-amd64.h \ @@ -79,7 +80,7 @@ EXTRA_libcipher_la_SOURCES = \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \ chacha20-armv7-neon.S chacha20-aarch64.S \ - chacha20-ppc.c \ + chacha20-ppc.c chacha20-s390x.S \ crc.c crc-intel-pclmul.c crc-armv8-ce.c \ crc-armv8-aarch64-ce.S \ crc-ppc.c \ diff --git a/cipher/asm-common-s390x.h b/cipher/asm-common-s390x.h new file mode 100644 index 00000000..b3a996cd --- /dev/null +++ b/cipher/asm-common-s390x.h @@ -0,0 +1,90 @@ +/* asm-common-s390x.h - Common macros for zSeries assembly + * + * Copyright (C) 2020 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifndef GCRY_ASM_COMMON_S390X_H +#define GCRY_ASM_COMMON_S390X_H + +#include + +#ifdef HAVE_GCC_ASM_ELF_DIRECTIVES +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + +#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES +/* CFI directives to emit DWARF stack unwinding information. */ +# define CFI_STARTPROC() .cfi_startproc +# define CFI_ENDPROC() .cfi_endproc +# define CFI_REMEMBER_STATE() .cfi_remember_state +# define CFI_RESTORE_STATE() .cfi_restore_state +# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off +# define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off +# define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg +# define CFI_REGISTER(ro,rn) .cfi_register ro, rn +# define CFI_RESTORE(reg) .cfi_restore reg + +/* CFA expressions are used for pointing CFA and registers to + * SP relative offsets. */ +# define DW_REGNO_SP 15 + +/* Fixed length encoding used for integers for now. */ +# define DW_SLEB128_7BIT(value) \ + 0x00|((value) & 0x7f) +# define DW_SLEB128_28BIT(value) \ + 0x80|((value)&0x7f), \ + 0x80|(((value)>>7)&0x7f), \ + 0x80|(((value)>>14)&0x7f), \ + 0x00|(((value)>>21)&0x7f) + +# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \ + .cfi_escape \ + 0x0f, /* DW_CFA_def_cfa_expression */ \ + DW_SLEB128_7BIT(11), /* length */ \ + 0x7f, /* DW_OP_breg15, rsp + constant */ \ + DW_SLEB128_28BIT(rsp_offs), \ + 0x06, /* DW_OP_deref */ \ + 0x23, /* DW_OP_plus_constu */ \ + DW_SLEB128_28BIT((cfa_depth)+160) + +# define CFI_REG_ON_STACK(regno,rsp_offs) \ + .cfi_escape \ + 0x10, /* DW_CFA_expression */ \ + DW_SLEB128_7BIT(regno), \ + DW_SLEB128_7BIT(5), /* length */ \ + 0x7f, /* DW_OP_breg15, rsp + constant */ \ + DW_SLEB128_28BIT(rsp_offs) + +#else +# define CFI_STARTPROC() +# define CFI_ENDPROC() +# define CFI_REMEMBER_STATE() +# define CFI_RESTORE_STATE() +# define CFI_ADJUST_CFA_OFFSET(off) +# define CFI_REL_OFFSET(reg,off) +# define CFI_DEF_CFA_REGISTER(reg) +# define CFI_REGISTER(ro,rn) +# define CFI_RESTORE(reg) + +# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) +# define CFI_REG_ON_STACK(reg,rsp_offs) +#endif + +#endif /* GCRY_ASM_COMMON_AMD64_H */ diff --git a/cipher/chacha20-s390x.S b/cipher/chacha20-s390x.S new file mode 100644 index 00000000..2cd38330 --- /dev/null +++ b/cipher/chacha20-s390x.S @@ -0,0 +1,888 @@ +/* chacha20-s390x.S - zSeries implementation of ChaCha20 cipher + * + * Copyright (C) 2020 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 +#include +#if defined(HAVE_GCC_INLINE_ASM_S390X_VX) + +#include "asm-common-s390x.h" + +.machine "z13+vx" +.text + +.balign 16 +.Lconsts: +.Lwordswap: + .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 +.Lbswap128: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.Lbswap32: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.Lone: + .long 0, 0, 0, 1 +.Ladd_counter_0123: + .long 0, 1, 2, 3 +.Ladd_counter_4567: + .long 4, 5, 6, 7 + +/* register macros */ +#define INPUT %r2 +#define DST %r3 +#define SRC %r4 +#define NBLKS %r0 +#define ROUND %r1 + +/* stack structure */ + +#define STACK_FRAME_STD (8 * 16 + 8 * 4) +#define STACK_FRAME_F8_F15 (8 * 8) +#define STACK_FRAME_Y0_Y15 (16 * 16) +#define STACK_FRAME_CTR (4 * 16) +#define STACK_FRAME_PARAMS (6 * 8) + +#define STACK_MAX (STACK_FRAME_STD + STACK_FRAME_F8_F15 + \ + STACK_FRAME_Y0_Y15 + STACK_FRAME_CTR + \ + STACK_FRAME_PARAMS) + +#define STACK_F8 (STACK_MAX - STACK_FRAME_F8_F15) +#define STACK_F9 (STACK_F8 + 8) +#define STACK_F10 (STACK_F9 + 8) +#define STACK_F11 (STACK_F10 + 8) +#define STACK_F12 (STACK_F11 + 8) +#define STACK_F13 (STACK_F12 + 8) +#define STACK_F14 (STACK_F13 + 8) +#define STACK_F15 (STACK_F14 + 8) +#define STACK_Y0_Y15 (STACK_F8 - STACK_FRAME_Y0_Y15) +#define STACK_CTR (STACK_Y0_Y15 - STACK_FRAME_CTR) +#define STACK_INPUT (STACK_CTR - STACK_FRAME_PARAMS) +#define STACK_DST (STACK_INPUT + 8) +#define STACK_SRC (STACK_DST + 8) +#define STACK_NBLKS (STACK_SRC + 8) +#define STACK_POCTX (STACK_NBLKS + 8) +#define STACK_POSRC (STACK_POCTX + 8) + +#define STACK_G0_H3 STACK_Y0_Y15 + +/* vector registers */ +#define A0 %v0 +#define A1 %v1 +#define A2 %v2 +#define A3 %v3 + +#define B0 %v4 +#define B1 %v5 +#define B2 %v6 +#define B3 %v7 + +#define C0 %v8 +#define C1 %v9 +#define C2 %v10 +#define C3 %v11 + +#define D0 %v12 +#define D1 %v13 +#define D2 %v14 +#define D3 %v15 + +#define E0 %v16 +#define E1 %v17 +#define E2 %v18 +#define E3 %v19 + +#define F0 %v20 +#define F1 %v21 +#define F2 %v22 +#define F3 %v23 + +#define G0 %v24 +#define G1 %v25 +#define G2 %v26 +#define G3 %v27 + +#define H0 %v28 +#define H1 %v29 +#define H2 %v30 +#define H3 %v31 + +#define IO0 E0 +#define IO1 E1 +#define IO2 E2 +#define IO3 E3 +#define IO4 F0 +#define IO5 F1 +#define IO6 F2 +#define IO7 F3 + +#define S0 G0 +#define S1 G1 +#define S2 G2 +#define S3 G3 + +#define TMP0 H0 +#define TMP1 H1 +#define TMP2 H2 +#define TMP3 H3 + +#define X0 A0 +#define X1 A1 +#define X2 A2 +#define X3 A3 +#define X4 B0 +#define X5 B1 +#define X6 B2 +#define X7 B3 +#define X8 C0 +#define X9 C1 +#define X10 C2 +#define X11 C3 +#define X12 D0 +#define X13 D1 +#define X14 D2 +#define X15 D3 + +#define Y0 E0 +#define Y1 E1 +#define Y2 E2 +#define Y3 E3 +#define Y4 F0 +#define Y5 F1 +#define Y6 F2 +#define Y7 F3 +#define Y8 G0 +#define Y9 G1 +#define Y10 G2 +#define Y11 G3 +#define Y12 H0 +#define Y13 H1 +#define Y14 H2 +#define Y15 H3 + +/********************************************************************** + helper macros + **********************************************************************/ + +#define _ /*_*/ + +#define CLEAR(x,...) vzero x; + +#define START_STACK(last_r) \ + lgr %r0, %r15; \ + lghi %r1, ~15; \ + stmg %r6, last_r, 6 * 8(%r15); \ + aghi %r0, -STACK_MAX; \ + ngr %r0, %r1; \ + lgr %r1, %r15; \ + CFI_DEF_CFA_REGISTER(1); \ + lgr %r15, %r0; \ + stg %r1, 0(%r15); \ + CFI_CFA_ON_STACK(0, 0); \ + std %f8, STACK_F8(%r15); \ + std %f9, STACK_F9(%r15); \ + std %f10, STACK_F10(%r15); \ + std %f11, STACK_F11(%r15); \ + std %f12, STACK_F12(%r15); \ + std %f13, STACK_F13(%r15); \ + std %f14, STACK_F14(%r15); \ + std %f15, STACK_F15(%r15); + +#define END_STACK(last_r) \ + lg %r1, 0(%r15); \ + ld %f8, STACK_F8(%r15); \ + ld %f9, STACK_F9(%r15); \ + ld %f10, STACK_F10(%r15); \ + ld %f11, STACK_F11(%r15); \ + ld %f12, STACK_F12(%r15); \ + ld %f13, STACK_F13(%r15); \ + ld %f14, STACK_F14(%r15); \ + ld %f15, STACK_F15(%r15); \ + lmg %r6, last_r, 6 * 8(%r1); \ + lgr %r15, %r1; \ + CFI_DEF_CFA_REGISTER(DW_REGNO_SP); + +#define PLUS(dst,src) \ + vaf dst, dst, src; + +#define XOR(dst,src) \ + vx dst, dst, src; + +#define ROTATE(v1,c) \ + verllf v1, v1, (c)(0); + +#define WORD_ROTATE(v1,s) \ + vsldb v1, v1, v1, ((s) * 4); + +#define DST_1(OPER, I, J) \ + OPER(A##I, J); + +#define DST_2(OPER, I, J) \ + OPER(A##I, J); OPER(B##I, J); + +#define DST_4(OPER, I, J) \ + OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); + +#define DST_8(OPER, I, J) \ + OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); \ + OPER(E##I, J); OPER(F##I, J); OPER(G##I, J); OPER(H##I, J); + +#define DST_SRC_1(OPER, I, J) \ + OPER(A##I, A##J); + +#define DST_SRC_2(OPER, I, J) \ + OPER(A##I, A##J); OPER(B##I, B##J); + +#define DST_SRC_4(OPER, I, J) \ + OPER(A##I, A##J); OPER(B##I, B##J); OPER(C##I, C##J); \ + OPER(D##I, D##J); + +#define DST_SRC_8(OPER, I, J) \ + OPER(A##I, A##J); OPER(B##I, B##J); OPER(C##I, C##J); \ + OPER(D##I, D##J); OPER(E##I, E##J); OPER(F##I, F##J); \ + OPER(G##I, G##J); OPER(H##I, H##J); + +/********************************************************************** + round macros + **********************************************************************/ + +#define QUARTERROUND4_POLY(wrot_1,wrot_2,wrot_3,op1,op2) \ + op1; DST_SRC_1(PLUS, 0, 1); DST_SRC_1(XOR, 3, 0); DST_1(ROTATE, 3, 16); \ + DST_SRC_1(PLUS, 2, 3); DST_SRC_1(XOR, 1, 2); DST_1(ROTATE, 1, 12); \ + DST_SRC_1(PLUS, 0, 1); DST_SRC_1(XOR, 3, 0); DST_1(ROTATE, 3, 8); \ + op2; DST_SRC_1(PLUS, 2, 3); DST_SRC_1(XOR, 1, 2); DST_1(ROTATE, 1, 7); \ + DST_1(WORD_ROTATE, 3, wrot_3); \ + DST_1(WORD_ROTATE, 2, wrot_2); \ + DST_1(WORD_ROTATE, 1, wrot_1); + +#define QUARTERROUND4(wrot_1,wrot_2,wrot_3) \ + QUARTERROUND4_POLY(wrot_1,wrot_2,wrot_3,,) + +#define QUARTERROUND4_2_POLY(wrot_1,wrot_2,wrot_3,op1,op2,op3,op4) \ + op1; DST_SRC_2(PLUS, 0, 1); DST_SRC_2(XOR, 3, 0); DST_2(ROTATE, 3, 16); \ + DST_SRC_2(PLUS, 2, 3); op2; DST_SRC_2(XOR, 1, 2); DST_2(ROTATE, 1, 12); \ + DST_SRC_2(PLUS, 0, 1); DST_SRC_2(XOR, 3, 0); op3; DST_2(ROTATE, 3, 8); \ + DST_SRC_2(PLUS, 2, 3); DST_SRC_2(XOR, 1, 2); DST_2(ROTATE, 1, 7); op4; \ + DST_2(WORD_ROTATE, 3, wrot_3); \ + DST_2(WORD_ROTATE, 2, wrot_2); \ + DST_2(WORD_ROTATE, 1, wrot_1); + +#define QUARTERROUND4_2(wrot_1,wrot_2,wrot_3) \ + QUARTERROUND4_2_POLY(wrot_1,wrot_2,wrot_3,,,,) + +#define QUARTERROUND4_4_POLY(wrot_1,wrot_2,wrot_3,op1,op2,op3,op4,op5,op6) \ + DST_SRC_4(PLUS, 0, 1); DST_SRC_4(XOR, 3, 0); op1; DST_4(ROTATE, 3, 16); \ + DST_SRC_4(PLUS, 2, 3); op2; DST_SRC_4(XOR, 1, 2); DST_4(ROTATE, 1, 12); \ + op3; DST_SRC_4(PLUS, 0, 1); DST_SRC_4(XOR, 3, 0); op4; DST_4(ROTATE, 3, 8); \ + DST_SRC_4(PLUS, 2, 3); op5; DST_SRC_4(XOR, 1, 2); DST_4(ROTATE, 1, 7); \ + op6; \ + DST_4(WORD_ROTATE, 3, wrot_3); \ + DST_4(WORD_ROTATE, 2, wrot_2); \ + DST_4(WORD_ROTATE, 1, wrot_1); + +#define QUARTERROUND4_4(wrot_1,wrot_2,wrot_3) \ + QUARTERROUND4_4_POLY(wrot_1,wrot_2,wrot_3,,,,,,) + +/********************************************************************** + 4-way && 2-way && 1-way chacha20 ("horizontal") + **********************************************************************/ + +.balign 8 +.globl _gcry_chacha20_s390x_vx_blocks4_2_1 +ELF(.type _gcry_chacha20_s390x_vx_blocks4_2_1,@function;) + +_gcry_chacha20_s390x_vx_blocks4_2_1: + /* input: + * %r2: input + * %r3: dst + * %r4: src + * %r5: nblks + */ + CFI_STARTPROC(); + + START_STACK(%r7); + lgr NBLKS, %r5; + + /* Load constants. */ + larl %r7, .Lconsts; + vl TMP0, (.Lwordswap - .Lconsts)(%r7); + vl TMP1, (.Lone - .Lconsts)(%r7); + vl TMP2, (.Lbswap128 - .Lconsts)(%r7); + + /* Load state. */ + vlm S0, S3, 0(INPUT); + vperm S0, S0, S0, TMP0; + vperm S1, S1, S1, TMP0; + vperm S2, S2, S2, TMP0; + vperm S3, S3, S3, TMP0; + + clgijl NBLKS, 4, .Lloop2; + +.balign 4 +.Lloop4: + /* Process four chacha20 blocks. */ + vlr TMP3, S3; + lghi ROUND, (20 / 2); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, TMP3; + vag TMP3, TMP3, TMP1; + vlr B0, S0; + vlr B1, S1; + vlr B2, S2; + vlr B3, TMP3; + vag TMP3, TMP3, TMP1; + vlr C0, S0; + vlr C1, S1; + vlr C2, S2; + vlr C3, TMP3; + vlr D0, S0; + vlr D1, S1; + vlr D2, S2; + vag D3, TMP3, TMP1; + + slgfi NBLKS, 4; + +.balign 4 +.Lround2_4: + QUARTERROUND4_4(3, 2, 1); + QUARTERROUND4_4(1, 2, 3); + brctg ROUND, .Lround2_4; + + vlm IO0, IO7, 0(SRC); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + PLUS(B0, S0); + PLUS(B1, S1); + PLUS(B2, S2); + PLUS(B3, S3); + vag S3, S3, TMP1; /* Update counter. */ + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + vperm B0, B0, B0, TMP2; + vperm B1, B1, B1, TMP2; + vperm B2, B2, B2, TMP2; + vperm B3, B3, B3, TMP2; + PLUS(C0, S0); + PLUS(C1, S1); + PLUS(C2, S2); + PLUS(C3, S3); + vag S3, S3, TMP1; /* Update counter. */ + PLUS(D0, S0); + PLUS(D1, S1); + PLUS(D2, S2); + PLUS(D3, S3); + vag S3, S3, TMP1; /* Update counter. */ + vperm C0, C0, C0, TMP2; + vperm C1, C1, C1, TMP2; + vperm C2, C2, C2, TMP2; + vperm C3, C3, C3, TMP2; + vperm D0, D0, D0, TMP2; + vperm D1, D1, D1, TMP2; + vperm D2, D2, D2, TMP2; + vperm D3, D3, D3, TMP2; + + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + XOR(IO4, B0); + XOR(IO5, B1); + XOR(IO6, B2); + XOR(IO7, B3); + vlm A0, B3, 128(SRC); + vstm IO0, IO7, 0(DST); + XOR(A0, C0); + XOR(A1, C1); + XOR(A2, C2); + XOR(A3, C3); + XOR(B0, D0); + XOR(B1, D1); + XOR(B2, D2); + XOR(B3, D3); + vstm A0, B3, 128(DST); + + aghi SRC, 256; + aghi DST, 256; + + clgijhe NBLKS, 4, .Lloop4; + + CLEAR(C0); + CLEAR(C1); + CLEAR(C2); + CLEAR(C3); + CLEAR(D0); + CLEAR(D1); + CLEAR(D2); + CLEAR(D3); + +.balign 4 +.Lloop2: + clgijl NBLKS, 2, .Lloop1; + + /* Process two chacha20 blocks. */ + lghi ROUND, (20 / 2); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, S3; + vlr B0, S0; + vlr B1, S1; + vlr B2, S2; + vag B3, S3, TMP1; + + slgfi NBLKS, 2; + +.balign 4 +.Lround2_2: + QUARTERROUND4_2(3, 2, 1); + QUARTERROUND4_2(1, 2, 3); + brctg ROUND, .Lround2_2; + + vlm IO0, IO7, 0(SRC); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + PLUS(B0, S0); + PLUS(B1, S1); + PLUS(B2, S2); + PLUS(B3, S3); + vag S3, S3, TMP1; /* Update counter. */ + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + vperm B0, B0, B0, TMP2; + vperm B1, B1, B1, TMP2; + vperm B2, B2, B2, TMP2; + vperm B3, B3, B3, TMP2; + + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + XOR(IO4, B0); + XOR(IO5, B1); + XOR(IO6, B2); + XOR(IO7, B3); + vstm IO0, IO7, 0(DST); + + aghi SRC, 128; + aghi DST, 128; + + clgijhe NBLKS, 2, .Lloop2; + + CLEAR(B0); + CLEAR(B1); + CLEAR(B2); + CLEAR(B3); + +.balign 4 +.Lloop1: + clgijl NBLKS, 1, .Ldone; + + /* Process one chacha20 block.*/ + lghi ROUND, (20 / 2); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, S3; + + slgfi NBLKS, 1; + +.balign 4 +.Lround2_1: + QUARTERROUND4(3, 2, 1); + QUARTERROUND4(1, 2, 3); + brct ROUND, .Lround2_1; + + vlm IO0, IO3, 0(SRC); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + vstm IO0, IO3, 0(DST); + + aghi SRC, 64; + aghi DST, 64; + + clgijhe NBLKS, 1, .Lloop1; + +.balign 4 +.Ldone: + /* Store counter. */ + vperm S3, S3, S3, TMP0; + vst S3, (48)(INPUT); + + /* Clear the used vector registers. */ + CLEAR(A0); + CLEAR(A1); + CLEAR(A2); + CLEAR(A3); + CLEAR(IO0); + CLEAR(IO1); + CLEAR(IO2); + CLEAR(IO3); + CLEAR(IO4); + CLEAR(IO5); + CLEAR(IO6); + CLEAR(IO7); + CLEAR(TMP0); + CLEAR(TMP1); + CLEAR(TMP2); + + END_STACK(%r7); + xgr %r2, %r2; + br %r14; + CFI_ENDPROC(); +ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1, + .-_gcry_chacha20_s390x_vx_blocks4_2_1;) + +/********************************************************************** + 8-way chacha20 ("vertical") + **********************************************************************/ + +#define QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\ + x8,x9,x10,x11,x12,x13,x14,x15,\ + y0,y1,y2,y3,y4,y5,y6,y7,\ + y8,y9,y10,y11,y12,y13,y14,y15,\ + op1,op2,op3,op4,op5,op6,op7,op8,\ + op9,op10,op11,op12) \ + op1; \ + PLUS(x0, x1); PLUS(x4, x5); \ + PLUS(x8, x9); PLUS(x12, x13); \ + PLUS(y0, y1); PLUS(y4, y5); \ + PLUS(y8, y9); PLUS(y12, y13); \ + op2; \ + XOR(x3, x0); XOR(x7, x4); \ + XOR(x11, x8); XOR(x15, x12); \ + XOR(y3, y0); XOR(y7, y4); \ + XOR(y11, y8); XOR(y15, y12); \ + op3; \ + ROTATE(x3, 16); ROTATE(x7, 16); \ + ROTATE(x11, 16); ROTATE(x15, 16); \ + ROTATE(y3, 16); ROTATE(y7, 16); \ + ROTATE(y11, 16); ROTATE(y15, 16); \ + op4; \ + PLUS(x2, x3); PLUS(x6, x7); \ + PLUS(x10, x11); PLUS(x14, x15); \ + PLUS(y2, y3); PLUS(y6, y7); \ + PLUS(y10, y11); PLUS(y14, y15); \ + op5; \ + XOR(x1, x2); XOR(x5, x6); \ + XOR(x9, x10); XOR(x13, x14); \ + XOR(y1, y2); XOR(y5, y6); \ + XOR(y9, y10); XOR(y13, y14); \ + op6; \ + ROTATE(x1,12); ROTATE(x5,12); \ + ROTATE(x9,12); ROTATE(x13,12); \ + ROTATE(y1,12); ROTATE(y5,12); \ + ROTATE(y9,12); ROTATE(y13,12); \ + op7; \ + PLUS(x0, x1); PLUS(x4, x5); \ + PLUS(x8, x9); PLUS(x12, x13); \ + PLUS(y0, y1); PLUS(y4, y5); \ + PLUS(y8, y9); PLUS(y12, y13); \ + op8; \ + XOR(x3, x0); XOR(x7, x4); \ + XOR(x11, x8); XOR(x15, x12); \ + XOR(y3, y0); XOR(y7, y4); \ + XOR(y11, y8); XOR(y15, y12); \ + op9; \ + ROTATE(x3,8); ROTATE(x7,8); \ + ROTATE(x11,8); ROTATE(x15,8); \ + ROTATE(y3,8); ROTATE(y7,8); \ + ROTATE(y11,8); ROTATE(y15,8); \ + op10; \ + PLUS(x2, x3); PLUS(x6, x7); \ + PLUS(x10, x11); PLUS(x14, x15); \ + PLUS(y2, y3); PLUS(y6, y7); \ + PLUS(y10, y11); PLUS(y14, y15); \ + op11; \ + XOR(x1, x2); XOR(x5, x6); \ + XOR(x9, x10); XOR(x13, x14); \ + XOR(y1, y2); XOR(y5, y6); \ + XOR(y9, y10); XOR(y13, y14); \ + op12; \ + ROTATE(x1,7); ROTATE(x5,7); \ + ROTATE(x9,7); ROTATE(x13,7); \ + ROTATE(y1,7); ROTATE(y5,7); \ + ROTATE(y9,7); ROTATE(y13,7); + +#define QUARTERROUND4_V8(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,\ + y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15) \ + QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\ + x8,x9,x10,x11,x12,x13,x14,x15,\ + y0,y1,y2,y3,y4,y5,y6,y7,\ + y8,y9,y10,y11,y12,y13,y14,y15,\ + ,,,,,,,,,,,) + +#define TRANSPOSE_4X4_2(v0,v1,v2,v3,va,vb,vc,vd,tmp0,tmp1,tmp2,tmpa,tmpb,tmpc) \ + vmrhf tmp0, v0, v1; \ + vmrhf tmp1, v2, v3; \ + vmrlf tmp2, v0, v1; \ + vmrlf v3, v2, v3; \ + vmrhf tmpa, va, vb; \ + vmrhf tmpb, vc, vd; \ + vmrlf tmpc, va, vb; \ + vmrlf vd, vc, vd; \ + vpdi v0, tmp0, tmp1, 0; \ + vpdi v1, tmp0, tmp1, 5; \ + vpdi v2, tmp2, v3, 0; \ + vpdi v3, tmp2, v3, 5; \ + vpdi va, tmpa, tmpb, 0; \ + vpdi vb, tmpa, tmpb, 5; \ + vpdi vc, tmpc, vd, 0; \ + vpdi vd, tmpc, vd, 5; + +.balign 8 +.globl _gcry_chacha20_s390x_vx_blocks8 +ELF(.type _gcry_chacha20_s390x_vx_blocks8,@function;) + +_gcry_chacha20_s390x_vx_blocks8: + /* input: + * %r2: input + * %r3: dst + * %r4: src + * %r5: nblks (multiple of 8) + */ + CFI_STARTPROC(); + + START_STACK(%r8); + lgr NBLKS, %r5; + + larl %r7, .Lconsts; + + /* Load counter. */ + lg %r8, (12 * 4)(INPUT); + rllg %r8, %r8, 32; + +.balign 4 + /* Process eight chacha20 blocks per loop. */ +.Lloop8: + vlm Y0, Y3, 0(INPUT); + + slgfi NBLKS, 8; + lghi ROUND, (20 / 2); + + /* Construct counter vectors X12/X13 & Y12/Y13. */ + vl X4, (.Ladd_counter_0123 - .Lconsts)(%r7); + vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r7); + vrepf Y12, Y3, 0; + vrepf Y13, Y3, 1; + vaccf X5, Y12, X4; + vaccf Y5, Y12, Y4; + vaf X12, Y12, X4; + vaf Y12, Y12, Y4; + vaf X13, Y13, X5; + vaf Y13, Y13, Y5; + + vrepf X0, Y0, 0; + vrepf X1, Y0, 1; + vrepf X2, Y0, 2; + vrepf X3, Y0, 3; + vrepf X4, Y1, 0; + vrepf X5, Y1, 1; + vrepf X6, Y1, 2; + vrepf X7, Y1, 3; + vrepf X8, Y2, 0; + vrepf X9, Y2, 1; + vrepf X10, Y2, 2; + vrepf X11, Y2, 3; + vrepf X14, Y3, 2; + vrepf X15, Y3, 3; + + /* Store counters for blocks 0-7. */ + vstm X12, X13, (STACK_CTR + 0 * 16)(%r15); + vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15); + + vlr Y0, X0; + vlr Y1, X1; + vlr Y2, X2; + vlr Y3, X3; + vlr Y4, X4; + vlr Y5, X5; + vlr Y6, X6; + vlr Y7, X7; + vlr Y8, X8; + vlr Y9, X9; + vlr Y10, X10; + vlr Y11, X11; + vlr Y14, X14; + vlr Y15, X15; + + /* Update and store counter. */ + agfi %r8, 8; + rllg %r5, %r8, 32; + stg %r5, (12 * 4)(INPUT); + +.balign 4 +.Lround2_8: + QUARTERROUND4_V8(X0, X4, X8, X12, X1, X5, X9, X13, + X2, X6, X10, X14, X3, X7, X11, X15, + Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13, + Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15); + QUARTERROUND4_V8(X0, X5, X10, X15, X1, X6, X11, X12, + X2, X7, X8, X13, X3, X4, X9, X14, + Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12, + Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14); + brctg ROUND, .Lround2_8; + + /* Store blocks 4-7. */ + vstm Y0, Y15, STACK_Y0_Y15(%r15); + + /* Load counters for blocks 0-3. */ + vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15); + + lghi ROUND, 1; + j .Lfirst_output_4blks_8; + +.balign 4 +.Lsecond_output_4blks_8: + /* Load blocks 4-7. */ + vlm X0, X15, STACK_Y0_Y15(%r15); + + /* Load counters for blocks 4-7. */ + vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15); + + lghi ROUND, 0; + +.balign 4 + /* Output four chacha20 blocks per loop. */ +.Lfirst_output_4blks_8: + vlm Y12, Y15, 0(INPUT); + PLUS(X12, Y0); + PLUS(X13, Y1); + vrepf Y0, Y12, 0; + vrepf Y1, Y12, 1; + vrepf Y2, Y12, 2; + vrepf Y3, Y12, 3; + vrepf Y4, Y13, 0; + vrepf Y5, Y13, 1; + vrepf Y6, Y13, 2; + vrepf Y7, Y13, 3; + vrepf Y8, Y14, 0; + vrepf Y9, Y14, 1; + vrepf Y10, Y14, 2; + vrepf Y11, Y14, 3; + vrepf Y14, Y15, 2; + vrepf Y15, Y15, 3; + PLUS(X0, Y0); + PLUS(X1, Y1); + PLUS(X2, Y2); + PLUS(X3, Y3); + PLUS(X4, Y4); + PLUS(X5, Y5); + PLUS(X6, Y6); + PLUS(X7, Y7); + PLUS(X8, Y8); + PLUS(X9, Y9); + PLUS(X10, Y10); + PLUS(X11, Y11); + PLUS(X14, Y14); + PLUS(X15, Y15); + + vl Y15, (.Lbswap32 - .Lconsts)(%r7); + TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7, + Y9, Y10, Y11, Y12, Y13, Y14); + TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15, + Y9, Y10, Y11, Y12, Y13, Y14); + + vlm Y0, Y14, 0(SRC); + vperm X0, X0, X0, Y15; + vperm X1, X1, X1, Y15; + vperm X2, X2, X2, Y15; + vperm X3, X3, X3, Y15; + vperm X4, X4, X4, Y15; + vperm X5, X5, X5, Y15; + vperm X6, X6, X6, Y15; + vperm X7, X7, X7, Y15; + vperm X8, X8, X8, Y15; + vperm X9, X9, X9, Y15; + vperm X10, X10, X10, Y15; + vperm X11, X11, X11, Y15; + vperm X12, X12, X12, Y15; + vperm X13, X13, X13, Y15; + vperm X14, X14, X14, Y15; + vperm X15, X15, X15, Y15; + vl Y15, (15 * 16)(SRC); + + XOR(Y0, X0); + XOR(Y1, X4); + XOR(Y2, X8); + XOR(Y3, X12); + XOR(Y4, X1); + XOR(Y5, X5); + XOR(Y6, X9); + XOR(Y7, X13); + XOR(Y8, X2); + XOR(Y9, X6); + XOR(Y10, X10); + XOR(Y11, X14); + XOR(Y12, X3); + XOR(Y13, X7); + XOR(Y14, X11); + XOR(Y15, X15); + vstm Y0, Y15, 0(DST); + + aghi SRC, 256; + aghi DST, 256; + + clgije ROUND, 1, .Lsecond_output_4blks_8; + + clgijhe NBLKS, 8, .Lloop8; + + /* Clear the used vector registers. */ + DST_8(CLEAR, 0, _); + DST_8(CLEAR, 1, _); + DST_8(CLEAR, 2, _); + DST_8(CLEAR, 3, _); + + /* Clear sensitive data in stack. */ + vlm Y0, Y15, STACK_Y0_Y15(%r15); + vlm Y0, Y3, STACK_CTR(%r15); + + END_STACK(%r8); + xgr %r2, %r2; + br %r14; + CFI_ENDPROC(); +ELF(.size _gcry_chacha20_s390x_vx_blocks8, + .-_gcry_chacha20_s390x_vx_blocks8;) + +#endif /*HAVE_GCC_INLINE_ASM_S390X_VX*/ +#endif /*__s390x__*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index c5967b6f..7b283080 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -97,6 +97,14 @@ # endif #endif +/* USE_S390X_VX indicates whether to enable zSeries code. */ +#undef USE_S390X_VX +#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 +# if defined(HAVE_GCC_INLINE_ASM_S390X_VX) +# define USE_S390X_VX 1 +# endif /* USE_S390X_VX */ +#endif + /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI @@ -113,10 +121,11 @@ typedef struct CHACHA20_context_s u32 input[16]; unsigned char pad[CHACHA20_BLOCK_SIZE]; unsigned int unused; /* bytes in the pad. */ - int use_ssse3:1; - int use_avx2:1; - int use_neon:1; - int use_ppc:1; + unsigned int use_ssse3:1; + unsigned int use_avx2:1; + unsigned int use_neon:1; + unsigned int use_ppc:1; + unsigned int use_s390x:1; } CHACHA20_context_t; @@ -168,10 +177,20 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, unsigned int _gcry_chacha20_poly1305_ppc8_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); -#endif +#endif /* SIZEOF_UNSIGNED_LONG == 8 */ #endif /* USE_PPC_VEC */ +#ifdef USE_S390X_VX + +unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst, + const byte *src, size_t nblks); + +unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst, + const byte *src, size_t nblks); + +#endif /* USE_S390X_VX */ + #ifdef USE_ARMV7_NEON unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst, @@ -311,6 +330,13 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, } #endif +#ifdef USE_S390X_VX + if (ctx->use_s390x) + { + return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks); + } +#endif + return do_chacha20_blocks (ctx->input, dst, src, nblks); } @@ -438,6 +464,9 @@ chacha20_do_setkey (CHACHA20_context_t *ctx, #ifdef USE_PPC_VEC ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0; #endif +#ifdef USE_S390X_VX + ctx->use_s390x = (features & HWF_S390X_VX) != 0; +#endif (void)features; @@ -538,6 +567,20 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, } #endif +#ifdef USE_S390X_VX + if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 8; + nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, + nblocks); + burn = nburn > burn ? nburn : burn; + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; diff --git a/configure.ac b/configure.ac index dc91e6b5..c97d050e 100644 --- a/configure.ac +++ b/configure.ac @@ -2067,7 +2067,8 @@ AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instruction AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void testfunc(void) { - asm volatile ("vx %%v0, %%v1, %%v31\n\t" + asm volatile (".machine \"z13+vx\"\n\t" + "vx %%v0, %%v1, %%v31\n\t" "verllf %%v11, %%v11, (16)(0)\n\t" : : @@ -2662,6 +2663,10 @@ if test "$found" = "1" ; then # Build with the ppc8 vector implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ppc.lo" ;; + s390x-*-*) + # Build with the s390x/zSeries vector implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-s390x.lo" + ;; esac if test x"$neonsupport" = xyes ; then -- cgit v1.2.1