From 6a0bb9ab7f886087d7edb0725c90485086a1c0b4 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Wed, 30 Dec 2020 17:46:04 +0200
Subject: Add s390x/zSeries implementation of ChaCha20

* cipher/Makefile.am: Add 'asm-common-s390x.h' and 'chacha20-s390x.S'.
* cipher/asm-common-s390x.h: New.
* cipher/chacha20-s390x.S: New.
* cipher/chacha20.c (USE_S390X_VX): New.
(CHACHA20_context_t): Change 'use_*' bit-field to unsigned type; Add
'use_s390x'.
(_gcry_chacha20_s390x_vx_blocks8)
(_gcry_chacha20_s390x_vx_blocks4_2_1): New.
(chacha20_do_setkey): Add HW feature detect for s390x/VX.
(chacha20_blocks, do_chacha20_encrypt_stream_tail): Add s390x/VX
code-path.
* configure.ac: Add 'chacha20-s390x.lo'.
--

Patch adds VX vector instruction set accelerated ChaCha20
implementation for zSeries.

Benchmark on z15 (4504 Mhz):

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      2.62 ns/B     364.0 MiB/s     11.80 c/B
     STREAM dec |      2.62 ns/B     363.8 MiB/s     11.81 c/B

After (~5x faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.505 ns/B      1888 MiB/s      2.28 c/B
     STREAM dec |     0.506 ns/B      1887 MiB/s      2.28 c/B

GnuPG-bug-id: 5201
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
 cipher/Makefile.am        |   3 +-
 cipher/asm-common-s390x.h |  90 +++++
 cipher/chacha20-s390x.S   | 888 ++++++++++++++++++++++++++++++++++++++++++++++
 cipher/chacha20.c         |  53 ++-
 configure.ac              |   7 +-
 5 files changed, 1034 insertions(+), 7 deletions(-)
 create mode 100644 cipher/asm-common-s390x.h
 create mode 100644 cipher/chacha20-s390x.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index c445e590..3234bcb2 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -71,6 +71,7 @@ libcipher_la_SOURCES = \
 EXTRA_libcipher_la_SOURCES = \
 	asm-common-aarch64.h \
 	asm-common-amd64.h \
+	asm-common-s390x.h \
 	asm-inline-s390x.h \
 	asm-poly1305-aarch64.h \
 	asm-poly1305-amd64.h \
@@ -79,7 +80,7 @@ EXTRA_libcipher_la_SOURCES = \
 	cast5.c cast5-amd64.S cast5-arm.S \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-armv7-neon.S chacha20-aarch64.S \
-	chacha20-ppc.c \
+	chacha20-ppc.c chacha20-s390x.S \
 	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
 	crc-armv8-aarch64-ce.S \
 	crc-ppc.c \
diff --git a/cipher/asm-common-s390x.h b/cipher/asm-common-s390x.h
new file mode 100644
index 00000000..b3a996cd
--- /dev/null
+++ b/cipher/asm-common-s390x.h
@@ -0,0 +1,90 @@
+/* asm-common-s390x.h  -  Common macros for zSeries assembly
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_S390X_H
+#define GCRY_ASM_COMMON_S390X_H
+
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_ELF_DIRECTIVES
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+/* CFA expressions are used for pointing CFA and registers to
+ * SP relative offsets. */
+# define DW_REGNO_SP 15
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+	0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+	0x80|((value)&0x7f), \
+	0x80|(((value)>>7)&0x7f), \
+	0x80|(((value)>>14)&0x7f), \
+	0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+	.cfi_escape \
+	  0x0f, /* DW_CFA_def_cfa_expression */ \
+	    DW_SLEB128_7BIT(11), /* length */ \
+	  0x7f, /* DW_OP_breg15, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs), \
+	  0x06, /* DW_OP_deref */ \
+	  0x23, /* DW_OP_plus_constu */ \
+	    DW_SLEB128_28BIT((cfa_depth)+160)
+
+# define CFI_REG_ON_STACK(regno,rsp_offs) \
+	.cfi_escape \
+	  0x10, /* DW_CFA_expression */ \
+	    DW_SLEB128_7BIT(regno), \
+	    DW_SLEB128_7BIT(5), /* length */ \
+	  0x7f, /* DW_OP_breg15, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/chacha20-s390x.S b/cipher/chacha20-s390x.S
new file mode 100644
index 00000000..2cd38330
--- /dev/null
+++ b/cipher/chacha20-s390x.S
@@ -0,0 +1,888 @@
+/* chacha20-s390x.S  -  zSeries implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
+
+#include "asm-common-s390x.h"
+
+.machine "z13+vx"
+.text
+
+.balign 16
+.Lconsts:
+.Lwordswap:
+	.byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+.Lbswap128:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap32:
+	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lone:
+	.long 0, 0, 0, 1
+.Ladd_counter_0123:
+	.long 0, 1, 2, 3
+.Ladd_counter_4567:
+	.long 4, 5, 6, 7
+
+/* register macros */
+#define INPUT %r2
+#define DST   %r3
+#define SRC   %r4
+#define NBLKS %r0
+#define ROUND %r1
+
+/* stack structure */
+
+#define STACK_FRAME_STD    (8 * 16 + 8 * 4)
+#define STACK_FRAME_F8_F15 (8 * 8)
+#define STACK_FRAME_Y0_Y15 (16 * 16)
+#define STACK_FRAME_CTR    (4 * 16)
+#define STACK_FRAME_PARAMS (6 * 8)
+
+#define STACK_MAX   (STACK_FRAME_STD + STACK_FRAME_F8_F15 + \
+		     STACK_FRAME_Y0_Y15 + STACK_FRAME_CTR + \
+		     STACK_FRAME_PARAMS)
+
+#define STACK_F8     (STACK_MAX - STACK_FRAME_F8_F15)
+#define STACK_F9     (STACK_F8 + 8)
+#define STACK_F10    (STACK_F9 + 8)
+#define STACK_F11    (STACK_F10 + 8)
+#define STACK_F12    (STACK_F11 + 8)
+#define STACK_F13    (STACK_F12 + 8)
+#define STACK_F14    (STACK_F13 + 8)
+#define STACK_F15    (STACK_F14 + 8)
+#define STACK_Y0_Y15 (STACK_F8 - STACK_FRAME_Y0_Y15)
+#define STACK_CTR    (STACK_Y0_Y15 - STACK_FRAME_CTR)
+#define STACK_INPUT  (STACK_CTR - STACK_FRAME_PARAMS)
+#define STACK_DST    (STACK_INPUT + 8)
+#define STACK_SRC    (STACK_DST + 8)
+#define STACK_NBLKS  (STACK_SRC + 8)
+#define STACK_POCTX  (STACK_NBLKS + 8)
+#define STACK_POSRC  (STACK_POCTX + 8)
+
+#define STACK_G0_H3  STACK_Y0_Y15
+
+/* vector registers */
+#define A0 %v0
+#define A1 %v1
+#define A2 %v2
+#define A3 %v3
+
+#define B0 %v4
+#define B1 %v5
+#define B2 %v6
+#define B3 %v7
+
+#define C0 %v8
+#define C1 %v9
+#define C2 %v10
+#define C3 %v11
+
+#define D0 %v12
+#define D1 %v13
+#define D2 %v14
+#define D3 %v15
+
+#define E0 %v16
+#define E1 %v17
+#define E2 %v18
+#define E3 %v19
+
+#define F0 %v20
+#define F1 %v21
+#define F2 %v22
+#define F3 %v23
+
+#define G0 %v24
+#define G1 %v25
+#define G2 %v26
+#define G3 %v27
+
+#define H0 %v28
+#define H1 %v29
+#define H2 %v30
+#define H3 %v31
+
+#define IO0 E0
+#define IO1 E1
+#define IO2 E2
+#define IO3 E3
+#define IO4 F0
+#define IO5 F1
+#define IO6 F2
+#define IO7 F3
+
+#define S0 G0
+#define S1 G1
+#define S2 G2
+#define S3 G3
+
+#define TMP0 H0
+#define TMP1 H1
+#define TMP2 H2
+#define TMP3 H3
+
+#define X0 A0
+#define X1 A1
+#define X2 A2
+#define X3 A3
+#define X4 B0
+#define X5 B1
+#define X6 B2
+#define X7 B3
+#define X8 C0
+#define X9 C1
+#define X10 C2
+#define X11 C3
+#define X12 D0
+#define X13 D1
+#define X14 D2
+#define X15 D3
+
+#define Y0 E0
+#define Y1 E1
+#define Y2 E2
+#define Y3 E3
+#define Y4 F0
+#define Y5 F1
+#define Y6 F2
+#define Y7 F3
+#define Y8 G0
+#define Y9 G1
+#define Y10 G2
+#define Y11 G3
+#define Y12 H0
+#define Y13 H1
+#define Y14 H2
+#define Y15 H3
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#define _ /*_*/
+
+#define CLEAR(x,...) vzero x;
+
+#define START_STACK(last_r) \
+	lgr %r0, %r15; \
+	lghi %r1, ~15; \
+	stmg %r6, last_r, 6 * 8(%r15); \
+	aghi %r0, -STACK_MAX; \
+	ngr %r0, %r1; \
+	lgr %r1, %r15; \
+	CFI_DEF_CFA_REGISTER(1); \
+	lgr %r15, %r0; \
+	stg %r1, 0(%r15); \
+	CFI_CFA_ON_STACK(0, 0); \
+	std %f8, STACK_F8(%r15); \
+	std %f9, STACK_F9(%r15); \
+	std %f10, STACK_F10(%r15); \
+	std %f11, STACK_F11(%r15); \
+	std %f12, STACK_F12(%r15); \
+	std %f13, STACK_F13(%r15); \
+	std %f14, STACK_F14(%r15); \
+	std %f15, STACK_F15(%r15);
+
+#define END_STACK(last_r) \
+	lg %r1, 0(%r15); \
+	ld %f8, STACK_F8(%r15); \
+	ld %f9, STACK_F9(%r15); \
+	ld %f10, STACK_F10(%r15); \
+	ld %f11, STACK_F11(%r15); \
+	ld %f12, STACK_F12(%r15); \
+	ld %f13, STACK_F13(%r15); \
+	ld %f14, STACK_F14(%r15); \
+	ld %f15, STACK_F15(%r15); \
+	lmg %r6, last_r, 6 * 8(%r1); \
+	lgr %r15, %r1; \
+	CFI_DEF_CFA_REGISTER(DW_REGNO_SP);
+
+#define PLUS(dst,src) \
+	vaf dst, dst, src;
+
+#define XOR(dst,src) \
+	vx dst, dst, src;
+
+#define ROTATE(v1,c) \
+	verllf v1, v1, (c)(0);
+
+#define WORD_ROTATE(v1,s) \
+	vsldb v1, v1, v1, ((s) * 4);
+
+#define DST_1(OPER, I, J) \
+	OPER(A##I, J);
+
+#define DST_2(OPER, I, J) \
+	OPER(A##I, J); OPER(B##I, J);
+
+#define DST_4(OPER, I, J) \
+	OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J);
+
+#define DST_8(OPER, I, J) \
+	OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); \
+	OPER(E##I, J); OPER(F##I, J); OPER(G##I, J); OPER(H##I, J);
+
+#define DST_SRC_1(OPER, I, J) \
+	OPER(A##I, A##J);
+
+#define DST_SRC_2(OPER, I, J) \
+	OPER(A##I, A##J); OPER(B##I, B##J);
+
+#define DST_SRC_4(OPER, I, J) \
+	OPER(A##I, A##J); OPER(B##I, B##J); OPER(C##I, C##J); \
+	OPER(D##I, D##J);
+
+#define DST_SRC_8(OPER, I, J) \
+	OPER(A##I, A##J); OPER(B##I, B##J); OPER(C##I, C##J); \
+	OPER(D##I, D##J); OPER(E##I, E##J); OPER(F##I, F##J); \
+	OPER(G##I, G##J); OPER(H##I, H##J);
+
+/**********************************************************************
+  round macros
+ **********************************************************************/
+
+#define QUARTERROUND4_POLY(wrot_1,wrot_2,wrot_3,op1,op2) \
+	op1; DST_SRC_1(PLUS, 0, 1); DST_SRC_1(XOR, 3, 0); DST_1(ROTATE, 3, 16); \
+	DST_SRC_1(PLUS, 2, 3); DST_SRC_1(XOR, 1, 2); DST_1(ROTATE, 1, 12); \
+	DST_SRC_1(PLUS, 0, 1); DST_SRC_1(XOR, 3, 0); DST_1(ROTATE, 3, 8); \
+	op2; DST_SRC_1(PLUS, 2, 3); DST_SRC_1(XOR, 1, 2); DST_1(ROTATE, 1, 7); \
+	DST_1(WORD_ROTATE, 3, wrot_3); \
+	DST_1(WORD_ROTATE, 2, wrot_2); \
+	DST_1(WORD_ROTATE, 1, wrot_1);
+
+#define QUARTERROUND4(wrot_1,wrot_2,wrot_3) \
+	QUARTERROUND4_POLY(wrot_1,wrot_2,wrot_3,,)
+
+#define QUARTERROUND4_2_POLY(wrot_1,wrot_2,wrot_3,op1,op2,op3,op4) \
+	op1; DST_SRC_2(PLUS, 0, 1); DST_SRC_2(XOR, 3, 0); DST_2(ROTATE, 3, 16); \
+	DST_SRC_2(PLUS, 2, 3); op2; DST_SRC_2(XOR, 1, 2); DST_2(ROTATE, 1, 12); \
+	DST_SRC_2(PLUS, 0, 1); DST_SRC_2(XOR, 3, 0); op3; DST_2(ROTATE, 3, 8); \
+	DST_SRC_2(PLUS, 2, 3); DST_SRC_2(XOR, 1, 2); DST_2(ROTATE, 1, 7); op4; \
+	DST_2(WORD_ROTATE, 3, wrot_3); \
+	DST_2(WORD_ROTATE, 2, wrot_2); \
+	DST_2(WORD_ROTATE, 1, wrot_1);
+
+#define QUARTERROUND4_2(wrot_1,wrot_2,wrot_3) \
+	QUARTERROUND4_2_POLY(wrot_1,wrot_2,wrot_3,,,,)
+
+#define QUARTERROUND4_4_POLY(wrot_1,wrot_2,wrot_3,op1,op2,op3,op4,op5,op6) \
+	DST_SRC_4(PLUS, 0, 1); DST_SRC_4(XOR, 3, 0); op1; DST_4(ROTATE, 3, 16); \
+	DST_SRC_4(PLUS, 2, 3); op2; DST_SRC_4(XOR, 1, 2); DST_4(ROTATE, 1, 12); \
+	op3; DST_SRC_4(PLUS, 0, 1); DST_SRC_4(XOR, 3, 0); op4; DST_4(ROTATE, 3, 8); \
+	DST_SRC_4(PLUS, 2, 3); op5; DST_SRC_4(XOR, 1, 2); DST_4(ROTATE, 1, 7); \
+	op6; \
+	DST_4(WORD_ROTATE, 3, wrot_3); \
+	DST_4(WORD_ROTATE, 2, wrot_2); \
+	DST_4(WORD_ROTATE, 1, wrot_1);
+
+#define QUARTERROUND4_4(wrot_1,wrot_2,wrot_3) \
+	QUARTERROUND4_4_POLY(wrot_1,wrot_2,wrot_3,,,,,,)
+
+/**********************************************************************
+  4-way && 2-way && 1-way chacha20 ("horizontal")
+ **********************************************************************/
+
+.balign 8
+.globl _gcry_chacha20_s390x_vx_blocks4_2_1
+ELF(.type _gcry_chacha20_s390x_vx_blocks4_2_1,@function;)
+
+_gcry_chacha20_s390x_vx_blocks4_2_1:
+	/* input:
+	 *	%r2: input
+	 *	%r3: dst
+	 *	%r4: src
+	 *	%r5: nblks
+	 */
+	CFI_STARTPROC();
+
+	START_STACK(%r7);
+	lgr NBLKS, %r5;
+
+	/* Load constants. */
+	larl %r7, .Lconsts;
+	vl TMP0, (.Lwordswap - .Lconsts)(%r7);
+	vl TMP1, (.Lone - .Lconsts)(%r7);
+	vl TMP2, (.Lbswap128 - .Lconsts)(%r7);
+
+	/* Load state. */
+	vlm S0, S3, 0(INPUT);
+	vperm S0, S0, S0, TMP0;
+	vperm S1, S1, S1, TMP0;
+	vperm S2, S2, S2, TMP0;
+	vperm S3, S3, S3, TMP0;
+
+	clgijl NBLKS, 4, .Lloop2;
+
+.balign 4
+.Lloop4:
+	/* Process four chacha20 blocks. */
+	vlr TMP3, S3;
+	lghi ROUND, (20 / 2);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, TMP3;
+	vag TMP3, TMP3, TMP1;
+	vlr B0, S0;
+	vlr B1, S1;
+	vlr B2, S2;
+	vlr B3, TMP3;
+	vag TMP3, TMP3, TMP1;
+	vlr C0, S0;
+	vlr C1, S1;
+	vlr C2, S2;
+	vlr C3, TMP3;
+	vlr D0, S0;
+	vlr D1, S1;
+	vlr D2, S2;
+	vag D3, TMP3, TMP1;
+
+	slgfi NBLKS, 4;
+
+.balign 4
+.Lround2_4:
+	QUARTERROUND4_4(3, 2, 1);
+	QUARTERROUND4_4(1, 2, 3);
+	brctg ROUND, .Lround2_4;
+
+	vlm IO0, IO7, 0(SRC);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	PLUS(B0, S0);
+	PLUS(B1, S1);
+	PLUS(B2, S2);
+	PLUS(B3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	vperm B0, B0, B0, TMP2;
+	vperm B1, B1, B1, TMP2;
+	vperm B2, B2, B2, TMP2;
+	vperm B3, B3, B3, TMP2;
+	PLUS(C0, S0);
+	PLUS(C1, S1);
+	PLUS(C2, S2);
+	PLUS(C3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	PLUS(D0, S0);
+	PLUS(D1, S1);
+	PLUS(D2, S2);
+	PLUS(D3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	vperm C0, C0, C0, TMP2;
+	vperm C1, C1, C1, TMP2;
+	vperm C2, C2, C2, TMP2;
+	vperm C3, C3, C3, TMP2;
+	vperm D0, D0, D0, TMP2;
+	vperm D1, D1, D1, TMP2;
+	vperm D2, D2, D2, TMP2;
+	vperm D3, D3, D3, TMP2;
+
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	XOR(IO4, B0);
+	XOR(IO5, B1);
+	XOR(IO6, B2);
+	XOR(IO7, B3);
+	vlm A0, B3, 128(SRC);
+	vstm IO0, IO7, 0(DST);
+	XOR(A0, C0);
+	XOR(A1, C1);
+	XOR(A2, C2);
+	XOR(A3, C3);
+	XOR(B0, D0);
+	XOR(B1, D1);
+	XOR(B2, D2);
+	XOR(B3, D3);
+	vstm A0, B3, 128(DST);
+
+	aghi SRC, 256;
+	aghi DST, 256;
+
+	clgijhe NBLKS, 4, .Lloop4;
+
+	CLEAR(C0);
+	CLEAR(C1);
+	CLEAR(C2);
+	CLEAR(C3);
+	CLEAR(D0);
+	CLEAR(D1);
+	CLEAR(D2);
+	CLEAR(D3);
+
+.balign 4
+.Lloop2:
+	clgijl NBLKS, 2, .Lloop1;
+
+	/* Process two chacha20 blocks. */
+	lghi ROUND, (20 / 2);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, S3;
+	vlr B0, S0;
+	vlr B1, S1;
+	vlr B2, S2;
+	vag B3, S3, TMP1;
+
+	slgfi NBLKS, 2;
+
+.balign 4
+.Lround2_2:
+	QUARTERROUND4_2(3, 2, 1);
+	QUARTERROUND4_2(1, 2, 3);
+	brctg ROUND, .Lround2_2;
+
+	vlm IO0, IO7, 0(SRC);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	PLUS(B0, S0);
+	PLUS(B1, S1);
+	PLUS(B2, S2);
+	PLUS(B3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	vperm B0, B0, B0, TMP2;
+	vperm B1, B1, B1, TMP2;
+	vperm B2, B2, B2, TMP2;
+	vperm B3, B3, B3, TMP2;
+
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	XOR(IO4, B0);
+	XOR(IO5, B1);
+	XOR(IO6, B2);
+	XOR(IO7, B3);
+	vstm IO0, IO7, 0(DST);
+
+	aghi SRC, 128;
+	aghi DST, 128;
+
+	clgijhe NBLKS, 2, .Lloop2;
+
+	CLEAR(B0);
+	CLEAR(B1);
+	CLEAR(B2);
+	CLEAR(B3);
+
+.balign 4
+.Lloop1:
+	clgijl NBLKS, 1, .Ldone;
+
+	/* Process one chacha20 block.*/
+	lghi ROUND, (20 / 2);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, S3;
+
+	slgfi NBLKS, 1;
+
+.balign 4
+.Lround2_1:
+	QUARTERROUND4(3, 2, 1);
+	QUARTERROUND4(1, 2, 3);
+	brct ROUND, .Lround2_1;
+
+	vlm IO0, IO3, 0(SRC);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	vstm IO0, IO3, 0(DST);
+
+	aghi SRC, 64;
+	aghi DST, 64;
+
+	clgijhe NBLKS, 1, .Lloop1;
+
+.balign 4
+.Ldone:
+	/* Store counter. */
+	vperm S3, S3, S3, TMP0;
+	vst S3, (48)(INPUT);
+
+	/* Clear the used vector registers. */
+	CLEAR(A0);
+	CLEAR(A1);
+	CLEAR(A2);
+	CLEAR(A3);
+	CLEAR(IO0);
+	CLEAR(IO1);
+	CLEAR(IO2);
+	CLEAR(IO3);
+	CLEAR(IO4);
+	CLEAR(IO5);
+	CLEAR(IO6);
+	CLEAR(IO7);
+	CLEAR(TMP0);
+	CLEAR(TMP1);
+	CLEAR(TMP2);
+
+	END_STACK(%r7);
+	xgr %r2, %r2;
+	br %r14;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1,
+    .-_gcry_chacha20_s390x_vx_blocks4_2_1;)
+
+/**********************************************************************
+  8-way chacha20 ("vertical")
+ **********************************************************************/
+
+#define QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
+			      x8,x9,x10,x11,x12,x13,x14,x15,\
+			      y0,y1,y2,y3,y4,y5,y6,y7,\
+			      y8,y9,y10,y11,y12,y13,y14,y15,\
+			      op1,op2,op3,op4,op5,op6,op7,op8,\
+			      op9,op10,op11,op12) \
+	op1;							\
+	PLUS(x0, x1); PLUS(x4, x5);				\
+	PLUS(x8, x9); PLUS(x12, x13);				\
+	PLUS(y0, y1); PLUS(y4, y5);				\
+	PLUS(y8, y9); PLUS(y12, y13);				\
+	    op2;						\
+	    XOR(x3, x0);  XOR(x7, x4);				\
+	    XOR(x11, x8); XOR(x15, x12);			\
+	    XOR(y3, y0);  XOR(y7, y4);				\
+	    XOR(y11, y8); XOR(y15, y12);			\
+		op3;						\
+		ROTATE(x3, 16); ROTATE(x7, 16);			\
+		ROTATE(x11, 16); ROTATE(x15, 16);		\
+		ROTATE(y3, 16); ROTATE(y7, 16);			\
+		ROTATE(y11, 16); ROTATE(y15, 16);		\
+	op4;							\
+	PLUS(x2, x3); PLUS(x6, x7);				\
+	PLUS(x10, x11); PLUS(x14, x15);				\
+	PLUS(y2, y3); PLUS(y6, y7);				\
+	PLUS(y10, y11); PLUS(y14, y15);				\
+	    op5;						\
+	    XOR(x1, x2); XOR(x5, x6);				\
+	    XOR(x9, x10); XOR(x13, x14);			\
+	    XOR(y1, y2); XOR(y5, y6);				\
+	    XOR(y9, y10); XOR(y13, y14);			\
+		op6;						\
+		ROTATE(x1,12); ROTATE(x5,12);			\
+		ROTATE(x9,12); ROTATE(x13,12);			\
+		ROTATE(y1,12); ROTATE(y5,12);			\
+		ROTATE(y9,12); ROTATE(y13,12);			\
+	op7;							\
+	PLUS(x0, x1); PLUS(x4, x5);				\
+	PLUS(x8, x9); PLUS(x12, x13);				\
+	PLUS(y0, y1); PLUS(y4, y5);				\
+	PLUS(y8, y9); PLUS(y12, y13);				\
+	    op8;						\
+	    XOR(x3, x0); XOR(x7, x4);				\
+	    XOR(x11, x8); XOR(x15, x12);			\
+	    XOR(y3, y0); XOR(y7, y4);				\
+	    XOR(y11, y8); XOR(y15, y12);			\
+		op9;						\
+		ROTATE(x3,8); ROTATE(x7,8);			\
+		ROTATE(x11,8); ROTATE(x15,8);			\
+		ROTATE(y3,8); ROTATE(y7,8);			\
+		ROTATE(y11,8); ROTATE(y15,8);			\
+	op10;							\
+	PLUS(x2, x3); PLUS(x6, x7);				\
+	PLUS(x10, x11); PLUS(x14, x15);				\
+	PLUS(y2, y3); PLUS(y6, y7);				\
+	PLUS(y10, y11); PLUS(y14, y15);				\
+	    op11;						\
+	    XOR(x1, x2); XOR(x5, x6);				\
+	    XOR(x9, x10); XOR(x13, x14);			\
+	    XOR(y1, y2); XOR(y5, y6);				\
+	    XOR(y9, y10); XOR(y13, y14);			\
+		op12;						\
+		ROTATE(x1,7); ROTATE(x5,7);			\
+		ROTATE(x9,7); ROTATE(x13,7);			\
+		ROTATE(y1,7); ROTATE(y5,7);			\
+		ROTATE(y9,7); ROTATE(y13,7);
+
+#define QUARTERROUND4_V8(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,\
+			 y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15) \
+	QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
+			      x8,x9,x10,x11,x12,x13,x14,x15,\
+			      y0,y1,y2,y3,y4,y5,y6,y7,\
+			      y8,y9,y10,y11,y12,y13,y14,y15,\
+			      ,,,,,,,,,,,)
+
+#define TRANSPOSE_4X4_2(v0,v1,v2,v3,va,vb,vc,vd,tmp0,tmp1,tmp2,tmpa,tmpb,tmpc) \
+	  vmrhf tmp0, v0, v1;					\
+	  vmrhf tmp1, v2, v3;					\
+	  vmrlf tmp2, v0, v1;					\
+	  vmrlf   v3, v2, v3;					\
+	  vmrhf tmpa, va, vb;					\
+	  vmrhf tmpb, vc, vd;					\
+	  vmrlf tmpc, va, vb;					\
+	  vmrlf   vd, vc, vd;					\
+	  vpdi v0, tmp0, tmp1, 0;				\
+	  vpdi v1, tmp0, tmp1, 5;				\
+	  vpdi v2, tmp2,   v3, 0;				\
+	  vpdi v3, tmp2,   v3, 5;				\
+	  vpdi va, tmpa, tmpb, 0;				\
+	  vpdi vb, tmpa, tmpb, 5;				\
+	  vpdi vc, tmpc,   vd, 0;				\
+	  vpdi vd, tmpc,   vd, 5;
+
+.balign 8
+.globl _gcry_chacha20_s390x_vx_blocks8
+ELF(.type _gcry_chacha20_s390x_vx_blocks8,@function;)
+
+_gcry_chacha20_s390x_vx_blocks8:
+	/* input:
+	 *	%r2: input
+	 *	%r3: dst
+	 *	%r4: src
+	 *	%r5: nblks (multiple of 8)
+	 */
+	CFI_STARTPROC();
+
+	START_STACK(%r8);
+	lgr NBLKS, %r5;
+
+	larl %r7, .Lconsts;
+
+	/* Load counter. */
+	lg %r8, (12 * 4)(INPUT);
+	rllg %r8, %r8, 32;
+
+.balign 4
+	/* Process eight chacha20 blocks per loop. */
+.Lloop8:
+	vlm Y0, Y3, 0(INPUT);
+
+	slgfi NBLKS, 8;
+	lghi ROUND, (20 / 2);
+
+	/* Construct counter vectors X12/X13 & Y12/Y13. */
+	vl X4, (.Ladd_counter_0123 - .Lconsts)(%r7);
+	vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r7);
+	vrepf Y12, Y3, 0;
+	vrepf Y13, Y3, 1;
+	vaccf X5, Y12, X4;
+	vaccf Y5, Y12, Y4;
+	vaf X12, Y12, X4;
+	vaf Y12, Y12, Y4;
+	vaf X13, Y13, X5;
+	vaf Y13, Y13, Y5;
+
+	vrepf X0, Y0, 0;
+	vrepf X1, Y0, 1;
+	vrepf X2, Y0, 2;
+	vrepf X3, Y0, 3;
+	vrepf X4, Y1, 0;
+	vrepf X5, Y1, 1;
+	vrepf X6, Y1, 2;
+	vrepf X7, Y1, 3;
+	vrepf X8, Y2, 0;
+	vrepf X9, Y2, 1;
+	vrepf X10, Y2, 2;
+	vrepf X11, Y2, 3;
+	vrepf X14, Y3, 2;
+	vrepf X15, Y3, 3;
+
+	/* Store counters for blocks 0-7. */
+	vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
+	vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
+
+	vlr Y0, X0;
+	vlr Y1, X1;
+	vlr Y2, X2;
+	vlr Y3, X3;
+	vlr Y4, X4;
+	vlr Y5, X5;
+	vlr Y6, X6;
+	vlr Y7, X7;
+	vlr Y8, X8;
+	vlr Y9, X9;
+	vlr Y10, X10;
+	vlr Y11, X11;
+	vlr Y14, X14;
+	vlr Y15, X15;
+
+	/* Update and store counter. */
+	agfi %r8, 8;
+	rllg %r5, %r8, 32;
+	stg %r5, (12 * 4)(INPUT);
+
+.balign 4
+.Lround2_8:
+	QUARTERROUND4_V8(X0, X4,  X8, X12,   X1, X5,  X9, X13,
+			 X2, X6, X10, X14,   X3, X7, X11, X15,
+			 Y0, Y4,  Y8, Y12,   Y1, Y5,  Y9, Y13,
+			 Y2, Y6, Y10, Y14,   Y3, Y7, Y11, Y15);
+	QUARTERROUND4_V8(X0, X5, X10, X15,   X1, X6, X11, X12,
+			 X2, X7,  X8, X13,   X3, X4,  X9, X14,
+			 Y0, Y5, Y10, Y15,   Y1, Y6, Y11, Y12,
+			 Y2, Y7,  Y8, Y13,   Y3, Y4,  Y9, Y14);
+	brctg ROUND, .Lround2_8;
+
+	/* Store blocks 4-7. */
+	vstm Y0, Y15, STACK_Y0_Y15(%r15);
+
+	/* Load counters for blocks 0-3. */
+	vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
+
+	lghi ROUND, 1;
+	j .Lfirst_output_4blks_8;
+
+.balign 4
+.Lsecond_output_4blks_8:
+	/* Load blocks 4-7. */
+	vlm X0, X15, STACK_Y0_Y15(%r15);
+
+	/* Load counters for blocks 4-7. */
+	vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
+
+	lghi ROUND, 0;
+
+.balign 4
+	/* Output four chacha20 blocks per loop. */
+.Lfirst_output_4blks_8:
+	vlm Y12, Y15, 0(INPUT);
+	PLUS(X12, Y0);
+	PLUS(X13, Y1);
+	vrepf Y0, Y12, 0;
+	vrepf Y1, Y12, 1;
+	vrepf Y2, Y12, 2;
+	vrepf Y3, Y12, 3;
+	vrepf Y4, Y13, 0;
+	vrepf Y5, Y13, 1;
+	vrepf Y6, Y13, 2;
+	vrepf Y7, Y13, 3;
+	vrepf Y8, Y14, 0;
+	vrepf Y9, Y14, 1;
+	vrepf Y10, Y14, 2;
+	vrepf Y11, Y14, 3;
+	vrepf Y14, Y15, 2;
+	vrepf Y15, Y15, 3;
+	PLUS(X0, Y0);
+	PLUS(X1, Y1);
+	PLUS(X2, Y2);
+	PLUS(X3, Y3);
+	PLUS(X4, Y4);
+	PLUS(X5, Y5);
+	PLUS(X6, Y6);
+	PLUS(X7, Y7);
+	PLUS(X8, Y8);
+	PLUS(X9, Y9);
+	PLUS(X10, Y10);
+	PLUS(X11, Y11);
+	PLUS(X14, Y14);
+	PLUS(X15, Y15);
+
+	vl Y15, (.Lbswap32 - .Lconsts)(%r7);
+	TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
+			Y9, Y10, Y11, Y12, Y13, Y14);
+	TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
+			Y9, Y10, Y11, Y12, Y13, Y14);
+
+	vlm Y0, Y14, 0(SRC);
+	vperm X0, X0, X0, Y15;
+	vperm X1, X1, X1, Y15;
+	vperm X2, X2, X2, Y15;
+	vperm X3, X3, X3, Y15;
+	vperm X4, X4, X4, Y15;
+	vperm X5, X5, X5, Y15;
+	vperm X6, X6, X6, Y15;
+	vperm X7, X7, X7, Y15;
+	vperm X8, X8, X8, Y15;
+	vperm X9, X9, X9, Y15;
+	vperm X10, X10, X10, Y15;
+	vperm X11, X11, X11, Y15;
+	vperm X12, X12, X12, Y15;
+	vperm X13, X13, X13, Y15;
+	vperm X14, X14, X14, Y15;
+	vperm X15, X15, X15, Y15;
+	vl Y15, (15 * 16)(SRC);
+
+	XOR(Y0, X0);
+	XOR(Y1, X4);
+	XOR(Y2, X8);
+	XOR(Y3, X12);
+	XOR(Y4, X1);
+	XOR(Y5, X5);
+	XOR(Y6, X9);
+	XOR(Y7, X13);
+	XOR(Y8, X2);
+	XOR(Y9, X6);
+	XOR(Y10, X10);
+	XOR(Y11, X14);
+	XOR(Y12, X3);
+	XOR(Y13, X7);
+	XOR(Y14, X11);
+	XOR(Y15, X15);
+	vstm Y0, Y15, 0(DST);
+
+	aghi SRC, 256;
+	aghi DST, 256;
+
+	clgije ROUND, 1, .Lsecond_output_4blks_8;
+
+	clgijhe NBLKS, 8, .Lloop8;
+
+	/* Clear the used vector registers. */
+	DST_8(CLEAR, 0, _);
+	DST_8(CLEAR, 1, _);
+	DST_8(CLEAR, 2, _);
+	DST_8(CLEAR, 3, _);
+
+	/* Clear sensitive data in stack. */
+	vlm Y0, Y15, STACK_Y0_Y15(%r15);
+	vlm Y0, Y3, STACK_CTR(%r15);
+
+	END_STACK(%r8);
+	xgr %r2, %r2;
+	br %r14;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_s390x_vx_blocks8,
+    .-_gcry_chacha20_s390x_vx_blocks8;)
+
+#endif /*HAVE_GCC_INLINE_ASM_S390X_VX*/
+#endif /*__s390x__*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index c5967b6f..7b283080 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -97,6 +97,14 @@
 # endif
 #endif
 
+/* USE_S390X_VX indicates whether to enable zSeries code. */
+#undef USE_S390X_VX
+#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+# if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
+#  define USE_S390X_VX 1
+# endif /* USE_S390X_VX */
+#endif
+
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
@@ -113,10 +121,11 @@ typedef struct CHACHA20_context_s
   u32 input[16];
   unsigned char pad[CHACHA20_BLOCK_SIZE];
   unsigned int unused; /* bytes in the pad.  */
-  int use_ssse3:1;
-  int use_avx2:1;
-  int use_neon:1;
-  int use_ppc:1;
+  unsigned int use_ssse3:1;
+  unsigned int use_avx2:1;
+  unsigned int use_neon:1;
+  unsigned int use_ppc:1;
+  unsigned int use_s390x:1;
 } CHACHA20_context_t;
 
 
@@ -168,10 +177,20 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		POLY1305_STATE *st, const byte *poly1305_src);
-#endif
+#endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
 #endif /* USE_PPC_VEC */
 
+#ifdef USE_S390X_VX
+
+unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst,
+					     const byte *src, size_t nblks);
+
+unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst,
+						 const byte *src, size_t nblks);
+
+#endif /* USE_S390X_VX */
+
 #ifdef USE_ARMV7_NEON
 
 unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
@@ -311,6 +330,13 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
     }
 #endif
 
+#ifdef USE_S390X_VX
+  if (ctx->use_s390x)
+    {
+      return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks);
+    }
+#endif
+
   return do_chacha20_blocks (ctx->input, dst, src, nblks);
 }
 
@@ -438,6 +464,9 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #ifdef USE_PPC_VEC
   ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
 #endif
+#ifdef USE_S390X_VX
+  ctx->use_s390x = (features & HWF_S390X_VX) != 0;
+#endif
 
   (void)features;
 
@@ -538,6 +567,20 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
     }
 #endif
 
+#ifdef USE_S390X_VX
+  if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+      nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf,
+					      nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
diff --git a/configure.ac b/configure.ac
index dc91e6b5..c97d050e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2067,7 +2067,8 @@ AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instruction
 	    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
 	    [[void testfunc(void)
 	      {
-		asm volatile ("vx %%v0, %%v1, %%v31\n\t"
+		asm volatile (".machine \"z13+vx\"\n\t"
+			      "vx %%v0, %%v1, %%v31\n\t"
 			      "verllf %%v11, %%v11, (16)(0)\n\t"
 			      :
 			      :
@@ -2662,6 +2663,10 @@ if test "$found" = "1" ; then
          # Build with the ppc8 vector implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ppc.lo"
       ;;
+      s390x-*-*)
+         # Build with the s390x/zSeries vector implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-s390x.lo"
+      ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
-- 
cgit v1.2.1