108 files changed, 5011 insertions, 606 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ed15456f..0f10d9fd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -144,8 +144,8 @@ Debian.cross.x86:
   - apt-get update -q
   # remove any previously installed nettle headers to avoid conflicts
   - for arch in armhf arm64 ppc64el;do apt-get remove -y nettle-dev:$arch;done
-  - if [ "$host" == "powerpc64-linux-gnu" ];then apt-get update && apt-get install -y gcc-$host g++-$host && export QEMU_LD_PREFIX=/usr/$host EXTRA_CONFIGURE_FLAGS='--enable-mini-gmp';fi
-  - if [ "$host" == "powerpc64le-linux-gnu" ];then apt-get update && apt-get install -y gcc-$host g++-$host libgmp-dev:ppc64el && export QEMU_LD_PREFIX=/usr/$host;fi
+  - if [ "$host" == "powerpc64-linux-gnu" ];then apt-get install -y software-properties-common && add-apt-repository "deb http://deb.debian.org/debian bullseye-backports main" && apt-get update && apt-get install -y -t bullseye-backports binfmt-support qemu-user && apt-get install -y gcc-$host g++-$host && export QEMU_LD_PREFIX=/usr/$host EXTRA_CONFIGURE_FLAGS='--enable-mini-gmp';fi
+  - if [ "$host" == "powerpc64le-linux-gnu" ];then apt-get install -y software-properties-common && add-apt-repository "deb http://deb.debian.org/debian bullseye-backports main" && apt-get update && apt-get install -y -t bullseye-backports binfmt-support qemu-user && apt-get install -y gcc-$host g++-$host libgmp-dev:ppc64el && export QEMU_LD_PREFIX=/usr/$host;fi
   - if [ "$host" == "s390x-linux-gnu" ];then apt-get update && apt-get install -y gcc-$host g++-$host libgmp-dev:s390x && export EXTRA_CONFIGURE_FLAGS='--disable-assembler';fi
   script:
   - build=$(dpkg-architecture -qDEB_HOST_GNU_TYPE)
diff --git a/AUTHORS b/AUTHORS
index 2caaf446..c4547b94 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -96,7 +96,7 @@ Amos Jeffries		Implementation of base64url encoding.
 Daiki Ueno		Implementation of RSA-PSS signatures,
 			curve448, shake256, ed448-shake256 signatures,
 			chacha functions for 32-bit nonce, struct
-			nettle_mac interface.
+			nettle_mac interface, siv-gcm.
 
 Dmitry Baryshkov	CFB and CFB8 modes, CMAC64. gosthash94cp and
 			Streebog hash functions, GOST DSA signatures
@@ -121,10 +121,12 @@ Mamone Tarsha Kurdi	Powerpc64 assembly and fat build setup,
 
 Nicolas Mora		RFC 3394 keywrap.
 
-Tianjia Zhang		SM3 hash function.
+Tianjia Zhang		SM3 hash function, SM4 block cipher.
 
 Amitay Isaacs		Powerpc64 assembly for secp192r1, secp224r1
 			and secp256r1.
 
 Martin Schwenke		Powerpc64 assembly for secp384r1, secp521r1,
 			curve25519 and curve448.
+
+Zoltan Fridrich		Ballon password hashing.
diff --git a/ChangeLog b/ChangeLog
index 7ce9d354..f1e5537d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,190 @@
+2022-11-09  Niels Möller  <nisse@lysator.liu.se>
+
+	From Mamone Tarsha:
+	* powerpc64/p9/poly1305-blocks.asm: New file, multi-block radix
+	2^44 implementation. Benchmarked to give a speedup of 3.2 times on
+	Power9.
+	* powerpc64/p9/poly1305.m4 (DEFINES_BLOCK_R64, BLOCK_R64): New
+	file, new macros.
+	* powerpc64/p9/poly1305-internal.asm: Use BLOCK_R64 macro.
+	* powerpc64/machine.m4 (INC_GPR, INC_VR): New macros.
+	* powerpc64/fat/poly1305-blocks.asm: New file.
+	* poly1305-update.c: Check HAVE_NATIVE_fat_poly1305_blocks, and
+	define _nettle_poly1305_blocks_c when needed.
+	* fat-ppc.c: Fat setup for _nettle_poly1305_blocks.
+
+2022-11-07  Niels Möller  <nisse@lysator.liu.se>
+
+	* configure.ac (ASM_FLAGS): New configure environment variable.
+	* aclocal.m4 (GMP_TRY_ASSEMBLE): Use $ASM_FLAGS.
+	* config.make.in (ASM_FLAGS): Add substitution.
+	* Makefile.in: Use $(ASM_FLAGS) when compiling .asm files.
+
+2022-10-31  Niels Möller  <nisse@lysator.liu.se>
+
+	* configure.ac: (asm_file_list): Add HAVE_NATIVE_poly1305_blocks.
+	(asm_nettle_optional_list): Add poly1305-blocks.asm.
+	* x86_64/poly1305-blocks.asm: New file.
+
+	* md-internal.h (MD_FILL_OR_RETURN_INDEX): New macro.
+	* poly1305-update.c (_nettle_poly1305_update): New file and
+	function.
+	* poly1305-internal.h: Declare _nettle_poly1305_blocks and
+	_nettle_poly1305_update.
+	* chacha-poly1305.c (poly1305_update): Use _nettle_poly1305_update.
+	* poly1305-aes.c (poly1305_aes_update): Likewise.
+	* Makefile.in (nettle_SOURCES): Add poly1305-update.c.
+
+2022-10-13  Niels Möller  <nisse@lysator.liu.se>
+
+	* gmp-glue.c (mpn_sec_tabselect) [NETTLE_USE_MINI_GMP]: Add back
+	here, to support mini-gmp builds. Updated signature to be
+	compatible with the gmp version.
+	* gmp-glue.h: Add declaration.
+
+2022-10-11  Niels Möller  <nisse@lysator.liu.se>
+
+	* sec-tabselect.c (sec_tabselect): Delete file and function. All
+	callers updated to use gmp's mpn_sec_tabselect instead, which is
+	implemented in assembly on many platforms.
+
+2022-10-02  Niels Möller  <nisse@lysator.liu.se>
+
+	* examples/ecc-benchmark.c (bench_curve): Add benchmarking of
+	modulo q inversion.
+
+2022-09-29  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc-ecdsa-verify.c (ecc_ecdsa_verify): Call ecc_mul_g and ecc_mul_a directly, not via
+	function pointers.
+	(ecc_ecdsa_verify_itch): Use ECC_MUL_A_ITCH
+	rather than ecc->mul_itch.
+	* ecc-gostdsa-verify.c (ecc_gostdsa_verify_itch)
+	(ecc_gostdsa_verify): Analogous changes.
+
+	* ecc-ecdsa-sign.c (ecc_ecdsa_sign): Call ecc_mul_g and ecc_j_to_a
+	directly, not via function pointers.
+	(ecc_ecdsa_sign_itch): Use ECC_MUL_G_ITCH rather than
+	ecc->mul_g_itch.
+	* ecc-gostdsa-sign.c (ecc_gostdsa_sign_itch, ecc_gostdsa_sign):
+	Analogous changes.
+
+2022-09-28  Niels Möller  <nisse@lysator.liu.se>
+
+	* testsuite/meta-hash-test.c (test_main): Add check of
+	NETTLE_MAX_HASH_BLOCK_SIZE.
+	* nettle-internal.h (NETTLE_MAX_HASH_BLOCK_SIZE): Increase to 144,
+	to accommodate sha3_224.
+	* testsuite/meta-cipher-test.c (test_main): Check that cipher
+	metadata doesn't exceed NETTLE_MAX_CIPHER_BLOCK_SIZE or
+	NETTLE_MAX_CIPHER_KEY_SIZE.
+
+	From Daiki Ueno:
+	* siv-gcm.c (siv_gcm_encrypt_message, siv_gcm_decrypt_message):
+	New file, implementation of SIV-GCM.
+	* siv-gcm.h (SIV_GCM_BLOCK_SIZE, SIV_GCM_DIGEST_SIZE)
+	(SIV_GCM_NONCE_SIZE): New header file, new constants and
+	declarations.
+	* siv-gcm-aes128.c (siv_gcm_aes128_encrypt_message)
+	(siv_gcm_aes128_decrypt_message): New file and functions.
+	* siv-gcm-aes256.c (siv_gcm_aes256_encrypt_message)
+	(siv_gcm_aes256_decrypt_message): Likewise.
+	* siv-ghash-set-key.c (_siv_ghash_set_key): New file, new internal
+	function.
+	* siv-ghash-update.c (_siv_ghash_update): Likewise.
+	* block-internal.h (block16_bswap): New inline function.
+	* bswap-internal.h (bswap64_if_be): New macro.
+	* nettle-internal.h (NETTLE_MAX_CIPHER_KEY_SIZE): New constant.
+	* Makefile.in (nettle_SOURCES): Add new source files.
+	(HEADERS): Add siv-gcm.h.
+	* testsuite/siv-gcm-test.c: New tests.
+	* testsuite/Makefile.in (TS_NETTLE_SOURCES): Add siv-gcm-test.c.
+	* nettle.texinfo (SIV-GCM): Documentation.
+
+	From Zoltan Fridrich:
+	* balloon.c (balloon, balloon_itch): Implementation of balloon
+	password hash.
+	* balloon.h: New header file.
+	* balloon-sha1.c (balloon_sha1): New file and function.
+	* balloon-sha256.c (balloon_sha256): Likewise.
+	* balloon-sha384.c (balloon_sha384): Likewise.
+	* balloon-sha512.c (balloon_sha512): Likewise.
+	* Makefile.in (nettle_SOURCES): Add balloon source files.
+	(HEADERS): Add ballon.h.
+	* testsuite/balloon-test.c: New tests.
+	* testsuite/Makefile.in (TS_NETTLE_SOURCES): Add balloon-test.c.
+
+2022-09-14  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc-nonsec-add-jjj.c (ecc_nonsec_add_jjj): New file and
+	function.
+	* ecc-internal.h: Declare it.
+	* Makefile.in (hogweed_SOURCES): Add ecc-nonsec-add-jjj.c.
+	* testsuite/ecc-add-test.c (test_main): Add tests for ecc_nonsec_add_jjj.
+
+	* ecc-ecdsa-verify.c (ecc_ecdsa_verify): Use ecc_nonsec_add_jjj,
+	to produce correct result in a corner case where point addition
+	needs to use point duplication. Also use ecc_j_to_a rather than
+	ecc->h_to_a, since ecdsa supports only weierstrass curves.
+	* ecc-gostdsa-verify.c (ecc_gostdsa_verify): Analogous change.
+
+	* testsuite/ecdsa-verify-test.c (test_main): Add corresponding test.
+	* testsuite/ecdsa-sign-test.c (test_main): And a test producing
+	the problematic signature.
+
+2022-09-08  Niels Möller  <nisse@lysator.liu.se>
+
+	* eccdata.c (string_toupper): New utility function.
+	(output_modulo): Move more of the per-modulo output here.
+	(output_curve): Remove corresponding code.
+
+2022-08-31  Niels Möller  <nisse@lysator.liu.se>
+
+	* bswap-internal.h (nettle_bswap64, nettle_bswap32)
+	(bswap64_if_le): New header file, new inline functions/macros.
+	* gcm.c (gcm_hash_sizes): Use bswap64_if_le, and bswap-internal.h,
+	replacing local definition of bswap_if_le.
+	* nist-keywrap.c (nist_keywrap16): Likewise.
+	* blowfish-bcrypt.c (swap32): Renamed function, to...
+	(bswap32_if_le): ...new name, rewritten to use nettle_bswap32.
+	Update call sites.
+	* Makefile.in (DISTFILES): Add bswap-internal.h.
+
+2022-08-18  Niels Möller  <nisse@lysator.liu.se>
+
+	* Makefile.in (HEADERS): Add sm4.h.
+
+	From Tianjia Zhang: SM4 block cipher.
+	* sm4.c: New file.
+	* sm4.h: New file.
+	* sm4-meta.c: New file.
+	* gcm-sm4.c: New file
+	* gcm-sm4-meta.c: New file.
+	* nettle.texinfo: Document SM4.
+	* testsuite/gcm-test.c (test_main): Add SM4 tests.
+	* testsuite/sm4-test.c: New file.
+
+	* configure.ac (ABI): Change mips abi check to apply only to mips64.
+
+2022-08-17  Niels Möller  <nisse@lysator.liu.se>
+
+	* testsuite/testutils.c (mpz_urandomm) [NETTLE_USE_MINI_GMP]: New
+	fallback definition when building with mini-gmp.
+
+2022-08-16  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc-mod-arith.c (ecc_mod_sub): Ensure that if inputs are in the
+	range 0 <= a, b < 2m, then output is in the same range.
+	* eccdata.c (output_curve): New outputs ecc_Bm2p and ecc_Bm2q.
+	* ecc-internal.h (struct ecc_modulo): New member Bm2m (B^size -
+	2m), needed by ecc_mod_sub. Update all curves.
+	* testsuite/ecc-mod-arith-test.c: New tests for ecc_mod_add and
+	ecc_mod_sub.
+
+	* eccdata.c (output_modulo): Output the limb size, delete return
+	value.
+	(output_curve): Update calls to output_modulo, other minor cleanup.
+
 2022-08-07  Niels Möller  <nisse@lysator.liu.se>
 
 	Delete all arcfour assembly code.
@@ -8,6 +195,15 @@
 	* x86/arcfour-crypt.asm: Deleted.
 	* asm.m4: Delete arcfour structure offsets.
 
+2022-08-07  Niels Möller  <nisse@lysator.liu.se>
+
+	Based on patch from Corentin Labbe:
+	* nettle.texinfo: Document sha256_compress, sha512_compress,
+	md5_compress and sha1_compress.
+
+	* configure.ac: Refer to nettle-types.h, rather than arcfour.c,
+	for AC_CONFIG_SRCDIR.
+
 2022-08-05  Niels Möller  <nisse@lysator.liu.se>
 
 	* nettle-internal.h: Include stdlib.h, fix alloca warnings on BSD.
@@ -23,6 +219,48 @@
 	* aclocal.m4 (LSH_CCPIC): Use proper PIC flag for *BSD OS's.
 	* blowfish-bcrypt.c (swap32): Eliminate conflict with OpenBSD's swap32 macro.
 
+2022-07-29  Niels Möller  <nisse@lysator.liu.se>
+
+	* s390x/msa_x1/sha256-compress-n.asm: New file. replacing...
+	* s390x/msa_x1/sha256-compress.asm: ...deleted file.
+	* s390x/fat/sha256-compress-n-2.asm: New file. replacing...
+	* s390x/fat/sha256-compress-2.asm: ...deleted file.
+	* fat-s390x.c: Update fat setup.
+
+2022-07-26  Niels Möller  <nisse@lysator.liu.se>
+
+	* arm/v6/sha256-compress-n.asm: New file. replacing...
+	* arm/v6/sha256-compress.asm: ...deleted file.
+	* arm/fat/sha256-compress-n-2.asm: New file. replacing...
+	* arm/fat/sha256-compress-2.asm: ...deleted file.
+	* fat-arm.c: Update fat setup.
+
+2022-07-11  Niels Möller  <nisse@lysator.liu.se>
+
+	* arm64/crypto/sha256-compress-n.asm: New file. replacing...
+	* arm64/crypto/sha256-compress.asm: ...deleted file.
+	* arm64/fat/sha256-compress-n-2.asm: New file. replacing...
+	* arm64/fat/sha256-compress-2.asm: ...deleted file.
+	* fat-arm64.c: Update fat setup.
+
+2022-07-05  Niels Möller  <nisse@lysator.liu.se>
+
+	* md-internal.h (MD_FILL_OR_RETURN): New file, new macro.
+	* sha256-compress-n.c (_nettle_sha256_compress_n): New file and
+	function, replacing...
+	* sha256-compress.c (_nettle_sha256_compress): ...deleted file and
+	function.
+	* sha2-internal.h (_nettle_sha256_compress_n): Declare new function..
+	* sha256.c (sha256_compress): Update to use
+	_nettle_sha256_compress_n and MD_FILL_OR_RETURN.
+	* x86_64/sha256-compress-n.asm: New file. replacing...
+	* x86_64/sha256-compress.asm: ...deleted file.
+	* x86_64/sha_ni/sha256-compress-n.asm: New file. replacing...
+	* x86_64/sha_ni/sha256-compress.asm: ...deleted file.
+	* fat-setup.h (sha256_compress_n_func): New typedef, replacing...
+	(sha256_compress_func): ... deleted typedef.
+	* fat-x86_64.c: Update fat setup.
+
 2022-06-20  Niels Möller  <nisse@lysator.liu.se>
 
 	* testsuite/sha1-test.c (test_sha1_compress): New function.
diff --git a/Makefile.in b/Makefile.in
index 4b4672fa..cd4993e8 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -83,6 +83,8 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
 		 nist-keywrap.c \
 		 arcfour.c \
 		 arctwo.c arctwo-meta.c blowfish.c blowfish-bcrypt.c \
+		 balloon.c balloon-sha1.c balloon-sha256.c \
+		 balloon-sha384.c balloon-sha512.c \
 		 base16-encode.c base16-decode.c base16-meta.c \
 		 base64-encode.c base64-decode.c base64-meta.c \
 		 base64url-encode.c base64url-decode.c base64url-meta.c \
@@ -100,18 +102,22 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
 		 cbc.c cbc-aes128-encrypt.c cbc-aes192-encrypt.c cbc-aes256-encrypt.c \
 		 ccm.c ccm-aes128.c ccm-aes192.c ccm-aes256.c cfb.c \
 		 siv-cmac.c siv-cmac-aes128.c siv-cmac-aes256.c \
+		 siv-gcm.c siv-gcm-aes128.c siv-gcm-aes256.c \
 		 cnd-memcpy.c \
 		 chacha-crypt.c chacha-core-internal.c \
 		 chacha-poly1305.c chacha-poly1305-meta.c \
 		 chacha-set-key.c chacha-set-nonce.c \
 		 ctr.c ctr16.c des.c des3.c \
 		 eax.c eax-aes128.c eax-aes128-meta.c \
-		 ghash-set-key.c ghash-update.c gcm.c gcm-aes.c \
+		 ghash-set-key.c ghash-update.c \
+		 siv-ghash-set-key.c siv-ghash-update.c \
+		 gcm.c gcm-aes.c \
 		 gcm-aes128.c gcm-aes128-meta.c \
 		 gcm-aes192.c gcm-aes192-meta.c \
 		 gcm-aes256.c gcm-aes256-meta.c \
 		 gcm-camellia128.c gcm-camellia128-meta.c \
 		 gcm-camellia256.c gcm-camellia256-meta.c \
+		 gcm-sm4.c gcm-sm4-meta.c \
 		 cmac.c cmac64.c cmac-aes128.c cmac-aes256.c cmac-des3.c \
 		 cmac-aes128-meta.c cmac-aes256-meta.c cmac-des3-meta.c \
 		 gost28147.c gosthash94.c gosthash94-meta.c \
@@ -130,7 +136,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
 		 nettle-meta-ciphers.c nettle-meta-hashes.c nettle-meta-macs.c \
 		 pbkdf2.c pbkdf2-hmac-gosthash94.c pbkdf2-hmac-sha1.c \
 		 pbkdf2-hmac-sha256.c pbkdf2-hmac-sha384.c pbkdf2-hmac-sha512.c \
-		 poly1305-aes.c poly1305-internal.c \
+		 poly1305-aes.c poly1305-internal.c poly1305-update.c \
 		 realloc.c \
 		 ripemd160.c ripemd160-compress.c ripemd160-meta.c \
 		 salsa20-core-internal.c salsa20-crypt-internal.c \
@@ -138,7 +144,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
 		 salsa20-set-nonce.c \
 		 salsa20-128-set-key.c salsa20-256-set-key.c \
 		 sha1.c sha1-compress.c sha1-meta.c \
-		 sha256.c sha256-compress.c sha224-meta.c sha256-meta.c \
+		 sha256.c sha256-compress-n.c sha224-meta.c sha256-meta.c \
 		 sha512.c sha512-compress.c sha384-meta.c sha512-meta.c \
 		 sha512-224-meta.c sha512-256-meta.c \
 		 sha3.c sha3-permute.c \
@@ -150,6 +156,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
 		 serpent-meta.c \
 		 streebog.c streebog-meta.c \
 		 twofish.c twofish-meta.c \
+		 sm4.c sm4-meta.c \
 		 umac-nh.c umac-nh-n.c umac-l2.c umac-l3.c \
 		 umac-poly64.c umac-poly128.c umac-set-key.c \
 		 umac32.c umac64.c umac96.c umac128.c \
@@ -187,7 +194,7 @@ hogweed_SOURCES = sexp.c sexp-format.c \
 		  dsa2sexp.c sexp2dsa.c \
 		  pgp-encode.c rsa2openpgp.c \
 		  der-iterator.c der2rsa.c der2dsa.c \
-		  sec-add-1.c sec-sub-1.c sec-tabselect.c \
+		  sec-add-1.c sec-sub-1.c \
 		  gmp-glue.c cnd-copy.c \
 		  ecc-mod.c ecc-mod-inv.c \
 		  ecc-mod-arith.c ecc-pp1-redc.c ecc-pm1-redc.c \
@@ -196,7 +203,7 @@ hogweed_SOURCES = sexp.c sexp-format.c \
 		  ecc-secp192r1.c ecc-secp224r1.c ecc-secp256r1.c \
 		  ecc-secp384r1.c ecc-secp521r1.c \
 		  ecc-size.c ecc-j-to-a.c ecc-a-to-j.c \
-		  ecc-dup-jj.c ecc-add-jja.c ecc-add-jjj.c \
+		  ecc-dup-jj.c ecc-add-jja.c ecc-add-jjj.c ecc-nonsec-add-jjj.c \
 		  ecc-eh-to-a.c \
 		  ecc-dup-eh.c ecc-add-eh.c ecc-add-ehh.c \
 		  ecc-dup-th.c ecc-add-th.c ecc-add-thh.c \
@@ -218,7 +225,7 @@ hogweed_SOURCES = sexp.c sexp-format.c \
 
 OPT_SOURCES = fat-arm.c fat-arm64.c fat-ppc.c fat-s390x.c fat-x86_64.c mini-gmp.c
 
-HEADERS = aes.h arcfour.h arctwo.h asn1.h blowfish.h \
+HEADERS = aes.h arcfour.h arctwo.h asn1.h blowfish.h balloon.h \
 	  base16.h base64.h bignum.h buffer.h camellia.h cast128.h \
 	  cbc.h ccm.h cfb.h chacha.h chacha-poly1305.h ctr.h \
 	  curve25519.h curve448.h des.h dsa.h dsa-compat.h eax.h \
@@ -226,15 +233,15 @@ HEADERS = aes.h arcfour.h arctwo.h asn1.h blowfish.h \
 	  gcm.h gostdsa.h gosthash94.h hmac.h \
 	  knuth-lfib.h hkdf.h \
 	  macros.h \
-	  cmac.h siv-cmac.h \
+	  cmac.h siv-cmac.h siv-gcm.h \
 	  md2.h md4.h \
 	  md5.h md5-compat.h \
 	  memops.h memxor.h \
 	  nettle-meta.h nettle-types.h \
 	  pbkdf2.h \
 	  pgp.h pkcs1.h pss.h pss-mgf1.h realloc.h ripemd160.h rsa.h \
-	  salsa20.h sexp.h \
-	  serpent.h sha.h sha1.h sha2.h sha3.h sm3.h streebog.h twofish.h \
+	  salsa20.h sexp.h serpent.h \
+	  sha.h sha1.h sha2.h sha3.h sm3.h sm4.h streebog.h twofish.h \
 	  umac.h yarrow.h xts.h poly1305.h nist-keywrap.h
 
 INSTALL_HEADERS = $(HEADERS) version.h @IF_MINI_GMP@ mini-gmp.h
@@ -257,10 +264,11 @@ DISTFILES = $(SOURCES) $(HEADERS) getopt.h getopt_int.h \
 	INSTALL NEWS ChangeLog \
 	nettle.pc.in hogweed.pc.in \
 	desdata.stamp $(des_headers) descore.README \
-	aes-internal.h block-internal.h blowfish-internal.h camellia-internal.h \
+	aes-internal.h block-internal.h blowfish-internal.h bswap-internal.h \
+	camellia-internal.h \
 	ghash-internal.h gost28147-internal.h poly1305-internal.h \
 	serpent-internal.h cast128_sboxes.h desinfo.h desCode.h \
-	ripemd160-internal.h sha2-internal.h \
+	ripemd160-internal.h md-internal.h sha2-internal.h \
 	memxor-internal.h nettle-internal.h nettle-write.h \
 	ctr-internal.h chacha-internal.h sha3-internal.h \
 	salsa20-internal.h umac-internal.h hogweed-internal.h \
@@ -290,7 +298,7 @@ libhogweed.a: $(hogweed_OBJS)
 
 %.$(OBJEXT): %.asm $(srcdir)/m4-utils.m4 $(srcdir)/asm.m4 config.m4 machine.m4
 	$(M4) $(srcdir)/m4-utils.m4 $(srcdir)/asm.m4 config.m4 machine.m4 $< >$*.s
-	$(COMPILE) -c $*.s
+	$(COMPILE) $(ASM_FLAGS) -c $*.s
 
 %.$(OBJEXT): %.c
 	$(COMPILE) -c $< \
diff --git a/aclocal.m4 b/aclocal.m4
index a398d346..c87c3fa8 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -302,7 +302,7 @@ AC_DEFUN([GMP_TRY_ASSEMBLE],
 [cat >conftest.s <<EOF
 [$1]
 EOF
-gmp_assemble="$CC $CFLAGS $CPPFLAGS -c conftest.s >conftest.out 2>&1"
+gmp_assemble="$CC $CFLAGS $CPPFLAGS $ASM_FLAGS -c conftest.s >conftest.out 2>&1"
 if AC_TRY_EVAL(gmp_assemble); then
   cat conftest.out >&AC_FD_CC
   ifelse([$2],,:,[$2])
@@ -563,7 +563,7 @@ dnl  Determine whether the assembler takes powerpc registers with an "r" as
 dnl  in "r6", or as plain "6".  The latter is standard, but NeXT, Rhapsody,
 dnl  and MacOS-X require the "r" forms.
 dnl
-dnl  See also mpn/powerpc32/powerpc-defs.m4 which uses the result of this
+dnl  See also powerpc64/machine.m4 which uses the result of this
 dnl  test.
 
 AC_DEFUN([GMP_ASM_POWERPC_R_REGISTERS],
diff --git a/arm/fat/sha256-compress-2.asm b/arm/fat/sha256-compress-n-2.asm
index 36d55e4b..8834d93d 100644
--- a/arm/fat/sha256-compress-2.asm
+++ b/arm/fat/sha256-compress-n-2.asm
@@ -1,4 +1,4 @@
-C arm/fat/sha256-compress-2.asm
+C arm/fat/sha256-compress-n-2.asm
 
 
 ifelse(`
@@ -31,7 +31,7 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
-dnl PROLOGUE(_nettle_sha256_compress) picked up by configure
+dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
 
 define(`fat_transform', `$1_armv6')
-include_src(`arm/v6/sha256-compress.asm')
+include_src(`arm/v6/sha256-compress-n.asm')
diff --git a/arm/v6/sha256-compress.asm b/arm/v6/sha256-compress-n.asm
index 3c021284..bf225bd8 100644
--- a/arm/v6/sha256-compress.asm
+++ b/arm/v6/sha256-compress-n.asm
@@ -1,7 +1,7 @@
-C arm/v6/sha256-compress.asm
+C arm/v6/sha256-compress-n.asm
 
 ifelse(`
-   Copyright (C) 2013 Niels Möller
+   Copyright (C) 2013, 2022 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -30,13 +30,14 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
-	.file "sha256-compress.asm"
+	.file "sha256-compress-n.asm"
 	.arch armv6
 
 define(`STATE', `r0')
-define(`INPUT', `r1')
-define(`K', `r2')
-define(`SA', `r3')
+define(`K', `r1')
+define(`BLOCKS', `r2')
+define(`INPUT', `r3')
+define(`SA', `r2')	C Overlap BLOCKS
 define(`SB', `r4')
 define(`SC', `r5')
 define(`SD', `r6')
@@ -45,12 +46,12 @@ define(`SF', `r8')
 define(`SG', `r10')
 define(`SH', `r11')
 define(`T0', `r12')
-define(`T1', `r1')	C Overlap INPUT
+define(`T1', `r3')	C Overlap INPUT
 define(`COUNT', `r0')	C Overlap STATE
 define(`W', `r14')
 
-C Used for data load
-define(`I0', `r3')
+C Used for data load. Must not clobber STATE (r0), K (r1) or INPUT (r3)
+define(`I0', `r2')
 define(`I1', `r4')
 define(`I2', `r5')
 define(`I3', `r6')
@@ -88,7 +89,7 @@ C S1(E) = E<<<26 ^ E<<<21 ^ E<<<7
 C S0(A) = A<<<30 ^ A<<<19 ^ A<<<10
 C Choice (E, F, G) = G^(E&(F^G))
 C Majority (A,B,C) = (A&B) + (C&(A^B))
-	
+
 define(`ROUND', `
 	ror	T0, $5, #6
 	eor	T0, T0, $5, ror #11
@@ -117,16 +118,31 @@ define(`NOEXPN', `
 	ldr	W, [sp, + $1]
 	add	$1, $1, #4
 ')
-	C void
-	C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
-
 	.text
 	.align 2
 
-PROLOGUE(_nettle_sha256_compress)
-	push	{r4,r5,r6,r7,r8,r10,r11,r14}
-	sub	sp, sp, #68
-	str	STATE, [sp, #+64]
+define(`SHIFT_OFFSET', 64)
+define(`INPUT_OFFSET', 68)
+define(`I0_OFFSET', 72)
+define(`STATE_OFFSET', 76)
+define(`K_OFFSET', 80)
+define(`BLOCKS_OFFSET', 84)
+
+	C const uint8_t *
+	C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+	C                           size_t blocks, const uint8_t *input)
+
+PROLOGUE(_nettle_sha256_compress_n)
+	cmp	BLOCKS, #0
+	bne	.Lwork
+
+	mov	r0, INPUT
+	bx lr
+
+.Lwork:
+	C Also save STATE (r0), K (r1) and BLOCKS (r2)
+	push	{r0,r1,r2,r4,r5,r6,r7,r8,r10,r11,r12,r14}
+	sub	sp, sp, #STATE_OFFSET
 
 	C Load data up front, since we don't have enough registers
 	C to load and shift on-the-fly
@@ -144,6 +160,9 @@ IF_BE(`	lsr	I1, T0, SHIFT')
 	C because there is no rotate left
 IF_BE(`	rsb	SHIFT, SHIFT, #32')
 
+	str	SHIFT, [sp, #SHIFT_OFFSET]
+
+.Loop_block:
 	mov	DST, sp
 	mov	ILEFT, #4
 .Lcopy:
@@ -164,7 +183,12 @@ IF_LE(`	rev	I3, I3')
 	stm	DST!, {I0,I1,I2,I3}
 	mov	I0, I4	
 	bne	.Lcopy
-	
+
+	str	INPUT, [sp, #INPUT_OFFSET]
+	str	I0, [sp, #I0_OFFSET]
+
+	C Process block, with input at sp, expanded on the fly
+
 	ldm	STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
 
 	mov	COUNT,#0
@@ -203,20 +227,40 @@ IF_LE(`	rev	I3, I3')
 	EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA)
 	bne	.Loop2
 
-	ldr	STATE, [sp, #+64]
+	ldr	STATE, [sp, #STATE_OFFSET]
 	C No longer needed registers
-	ldm	STATE, {r1,r2,r12,r14}
-	add	SA, SA, r1
-	add	SB, SB, r2
-	add	SC, SC, r12
-	add	SD, SD, r14
+	ldm	STATE, {K, T1, T0, W}
+	add	SA, SA, K
+	add	SB, SB, T1
+	add	SC, SC, T0
+	add	SD, SD, W
 	stm	STATE!, {SA,SB,SC,SD}
-	ldm	STATE, {r1,r2,r12,r14}
-	add	SE, SE, r1
-	add	SF, SF, r2
-	add	SG, SG, r12
-	add	SH, SH, r14
-	stm	STATE!, {SE,SF,SG,SH}
-	add	sp, sp, #68
-	pop	{r4,r5,r6,r7,r8,r10,r11,pc}
-EPILOGUE(_nettle_sha256_compress)
+	ldm	STATE, {K, T1, T0, W}
+	add	SE, SE, K
+	add	SF, SF, T1
+	add	SG, SG, T0
+	add	SH, SH, W
+	stm	STATE, {SE,SF,SG,SH}
+	sub	STATE, STATE, #16
+
+	ldr	BLOCKS, [sp, #BLOCKS_OFFSET]
+	subs	BLOCKS, BLOCKS, #1
+	str	BLOCKS, [sp, #BLOCKS_OFFSET]
+
+	ldr	SHIFT, [sp, #SHIFT_OFFSET]
+	ldr	K, [sp, #K_OFFSET]
+	ldr	INPUT, [sp, #INPUT_OFFSET]
+	ldr	I0, [sp, #I0_OFFSET]
+
+	bne	.Loop_block
+
+	C Restore input pointer adjustment
+IF_BE(`	rsbs	SHIFT, SHIFT, #32')
+IF_LE(` cmp	SHIFT, #0')
+	subne	INPUT, INPUT, #4
+	orr	r0, INPUT, SHIFT, lsr #3
+
+	C Discard saved STATE, K and BLOCKS.
+	add	sp, sp, #STATE_OFFSET + 12
+	pop	{r4,r5,r6,r7,r8,r10,r11,r12,pc}
+EPILOGUE(_nettle_sha256_compress_n)
diff --git a/arm64/crypto/sha256-compress.asm b/arm64/crypto/sha256-compress-n.asm
index 2bddea05..447dc590 100644
--- a/arm64/crypto/sha256-compress.asm
+++ b/arm64/crypto/sha256-compress-n.asm
@@ -1,4 +1,4 @@
-C arm64/crypto/sha256-compress.asm
+C arm64/crypto/sha256-compress-n.asm
 
 ifelse(`
    Copyright (C) 2021 Mamone Tarsha
@@ -37,7 +37,7 @@ C SHA256H2: SHA256 hash update (part 2)
 C SHA256SU0: SHA256 schedule update 0
 C SHA256SU1: SHA256 schedule update 1
 
-.file "sha256-compress.asm"
+.file "sha256-compress-n.asm"
 .arch armv8-a+crypto
 
 .text
@@ -45,8 +45,9 @@ C SHA256SU1: SHA256 schedule update 1
 C Register usage:
 
 define(`STATE', `x0')
-define(`INPUT', `x1')
-define(`K', `x2')
+define(`K', `x1')
+define(`BLOCKS', `x2')
+define(`INPUT', `x3')
 
 define(`MSG0', `v0')
 define(`MSG1', `v1')
@@ -59,19 +60,23 @@ define(`TMP', `v7')
 define(`STATE0_SAVED', `v16')
 define(`STATE1_SAVED', `v17')
 
-C void 
-C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+C const uint8_t *
+C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+C                           size_t blocks, const uint8_t *input)
+
+PROLOGUE(_nettle_sha256_compress_n)
+    cbz            BLOCKS, .Lend
 
-PROLOGUE(_nettle_sha256_compress)
     C Load state
     ld1            {STATE0.4s,STATE1.4s},[STATE]
 
+.Loop:
     C Save state
     mov            STATE0_SAVED.16b,STATE0.16b
     mov            STATE1_SAVED.16b,STATE1.16b
 
     C Load message
-    ld1            {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT]
+    ld1            {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT],#64
     
     C Reverse for little endian
     rev32          MSG0.16b,MSG0.16b
@@ -217,9 +222,13 @@ PROLOGUE(_nettle_sha256_compress)
     C Combine state
     add            STATE0.4s,STATE0.4s,STATE0_SAVED.4s
     add            STATE1.4s,STATE1.4s,STATE1_SAVED.4s
-	
+    subs           BLOCKS, BLOCKS, #1
+    sub            K, K, #240
+    b.ne           .Loop
+
     C Store state
     st1            {STATE0.4s,STATE1.4s},[STATE]
-
+.Lend:
+    mov            x0, INPUT
     ret
-EPILOGUE(_nettle_sha256_compress)
+EPILOGUE(_nettle_sha256_compress_n)
diff --git a/arm64/fat/sha256-compress-2.asm b/arm64/fat/sha256-compress-n-2.asm
index 67590794..2f70686e 100644
--- a/arm64/fat/sha256-compress-2.asm
+++ b/arm64/fat/sha256-compress-n-2.asm
@@ -1,4 +1,4 @@
-C arm64/fat/sha256-compress-2.asm
+C arm64/fat/sha256-compress-n-2.asm
 
 
 ifelse(`
@@ -31,7 +31,7 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
-dnl PROLOGUE(_nettle_sha256_compress) picked up by configure
+dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
 
 define(`fat_transform', `$1_arm64')
-include_src(`arm64/crypto/sha256-compress.asm')
+include_src(`arm64/crypto/sha256-compress-n.asm')
diff --git a/balloon-sha1.c b/balloon-sha1.c
new file mode 100644
index 00000000..71c86e1d
--- /dev/null
+++ b/balloon-sha1.c
@@ -0,0 +1,55 @@
+/* balloon-sha1.c
+
+   Balloon password-hashing algorithm.
+
+   Copyright (C) 2022 Zoltan Fridrich
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "balloon.h"
+#include "sha1.h"
+
+void
+balloon_sha1(size_t s_cost, size_t t_cost,
+             size_t passwd_length, const uint8_t *passwd,
+             size_t salt_length, const uint8_t *salt,
+             uint8_t *scratch, uint8_t *dst)
+{
+  struct sha1_ctx ctx;
+  sha1_init(&ctx);
+  balloon(&ctx,
+          (nettle_hash_update_func*)sha1_update,
+          (nettle_hash_digest_func*)sha1_digest,
+          SHA1_DIGEST_SIZE, s_cost, t_cost,
+          passwd_length, passwd, salt_length, salt, scratch, dst);
+}
diff --git a/balloon-sha256.c b/balloon-sha256.c
new file mode 100644
index 00000000..fe31a691
--- /dev/null
+++ b/balloon-sha256.c
@@ -0,0 +1,55 @@
+/* balloon-sha256.c
+
+   Balloon password-hashing algorithm.
+
+   Copyright (C) 2022 Zoltan Fridrich
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "balloon.h"
+#include "sha2.h"
+
+void
+balloon_sha256(size_t s_cost, size_t t_cost,
+               size_t passwd_length, const uint8_t *passwd,
+               size_t salt_length, const uint8_t *salt,
+               uint8_t *scratch, uint8_t *dst)
+{
+  struct sha256_ctx ctx;
+  sha256_init(&ctx);
+  balloon(&ctx,
+          (nettle_hash_update_func*)sha256_update,
+          (nettle_hash_digest_func*)sha256_digest,
+          SHA256_DIGEST_SIZE, s_cost, t_cost,
+          passwd_length, passwd, salt_length, salt, scratch, dst);
+}
diff --git a/balloon-sha384.c b/balloon-sha384.c
new file mode 100644
index 00000000..68294496
--- /dev/null
+++ b/balloon-sha384.c
@@ -0,0 +1,55 @@
+/* balloon-sha384.c
+
+   Balloon password-hashing algorithm.
+
+   Copyright (C) 2022 Zoltan Fridrich
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "balloon.h"
+#include "sha2.h"
+
+void
+balloon_sha384(size_t s_cost, size_t t_cost,
+               size_t passwd_length, const uint8_t *passwd,
+               size_t salt_length, const uint8_t *salt,
+               uint8_t *scratch, uint8_t *dst)
+{
+  struct sha384_ctx ctx;
+  sha384_init(&ctx);
+  balloon(&ctx,
+          (nettle_hash_update_func*)sha384_update,
+          (nettle_hash_digest_func*)sha384_digest,
+          SHA384_DIGEST_SIZE, s_cost, t_cost,
+          passwd_length, passwd, salt_length, salt, scratch, dst);
+}
diff --git a/balloon-sha512.c b/balloon-sha512.c
new file mode 100644
index 00000000..f19f8aa0
--- /dev/null
+++ b/balloon-sha512.c
@@ -0,0 +1,55 @@
+/* balloon-sha512.c
+
+   Balloon password-hashing algorithm.
+
+   Copyright (C) 2022 Zoltan Fridrich
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "balloon.h"
+#include "sha2.h"
+
+void
+balloon_sha512(size_t s_cost, size_t t_cost,
+               size_t passwd_length, const uint8_t *passwd,
+               size_t salt_length, const uint8_t *salt,
+               uint8_t *scratch, uint8_t *dst)
+{
+  struct sha512_ctx ctx;
+  sha512_init(&ctx);
+  balloon(&ctx,
+          (nettle_hash_update_func*)sha512_update,
+          (nettle_hash_digest_func*)sha512_digest,
+          SHA512_DIGEST_SIZE, s_cost, t_cost,
+          passwd_length, passwd, salt_length, salt, scratch, dst);
+}
diff --git a/balloon.c b/balloon.c
new file mode 100644
index 00000000..c744160a
--- /dev/null
+++ b/balloon.c
@@ -0,0 +1,149 @@
+/* balloon.c
+
+   Balloon password-hashing algorithm.
+
+   Copyright (C) 2022 Zoltan Fridrich
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+/* For a description of the algorithm, see:
+ * Boneh, D., Corrigan-Gibbs, H., Schechter, S. (2017, May 12). Balloon Hashing:
+ * A Memory-Hard Function Providing Provable Protection Against Sequential Attacks.
+ * Retrieved Sep 1, 2022, from https://eprint.iacr.org/2016/027.pdf
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <string.h>
+
+#include "balloon.h"
+#include "macros.h"
+
+#define DELTA 3
+
+static void
+hash(void *ctx,
+     nettle_hash_update_func *update,
+     nettle_hash_digest_func *digest,
+     size_t digest_size,
+     uint64_t cnt,
+     size_t a_len, const uint8_t *a,
+     size_t b_len, const uint8_t *b,
+     uint8_t *dst)
+{
+  uint8_t tmp[8];
+  LE_WRITE_UINT64(tmp, cnt);
+  update(ctx, sizeof(tmp), tmp);
+  if (a && a_len)
+    update(ctx, a_len, a);
+  if (b && b_len)
+    update(ctx, b_len, b);
+  digest(ctx, digest_size, dst);
+}
+
+static void
+hash_ints(void *ctx,
+          nettle_hash_update_func *update,
+          nettle_hash_digest_func *digest,
+          size_t digest_size,
+          uint64_t i, uint64_t j, uint64_t k,
+          uint8_t *dst)
+{
+  uint8_t tmp[24];
+  LE_WRITE_UINT64(tmp, i);
+  LE_WRITE_UINT64(tmp + 8, j);
+  LE_WRITE_UINT64(tmp + 16, k);
+  update(ctx, sizeof(tmp), tmp);
+  digest(ctx, digest_size, dst);
+}
+
+/* Takes length bytes long big number stored
+ * in little endian format and computes modulus
+ */
+static size_t
+block_to_int(size_t length, const uint8_t *block, size_t mod)
+{
+  size_t i = length, r = 0;
+  while (i--)
+    {
+      r = (r << 8) + block[i];
+      r %= mod;
+    }
+  return r;
+}
+
+void
+balloon(void *hash_ctx,
+        nettle_hash_update_func *update,
+        nettle_hash_digest_func *digest,
+        size_t digest_size, size_t s_cost, size_t t_cost,
+        size_t passwd_length, const uint8_t *passwd,
+        size_t salt_length, const uint8_t *salt,
+        uint8_t *scratch, uint8_t *dst)
+{
+  const size_t BS = digest_size;
+  uint8_t *block = scratch;
+  uint8_t *buf = scratch + BS;
+  size_t i, j, k, cnt = 0;
+
+  hash(hash_ctx, update, digest, digest_size,
+       cnt++, passwd_length, passwd, salt_length, salt, buf);
+  for (i = 1; i < s_cost; ++i)
+    hash(hash_ctx, update, digest, digest_size,
+         cnt++, BS, buf + (i - 1) * BS, 0, NULL, buf + i * BS);
+
+  for (i = 0; i < t_cost; ++i)
+    {
+      for (j = 0; j < s_cost; ++j)
+        {
+          hash(hash_ctx, update, digest, digest_size,
+               cnt++, BS, buf + (j ? j - 1 : s_cost - 1) * BS,
+               BS, buf + j * BS, buf + j * BS);
+          for (k = 0; k < DELTA; ++k)
+            {
+              hash_ints(hash_ctx, update, digest, digest_size, i, j, k, block);
+              hash(hash_ctx, update, digest, digest_size,
+                   cnt++, salt_length, salt, BS, block, block);
+              hash(hash_ctx, update, digest, digest_size,
+                   cnt++, BS, buf + j * BS,
+                   BS, buf + block_to_int(BS, block, s_cost) * BS,
+                   buf + j * BS);
+            }
+        }
+    }
+  memcpy(dst, buf + (s_cost - 1) * BS, BS);
+}
+
+size_t
+balloon_itch(size_t digest_size, size_t s_cost)
+{
+  return (s_cost + 1) * digest_size;
+}
diff --git a/balloon.h b/balloon.h
new file mode 100644
index 00000000..9c021925
--- /dev/null
+++ b/balloon.h
@@ -0,0 +1,98 @@
+/* balloon.h
+
+   Balloon password-hashing algorithm.
+
+   Copyright (C) 2022 Zoltan Fridrich
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+/* For a description of the algorithm, see:
+ * Boneh, D., Corrigan-Gibbs, H., Schechter, S. (2017, May 12). Balloon Hashing:
+ * A Memory-Hard Function Providing Provable Protection Against Sequential Attacks.
+ * Retrieved Sep 1, 2022, from https://eprint.iacr.org/2016/027.pdf
+ */
+
+#ifndef NETTLE_BALLOON_H_INCLUDED
+#define NETTLE_BALLOON_H_INCLUDED
+
+#include "nettle-types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Name mangling */
+#define balloon nettle_balloon
+#define balloon_itch nettle_balloon_itch
+#define balloon_sha1 nettle_balloon_sha1
+#define balloon_sha256 nettle_balloon_sha256
+#define balloon_sha384 nettle_balloon_sha384
+#define balloon_sha512 nettle_balloon_sha512
+
+void
+balloon(void *hash_ctx,
+        nettle_hash_update_func *update,
+        nettle_hash_digest_func *digest,
+        size_t digest_size, size_t s_cost, size_t t_cost,
+        size_t passwd_length, const uint8_t *passwd,
+        size_t salt_length, const uint8_t *salt,
+        uint8_t *scratch, uint8_t *dst);
+
+size_t
+balloon_itch(size_t digest_size, size_t s_cost);
+
+void
+balloon_sha1(size_t s_cost, size_t t_cost,
+             size_t passwd_length, const uint8_t *passwd,
+             size_t salt_length, const uint8_t *salt,
+             uint8_t *scratch, uint8_t *dst);
+
+void
+balloon_sha256(size_t s_cost, size_t t_cost,
+               size_t passwd_length, const uint8_t *passwd,
+               size_t salt_length, const uint8_t *salt,
+               uint8_t *scratch, uint8_t *dst);
+
+void
+balloon_sha384(size_t s_cost, size_t t_cost,
+               size_t passwd_length, const uint8_t *passwd,
+               size_t salt_length, const uint8_t *salt,
+               uint8_t *scratch, uint8_t *dst);
+
+void
+balloon_sha512(size_t s_cost, size_t t_cost,
+               size_t passwd_length, const uint8_t *passwd,
+               size_t salt_length, const uint8_t *salt,
+               uint8_t *scratch, uint8_t *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NETTLE_BALLOON_H_INCLUDED */
diff --git a/block-internal.h b/block-internal.h
index d7b0c315..e9c26ff6 100644
--- a/block-internal.h
+++ b/block-internal.h
@@ -40,6 +40,7 @@
 #include <assert.h>
 
 #include "nettle-types.h"
+#include "bswap-internal.h"
 #include "memxor.h"
 
 static inline void
@@ -197,4 +198,15 @@ block16_mulx_ghash (union nettle_block16 *r,
 }
 #endif /* ! WORDS_BIGENDIAN */
 
+/* Reverse bytes in X and store the result in R.  This supports
+   in-place operation (R and X can overlap).  */
+static inline void
+block16_bswap (union nettle_block16 *r,
+	       const union nettle_block16 *x)
+{
+  uint64_t t = nettle_bswap64 (x->u64[0]);
+  r->u64[0] = nettle_bswap64 (x->u64[1]);
+  r->u64[1] = t;
+}
+
 #endif /* NETTLE_BLOCK_INTERNAL_H_INCLUDED */
diff --git a/blowfish-bcrypt.c b/blowfish-bcrypt.c
index 800d1468..08b1e32e 100644
--- a/blowfish-bcrypt.c
+++ b/blowfish-bcrypt.c
@@ -42,7 +42,7 @@
 #include "blowfish.h"
 #include "blowfish-internal.h"
 #include "base64.h"
-
+#include "bswap-internal.h"
 #include "macros.h"
 
 #define CRYPTPLEN 7
@@ -149,19 +149,16 @@ static uint32_t magic_w[6] = {
   0x64657253, 0x63727944, 0x6F756274
 };
 
-/* conflicts with OpenBSD's swap32 macro */
-#undef swap32
-
-static void swap32(uint32_t *x, int count)
+#if WORDS_BIGENDIAN
+#define bswap32_if_le(x, n)
+#else
+static void bswap32_if_le (uint32_t *x, unsigned n)
 {
-#if !WORDS_BIGENDIAN
-  do {
-    uint32_t tmp = *x;
-    tmp = (tmp << 16) | (tmp >> 16);
-    *x++ = ((tmp & 0x00FF00FF) << 8) | ((tmp >> 8) & 0x00FF00FF);
-  } while (--count);
-#endif
+  unsigned i;
+  for (i = 0; i < n; i++)
+    x[i] = nettle_bswap32 (x[i]);
 }
+#endif
 
 static void set_xkey(size_t lenkey, const uint8_t *key,
                      bf_key expanded, bf_key initial,
@@ -343,7 +340,7 @@ static int ibcrypt(uint8_t *dst,
   else if (lenscheme < HASHOFFSET)
     return 0;
   memcpy(psalt, data.binary.salt, BLOWFISH_BCRYPT_BINSALT_SIZE);
-  swap32(data.binary.salt, 4);
+  bswap32_if_le (data.binary.salt, 4);
 
   if (log2rounds < minlog2rounds || log2rounds > 31)
     return 0;
@@ -448,7 +445,7 @@ static int ibcrypt(uint8_t *dst,
   dst = (uint8_t*)
         encode_radix64((char*) dst, BLOWFISH_BCRYPT_BINSALT_SIZE, psalt) - 1;
 
-  swap32(data.binary.output, 6);
+  bswap32_if_le (data.binary.output, 6);
 /* This has to be bug-compatible with the original implementation, so
    only encode 23 of the 24 bytes. */
   encode_radix64((char*) dst, 23, (uint8_t *) data.binary.output);
diff --git a/bswap-internal.h b/bswap-internal.h
new file mode 100644
index 00000000..b9923f99
--- /dev/null
+++ b/bswap-internal.h
@@ -0,0 +1,77 @@
+/* bswap-internal.h
+
+   Copyright (C) 2022 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef NETTLE_BSWAP_INTERNAL_H_INCLUDED
+#define NETTLE_BSWAP_INTERNAL_H_INCLUDED
+
+#include "nettle-types.h"
+
+/* Note that these definitions depend config.h, which should be
+   included first. */
+
+#if HAVE_BUILTIN_BSWAP64
+#define nettle_bswap64 __builtin_bswap64
+/* Assume bswap32 is also available. */
+#define nettle_bswap32 __builtin_bswap32
+#else
+static inline uint64_t
+nettle_bswap64 (uint64_t x)
+{
+  x = (x >> 32) | (x << 32);
+  x = ((x >> 16) & UINT64_C (0xffff0000ffff))
+    | ((x & UINT64_C (0xffff0000ffff)) << 16);
+  x = ((x >> 8) & UINT64_C (0xff00ff00ff00ff))
+    | ((x & UINT64_C (0xff00ff00ff00ff)) << 8);
+  return x;
+}
+
+static inline uint32_t
+nettle_bswap32 (uint32_t x)
+{
+  x = (x << 16) | (x >> 16);
+  x = ((x & 0x00FF00FF) << 8) | ((x >> 8) & 0x00FF00FF);
+  return x;
+}
+#endif
+
+#if WORDS_BIGENDIAN
+#define bswap64_if_le(x) (x)
+#else
+#define bswap64_if_le nettle_bswap64
+#endif
+
+#if WORDS_BIGENDIAN
+#define bswap64_if_be nettle_bswap64
+#else
+#define bswap64_if_be(x) (x)
+#endif
+
+#endif /* NETTLE_BSWAP_INTERNAL_H_INCLUDED */
diff --git a/chacha-poly1305.c b/chacha-poly1305.c
index 7a423e1e..ea8b2952 100644
--- a/chacha-poly1305.c
+++ b/chacha-poly1305.c
@@ -97,7 +97,8 @@ static void
 poly1305_update (struct chacha_poly1305_ctx *ctx,
 		 size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+  ctx->index = _nettle_poly1305_update (&(ctx)->poly1305,
+					ctx->block, ctx->index, length, data);
 }
 
 static void
diff --git a/config.make.in b/config.make.in
index f8e1f74e..6aec7c73 100644
--- a/config.make.in
+++ b/config.make.in
@@ -8,6 +8,7 @@ CCPIC = @CCPIC@
 CPPFLAGS = @CPPFLAGS@
 DEFS = @DEFS@
 LDFLAGS = @LDFLAGS@
+ASM_FLAGS = @ASM_FLAGS@
 LIBS = @LIBS@
 LIBOBJS = @LIBOBJS@
 EMULATOR = @EMULATOR@
diff --git a/configure.ac b/configure.ac
index 73ce5764..92536fb0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4,7 +4,7 @@ dnl Process this file with autoconf to produce a configure script.
 
 AC_INIT([nettle], [3.8], [nettle-bugs@lists.lysator.liu.se])
 AC_PREREQ(2.61)
-AC_CONFIG_SRCDIR([arcfour.c])
+AC_CONFIG_SRCDIR([nettle-types.h])
 # Needed to stop autoconf from looking for files in parent directories.
 AC_CONFIG_AUX_DIR([.])
 
@@ -121,6 +121,8 @@ AC_ARG_ENABLE(mini-gmp,
   AC_HELP_STRING([--enable-mini-gmp], [Enable mini-gmp, used instead of libgmp.]),,
   [enable_mini_gmp=no])
 
+AC_ARG_VAR(ASM_FLAGS, [Extra flags for processing assembly source files])
+
 if test "x$enable_mini_gmp" = xyes ; then
   NETTLE_USE_MINI_GMP=1
   HOGWEED_EXTRA_SYMBOLS="mpz_*;gmp_*;mpn_*;mp_*;"
@@ -345,7 +347,7 @@ case "$host_cpu" in
       ABI=64
     ])
     ;;
-  *mips*)
+  *mips64*)
     AC_TRY_COMPILE([
 #if defined(__mips64) || defined(__mips64__) || (defined(__sgi) && defined(__LP64__))
 #error 64-bit mips
@@ -598,7 +600,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		chacha-core-internal.asm \
 		salsa20-crypt.asm salsa20-core-internal.asm \
 		serpent-encrypt.asm serpent-decrypt.asm \
-		sha1-compress.asm sha256-compress.asm sha512-compress.asm \
+		sha1-compress.asm sha256-compress-n.asm sha512-compress.asm \
 		sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
 
 # Assembler files which generate additional object files if they are used.
@@ -612,10 +614,10 @@ asm_nettle_optional_list="cpuid.asm cpu-facility.asm \
   aes256-encrypt-2.asm aes256-decrypt-2.asm \
   cbc-aes128-encrypt-2.asm cbc-aes192-encrypt-2.asm cbc-aes256-encrypt-2.asm \
   chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
-  poly1305-internal-2.asm \
+  poly1305-blocks.asm poly1305-internal-2.asm \
   ghash-set-key-2.asm ghash-update-2.asm \
   salsa20-2core.asm salsa20-core-internal-2.asm \
-  sha1-compress-2.asm sha256-compress-2.asm \
+  sha1-compress-2.asm sha256-compress-n-2.asm \
   sha3-permute-2.asm sha512-compress-2.asm \
   umac-nh-n-2.asm umac-nh-2.asm"
 
@@ -762,13 +764,15 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_poly1305_set_key
 #undef HAVE_NATIVE_poly1305_block
 #undef HAVE_NATIVE_poly1305_digest
+#undef HAVE_NATIVE_poly1305_blocks
+#undef HAVE_NATIVE_fat_poly1305_blocks
 #undef HAVE_NATIVE_ghash_set_key
 #undef HAVE_NATIVE_ghash_update
 #undef HAVE_NATIVE_salsa20_core
 #undef HAVE_NATIVE_salsa20_2core
 #undef HAVE_NATIVE_fat_salsa20_2core
 #undef HAVE_NATIVE_sha1_compress
-#undef HAVE_NATIVE_sha256_compress
+#undef HAVE_NATIVE_sha256_compress_n
 #undef HAVE_NATIVE_sha512_compress
 #undef HAVE_NATIVE_sha3_permute
 #undef HAVE_NATIVE_umac_nh
diff --git a/ecc-curve25519.c b/ecc-curve25519.c
index 56abcf23..539bff22 100644
--- a/ecc-curve25519.c
+++ b/ecc-curve25519.c
@@ -266,6 +266,7 @@ const struct ecc_curve _nettle_curve25519 =
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
+    ecc_Bm2p,
     NULL,
     ecc_pp1h,
 
@@ -287,6 +288,7 @@ const struct ecc_curve _nettle_curve25519 =
     ecc_q,
     ecc_Bmodq,  
     ecc_mBmodq_shifted, /* Use q - 2^{252} instead. */
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/ecc-curve448.c b/ecc-curve448.c
index 1bd4e11f..daef56cc 100644
--- a/ecc-curve448.c
+++ b/ecc-curve448.c
@@ -220,6 +220,7 @@ const struct ecc_curve _nettle_curve448 =
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
+    ecc_Bm2p,
     NULL,
     ecc_pp1h,
 
@@ -241,6 +242,7 @@ const struct ecc_curve _nettle_curve448 =
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/ecc-ecdsa-sign.c b/ecc-ecdsa-sign.c
index 4adee1d1..6a41c14c 100644
--- a/ecc-ecdsa-sign.c
+++ b/ecc-ecdsa-sign.c
@@ -46,9 +46,9 @@
 mp_size_t
 ecc_ecdsa_sign_itch (const struct ecc_curve *ecc)
 {
-  /* Needs 3*ecc->p.size + scratch for ecc->mul_g. Currently same for
-     ecc_mul_g. */
-  assert (ecc->p.size + ecc->p.invert_itch <= 3*ecc->p.size + ecc->mul_g_itch);
+  /* Needs 3*ecc->p.size + scratch for ecc_mul_g. */
+  assert (ecc->p.size + ecc->p.invert_itch
+	  <= 3*ecc->p.size + ECC_MUL_G_ITCH (ecc->p.size));
   return ECC_ECDSA_SIGN_ITCH (ecc->p.size);
 }
 
@@ -79,9 +79,9 @@ ecc_ecdsa_sign (const struct ecc_curve *ecc,
      4. s2 <-- (h + z*s1)/k mod q.
   */
 
-  ecc->mul_g (ecc, P, kp, P + 3*ecc->p.size);
+  ecc_mul_g (ecc, P, kp, P + 3*ecc->p.size);
   /* x coordinate only, modulo q */
-  ecc->h_to_a (ecc, 2, rp, P, P + 3*ecc->p.size);
+  ecc_j_to_a (ecc, 2, rp, P, P + 3*ecc->p.size);
 
   /* Invert k, uses up to 7 * ecc->p.size including scratch (for secp384). */
   ecc->q.invert (&ecc->q, kinv, kp, tp);
diff --git a/ecc-ecdsa-verify.c b/ecc-ecdsa-verify.c
index f3b112b0..9e324ea2 100644
--- a/ecc-ecdsa-verify.c
+++ b/ecc-ecdsa-verify.c
@@ -53,8 +53,8 @@ ecdsa_in_range (const struct ecc_curve *ecc, const mp_limb_t *xp)
 mp_size_t
 ecc_ecdsa_verify_itch (const struct ecc_curve *ecc)
 {
-  /* Largest storage need is for the ecc->mul call. */
-  return 5*ecc->p.size + ecc->mul_itch;
+  /* Largest storage need is for the ecc_mul_a call. */
+  return 5*ecc->p.size + ECC_MUL_A_ITCH (ecc->p.size);
 }
 
 /* FIXME: Use faster primitives, not requiring side-channel silence. */
@@ -107,35 +107,23 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
   /* u2 = r / s, P2 = u2 * Y */
   ecc_mod_mul_canonical (&ecc->q, u2, rp, sinv, u2);
 
-   /* Total storage: 5*ecc->p.size + ecc->mul_itch */
-  ecc->mul (ecc, P2, u2, pp, u2 + ecc->p.size);
+   /* Total storage: 5*ecc->p.size + ECC_MUL_A_ITCH */
+  ecc_mul_a (ecc, P2, u2, pp, u2 + ecc->p.size);
 
   /* u = 0 can happen only if h = 0 or h = q, which is extremely
      unlikely. */
   if (!mpn_zero_p (u1, ecc->p.size))
     {
-      /* Total storage: 7*ecc->p.size + ecc->mul_g_itch (ecc->p.size) */
-      ecc->mul_g (ecc, P1, u1, P1 + 3*ecc->p.size);
-
-      /* NOTE: ecc_add_jjj and/or ecc_j_to_a will produce garbage in
-	 case u1 G = +/- u2 V. However, anyone who gets his or her
-	 hands on a signature where this happens during verification,
-	 can also get the private key as z = +/- u1 / u_2 (mod q). And
-	 then it doesn't matter very much if verification of
-	 signatures with that key succeeds or fails.
-
-	 u1 G = - u2 V can never happen for a correctly generated
-	 signature, since it implies k = 0.
-
-	 u1 G = u2 V is possible, if we are unlucky enough to get h /
-	 s_1 = z. Hitting that is about as unlikely as finding the
-	 private key by guessing.
-       */
-      /* Total storage: 6*ecc->p.size + ecc->add_hhh_itch */
-      ecc->add_hhh (ecc, P2, P2, P1, P1 + 3*ecc->p.size);
+      /* Total storage: 7*ecc->p.size + ECC_MUL_G_ITCH */
+      ecc_mul_g (ecc, P1, u1, P1 + 3*ecc->p.size);
+
+      /* Total storage: 6*ecc->p.size + ECC_ADD_JJJ_ITCH */
+      if (!ecc_nonsec_add_jjj (ecc, P2, P2, P1, P1 + 3*ecc->p.size))
+	/* Infinity point, not a valid signature. */
+	return 0;
     }
   /* x coordinate only, modulo q */
-  ecc->h_to_a (ecc, 2, P1, P2, P1 + 3*ecc->p.size);
+  ecc_j_to_a (ecc, 2, P1, P2, P1 + 3*ecc->p.size);
 
   return (mpn_cmp (rp, P1, ecc->p.size) == 0);
 #undef P2
diff --git a/ecc-gost-gc256b.c b/ecc-gost-gc256b.c
index 0cf753e4..df9cbb58 100644
--- a/ecc-gost-gc256b.c
+++ b/ecc-gost-gc256b.c
@@ -71,6 +71,7 @@ const struct ecc_curve _nettle_gost_gc256b =
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
+    ecc_Bm2p,
     ecc_redc_ppm1,
 
     ecc_pp1h,
@@ -92,6 +93,7 @@ const struct ecc_curve _nettle_gost_gc256b =
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/ecc-gost-gc512a.c b/ecc-gost-gc512a.c
index 338ed001..3807b57e 100644
--- a/ecc-gost-gc512a.c
+++ b/ecc-gost-gc512a.c
@@ -71,6 +71,7 @@ const struct ecc_curve _nettle_gost_gc512a =
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
+    ecc_Bm2p,
     ecc_redc_ppm1,
 
     ecc_pp1h,
@@ -92,6 +93,7 @@ const struct ecc_curve _nettle_gost_gc512a =
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/ecc-gostdsa-sign.c b/ecc-gostdsa-sign.c
index c924122c..491a2281 100644
--- a/ecc-gostdsa-sign.c
+++ b/ecc-gostdsa-sign.c
@@ -45,8 +45,7 @@
 mp_size_t
 ecc_gostdsa_sign_itch (const struct ecc_curve *ecc)
 {
-  /* Needs 3*ecc->p.size + scratch for ecc->mul_g. Currently same for
-     ecc_mul_g. */
+  /* Needs 3*ecc->p.size + scratch for ecc_mul_g. */
   return ECC_GOSTDSA_SIGN_ITCH (ecc->p.size);
 }
 
@@ -75,9 +74,9 @@ ecc_gostdsa_sign (const struct ecc_curve *ecc,
      4. s <-- (r*z + k*h) mod q.
   */
 
-  ecc->mul_g (ecc, P, kp, P + 3*ecc->p.size);
+  ecc_mul_g (ecc, P, kp, P + 3*ecc->p.size);
   /* x coordinate only, modulo q */
-  ecc->h_to_a (ecc, 2, rp, P, P + 3*ecc->p.size);
+  ecc_j_to_a (ecc, 2, rp, P, P + 3*ecc->p.size);
 
   /* Process hash digest */
   gost_hash (&ecc->q, hp, length, digest);
diff --git a/ecc-gostdsa-verify.c b/ecc-gostdsa-verify.c
index fcdd4644..0570af7e 100644
--- a/ecc-gostdsa-verify.c
+++ b/ecc-gostdsa-verify.c
@@ -52,8 +52,8 @@ ecdsa_in_range (const struct ecc_curve *ecc, const mp_limb_t *xp)
 mp_size_t
 ecc_gostdsa_verify_itch (const struct ecc_curve *ecc)
 {
-  /* Largest storage need is for the ecc->mul call. */
-  return 5*ecc->p.size + ecc->mul_itch;
+  /* Largest storage need is for the ecc_mul_a call. */
+  return 5*ecc->p.size + ECC_MUL_A_ITCH (ecc->p.size);
 }
 
 /* FIXME: Use faster primitives, not requiring side-channel silence. */
@@ -108,17 +108,18 @@ ecc_gostdsa_verify (const struct ecc_curve *ecc,
   mpn_sub_n (hp, ecc->q.m, rp, ecc->p.size);
   ecc_mod_mul_canonical (&ecc->q, z2, hp, vp, z2);
 
-   /* Total storage: 5*ecc->p.size + ecc->mul_itch */
-  ecc->mul (ecc, P2, z2, pp, z2 + ecc->p.size);
+   /* Total storage: 5*ecc->p.size + ECC_MUL_A_ITCH */
+  ecc_mul_a (ecc, P2, z2, pp, z2 + ecc->p.size);
 
-  /* Total storage: 7*ecc->p.size + ecc->mul_g_itch (ecc->p.size) */
-  ecc->mul_g (ecc, P1, z1, P1 + 3*ecc->p.size);
+  /* Total storage: 7*ecc->p.size + ECC_MUL_G_ITCH */
+  ecc_mul_g (ecc, P1, z1, P1 + 3*ecc->p.size);
 
-  /* Total storage: 6*ecc->p.size + ecc->add_hhh_itch */
-  ecc->add_hhh (ecc, P1, P1, P2, P1 + 3*ecc->p.size);
+  /* Total storage: 6*ecc->p.size + ECC_ADD_JJJ_ITCH */
+  if (!ecc_nonsec_add_jjj (ecc, P1, P1, P2, P1 + 3*ecc->p.size))
+    return 0;
 
   /* x coordinate only, modulo q */
-  ecc->h_to_a (ecc, 2, P2, P1, P1 + 3*ecc->p.size);
+  ecc_j_to_a (ecc, 2, P2, P1, P1 + 3*ecc->p.size);
 
   return (mpn_cmp (rp, P2, ecc->p.size) == 0);
 #undef P2
diff --git a/ecc-internal.h b/ecc-internal.h
index 2ea553b5..be02de5f 100644
--- a/ecc-internal.h
+++ b/ecc-internal.h
@@ -66,6 +66,7 @@
 #define ecc_dup_jj _nettle_ecc_dup_jj
 #define ecc_add_jja _nettle_ecc_add_jja
 #define ecc_add_jjj _nettle_ecc_add_jjj
+#define ecc_nonsec_add_jjj _nettle_ecc_nonsec_add_jjj
 #define ecc_dup_eh _nettle_ecc_dup_eh
 #define ecc_add_eh _nettle_ecc_add_eh
 #define ecc_add_ehh _nettle_ecc_add_ehh
@@ -80,7 +81,6 @@
 #define cnd_copy _nettle_cnd_copy
 #define sec_add_1 _nettle_sec_add_1
 #define sec_sub_1 _nettle_sec_sub_1
-#define sec_tabselect _nettle_sec_tabselect
 #define sec_modinv _nettle_sec_modinv
 #define curve25519_eh_to_x _nettle_curve25519_eh_to_x
 #define curve448_eh_to_x _nettle_curve448_eh_to_x
@@ -174,8 +174,14 @@ struct ecc_modulo
   /* B^size mod m. Expected to have at least 32 leading zeros
      (equality for secp_256r1). */
   const mp_limb_t *B;
-  /* 2^{bit_size} - m, same value as above, but shifted. */
+  /* 2^{bit_size} - m. When different from B above, for numbers of
+     interest, usually B has trailing zeros and this is B shifted
+     right. */
   const mp_limb_t *B_shifted;
+  /* For ecc_mod_sub: B^size - 2m, if that doesn't underflow.
+     Otherwise, same as B */
+  const mp_limb_t *Bm2m;
+
   /* m +/- 1, for redc, excluding redc_size low limbs. */
   const mp_limb_t *redc_mpm1;
   /* (m+1)/2 */
@@ -258,6 +264,8 @@ ecc_mod_equal_p (const struct ecc_modulo *m, const mp_limb_t *a,
 void
 ecc_mod_add (const struct ecc_modulo *m, mp_limb_t *rp,
 	     const mp_limb_t *ap, const mp_limb_t *bp);
+
+/* If inputs are in the range 0 <= a, b < 2m, then so is the output. */
 void
 ecc_mod_sub (const struct ecc_modulo *m, mp_limb_t *rp,
 	     const mp_limb_t *ap, const mp_limb_t *bp);
@@ -382,6 +390,14 @@ ecc_add_jjj (const struct ecc_curve *ecc,
 	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
 	     mp_limb_t *scratch);
 
+/* Variant that handles the checks for the special cases P = ±Q.
+   Returns 1 on success, 0 if result is infinite. Not side-channel
+   silent, so must not be used with secret inputs. */
+int
+ecc_nonsec_add_jjj (const struct ecc_curve *ecc,
+		    mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
+		    mp_limb_t *scratch);
+
 /* Point doubling on a twisted Edwards curve, with homogeneous
    cooordinates. */
 void
@@ -458,11 +474,6 @@ mp_limb_t
 sec_sub_1 (mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_limb_t b);
 
 void
-sec_tabselect (mp_limb_t *rp, mp_size_t rn,
-	       const mp_limb_t *table, unsigned tn,
-	       unsigned k);
-
-void
 curve25519_eh_to_x (mp_limb_t *xp, const mp_limb_t *p,
 		    mp_limb_t *scratch);
 
diff --git a/ecc-mod-arith.c b/ecc-mod-arith.c
index 310cbb1d..d0137864 100644
--- a/ecc-mod-arith.c
+++ b/ecc-mod-arith.c
@@ -85,7 +85,20 @@ ecc_mod_sub (const struct ecc_modulo *m, mp_limb_t *rp,
 {
   mp_limb_t cy;
   cy = mpn_sub_n (rp, ap, bp, m->size);
-  cy = mpn_cnd_sub_n (cy, rp, rp, m->B, m->size);
+  /* The adjustments for this function work differently depending on
+     the value of the most significant bit of m.
+
+     If m has a most significant bit of zero, then the first
+     adjustment step conditionally adds 2m. If in addition, inputs are
+     in the 0 <= a,b < 2m range, then the first adjustment guarantees
+     that result is in that same range. The second adjustment step is
+     needed only if b > 2m, it then ensures output is correct modulo
+     m, but nothing more.
+
+     If m has a most significant bit of one, Bm2m and B are the same,
+     and this function works analogously to ecc_mod_add.
+   */
+  cy = mpn_cnd_sub_n (cy, rp, rp, m->Bm2m, m->size);
   cy = mpn_cnd_sub_n (cy, rp, rp, m->B, m->size);
   assert (cy == 0);  
 }
diff --git a/ecc-mul-a-eh.c b/ecc-mul-a-eh.c
index 1eb3efcc..980fec3f 100644
--- a/ecc-mul-a-eh.c
+++ b/ecc-mul-a-eh.c
@@ -140,7 +140,7 @@ ecc_mul_a_eh (const struct ecc_curve *ecc,
 
   assert (bits < TABLE_SIZE);
 
-  sec_tabselect (r, 3*ecc->p.size, table, TABLE_SIZE, bits);
+  mpn_sec_tabselect (r, table, 3*ecc->p.size, TABLE_SIZE, bits);
 
   for (;;)
     {
@@ -166,7 +166,7 @@ ecc_mul_a_eh (const struct ecc_curve *ecc,
 	ecc->dup (ecc, r, r, scratch_out);
 
       bits &= TABLE_MASK;
-      sec_tabselect (tp, 3*ecc->p.size, table, TABLE_SIZE, bits);
+      mpn_sec_tabselect (tp, table, 3*ecc->p.size, TABLE_SIZE, bits);
       ecc->add_hhh (ecc, r, r, tp, scratch_out);
     }
 #undef table
diff --git a/ecc-mul-a.c b/ecc-mul-a.c
index cb9c7d41..8e1355eb 100644
--- a/ecc-mul-a.c
+++ b/ecc-mul-a.c
@@ -144,7 +144,7 @@ ecc_mul_a (const struct ecc_curve *ecc,
 
   assert (bits < TABLE_SIZE);
 
-  sec_tabselect (r, 3*ecc->p.size, table, TABLE_SIZE, bits);
+  mpn_sec_tabselect (r, table, 3*ecc->p.size, TABLE_SIZE, bits);
   is_zero = (bits == 0);
 
   for (;;)
@@ -171,7 +171,7 @@ ecc_mul_a (const struct ecc_curve *ecc,
 	ecc_dup_jj (ecc, r, r, scratch_out);
 
       bits &= TABLE_MASK;
-      sec_tabselect (tp, 3*ecc->p.size, table, TABLE_SIZE, bits);
+      mpn_sec_tabselect (tp, table, 3*ecc->p.size, TABLE_SIZE, bits);
       cnd_copy (is_zero, r, tp, 3*ecc->p.size);
       ecc_add_jjj (ecc, tp, tp, r, scratch_out);
 
diff --git a/ecc-mul-g-eh.c b/ecc-mul-g-eh.c
index 8b3ca8f8..57df1c6d 100644
--- a/ecc-mul-g-eh.c
+++ b/ecc-mul-g-eh.c
@@ -88,10 +88,10 @@ ecc_mul_g_eh (const struct ecc_curve *ecc, mp_limb_t *r,
 	      shift = bit_index % GMP_NUMB_BITS;
 	      bits = (bits << 1) | ((np[limb_index] >> shift) & 1);
 	    }
-	  sec_tabselect (tp, 2*ecc->p.size,
-			 (ecc->pippenger_table
-			  + (2*ecc->p.size * (mp_size_t) j << c)),
-			 1<<c, bits);
+	  mpn_sec_tabselect (tp,
+			     (ecc->pippenger_table
+			      + (2*ecc->p.size * (mp_size_t) j << c)),
+			      2*ecc->p.size, 1<<c, bits);
 
 	  ecc->add_hh (ecc, r, r, tp, scratch_out);
 	}
diff --git a/ecc-mul-g.c b/ecc-mul-g.c
index dcc7c3ea..677a37e7 100644
--- a/ecc-mul-g.c
+++ b/ecc-mul-g.c
@@ -88,10 +88,10 @@ ecc_mul_g (const struct ecc_curve *ecc, mp_limb_t *r,
 	      shift = bit_index % GMP_NUMB_BITS;
 	      bits = (bits << 1) | ((np[limb_index] >> shift) & 1);
 	    }
-	  sec_tabselect (tp, 2*ecc->p.size,
-			 (ecc->pippenger_table
-			  + (2*ecc->p.size * (mp_size_t) j << c)),
-			 1<<c, bits);
+	  mpn_sec_tabselect (tp,
+			     (ecc->pippenger_table
+			      + (2*ecc->p.size * (mp_size_t) j << c)),
+			      2*ecc->p.size, 1<<c, bits);
 	  cnd_copy (is_zero, r, tp, 2*ecc->p.size);
 	  cnd_copy (is_zero, r + 2*ecc->p.size, ecc->unit, ecc->p.size);
 	  
diff --git a/ecc-nonsec-add-jjj.c b/ecc-nonsec-add-jjj.c
new file mode 100644
index 00000000..439c0a52
--- /dev/null
+++ b/ecc-nonsec-add-jjj.c
@@ -0,0 +1,162 @@
+/* ecc-non-sec-add-jjj.c
+
+   Copyright (C) 2013, 2022 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+/* Development of Nettle's ECC support was funded by the .SE Internet Fund. */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "ecc.h"
+#include "ecc-internal.h"
+
+/* Similar to ecc_add_jjj, but checks if x coordinates are equal (H =
+   0) below, and if so, performs doubling if also y coordinates are
+   equal, or returns 0 (failure) indicating that the result is the
+   infinity point. */
+int
+ecc_nonsec_add_jjj (const struct ecc_curve *ecc,
+		    mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
+		    mp_limb_t *scratch)
+{
+#define x1 p
+#define y1 (p + ecc->p.size)
+#define z1 (p + 2*ecc->p.size)
+
+#define x2 q
+#define y2 (q + ecc->p.size)
+#define z2 (q + 2*ecc->p.size)
+
+#define x3 r
+#define y3 (r + ecc->p.size)
+#define z3 (r + 2*ecc->p.size)
+  /* Formulas, from djb,
+     http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl:
+
+     Computation		Operation	Live variables
+
+      Z1Z1 = Z1^2		sqr		Z1Z1
+      Z2Z2 = Z2^2		sqr		Z1Z1, Z2Z2
+      U1 = X1*Z2Z2		mul		Z1Z1, Z2Z2, U1
+      U2 = X2*Z1Z1		mul		Z1Z1, Z2Z2, U1, U2
+      H = U2-U1					Z1Z1, Z2Z2, U1, H
+      Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H sqr, mul	Z1Z1, Z2Z2, U1, H
+      S1 = Y1*Z2*Z2Z2		mul, mul	Z1Z1, U1, H, S1
+      S2 = Y2*Z1*Z1Z1		mul, mul	U1, H, S1, S2
+      W = 2*(S2-S1)	(djb: r)		U1, H, S1, W
+      I = (2*H)^2		sqr		U1, H, S1, W, I
+      J = H*I			mul		U1, S1, W, J, V
+      V = U1*I			mul		S1, W, J, V
+      X3 = W^2-J-2*V		sqr		S1, W, J, V
+      Y3 = W*(V-X3)-2*S1*J	mul, mul
+  */
+
+#define h scratch
+#define z1z1 (scratch + ecc->p.size)
+#define z2z2 z1z1
+#define z1z2 (scratch + 2*ecc->p.size)
+
+#define w (scratch + ecc->p.size)
+#define i (scratch + 2*ecc->p.size)
+#define j h
+#define v i
+
+#define tp  (scratch + 3*ecc->p.size)
+
+  ecc_mod_sqr (&ecc->p, z2z2, z2, tp);		/* z2z2 */
+  /* Store u1 at x3 */
+  ecc_mod_mul (&ecc->p, x3, x1, z2z2, tp);	/* z2z2 */
+
+  ecc_mod_add (&ecc->p, z1z2, z1, z2);		/* z2z2, z1z2 */
+  ecc_mod_sqr (&ecc->p, z1z2, z1z2, tp);
+  ecc_mod_sub (&ecc->p, z1z2, z1z2, z2z2);	/* z2z2, z1z2 */
+
+  /* Do s1 early, store at y3 */
+  ecc_mod_mul (&ecc->p, z2z2, z2z2, z2, tp);	/* z2z2, z1z2 */
+  ecc_mod_mul (&ecc->p, y3, z2z2, y1, tp);	/* z1z2 */
+
+  ecc_mod_sqr (&ecc->p, z1z1, z1, tp);		/* z1z1, z1z2 */
+  ecc_mod_sub (&ecc->p, z1z2, z1z2, z1z1);
+  ecc_mod_mul (&ecc->p, h, x2, z1z1, tp);	/* z1z1, z1z2, h */
+  ecc_mod_sub (&ecc->p, h, h, x3);
+
+  /* z1^3 */
+  ecc_mod_mul (&ecc->p, z1z1, z1z1, z1, tp);
+
+  /* z3 <-- h z1 z2 delayed until now, since that may clobber z1. */
+  ecc_mod_mul (&ecc->p, z3, z1z2, h, tp);	/* z1z1, h */
+  /* w = 2 (s2 - s1) */
+  ecc_mod_mul (&ecc->p, w, z1z1, y2, tp);	/* h, w */
+  ecc_mod_sub (&ecc->p, w, w, y3);
+
+  /* Note that use of ecc_mod_zero_p depends 0 <= h,w < 2p. */
+  if (ecc_mod_zero_p (&ecc->p, h))
+    {
+      /* X1 == X2 */
+      if (ecc_mod_zero_p (&ecc->p, w)) {
+	/* Y1 == Y2. Do point duplication. Note that q input is
+	   unclobbered, and that scratch need is smaller. Implies some
+	   unnecessary recomputation, but performance it not so
+	   important for this very unlikely corner case. */
+	ecc_dup_jj (ecc, r, q, scratch);
+	return 1;
+      }
+
+      /* We must have Y1 == -Y2, and then the result is the infinity
+	 point, */
+      mpn_zero (r, 3*ecc->p.size);
+      return 0;
+    }
+  ecc_mod_add (&ecc->p, w, w, w);
+
+  /* i = (2h)^2 */
+  ecc_mod_add (&ecc->p, i, h, h);		/* h, w, i */
+  ecc_mod_sqr (&ecc->p, i, i, tp);
+
+  /* j and h can overlap */
+  ecc_mod_mul (&ecc->p, j, h, i, tp);		/* j, w, i */
+
+  /* v and i can overlap */
+  ecc_mod_mul (&ecc->p, v, x3, i, tp);		/* j, w, v */
+
+  /* x3 <-- w^2 - j - 2v */
+  ecc_mod_sqr (&ecc->p, x3, w, tp);
+  ecc_mod_sub (&ecc->p, x3, x3, j);
+  ecc_mod_submul_1 (&ecc->p, x3, v, 2);
+
+  /* y3 <-- w (v - x3) - 2 s1 j */
+  ecc_mod_mul (&ecc->p, j, j, y3, tp);
+  ecc_mod_sub (&ecc->p, v, v, x3);
+  ecc_mod_mul (&ecc->p, y3, v, w, tp);
+  ecc_mod_submul_1 (&ecc->p, y3, j, 2);
+
+  return 1;
+}
diff --git a/ecc-secp192r1.c b/ecc-secp192r1.c
index 391ba528..4a07bca3 100644
--- a/ecc-secp192r1.c
+++ b/ecc-secp192r1.c
@@ -247,7 +247,8 @@ const struct ecc_curve _nettle_secp_192r1 =
 
     ecc_p,
     ecc_Bmodp,
-    ecc_Bmodp_shifted,    
+    ecc_Bmodp_shifted,
+    ecc_Bm2p,
     ecc_redc_ppm1,
     ecc_pp1h,
 
@@ -269,6 +270,7 @@ const struct ecc_curve _nettle_secp_192r1 =
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/ecc-secp224r1.c b/ecc-secp224r1.c
index bb321298..b2a335ec 100644
--- a/ecc-secp224r1.c
+++ b/ecc-secp224r1.c
@@ -223,6 +223,7 @@ const struct ecc_curve _nettle_secp_224r1 =
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
+    ecc_Bm2p,
     ecc_redc_ppm1,
     ecc_pp1h,
 
@@ -244,6 +245,7 @@ const struct ecc_curve _nettle_secp_224r1 =
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/ecc-secp256r1.c b/ecc-secp256r1.c
index e1a14b90..4848dfe3 100644
--- a/ecc-secp256r1.c
+++ b/ecc-secp256r1.c
@@ -343,6 +343,7 @@ const struct ecc_curve _nettle_secp_256r1 =
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
+    ecc_Bm2p,
     ecc_redc_ppm1,
     ecc_pp1h,
 
@@ -364,6 +365,7 @@ const struct ecc_curve _nettle_secp_256r1 =
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/ecc-secp384r1.c b/ecc-secp384r1.c
index 39716dff..abac5e6d 100644
--- a/ecc-secp384r1.c
+++ b/ecc-secp384r1.c
@@ -314,6 +314,7 @@ const struct ecc_curve _nettle_secp_384r1 =
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
+    ecc_Bm2p,
     ecc_redc_ppm1,
     ecc_pp1h,
 
@@ -335,6 +336,7 @@ const struct ecc_curve _nettle_secp_384r1 =
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/ecc-secp521r1.c b/ecc-secp521r1.c
index 24d0b53a..8ab7b4bf 100644
--- a/ecc-secp521r1.c
+++ b/ecc-secp521r1.c
@@ -169,6 +169,7 @@ const struct ecc_curve _nettle_secp_521r1 =
     ecc_p,
     ecc_Bmodp,
     ecc_Bmodp_shifted,
+    ecc_Bm2p,
     ecc_redc_ppm1,
     ecc_pp1h,
 
@@ -190,6 +191,7 @@ const struct ecc_curve _nettle_secp_521r1 =
     ecc_q,
     ecc_Bmodq,
     ecc_Bmodq_shifted,
+    ecc_Bm2q,
     NULL,
     ecc_qp1h,
 
diff --git a/eccdata.c b/eccdata.c
index a7e7e18a..e0726e8d 100644
--- a/eccdata.c
+++ b/eccdata.c
@@ -71,6 +71,7 @@ struct ecc_curve
 
   /* Prime */
   mpz_t p;
+  /* Curve constant */
   mpz_t b;
 
   /* Curve order */
@@ -626,15 +627,15 @@ ecc_curve_init (struct ecc_curve *ecc, const char *curve)
 
 	   x^2 + y^2 = 1 + (121665/121666) x^2 y^2 (mod p).
 
-	   -x^2 + y^2 = 1 - (121665/121666) x^2 y^2, with p = 2^{255} - 19.
+	 But instead of using this curve, we use a twisted curve, following RFC 7748,
+
+	   -x^2 + y^2 = 1 - (121665/121666) x^2 y^2 (mod p)
+
+         (this is possible because -1 is a square modulo p).
 
 	 The generator is
 	   x = 0x216936d3cd6e53fec0a4e231fdd6dc5c692cc7609525a7b2c9562d608f25d51a
            y = 0x6666666666666666666666666666666666666666666666666666666666666658
-
-	 Also birationally equivalent to the curve25519 Montgomery curve,
-
-	   y^2 = x^3 + 486662 x^2 + x (mod p)
       */
       ecc_curve_init_str (ecc, ECC_TYPE_TWISTED_EDWARDS,
 			  "7fffffffffffffffffffffffffffffff"
@@ -1151,98 +1152,99 @@ output_point (const struct ecc_curve *ecc,
   mpz_clear (t);
 }
 
-static unsigned
-output_modulo (const char *name, const mpz_t x,
-	       unsigned size, unsigned bits_per_limb)
+static void
+string_toupper (char *buf, size_t size, const char *s)
 {
-  mpz_t mod;
-  unsigned bits;
-
-  mpz_init (mod);
-
-  mpz_setbit (mod, bits_per_limb * size);
-  mpz_mod (mod, mod, x);
-
-  bits = mpz_sizeinbase (mod, 2);
-  output_bignum (name, mod, size, bits_per_limb);
-  
-  mpz_clear (mod);
-  return bits;
+  size_t i;
+  for (i = 0; i < size; i++)
+    {
+      buf[i] = toupper ((int)s[i]);
+      if (!buf[i])
+	return;
+    }
+  fprintf (stderr, "string '%s' too large for buffer of size %u.\n",
+	   s, (unsigned) size);
+  abort();
 }
 
 static void
-output_curve (const struct ecc_curve *ecc, unsigned bits_per_limb)
+output_modulo (const char *name, const mpz_t x,
+	       unsigned size, unsigned bits_per_limb)
 {
-  unsigned limb_size = (ecc->bit_size + bits_per_limb - 1)/bits_per_limb;
-  unsigned i;
-  unsigned bits;
-  int redc_limbs;
+  unsigned bit_size;
+  int shift;
+  char buf[20];
   mpz_t t;
-  mpz_t z;
+
+  snprintf (buf, sizeof (buf), "ecc_%s", name);
+  output_bignum (buf, x, size, bits_per_limb);
 
   mpz_init (t);
-  mpz_init (z);
 
-  printf ("/* For NULL. */\n#include <stddef.h>\n");
+  mpz_setbit (t, bits_per_limb * size);
+  mpz_mod (t, t, x);
 
-  printf ("#define ECC_LIMB_SIZE %u\n", limb_size);
-  printf ("#define ECC_PIPPENGER_K %u\n", ecc->pippenger_k);
-  printf ("#define ECC_PIPPENGER_C %u\n", ecc->pippenger_c);
-
-  output_bignum ("ecc_p", ecc->p, limb_size, bits_per_limb);
-  output_bignum ("ecc_b", ecc->b, limb_size, bits_per_limb);
-  output_bignum ("ecc_q", ecc->q, limb_size, bits_per_limb);
+  snprintf (buf, sizeof (buf), "ecc_Bmod%s", name);
+  output_bignum (buf, t, size, bits_per_limb);
   
-  bits = output_modulo ("ecc_Bmodp", ecc->p, limb_size, bits_per_limb);
-  printf ("#define ECC_BMODP_SIZE %u\n",
-	  (bits + bits_per_limb - 1) / bits_per_limb);
-  bits = output_modulo ("ecc_Bmodq", ecc->q, limb_size, bits_per_limb);
-  printf ("#define ECC_BMODQ_SIZE %u\n",
-	  (bits + bits_per_limb - 1) / bits_per_limb);
-  bits = mpz_sizeinbase (ecc->q, 2);
-  if (bits < ecc->bit_size)
+  string_toupper (buf, sizeof (buf), name);
+  printf ("#define ECC_BMOD%s_SIZE %u\n", buf,
+	  (unsigned) ((mpz_sizeinbase (t, 2) + bits_per_limb - 1)
+		      / bits_per_limb));
+
+  bit_size = mpz_sizeinbase (x, 2);
+
+  shift = size * bits_per_limb - bit_size;
+  assert (shift >= 0);
+  if (shift > 0)
     {
-      /* for curve25519, with q = 2^k + q', with a much smaller q' */
-      unsigned mbits;
-      unsigned shift;
+      mpz_set_ui (t, 0);
+      mpz_setbit (t, size * bits_per_limb);
+      mpz_submul_ui (t, x, 2);
 
-      /* Shift to align the one bit at B */
-      shift = bits_per_limb * limb_size + 1 - bits;
-      
-      mpz_set (t, ecc->q);
-      mpz_clrbit (t, bits-1);
-      mbits = mpz_sizeinbase (t, 2);
+      snprintf (buf, sizeof (buf), "ecc_Bm2%s", name);
+      output_bignum (buf, t, size, bits_per_limb);
 
-      /* The shifted value must be a limb smaller than q. */
-      if (mbits + shift + bits_per_limb <= bits)
+      if (bit_size == 253)
 	{
+	  /* For curve25519, with q = 2^k + q', with a much smaller q' */
+	  unsigned mbits;
+	  unsigned shift;
+
+	  /* Shift to align the one bit at B */
+	  shift = bits_per_limb * size + 1 - bit_size;
+
+	  mpz_set (t, x);
+	  mpz_clrbit (t, bit_size-1);
+	  mbits = mpz_sizeinbase (t, 2);
+
+	  /* The shifted value must be a limb smaller than q. */
+	  assert (mbits + shift + bits_per_limb <= bit_size);
+
 	  /* q of the form 2^k + q', with q' a limb smaller */
 	  mpz_mul_2exp (t, t, shift);
-	  output_bignum ("ecc_mBmodq_shifted", t, limb_size, bits_per_limb);
-	}
-    }
+	  snprintf (buf, sizeof (buf), "ecc_mBmod%s_shifted", name);
 
-  if (ecc->bit_size < limb_size * bits_per_limb)
-    {
-      int shift;
+	  output_bignum (buf, t, size, bits_per_limb);
+	}
+      else
+	{
+	  mpz_set_ui (t, 0);
+	  mpz_setbit (t, bit_size);
+	  mpz_sub (t, t, x);
 
-      mpz_set_ui (t, 0);
-      mpz_setbit (t, ecc->bit_size);
-      mpz_sub (t, t, ecc->p);      
-      output_bignum ("ecc_Bmodp_shifted", t, limb_size, bits_per_limb);
+	  snprintf (buf, sizeof (buf), "ecc_Bmod%s_shifted", name);
+	  output_bignum (buf, t, size, bits_per_limb);
 
-      shift = limb_size * bits_per_limb - ecc->bit_size;
-      if (shift > 0)
-	{
 	  /* Check condition for reducing hi limbs. If s is the
 	     normalization shift and n is the bit size (so that s + n
-	     = limb_size * bite_per_limb), then we need
+	     = limb_size * bits_per_limb), then we need
 
-	       (2^n - 1) + (2^s - 1) (2^n - p) < 2p
+	     (2^n - 1) + (2^s - 1) (2^n - p) < 2p
 
 	     or equivalently,
 
-	       2^s (2^n - p) <= p
+	     2^s (2^n - p) <= p
 
 	     To a allow a carry limb to be added in at the same time,
 	     substitute s+1 for s.
@@ -1250,26 +1252,45 @@ output_curve (const struct ecc_curve *ecc, unsigned bits_per_limb)
 	  /* FIXME: For ecdsa verify, we actually need the stricter
 	     inequality < 2 q. */
 	  mpz_mul_2exp (t, t, shift + 1);
-	  if (mpz_cmp (t, ecc->p) > 0)
+	  if (mpz_cmp (t, x) > 0)
 	    {
-	      fprintf (stderr, "Reduction condition failed for %u-bit curve.\n",
-		       ecc->bit_size);
+	      fprintf (stderr, "Reduction condition failed for %u-bit %s.\n",
+		       bit_size, name);
 	      exit (EXIT_FAILURE);
 	    }
 	}
     }
   else
-    printf ("#define ecc_Bmodp_shifted ecc_Bmodp\n");
-
-  if (bits < limb_size * bits_per_limb)
     {
-      mpz_set_ui (t, 0);
-      mpz_setbit (t, bits);
-      mpz_sub (t, t, ecc->q);      
-      output_bignum ("ecc_Bmodq_shifted", t, limb_size, bits_per_limb);      
+      printf ("#define ecc_Bm2%s ecc_Bmod%s\n", name, name);
+      printf ("#define ecc_Bmod%s_shifted ecc_Bmod%s\n", name, name);
     }
-  else
-    printf ("#define ecc_Bmodq_shifted ecc_Bmodq\n");
+
+  mpz_clear (t);
+}
+
+static void
+output_curve (const struct ecc_curve *ecc, unsigned bits_per_limb)
+{
+  unsigned limb_size = (ecc->bit_size + bits_per_limb - 1)/bits_per_limb;
+  unsigned i;
+  int redc_limbs;
+  mpz_t t;
+  mpz_t z;
+
+  mpz_init (t);
+  mpz_init (z);
+
+  printf ("/* For NULL. */\n#include <stddef.h>\n");
+
+  printf ("#define ECC_LIMB_SIZE %u\n", limb_size);
+  printf ("#define ECC_PIPPENGER_K %u\n", ecc->pippenger_k);
+  printf ("#define ECC_PIPPENGER_C %u\n", ecc->pippenger_c);
+
+  output_modulo ("p", ecc->p, limb_size, bits_per_limb);
+  output_modulo ("q", ecc->q, limb_size, bits_per_limb);
+
+  output_bignum ("ecc_b", ecc->b, limb_size, bits_per_limb);
 
   mpz_add_ui (t, ecc->p, 1);
   mpz_fdiv_q_2exp (t, t, 1);
diff --git a/examples/ecc-benchmark.c b/examples/ecc-benchmark.c
index 3ab269c7..7e857f80 100644
--- a/examples/ecc-benchmark.c
+++ b/examples/ecc-benchmark.c
@@ -159,11 +159,17 @@ bench_modq (void *p)
 }
 
 static void
-bench_modinv (void *p)
+bench_pinv (void *p)
 {
   struct ecc_ctx *ctx = (struct ecc_ctx *) p;
   ctx->ecc->p.invert (&ctx->ecc->p, ctx->rp, ctx->ap, ctx->tp);
 }
+static void
+bench_qinv (void *p)
+{
+  struct ecc_ctx *ctx = (struct ecc_ctx *) p;
+  ctx->ecc->q.invert (&ctx->ecc->p, ctx->rp, ctx->ap, ctx->tp);
+}
 
 #if !NETTLE_USE_MINI_GMP
 static void
@@ -239,7 +245,7 @@ static void
 bench_curve (const struct ecc_curve *ecc)
 {
   struct ecc_ctx ctx;  
-  double modp, reduce, modq, modinv, modinv_gcd, modinv_powm,
+  double modp, reduce, modq, pinv, qinv, modinv_gcd, modinv_powm,
     dup_hh, add_hh, add_hhh,
     mul_g, mul_a;
 
@@ -277,7 +283,8 @@ bench_curve (const struct ecc_curve *ecc)
 
   modq = time_function (bench_modq, &ctx);
 
-  modinv = time_function (bench_modinv, &ctx);
+  pinv = time_function (bench_pinv, &ctx);
+  qinv = time_function (bench_qinv, &ctx);
 #if !NETTLE_USE_MINI_GMP
   modinv_gcd = time_function (bench_modinv_gcd, &ctx);
 #else
@@ -299,9 +306,9 @@ bench_curve (const struct ecc_curve *ecc)
   free (ctx.bp);
   free (ctx.tp);
 
-  printf ("%4d %6.4f %6.4f %6.4f %6.2f %6.3f %6.2f %6.3f %6.3f %6.3f %6.1f %6.1f\n",
+  printf ("%4d %6.4f %6.4f %6.4f %6.2f %6.2f %6.3f %6.2f %6.3f %6.3f %6.3f %6.1f %6.1f\n",
 	  ecc->p.bit_size, 1e6 * modp, 1e6 * reduce, 1e6 * modq,
-	  1e6 * modinv, 1e6 * modinv_gcd, 1e6 * modinv_powm,
+	  1e6 * pinv, 1e6 * qinv, 1e6 * modinv_gcd, 1e6 * modinv_powm,
 	  1e6 * dup_hh, 1e6 * add_hh, 1e6 * add_hhh,
 	  1e6 * mul_g, 1e6 * mul_a);
 }
@@ -326,8 +333,8 @@ main (int argc UNUSED, char **argv UNUSED)
   unsigned i;
 
   time_init();
-  printf ("%4s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s (us)\n",
-	  "size", "modp", "reduce", "modq", "modinv", "mi_gcd", "mi_pow",
+  printf ("%4s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s (us)\n",
+	  "size", "modp", "reduce", "modq", "pinv", "qinv", "mi_gcd", "mi_pow",
 	  "dup_hh", "add_hh", "ad_hhh",
 	  "mul_g", "mul_a");
   for (i = 0; i < numberof (curves); i++)
diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c
index ba5dd284..802a7234 100644
--- a/examples/nettle-benchmark.c
+++ b/examples/nettle-benchmark.c
@@ -63,6 +63,7 @@
 #include "sha1.h"
 #include "sha2.h"
 #include "sha3.h"
+#include "sm4.h"
 #include "twofish.h"
 #include "umac.h"
 #include "cmac.h"
@@ -926,6 +927,7 @@ main(int argc, char **argv)
       &nettle_des3,
       &nettle_serpent256,
       &nettle_twofish128, &nettle_twofish192, &nettle_twofish256,
+      &nettle_sm4,
       NULL
     };
 
diff --git a/fat-arm.c b/fat-arm.c
index 56647404..8133ca69 100644
--- a/fat-arm.c
+++ b/fat-arm.c
@@ -153,9 +153,9 @@ DECLARE_FAT_FUNC(nettle_sha1_compress, sha1_compress_func)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, c)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, armv6)
 
-DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, armv6)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, armv6)
 
 DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func)
 DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c)
@@ -202,7 +202,7 @@ fat_init (void)
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_armv6;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_armv6;
       nettle_sha1_compress_vec = _nettle_sha1_compress_armv6;
-      _nettle_sha256_compress_vec = _nettle_sha256_compress_armv6;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_armv6;
     }
   else
     {
@@ -211,7 +211,7 @@ fat_init (void)
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_arm;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_arm;
       nettle_sha1_compress_vec = _nettle_sha1_compress_c;
-      _nettle_sha256_compress_vec = _nettle_sha256_compress_c;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
     }
   if (features.have_neon)
     {
@@ -263,9 +263,10 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 		(uint32_t *state, const uint8_t *input),
 		(state, input))
 
-DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
-		(uint32_t *state, const uint8_t *input, const uint32_t *k),
-		(state, input, k))
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+		(uint32_t *state, const uint32_t *k,
+		 size_t blocks, const uint8_t *input),
+		(state, k, blocks, input))
 
 DEFINE_FAT_FUNC(_nettle_sha512_compress, void,
 		(uint64_t *state, const uint8_t *input, const uint64_t *k),
diff --git a/fat-arm64.c b/fat-arm64.c
index f2b8493d..aec99f66 100644
--- a/fat-arm64.c
+++ b/fat-arm64.c
@@ -178,9 +178,9 @@ DECLARE_FAT_FUNC(nettle_sha1_compress, sha1_compress_func)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, c)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, arm64)
 
-DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, arm64)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, arm64)
 
 static void CONSTRUCTOR
 fat_init (void)
@@ -250,11 +250,11 @@ fat_init (void)
     {
       if (verbose)
 	fprintf (stderr, "libnettle: enabling hardware-accelerated sha256 compress code.\n");
-      _nettle_sha256_compress_vec = _nettle_sha256_compress_arm64;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_arm64;
     }
   else
     {
-      _nettle_sha256_compress_vec = _nettle_sha256_compress_c;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
     }
 }
 
@@ -297,6 +297,7 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 		(uint32_t *state, const uint8_t *input),
 		(state, input))
 
-DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
-		(uint32_t *state, const uint8_t *input, const uint32_t *k),
-		(state, input, k))
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+		(uint32_t *state, const uint32_t *k,
+		 size_t blocks, const uint8_t *input),
+		(state, k, blocks, input))
diff --git a/fat-ppc.c b/fat-ppc.c
index 7569e44d..b95365f6 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -195,6 +195,11 @@ DECLARE_FAT_FUNC(_nettle_poly1305_digest, poly1305_digest_func)
 DECLARE_FAT_FUNC_VAR(poly1305_digest, poly1305_digest_func, c)
 DECLARE_FAT_FUNC_VAR(poly1305_digest, poly1305_digest_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func)
+DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c)
+DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
+
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -251,12 +256,14 @@ fat_init (void)
       _nettle_poly1305_set_key_vec = _nettle_poly1305_set_key_ppc64;
     _nettle_poly1305_block_vec = _nettle_poly1305_block_ppc64;
     _nettle_poly1305_digest_vec = _nettle_poly1305_digest_ppc64;
+    _nettle_poly1305_blocks_vec = _nettle_poly1305_blocks_ppc64;
     }
   else
     {
       _nettle_poly1305_set_key_vec = _nettle_poly1305_set_key_c;
     _nettle_poly1305_block_vec = _nettle_poly1305_block_c;
     _nettle_poly1305_digest_vec = _nettle_poly1305_digest_c;
+    _nettle_poly1305_blocks_vec = _nettle_poly1305_blocks_c;
     }
 }
 
@@ -315,3 +322,9 @@ DEFINE_FAT_FUNC(_nettle_poly1305_digest, void,
 		(struct poly1305_ctx *ctx,
      union nettle_block16 *s),
 		(ctx, s))
+
+DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *,
+		(struct poly1305_ctx *ctx,
+     size_t blocks,
+		 const uint8_t *m),
+		(ctx, blocks, m))
diff --git a/fat-s390x.c b/fat-s390x.c
index fa026018..1bbd8e16 100644
--- a/fat-s390x.c
+++ b/fat-s390x.c
@@ -254,9 +254,9 @@ DECLARE_FAT_FUNC(nettle_sha1_compress, sha1_compress_func)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, c)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, s390x)
 
-DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, s390x)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, s390x)
 
 DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func)
 DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c)
@@ -398,11 +398,11 @@ fat_init (void)
   {
     if (verbose)
       fprintf (stderr, "libnettle: enabling hardware accelerated SHA256 compress code.\n");
-    _nettle_sha256_compress_vec = _nettle_sha256_compress_s390x;
+    _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_s390x;
   }
   else
   {
-    _nettle_sha256_compress_vec = _nettle_sha256_compress_c;
+    _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
   }
 
   /* SHA512 */
@@ -495,9 +495,10 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 		(state, input))
 
 /* SHA256 */
-DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
-		(uint32_t *state, const uint8_t *input, const uint32_t *k),
-		(state, input, k))
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+		(uint32_t *state, const uint32_t *k,
+		 size_t blocks, const uint8_t *input),
+		(state, k, blocks, input))
 
 /* SHA512 */
 DEFINE_FAT_FUNC(_nettle_sha512_compress, void,
diff --git a/fat-setup.h b/fat-setup.h
index ad3c10f0..6bf3e2fa 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -178,7 +178,9 @@ typedef void salsa20_crypt_func (struct salsa20_ctx *ctx, unsigned rounds,
 				 const uint8_t *src);
 
 typedef void sha1_compress_func(uint32_t *state, const uint8_t *input);
-typedef void sha256_compress_func(uint32_t *state, const uint8_t *input, const uint32_t *k);
+typedef const uint8_t *
+sha256_compress_n_func(uint32_t *state, const uint32_t *k,
+		       size_t blocks, const uint8_t *input);
 
 struct sha3_state;
 typedef void sha3_permute_func (struct sha3_state *state);
@@ -201,6 +203,8 @@ typedef void poly1305_set_key_func(struct poly1305_ctx *ctx, const uint8_t *key)
 typedef void poly1305_digest_func(struct poly1305_ctx *ctx, union nettle_block16 *s);
 typedef void poly1305_block_func(struct poly1305_ctx *ctx, const uint8_t *m,
 			     unsigned high);
+typedef const uint8_t * poly1305_blocks_func(struct poly1305_ctx *ctx, size_t blocks,
+			     const uint8_t *m);
 
 struct aes128_ctx;
 typedef void aes128_set_key_func (struct aes128_ctx *ctx, const uint8_t *key);
diff --git a/fat-x86_64.c b/fat-x86_64.c
index 47cf78ae..0a2fedf4 100644
--- a/fat-x86_64.c
+++ b/fat-x86_64.c
@@ -155,9 +155,9 @@ DECLARE_FAT_FUNC(nettle_sha1_compress, sha1_compress_func)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, x86_64)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, sha_ni)
 
-DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, x86_64)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, sha_ni)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, x86_64)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, sha_ni)
 
 DECLARE_FAT_FUNC(_nettle_ghash_set_key, ghash_set_key_func)
 DECLARE_FAT_FUNC_VAR(ghash_set_key, ghash_set_key_func, c)
@@ -228,14 +228,14 @@ fat_init (void)
       if (verbose)
 	fprintf (stderr, "libnettle: using sha_ni instructions.\n");
       nettle_sha1_compress_vec = _nettle_sha1_compress_sha_ni;
-      _nettle_sha256_compress_vec = _nettle_sha256_compress_sha_ni;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_sha_ni;
     }
   else
     {
       if (verbose)
 	fprintf (stderr, "libnettle: not using sha_ni instructions.\n");
       nettle_sha1_compress_vec = _nettle_sha1_compress_x86_64;
-      _nettle_sha256_compress_vec = _nettle_sha256_compress_x86_64;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_x86_64;
     }
 
   if (features.have_pclmul)
@@ -315,9 +315,10 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 		(uint32_t *state, const uint8_t *input),
 		(state, input))
 
-DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
-		(uint32_t *state, const uint8_t *input, const uint32_t *k),
-		(state, input, k))
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+		(uint32_t *state, const uint32_t *k,
+		 size_t blocks, const uint8_t *input),
+		(state, k, blocks, input))
 
 DEFINE_FAT_FUNC(_nettle_ghash_set_key, void,
 		(struct gcm_key *ctx, const union nettle_block16 *key),
diff --git a/sec-tabselect.c b/gcm-sm4-meta.c
index e6bf2282..090460d3 100644
--- a/sec-tabselect.c
+++ b/gcm-sm4-meta.c
@@ -1,6 +1,6 @@
-/* sec-tabselect.c
+/* gcm-sm4-meta.c
 
-   Copyright (C) 2013 Niels Möller
+   Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
 
    This file is part of GNU Nettle.
 
@@ -29,34 +29,32 @@
    not, see http://www.gnu.org/licenses/.
 */
 
-/* Development of Nettle's ECC support was funded by the .SE Internet Fund. */
-
 #if HAVE_CONFIG_H
 # include "config.h"
 #endif
 
 #include <assert.h>
 
-#include "ecc-internal.h"
+#include "nettle-meta.h"
+
+#include "gcm.h"
 
-/* Copy the k'th element of the table out tn elements, each of size
-   rn. Always read complete table. Similar to gmp's mpn_tabselect. */
-/* FIXME: Should we need to volatile declare anything? */
-void
-sec_tabselect (mp_limb_t *rp, mp_size_t rn,
-	       const mp_limb_t *table, unsigned tn,
-	       unsigned k)
+static nettle_set_key_func gcm_sm4_set_nonce_wrapper;
+static void
+gcm_sm4_set_nonce_wrapper (void *ctx, const uint8_t *nonce)
 {
-  const mp_limb_t *end = table + tn * rn;
-  const mp_limb_t *p;
-  mp_size_t i;
-  
-  assert (k < tn);
-  mpn_zero (rp, rn);
-  for (p = table; p < end; p += rn, k--)
-    {
-      mp_limb_t mask = - (mp_limb_t) (k == 0);
-      for (i = 0; i < rn; i++)
-	rp[i] += mask & p[i];
-    }
+  gcm_sm4_set_iv (ctx, GCM_IV_SIZE, nonce);
 }
+
+const struct nettle_aead nettle_gcm_sm4 =
+  { "gcm_sm4", sizeof(struct gcm_sm4_ctx),
+    GCM_BLOCK_SIZE, SM4_KEY_SIZE,
+    GCM_IV_SIZE, GCM_DIGEST_SIZE,
+    (nettle_set_key_func *) gcm_sm4_set_key,
+    (nettle_set_key_func *) gcm_sm4_set_key,
+    gcm_sm4_set_nonce_wrapper,
+    (nettle_hash_update_func *) gcm_sm4_update,
+    (nettle_crypt_func *) gcm_sm4_encrypt,
+    (nettle_crypt_func *) gcm_sm4_decrypt,
+    (nettle_hash_digest_func *) gcm_sm4_digest,
+  };
diff --git a/gcm-sm4.c b/gcm-sm4.c
new file mode 100644
index 00000000..19d91ae9
--- /dev/null
+++ b/gcm-sm4.c
@@ -0,0 +1,81 @@
+/* gcm-sm4.c
+
+   Galois counter mode using SM4 as the underlying cipher.
+
+   Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+
+#include "gcm.h"
+
+void
+gcm_sm4_set_key(struct gcm_sm4_ctx *ctx, const uint8_t *key)
+{
+  GCM_SET_KEY(ctx, sm4_set_encrypt_key, sm4_crypt, key);
+}
+
+void
+gcm_sm4_set_iv(struct gcm_sm4_ctx *ctx,
+	       size_t length, const uint8_t *iv)
+{
+  GCM_SET_IV (ctx, length, iv);
+}
+
+void
+gcm_sm4_update(struct gcm_sm4_ctx *ctx,
+	       size_t length, const uint8_t *data)
+{
+  GCM_UPDATE (ctx, length, data);
+}
+
+void
+gcm_sm4_encrypt(struct gcm_sm4_ctx *ctx,
+		size_t length, uint8_t *dst, const uint8_t *src)
+{
+  GCM_ENCRYPT(ctx, sm4_crypt, length, dst, src);
+}
+
+void
+gcm_sm4_decrypt(struct gcm_sm4_ctx *ctx,
+		size_t length, uint8_t *dst, const uint8_t *src)
+{
+  GCM_DECRYPT(ctx, sm4_crypt, length, dst, src);
+}
+
+void
+gcm_sm4_digest(struct gcm_sm4_ctx *ctx,
+	       size_t length, uint8_t *digest)
+{
+  GCM_DIGEST(ctx, sm4_crypt, length, digest);
+}
diff --git a/gcm.c b/gcm.c
index 5de8abb2..1e015b9d 100644
--- a/gcm.c
+++ b/gcm.c
@@ -55,25 +55,7 @@
 #include "macros.h"
 #include "ctr-internal.h"
 #include "block-internal.h"
-
-/* FIXME: Duplicated in nist-keywrap.c */
-#if WORDS_BIGENDIAN
-#define bswap_if_le(x) (x)
-#elif HAVE_BUILTIN_BSWAP64
-#define bswap_if_le(x) (__builtin_bswap64 (x))
-#else
-static uint64_t
-bswap_if_le (uint64_t x)
-{
-  x = ((x >> 32) & UINT64_C (0xffffffff))
-    | ((x & UINT64_C (0xffffffff)) << 32);
-  x = ((x >> 16) & UINT64_C (0xffff0000ffff))
-    | ((x & UINT64_C (0xffff0000ffff)) << 16);
-  x = ((x >> 8) & UINT64_C (0xff00ff00ff00ff))
-    | ((x & UINT64_C (0xff00ff00ff00ff)) << 8);
-  return x;
-}
-#endif
+#include "bswap-internal.h"
 
 /* Initialization of GCM.
  * @ctx: The context of GCM
@@ -115,8 +97,8 @@ gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
   data_size *= 8;
   auth_size *= 8;
 
-  buffer.u64[0] = bswap_if_le (auth_size);
-  buffer.u64[1] = bswap_if_le (data_size);
+  buffer.u64[0] = bswap64_if_le (auth_size);
+  buffer.u64[1] = bswap64_if_le (data_size);
 
   _ghash_update (key, x, 1, buffer.b);
 }
diff --git a/gcm.h b/gcm.h
index 96578530..39af5ab0 100644
--- a/gcm.h
+++ b/gcm.h
@@ -40,6 +40,7 @@
 
 #include "aes.h"
 #include "camellia.h"
+#include "sm4.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -95,6 +96,13 @@ extern "C" {
 #define gcm_camellia256_decrypt nettle_gcm_camellia256_decrypt
 #define gcm_camellia256_digest nettle_gcm_camellia256_digest
 
+#define gcm_sm4_set_key nettle_gcm_sm4_set_key
+#define gcm_sm4_set_iv nettle_gcm_sm4_set_iv
+#define gcm_sm4_update nettle_gcm_sm4_update
+#define gcm_sm4_encrypt nettle_gcm_sm4_encrypt
+#define gcm_sm4_decrypt nettle_gcm_sm4_decrypt
+#define gcm_sm4_digest nettle_gcm_sm4_digest
+
 #define GCM_BLOCK_SIZE 16
 #define GCM_IV_SIZE (GCM_BLOCK_SIZE - 4)
 #define GCM_DIGEST_SIZE 16
@@ -322,7 +330,22 @@ void gcm_camellia256_decrypt(struct gcm_camellia256_ctx *ctx,
 void gcm_camellia256_digest(struct gcm_camellia256_ctx *ctx,
 			    size_t length, uint8_t *digest);
 
-  
+
+struct gcm_sm4_ctx GCM_CTX(struct sm4_ctx);
+
+void gcm_sm4_set_key(struct gcm_sm4_ctx *ctx, const uint8_t *key);
+void gcm_sm4_set_iv(struct gcm_sm4_ctx *ctx,
+		    size_t length, const uint8_t *iv);
+void gcm_sm4_update(struct gcm_sm4_ctx *ctx,
+		    size_t length, const uint8_t *data);
+void gcm_sm4_encrypt(struct gcm_sm4_ctx *ctx,
+		     size_t length, uint8_t *dst, const uint8_t *src);
+void gcm_sm4_decrypt(struct gcm_sm4_ctx *ctx,
+		     size_t length, uint8_t *dst, const uint8_t *src);
+void gcm_sm4_digest(struct gcm_sm4_ctx *ctx,
+		    size_t length, uint8_t *digest);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ghash-internal.h b/ghash-internal.h
index 97dff024..2504dc09 100644
--- a/ghash-internal.h
+++ b/ghash-internal.h
@@ -38,6 +38,8 @@
 /* Name mangling */
 #define _ghash_set_key _nettle_ghash_set_key
 #define _ghash_update _nettle_ghash_update
+#define _siv_ghash_set_key _nettle_siv_ghash_set_key
+#define _siv_ghash_update _nettle_siv_ghash_update
 
 #ifdef __cplusplus
 extern "C" {
@@ -58,6 +60,17 @@ const uint8_t *
 _ghash_update (const struct gcm_key *ctx, union nettle_block16 *state,
 	       size_t blocks, const uint8_t *data);
 
+/* Expands KEY as needed, for corresponding _siv_ghash_update */
+void
+_siv_ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key);
+
+/* Updates STATE by hashing DATA, which must be an integral number of
+   blocks. For convenience, returns a pointer to the end of the
+   data. */
+const uint8_t *
+_siv_ghash_update (const struct gcm_key *ctx, union nettle_block16 *state,
+		 size_t blocks, const uint8_t *data);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gmp-glue.c b/gmp-glue.c
index e75d678b..ffce6c30 100644
--- a/gmp-glue.c
+++ b/gmp-glue.c
@@ -99,6 +99,26 @@ mpn_cnd_swap (mp_limb_t cnd, volatile mp_limb_t *ap, volatile mp_limb_t *bp, mp_
     }
 }
 
+/* Copy the k'th element of the table out tn elements, each of size
+   rn. Always read complete table. Similar to gmp's mpn_tabselect. */
+void
+mpn_sec_tabselect (volatile mp_limb_t *rp, volatile const mp_limb_t *table,
+		   mp_size_t rn, unsigned tn, unsigned k)
+{
+  volatile const mp_limb_t *end = table + tn * rn;
+  volatile const mp_limb_t *p;
+  mp_size_t i;
+
+  assert (k < tn);
+  for (p = table; p < end; p += rn, k--)
+    {
+      mp_limb_t mask = - (mp_limb_t) (k == 0);
+      for (i = 0; i < rn; i++)
+	rp[i] = (~mask & rp[i]) | (mask & p[i]);
+    }
+}
+
+
 #endif /* NETTLE_USE_MINI_GMP */
 
 int
diff --git a/gmp-glue.h b/gmp-glue.h
index bc6dbf16..dc0ede2a 100644
--- a/gmp-glue.h
+++ b/gmp-glue.h
@@ -66,6 +66,10 @@ mpn_cnd_sub_n (mp_limb_t cnd, mp_limb_t *rp,
 
 void
 mpn_cnd_swap (mp_limb_t cnd, volatile mp_limb_t *ap, volatile mp_limb_t *bp, mp_size_t n);
+
+void
+mpn_sec_tabselect (volatile mp_limb_t *rp, volatile const mp_limb_t *table,
+		   mp_size_t rn, unsigned tn, unsigned k);
 #endif
 
 /* Side-channel silent variant of mpn_zero_p. */
diff --git a/md-internal.h b/md-internal.h
new file mode 100644
index 00000000..a97b7b90
--- /dev/null
+++ b/md-internal.h
@@ -0,0 +1,70 @@
+/* md-internal.h
+
+   Copyright (C) 2001, 2010, 2022 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef NETTLE_MD_INTERNAL_H_INCLUDED
+#define NETTLE_MD_INTERNAL_H_INCLUDED
+
+#include <string.h>
+
+/* Internal helper macros for Merkle-Damgård hash functions. Assumes the context
+   structs includes the following fields:
+
+     uint8_t block[...];		// Buffer holding one block
+     unsigned int index;		// Index into block
+*/
+
+#define MD_FILL_OR_RETURN(ctx, length, data)			\
+  do {								\
+    unsigned __md_left = sizeof((ctx)->block) - (ctx)->index;	\
+    if ((length) < __md_left)					\
+      {								\
+	memcpy((ctx)->block + (ctx)->index, (data), (length));	\
+	(ctx)->index += (length);				\
+	return;							\
+      }								\
+    memcpy((ctx)->block + (ctx)->index, (data), __md_left);	\
+    (data) += __md_left;					\
+    (length) -= __md_left;					\
+  } while(0)
+
+#define MD_FILL_OR_RETURN_INDEX(block_size, block, index, length, data)	\
+  do {									\
+    unsigned __md_left = (block_size) - (index);			\
+    if ((length) < __md_left)						\
+      {									\
+	memcpy(block + (index), (data), (length));			\
+	return (index) + (length);					\
+      }									\
+    memcpy((block) + (index), (data), __md_left);			\
+    (data) += __md_left;						\
+    (length) -= __md_left;						\
+  } while(0)
+#endif /* NETTLE_MD_INTERNAL_H_INCLUDED */
diff --git a/nettle-internal.h b/nettle-internal.h
index 92416400..bf906c88 100644
--- a/nettle-internal.h
+++ b/nettle-internal.h
@@ -74,12 +74,13 @@
   do { assert((size_t)(size) <= (sizeof(name))); } while (0)
 #endif 
 
-/* Arbitrary limits which apply to systems that don't have alloca */
-#define NETTLE_MAX_HASH_BLOCK_SIZE 128
+/* Limits that apply to systems that don't have alloca */
+#define NETTLE_MAX_HASH_BLOCK_SIZE 144  /* For sha3_224*/
 #define NETTLE_MAX_HASH_DIGEST_SIZE 64
 #define NETTLE_MAX_HASH_CONTEXT_SIZE (sizeof(struct sha3_224_ctx))
 #define NETTLE_MAX_SEXP_ASSOC 17
 #define NETTLE_MAX_CIPHER_BLOCK_SIZE 32
+#define NETTLE_MAX_CIPHER_KEY_SIZE 32
 
 /* Doesn't quite fit with the other algorithms, because of the weak
  * keys. Weak keys are not reported, the functions will simply crash
diff --git a/nettle-meta-aeads.c b/nettle-meta-aeads.c
index c99cc465..78f38a3c 100644
--- a/nettle-meta-aeads.c
+++ b/nettle-meta-aeads.c
@@ -43,6 +43,7 @@ const struct nettle_aead * const _nettle_aeads[] = {
   &nettle_gcm_aes256,
   &nettle_gcm_camellia128,
   &nettle_gcm_camellia256,
+  &nettle_gcm_sm4,
   &nettle_eax_aes128,
   &nettle_chacha_poly1305,
   NULL
diff --git a/nettle-meta-ciphers.c b/nettle-meta-ciphers.c
index 49cb47a7..f8d691cf 100644
--- a/nettle-meta-ciphers.c
+++ b/nettle-meta-ciphers.c
@@ -54,6 +54,7 @@ const struct nettle_cipher * const _nettle_ciphers[] = {
   &nettle_arctwo64,
   &nettle_arctwo128,
   &nettle_arctwo_gutmann128,
+  &nettle_sm4,
   NULL
 };
 
diff --git a/nettle-meta.h b/nettle-meta.h
index d684947e..19dc96c5 100644
--- a/nettle-meta.h
+++ b/nettle-meta.h
@@ -89,6 +89,8 @@ extern const struct nettle_cipher nettle_arctwo64;
 extern const struct nettle_cipher nettle_arctwo128;
 extern const struct nettle_cipher nettle_arctwo_gutmann128;
 
+extern const struct nettle_cipher nettle_sm4;
+
 struct nettle_hash
 {
   const char *name;
@@ -198,6 +200,7 @@ extern const struct nettle_aead nettle_gcm_aes192;
 extern const struct nettle_aead nettle_gcm_aes256;
 extern const struct nettle_aead nettle_gcm_camellia128;
 extern const struct nettle_aead nettle_gcm_camellia256;
+extern const struct nettle_aead nettle_gcm_sm4;
 extern const struct nettle_aead nettle_eax_aes128;
 extern const struct nettle_aead nettle_chacha_poly1305;
 
diff --git a/nettle.texinfo b/nettle.texinfo
index 69f9bcaf..767ae718 100644
--- a/nettle.texinfo
+++ b/nettle.texinfo
@@ -105,6 +105,7 @@ Cipher functions
 * DES3::
 * Salsa20::
 * Serpent::
+* SM4::
 * Twofish::
 * nettle_cipher abstraction::
 
@@ -122,6 +123,7 @@ Authenticated encryption with associated data
 * CCM::
 * ChaCha-Poly1305::
 * SIV-CMAC::
+* SIV-GCM::
 * nettle_aead abstraction::
 
 Keyed Hash Functions
@@ -442,6 +444,14 @@ This function also resets the context in the same way as
 @code{sha256_init}.
 @end deftypefun
 
+@deftypefun void sha256_compress (const uint32_t *@var{state}, uint8_t *@var{input})
+Perform a raw SHA256 compress on SHA256_BLOCK_SIZE bytes from@var{input}
+using @var{state} as IV (an array of 8 uint32_t). The output is stored in @var{state}.
+This function provides access to the underlying compression function,
+for the rare applications that need that (e.g., using different IV from
+standard SHA256).
+@end deftypefun
+
 Earlier versions of nettle defined SHA256 in the header file
 @file{<nettle/sha.h>}, which is now deprecated, but kept for
 compatibility.
@@ -522,6 +532,14 @@ This function also resets the context in the same way as
 @code{sha512_init}.
 @end deftypefun
 
+@deftypefun void sha512_compress (const uint64_t *@var{state}, uint8_t *@var{input})
+Perform a raw SHA512 compress on SHA512_BLOCK_SIZE bytes from
+@var{input} using @var{state} as IV (an array of 8 uint64_t). The output is stored in @var{state}.
+This function provides access to the underlying compression function,
+for the rare applications that need that (e.g., using different IV from
+standard SHA512).
+@end deftypefun
+
 @subsubsection @acronym{SHA384 and other variants of SHA512}
 
 Several variants of SHA512 have been defined, with a different initial
@@ -929,6 +947,14 @@ This function also resets the context in the same way as
 @code{md5_init}.
 @end deftypefun
 
+@deftypefun void md5_compress (const uint32_t *@var{state}, uint8_t *@var{input})
+Perform a raw MD5 compress on MD5_BLOCK_SIZE bytes from @var{input}
+using @var{state} as IV (an array of 4 uint32_t). The output is stored in @var{state}.
+This function provides access to the underlying compression function,
+for the rare applications that need that (e.g., using different IV from
+standard MD5).
+@end deftypefun
+
 The normal way to use MD5 is to call the functions in order: First
 @code{md5_init}, then @code{md5_update} zero or more times, and finally
 @code{md5_digest}. After @code{md5_digest}, the context is reset to
@@ -1083,6 +1109,13 @@ This function also resets the context in the same way as
 @code{sha1_init}.
 @end deftypefun
 
+@deftypefun void sha1_compress (const uint32_t *@var{state}, uint8_t *@var{input})
+Perform a raw SHA1 compress on SHA1_BLOCK_SIZE bytes from @var{input}
+using @var{state} as IV (an array of 5 uint32_t). The output is stored in @var{state}.
+This function provides access to the underlying compression function,
+for the rare applications that need that (e.g., using different IV from
+standard SHA1).
+@end deftypefun
 
 @subsubsection @acronym{GOSTHASH94 and GOSTHASH94CP}
 @cindex GOST hash
@@ -1292,6 +1325,7 @@ decryption.
 * DES3::
 * Salsa20::
 * Serpent::
+* SM4::
 * Twofish::
 * nettle_cipher abstraction::
 @end menu
@@ -2114,6 +2148,42 @@ in any other way.
 Analogous to @code{serpent_encrypt}
 @end deftypefun
 
+@node SM4
+@subsection SM4
+@cindex SM4
+
+SM4 is a block cipher standard adopted by the government of the People's
+Republic of China, and it was issued by the State Cryptography Administration
+on March 21, 2012. The standard is GM/T 0002-2012 "SM4 block cipher algorithm".
+Nettle defines it in @file{<nettle/sm4.h>}.
+
+@deftp {Context struct} {struct sm4_ctx}
+@end deftp
+
+@defvr Constant SM4_BLOCK_SIZE
+The SM4 block-size, 16.
+@end defvr
+
+@defvr Constant SM4_KEY_SIZE
+Default SM4 key size, 16.
+@end defvr
+
+@deftypefun void sm4_set_encrypt_key (struct sm4_ctx *@var{ctx}, const uint8_t *@var{key})
+Initialize the cipher. The function is used for encryption.
+@end deftypefun
+
+@deftypefun void sm4_set_decrypt_key (struct sm4_ctx *@var{ctx}, const uint8_t *@var{key})
+Initialize the cipher. The function is used for decryption.
+@end deftypefun
+
+@deftypefun void sm4_crypt (const struct sm4_ctx *@var{ctx}, size_t @var{length}, uint8_t *@var{dst}, const uint8_t *@var{src})
+Cryption function. @var{length} must be an integral multiple of the
+block size. If it is more than one block, the data is processed in ECB
+mode. @code{src} and @code{dst} may be equal, but they must not overlap
+in any other way. The same function is used for both encryption and
+decryption.
+@end deftypefun
+
 @node Twofish
 @subsection Twofish
 @cindex Twofish
@@ -2811,6 +2881,7 @@ more adventurous alternative, in particular if performance is important.
 * CCM::
 * ChaCha-Poly1305::
 * SIV-CMAC::
+* SIV-GCM::
 * nettle_aead abstraction::
 @end menu
 
@@ -3227,6 +3298,44 @@ that @var{length} is @code{GCM_DIGEST_SIZE}, but if you provide a smaller
 value, only the first @var{length} octets of the digest are written.
 @end deftypefun
 
+@subsubsection @acronym{GCM}-SM4 interface
+
+The following functions implement the case of @acronym{GCM} using
+SM4 as the underlying cipher.
+
+@deftp {Context struct} {struct gcm_sm4_ctx}
+Context structs, defined using @code{GCM_CTX}.
+@end deftp
+
+@deftypefun void gcm_sm4_set_key (struct gcm_sm4_ctx *@var{ctx}, const uint8_t *@var{key})
+Initializes @var{ctx} using the given key.
+@end deftypefun
+
+@deftypefun void gcm_sm4_set_iv (struct gcm_sm4_ctx *@var{ctx}, size_t @var{length}, const uint8_t *@var{iv})
+Initializes the per-message state, using the given @acronym{IV}.
+@end deftypefun
+
+@deftypefun void gcm_sm4_update (struct gcm_sm4_ctx *@var{ctx}, size_t @var{length}, const uint8_t *@var{data})
+Provides associated data to be authenticated. If used, must be called
+before @code{gcm_sm4_encrypt} or @code{gcm_sm4_decrypt}. All but the
+last call for each message @emph{must} use a length that is a multiple
+of the block size.
+@end deftypefun
+
+@deftypefun void gcm_sm4_encrypt (struct gcm_sm4_ctx *@var{ctx}, size_t @var{length}, uint8_t *@var{dst}, const uint8_t *@var{src})
+@deftypefunx void gcm_sm4_decrypt (struct gcm_sm4_ctx *@var{ctx}, size_t @var{length}, uint8_t *@var{dst}, const uint8_t *@var{src})
+Encrypts or decrypts the data of a message. All but the last call for
+each message @emph{must} use a length that is a multiple of the block
+size.
+@end deftypefun
+
+@deftypefun void gcm_sm4_digest (struct gcm_sm4_ctx *@var{ctx}, size_t @var{length}, uint8_t *@var{digest})
+Extracts the message digest (also known ``authentication tag''). This is
+the final operation when processing a message. It's strongly recommended
+that @var{length} is @code{GCM_DIGEST_SIZE}, but if you provide a smaller
+value, only the first @var{length} octets of the digest are written.
+@end deftypefun
+
 @node CCM
 @subsection Counter with CBC-MAC mode
 
@@ -3626,6 +3735,95 @@ are equal, this will return 1 indicating a valid and authenticated
 message. Otherwise, this function will return zero.
 @end deftypefun
 
+@node SIV-GCM
+@subsection SIV-GCM
+
+@acronym{SIV-GCM}, described in @cite{RFC 8452}, is an @acronym{AEAD}
+construction similar to @acronym{AES-GCM}, but provides protection against
+accidental nonce misuse like @acronym{SIV-CMAC} mode.
+
+It is constructed on top of a block cipher which must have a block size of 128
+bits and a nonce size of 12 bytes. Nettle's support for @acronym{SIV-GCM}
+consists of a message encryption and authentication interface, for
+@acronym{SIV-GCM} using AES as the underlying block cipher.  These
+interfaces are defined in @file{<nettle/siv-gcm.h>}.
+
+Unlike other @acronym{AEAD} mode in @acronym{SIV-GCM} the tag is calculated
+over the encoded additional authentication data and plaintext instead of the
+ciphertext.
+
+@subsubsection General interface
+
+@defvr Constant SIV_GCM_BLOCK_SIZE
+@acronym{SIV-GCM}'s block size, 16.
+@end defvr
+
+@defvr Constant SIV_GCM_DIGEST_SIZE
+Size of the @acronym{SIV-GCM} digest for tags, 16.
+@end defvr
+
+@defvr Constant SIV_GCM_NONCE_SIZE
+Size of the @acronym{SIV-GCM} nonce, 12.
+@end defvr
+
+@deftypefun void siv_gcm_encrypt_message (const struct nettle_cipher *@var{nc}, const void *@var{ctx}, void *@var{ctr_ctx}, size_t @var{nlength}, const uint8_t *@var{nonce}, size_t @var{alength}, const uint8_t *@var{adata}, size_t @var{clength}, uint8_t *@var{dst}, const uint8_t *@var{src})
+Computes the message digest from the @var{adata} and @var{src}
+parameters, encrypts the plaintext from @var{src}, appends the
+authentication tag to the ciphertext and outputs it to @var{dst}.  The
+@var{clength} variable must be equal to the length of @var{src} plus
+@code{SIV_GCM_DIGEST_SIZE}.
+@end deftypefun
+
+@deftypefun int siv_gcm_decrypt_message (const struct nettle_cipher *@var{nc}, const void *@var{ctx}, void *@var{ctr_ctx}, size_t @var{nlength}, const uint8_t *@var{nonce}, size_t @var{alength}, const uint8_t *@var{adata}, size_t @var{mlength}, uint8_t *@var{dst}, const uint8_t *@var{src})
+Decrypts the ciphertext from @var{src}, outputs the plaintext to
+@var{dst}, recalculates the initialization vector from @var{adata} and the
+plaintext. If the values of the received and calculated initialization vector
+are equal, this will return 1 indicating a valid and authenticated
+message. Otherwise, this function will return zero.
+@end deftypefun
+
+In the above interface, @var{nc} must point to a cipher that works
+with 16-byte block size and the key sizes that are multiple of
+8-bytes.  The @var{ctx} context structure must be initialized for
+encryption mode using a set-key function, before using any of the
+functions in this interface.  While the @var{ctr_ctx} context
+structure must have the same size as @var{ctx}, it does not need to be
+initialized before calling those functions as it is used as working
+storage.  These structures can point to the same area; in that case
+the contents of *@var{ctx} is destroyed by the call.
+
+For convenience, Nettle provides wrapper functions that works with
+@acronym{AES} described in the following section.
+
+@subsubsection @acronym{SIV-GCM}-@acronym{AES} interface
+
+The @acronym{SIV-GCM} functions provide an API for using @acronym{SIV-GCM}
+mode with the @acronym{AES} block ciphers. The parameters all have the same
+meaning as the general and message interfaces, except that the @var{cipher},
+@var{f}, and @var{ctx} parameters are replaced with an @acronym{AES} context
+structure. The @acronym{AES} context structure must be initialized for
+encryption mode using a set-key function, before using any of the functions in
+this interface.
+
+@deftypefun void siv_gcm_aes128_encrypt_message (const struct aes128_ctx *@var{ctx}, size_t @var{nlength}, const uint8_t *@var{nonce}, size_t @var{alength}, const uint8_t *@var{adata}, size_t @var{clength}, uint8_t *@var{dst}, const uint8_t *@var{src})
+@deftypefunx void siv_gcm_aes256_encrypt_message (const struct aes256_ctx *@var{ctx}, size_t @var{nlength}, const uint8_t *@var{nonce}, size_t @var{alength}, const uint8_t *@var{adata}, size_t @var{clength}, uint8_t *@var{dst}, const uint8_t *@var{src})
+Computes the message digest from the @var{adata} and @var{src}
+parameters, encrypts the plaintext from @var{src}, appends the
+authentication tag to the ciphertext and outputs it to @var{dst}.
+The @var{clength} variable must be equal to the length of @var{src}
+plus @code{SIV_GCM_DIGEST_SIZE}.
+
+@end deftypefun
+
+@deftypefun int siv_gcm_aes128_decrypt_message (const struct aes128_ctx *@var{ctx}, size_t @var{nlength}, const uint8_t *@var{nonce}, size_t @var{alength}, const uint8_t *@var{adata}, size_t @var{mlength}, uint8_t *@var{dst}, const uint8_t *@var{src})
+@deftypefunx int siv_gcm_aes256_decrypt_message (const struct aes256_ctx *@var{ctx}, size_t @var{nlength}, const uint8_t *@var{nonce}, size_t @var{alength}, const uint8_t *@var{adata}, size_t @var{mlength}, uint8_t *@var{dst}, const uint8_t *@var{src})
+Decrypts the ciphertext from @var{src}, outputs the plaintext to
+@var{dst}, recalculates the initialization vector from @var{adata} and the
+plaintext. If the values of the received and calculated initialization vector
+are equal, this will return 1 indicating a valid and authenticated
+message. Otherwise, this function will return zero.
+@end deftypefun
+
 @node nettle_aead abstraction
 @subsection The @code{struct nettle_aead} abstraction
 @cindex nettle_aead
@@ -4348,6 +4546,81 @@ salt @var{salt} of length @var{salt_length}, with iteration counter
 room for at least @var{length} octets.
 @end deftypefun
 
+
+@subsection @acronym{BALLOON}
+@cindex Balloon password-hashing algorithm
+Balloon is a memory-hard password-hashing algorithm.  An in-depth description
+of the algorithm and its properties can be found in an online research paper:
+Boneh, D., Corrigan-Gibbs, H., Schechter, S. (2017, May 12). Balloon Hashing:
+A Memory-Hard Function Providing Provable Protection Against Sequential Attacks.
+Retrieved Sep 1, 2022, from @url{https://eprint.iacr.org/2016/027.pdf}
+
+Nettle's definition of the @acronym{BALLOON} algorithm can be found in
+@file{<nettle/balloon.h>}.  There is a general @acronym{BALLOON} function where
+the user can specify desired hash algorithm that will be used by the function.
+There are also concrete, more user-friendly functions that use common hash algorithms
+like SHA1, SHA256, SHA384 and SHA512.  There is also a utility function which helps to
+determine the size of the working buffer that must be provided as one of the inputs.
+
+Each @acronym{BALLOON} function takes as an input a password and a salt of arbitrary
+lengths, a time and a space parameters, and a scratch buffer.  The space parameter
+@var{s_cost} determines how many blocks of working space the algorithm will require
+during its computation.  It is common to set @var{s_cost} to a high value in order
+to increase the cost of hardware accelerators built by the adversary.  The time
+parameter @var{t_cost} determines the number of rounds of computation that the algorithm
+will perform. This can be used to further increase the cost of computation without raising
+the memory requirement.  Scratch buffer @var{scratch} is a user allocated working space
+required by the algorithm.  To determine the required size of the scratch buffer use the
+utility function @code{balloon_itch}.  Output of @acronym{BALLOON} algorithm will be
+written into the output buffer @var{dst} that has to be at least @var{digest_size} bytes
+long.  Note that it is safe to use the same buffer for both @var{scratch} and @var{dst}.
+Next follows the description of the general @acronym{BALLOON} function.
+
+@deftypefun void balloon (void *@var{hash_ctx}, nettle_hash_update_func *@var{update}, nettle_hash_digest_func *@var{digest}, size_t @var{digest_size}, size_t @var{s_cost}, size_t @var{t_cost}, size_t @var{passwd_length}, const uint8_t *@var{passwd}, size_t @var{salt_length}, const uint8_t *@var{salt}, uint8_t *@var{scratch}, uint8_t *@var{dst})
+Compute hash of given password @var{passwd} of length @var{passwd_length} salted
+with @var{salt} of length @var{salt_length} and write @var{digest_size} bytes into
+the output buffer @var{dst}.  Parameter @var{hash_ctx} is a context for the
+underlying hash function, which much be initialized by the caller.  @var{update}
+and @var{digest} are the update and digest functions of the chosen hash algorithm.
+@var{digest_size} is the digest size of the chosen hash algorithm and determines
+the size of the output.
+@end deftypefun
+
+@deftypefun size_t balloon_itch (size_t @var{digest_size}, size_t @var{s_cost})
+Compute the size of the scratch buffer @var{scratch}.  @var{digest_size} is the
+digest size of the chosen hash algorithm.  @var{s_cost} is the space parameter
+used by the @code{balloon} function.
+@end deftypefun
+
+@subsection Concrete @acronym{BALLOON} functions
+Here follows a list of the specialized @acronym{BALLOON} functions, which are
+more user-friendly variants of the general function.
+
+@subsubsection @acronym{BALLOON-SHA1}
+
+@deftypefun void balloon_sha1 (size_t @var{s_cost}, size_t @var{t_cost}, size_t @var{passwd_length}, const uint8_t *@var{passwd}, size_t @var{salt_length}, const uint8_t *@var{salt}, uint8_t *@var{scratch}, uint8_t *@var{dst})
+@acronym{BALLOON} algorithm using SHA1 as the underlying hash function.
+@end deftypefun
+
+@subsubsection @acronym{BALLOON-SHA256}
+
+@deftypefun void balloon_sha256 (size_t @var{s_cost}, size_t @var{t_cost}, size_t @var{passwd_length}, const uint8_t *@var{passwd}, size_t @var{salt_length}, const uint8_t *@var{salt}, uint8_t *@var{scratch}, uint8_t *@var{dst})
+@acronym{BALLOON} algorithm using SHA256 as the underlying hash function.
+@end deftypefun
+
+@subsubsection @acronym{BALLOON-SHA384}
+
+@deftypefun void balloon_sha384 (size_t @var{s_cost}, size_t @var{t_cost}, size_t @var{passwd_length}, const uint8_t *@var{passwd}, size_t @var{salt_length}, const uint8_t *@var{salt}, uint8_t *@var{scratch}, uint8_t *@var{dst})
+@acronym{BALLOON} algorithm using SHA384 as the underlying hash function.
+@end deftypefun
+
+@subsubsection @acronym{BALLOON-SHA512}
+
+@deftypefun void balloon_sha512 (size_t @var{s_cost}, size_t @var{t_cost}, size_t @var{passwd_length}, const uint8_t *@var{passwd}, size_t @var{salt_length}, const uint8_t *@var{salt}, uint8_t *@var{scratch}, uint8_t *@var{dst})
+@acronym{BALLOON} algorithm using SHA512 as the underlying hash function.
+@end deftypefun
+
+
 @node Public-key algorithms
 @section Public-key algorithms
 
diff --git a/nist-keywrap.c b/nist-keywrap.c
index 8fdd9335..2aca8423 100644
--- a/nist-keywrap.c
+++ b/nist-keywrap.c
@@ -44,24 +44,7 @@
 #include "nist-keywrap.h"
 #include "memops.h"
 #include "macros.h"
-
-#if WORDS_BIGENDIAN
-#define bswap_if_le(x) (x)
-#elif HAVE_BUILTIN_BSWAP64
-#define bswap_if_le(x) (__builtin_bswap64 (x))
-#else
-static uint64_t
-bswap_if_le (uint64_t x)
-{
-  x = ((x >> 32) & UINT64_C (0xffffffff))
-    | ((x & UINT64_C (0xffffffff)) << 32);
-  x = ((x >> 16) & UINT64_C (0xffff0000ffff))
-    | ((x & UINT64_C (0xffff0000ffff)) << 16);
-  x = ((x >> 8) & UINT64_C (0xff00ff00ff00ff))
-    | ((x & UINT64_C (0xff00ff00ff00ff)) << 8);
-  return x;
-}
-#endif
+#include "bswap-internal.h"
 
 void
 nist_keywrap16 (const void *ctx, nettle_cipher_func *encrypt,
@@ -94,7 +77,7 @@ nist_keywrap16 (const void *ctx, nettle_cipher_func *encrypt,
 	  encrypt (ctx, 16, B.b, I.b);
 
 	  /* A = MSB(64, B) ^ t where t = (n*j)+i */
-	  A.u64 = B.u64[0] ^ bswap_if_le ((n * j) + (i + 1));
+	  A.u64 = B.u64[0] ^ bswap64_if_le ((n * j) + (i + 1));
 
 	  /* R[i] = LSB(64, B) */
 	  memcpy (R + (i * 8), B.b + 8, 8);
@@ -129,7 +112,7 @@ nist_keyunwrap16 (const void *ctx, nettle_cipher_func *decrypt,
       for (i = n - 1; i >= 0; i--)
 	{
 	  /* B = AES-1(K, (A ^ t) | R[i]) where t = n*j+i */
-	  I.u64[0] = A.u64 ^ bswap_if_le ((n * j) + (i + 1));
+	  I.u64[0] = A.u64 ^ bswap64_if_le ((n * j) + (i + 1));
 	  memcpy (I.b + 8, R + (i * 8), 8);
 	  decrypt (ctx, 16, B.b, I.b);
 
diff --git a/poly1305-aes.c b/poly1305-aes.c
index a4050254..374d5a78 100644
--- a/poly1305-aes.c
+++ b/poly1305-aes.c
@@ -56,13 +56,12 @@ poly1305_aes_set_nonce (struct poly1305_aes_ctx *ctx,
   memcpy (ctx->nonce, nonce, POLY1305_AES_NONCE_SIZE);
 }
 
-#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->pctx, (data), 1)
-
 void
 poly1305_aes_update (struct poly1305_aes_ctx *ctx,
 		     size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+  ctx->index = _nettle_poly1305_update (&(ctx)->pctx,
+					ctx->block, ctx->index, length, data);
 }
 
 void
diff --git a/poly1305-internal.h b/poly1305-internal.h
index 9932d524..a6afd466 100644
--- a/poly1305-internal.h
+++ b/poly1305-internal.h
@@ -53,7 +53,15 @@ void _nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s)
 /* Process one block. */
 void _nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m,
 			     unsigned high);
-
+/* Updates CTX by hashing M, which must be an integral number of
+   blocks. For convenience, returns a pointer to the end of the
+   data. Implies 128 set on all input blocks. */
+const uint8_t *
+_nettle_poly1305_blocks (struct poly1305_ctx *ctx, size_t blocks, const uint8_t *m);
+
+unsigned
+_nettle_poly1305_update (struct poly1305_ctx *ctx, uint8_t *buffer, unsigned index,
+			 size_t length, const uint8_t *m);
 #ifdef __cplusplus
 }
 #endif
diff --git a/poly1305-update.c b/poly1305-update.c
new file mode 100644
index 00000000..15ee3231
--- /dev/null
+++ b/poly1305-update.c
@@ -0,0 +1,78 @@
+/* poly1305-update.c
+
+   Copyright (C) 2022 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "poly1305.h"
+#include "poly1305-internal.h"
+#include "md-internal.h"
+
+#if HAVE_NATIVE_fat_poly1305_blocks
+const uint8_t *
+_nettle_poly1305_blocks_c(struct poly1305_ctx *ctx,
+			   size_t blocks, const uint8_t *m);
+
+const uint8_t *
+_nettle_poly1305_blocks_c(struct poly1305_ctx *ctx,
+			   size_t blocks, const uint8_t *m)
+{
+  for (; blocks; blocks--, m += POLY1305_BLOCK_SIZE)
+    _nettle_poly1305_block(ctx, m, 1);
+  return m;
+}
+#endif
+
+unsigned
+_nettle_poly1305_update (struct poly1305_ctx *ctx,
+			 uint8_t *block, unsigned index,
+			 size_t length, const uint8_t *m)
+{
+  if (index > 0)
+    {
+      /* Try to fill partial block */
+      MD_FILL_OR_RETURN_INDEX (POLY1305_BLOCK_SIZE, block, index,
+			       length, m);
+      _nettle_poly1305_block(ctx, block, 1);
+    }
+#if HAVE_NATIVE_poly1305_blocks
+  m = _nettle_poly1305_blocks (ctx, length >> 4, m);
+  length &= 15;
+#else
+  for (; length >= POLY1305_BLOCK_SIZE;
+       length -= POLY1305_BLOCK_SIZE, m += POLY1305_BLOCK_SIZE)
+    _nettle_poly1305_block (ctx, m, 1);
+#endif
+
+  memcpy (block, m, length);
+  return length;
+}
diff --git a/powerpc64/fat/poly1305-blocks.asm b/powerpc64/fat/poly1305-blocks.asm
new file mode 100644
index 00000000..9efef0a0
--- /dev/null
+++ b/powerpc64/fat/poly1305-blocks.asm
@@ -0,0 +1,38 @@
+C powerpc64/fat/poly1305-blocks.asm
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl picked up by configure
+dnl PROLOGUE(_nettle_poly1305_blocks)
+dnl PROLOGUE(_nettle_fat_poly1305_blocks)
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p9/poly1305-blocks.asm')
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4
index b59f0863..8f28f295 100644
--- a/powerpc64/machine.m4
+++ b/powerpc64/machine.m4
@@ -51,3 +51,15 @@ forloop(i,0,63,`deflit(`vs'i,i)')
 forloop(i,0,31,`deflit(`f'i,i)')
 forloop(i,0,7, `deflit(`cr'i,i)')
 ')
+
+C Increase index of general-purpose register by specific value
+C INC_GPR(GPR, INC)
+define(`INC_GPR',`ifelse(substr($1,0,1),`r',
+``r'eval($2+substr($1,1,len($1)))',
+`eval($2+$1)')')
+
+C Increase index of vector register by specific value
+C INC_VR(VR, INC)
+define(`INC_VR',`ifelse(substr($1,0,1),`v',
+``v'eval($2+substr($1,1,len($1)))',
+`eval($2+$1)')')
diff --git a/powerpc64/p7/chacha-2core.asm b/powerpc64/p7/chacha-2core.asm
index d5935263..ec20b4a5 100644
--- a/powerpc64/p7/chacha-2core.asm
+++ b/powerpc64/p7/chacha-2core.asm
@@ -60,6 +60,9 @@ define(`S3p1', `v16')
 
 define(`T0', `v17')
 
+define(`EW_MASK', `v18')
+define(`OW_MASK', `v19')
+
 	.text
 	C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
 
@@ -78,6 +81,9 @@ PROLOGUE(_nettle_chacha_2core)
 	vor	Y3, Y3, X1
 
 .Lshared_entry:
+	DATA_LOAD_VEC(EW_MASK,.even_word_mask,r6)
+	DATA_LOAD_VEC(OW_MASK,.odd_word_mask,r6)
+	
 	vadduwm	Y3, Y3, X3
 
 	li	r6, 0x10	C set up some...
@@ -92,14 +98,14 @@ PROLOGUE(_nettle_chacha_2core)
 	vor	S3, X3, X3
 	vor	S3p1, Y3, Y3
 
-	vmrgow	Y0, X0, X0	C  1  1  3  3
-	vmrgew	X0, X0, X0	C  0  0  2  2
-	vmrgow	Y1, X1, X1	C  5  5  7  7
-	vmrgew	X1, X1, X1	C  4  4  6  6
-	vmrgow	Y2, X2, X2	C  9  9 11 11
-	vmrgew	X2, X2, X2	C  8  8 10 10
-	vmrgow	Y3, X3, S3p1	C 13 13 15 15
-	vmrgew	X3, X3, S3p1	C 12 12 14 14
+	vperm	Y0, X0, X0, OW_MASK	C  1  1  3  3
+	vperm	X0, X0, X0, EW_MASK	C  0  0  2  2
+	vperm	Y1, X1, X1, OW_MASK	C  5  5  7  7
+	vperm	X1, X1, X1, EW_MASK	C  4  4  6  6
+	vperm	Y2, X2, X2, OW_MASK	C  9  9 11 11
+	vperm	X2, X2, X2, EW_MASK	C  8  8 10 10
+	vperm	Y3, X3, S3p1, OW_MASK	C 13 13 15 15
+	vperm	X3, X3, S3p1, EW_MASK	C 12 12 14 14
 
 	vspltisw ROT16, -16	C -16 instead of 16 actually works!
 	vspltisw ROT12, 12
@@ -189,17 +195,17 @@ C Y3  A15 B15 A13 B13  X3  A12 B12 A14 B14 (Y3 swapped)
 
 	bdnz	.Loop
 
-	vmrgew	T0, X0, Y0
-	vmrgow	Y0, X0, Y0
+	vperm	T0, X0, Y0, EW_MASK
+	vperm	Y0, X0, Y0, OW_MASK
 
-	vmrgew	X0, X1, Y1
-	vmrgow	Y1, X1, Y1
+	vperm	X0, X1, Y1, EW_MASK
+	vperm	Y1, X1, Y1, OW_MASK
 
-	vmrgew	X1, X2, Y2
-	vmrgow	Y2, X2, Y2
+	vperm	X1, X2, Y2, EW_MASK
+	vperm	Y2, X2, Y2, OW_MASK
 
-	vmrgew	X2, X3, Y3
-	vmrgow	Y3, X3, Y3
+	vperm	X2, X3, Y3, EW_MASK
+	vperm	Y3, X3, Y3, OW_MASK
 
 	vadduwm T0, T0, S0
 	vadduwm Y0, Y0, S0
@@ -251,6 +257,15 @@ PROLOGUE(_nettle_chacha_2core32)
 	b	.Lshared_entry
 EPILOGUE(_nettle_chacha_2core32)
 
+.rodata
+.align 4
+.even_word_mask:
+IF_LE(`.byte 27,26,25,24,11,10,9,8,19,18,17,16,3,2,1,0')
+IF_BE(`.byte 0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27')
+.odd_word_mask:
+IF_LE(`.byte 31,30,29,28,15,14,13,12,23,22,21,20,7,6,5,4')
+IF_BE(`.byte 4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31')
+
 divert(-1)
 define core2state
 p/x $vs32.v4_int32
diff --git a/powerpc64/p9/poly1305-blocks.asm b/powerpc64/p9/poly1305-blocks.asm
new file mode 100644
index 00000000..90e3df7b
--- /dev/null
+++ b/powerpc64/p9/poly1305-blocks.asm
@@ -0,0 +1,434 @@
+C powerpc64/p9/poly1305-blocks.asm
+
+ifelse(`
+   Copyright (C) 2013, 2022 Niels Möller
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+include_src(`powerpc64/p9/poly1305.m4')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+C Argments
+define(`CTX', `r3')
+define(`BLOCKS', `r4')
+define(`DATA', `r5')
+
+define(`PADBYTE', `r6') C Padding byte register
+
+define(`DEFINES_BLOCK_R44', `
+	define(`R0', `v0')
+	define(`R1', `v1')
+	define(`R2', `v2')
+	define(`S1', `v3')
+	define(`S2', `v4')
+	define(`H0', `v5')
+	define(`H1', `v6')
+	define(`H2', `v7')
+
+	define(`R3', `v8')
+	define(`R4', `v9')
+	define(`R5', `v10')
+	define(`S4', `v11')
+	define(`S5', `v12')
+
+	define(`T0', `v13')
+	define(`T1', `v14')
+	define(`T2', `v15')
+	define(`T3', `v16')
+	define(`T4', `v17')
+	define(`T5', `v18')
+	define(`TMP', `v19')
+	define(`TMP2', `v20')
+
+	define(`ZERO', `v21')
+	define(`MASK44', `v22')
+	define(`MASK42L', `v23')
+	define(`MASK44L', `v24')
+	define(`T4PAD', `v25')
+	define(`D40', `v26')
+	define(`D20', `v27')
+	define(`D24', `v28')
+	define(`D44', `v29')
+	define(`D2', `v30')
+	define(`D4', `v31')
+	')
+
+C Compute S_1 = 20 * R_1 and S_2 = 20 * R_2
+C COMPUTE_S(S1, S2, R1, R2)
+define(`COMPUTE_S', `
+	vsld		$1, $3, D2
+	vsld		$2, $4, D2
+	vaddudm		$1, $1, $3
+	vaddudm		$2, $2, $4
+	vsld		$1, $1, D2
+	vsld		$2, $2, D2
+	')
+
+C Convert two-part radix 2^64 to three-part radix 2^44 of four blocks
+C R64_TO_R44_4B(VR0, VR1, VR2, VR3, VR4, VR5)
+define(`R64_TO_R44_4B', `
+	vsrd		$3, $2, D24
+	vsrd		$6, $5, D24
+	vsrd		TMP, $1, D44
+	vsrd		TMP2, $4, D44
+	vsld		$2, $2, D20
+	vsld		$5, $5, D20
+	vor			$2, $2, TMP
+	vor			$5, $5, TMP2
+	vand		$1, $1, MASK44
+	vand		$4, $4, MASK44
+	vand		$2, $2, MASK44
+	vand		$5, $5, MASK44
+	')
+
+C T_0 = R_0 H_0 + S_2 H_1 + S_1 H_2
+C T_1 = R_1 H_0 + R_0 H_1 + S_2 H_2
+C T_2 = R_2 H_0 + R_1 H_1 + R_0 H_2
+C MUL(T0, T1, T2, H0, H1, H2)
+define(`MUL', `
+	vmsumudm	$1, $4, R0, ZERO
+	vmsumudm	$2, $4, R1, ZERO
+	vmsumudm	$3, $4, R2, ZERO
+
+	vmsumudm	$1, $5, S2, $1
+	vmsumudm	$2, $5, R0, $2
+	vmsumudm	$3, $5, R1, $3
+
+	vmsumudm	$1, $6, S1, $1
+	vmsumudm	$2, $6, S2, $2
+	vmsumudm	$3, $6, R0, $3
+	')
+
+C Apply aforenamed equations on four-blocks
+C Each two successive blocks are interleaved horizontally
+C MUL_4B(T0, T1, T2, H0, H1, H2, H3, H4, H5)
+define(`MUL_4B', `
+	vmsumudm	$1, $7, R0, ZERO
+	vmsumudm	$2, $7, R1, ZERO
+	vmsumudm	$3, $7, R2, ZERO
+
+	vmsumudm	$1, $8, S2, $1
+	vmsumudm	$2, $8, R0, $2
+	vmsumudm	$3, $8, R1, $3
+
+	vmsumudm	$1, $9, S1, $1
+	vmsumudm	$2, $9, S2, $2
+	vmsumudm	$3, $9, R0, $3
+
+	vmsumudm	$1, $4, R3, $1
+	vmsumudm	$2, $4, R4, $2
+	vmsumudm	$3, $4, R5, $3
+
+	vmsumudm	$1, $5, S5, $1
+	vmsumudm	$2, $5, R3, $2
+	vmsumudm	$3, $5, R4, $3
+
+	vmsumudm	$1, $6, S4, $1
+	vmsumudm	$2, $6, S5, $2
+	vmsumudm	$3, $6, R3, $3
+	')
+
+C Reduction phase of two interleaved chains
+C RED(H0, H1, H2, T0, T1, T2)
+define(`RED', `
+	vand		$1, $4, MASK44L
+	vsro		$4, $4, D40
+	vsrd		$4, $4, D4
+	vadduqm		$5, $5, $4
+	vand		$2, $5, MASK44L
+	vsro		$5, $5, D40
+	vsrd		$5, $5, D4
+	vadduqm		$6, $6, $5
+	vand		$3, $6, MASK42L
+	vsro		$6, $6, D40
+	vsrd		$6, $6, D2
+	vadduqm		$1, $1, $6
+	vsld		$6, $6, D2
+	vadduqm		$1, $1, $6
+	vsrd		TMP, $1, D44
+	vand		$1, $1, MASK44L
+	vadduqm		$2, $2, TMP
+	')
+
+.text
+
+C void _nettle_poly1305_blocks(struct poly1305_ctx *ctx,
+C 				size_t length, const uint8_t *data)
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_poly1305_blocks)
+	C Save non-volatile vector registers
+	std		r31,-8(SP)
+	stxv	VSR(v31),-32(SP)
+	stxv	VSR(v30),-48(SP)
+	stxv	VSR(v29),-64(SP)
+	stxv	VSR(v28),-80(SP)
+	stxv	VSR(v27),-96(SP)
+	stxv	VSR(v26),-112(SP)
+	stxv	VSR(v25),-128(SP)
+	stxv	VSR(v24),-144(SP)
+	stxv	VSR(v23),-160(SP)
+	stxv	VSR(v22),-176(SP)
+	stxv	VSR(v21),-192(SP)
+	stxv	VSR(v20),-208(SP)
+
+	C Initialize padding byte register
+	li		PADBYTE, 1
+
+C Process data blocks of number of multiple 4
+	DEFINES_BLOCK_R44()
+	cmpldi	BLOCKS, POLY1305_BLOCK_THRESHOLD
+	blt		Ldata_r64
+	srdi	r9, BLOCKS, 2
+	andi.	BLOCKS, BLOCKS, 3
+	mtctr	r9
+
+	C Initialize constants
+
+	vxor 		ZERO, ZERO, ZERO
+	vspltisb	D2, 2
+	vspltisb	D4, 4
+	addis		r9, TOCP, .mask44@got@ha
+	ld			r9, .mask44@got@l(r9)
+	lxvd2x		VSR(MASK44), 0, r9
+	addi		r9, r9, 16
+	lxvd2x		VSR(MASK42L), 0, r9
+	addi		r9, r9, 16
+	lxvd2x		VSR(D40), 0, r9
+	addi		r9, r9, 16
+	lxvd2x		VSR(D20), 0, r9
+	addi		r9, r9, 16
+	lxvd2x		VSR(D24), 0, r9
+	addi		r9, r9, 16
+	lxvd2x		VSR(D44), 0, r9
+	xxmrghd		VSR(MASK44L), VSR(ZERO), VSR(MASK44)
+
+	sldi		r10, PADBYTE, 40
+	mtvsrdd		VSR(T4PAD), r10, r10
+
+	C Load key of radix 2^44
+	lxsd		R0, 0(CTX)
+	lxsd		R1, 8(CTX)
+	vsrd		R2, R1, D24
+	vsrd		TMP, R0, D44
+	vsld		R1, R1, D20
+	vor			R1, R1, TMP
+	vand		R0, R0, MASK44
+	vand		R1, R1, MASK44
+	xxmrghd		VSR(R0), VSR(R0), VSR(ZERO)
+	xxmrghd		VSR(R1), VSR(R1), VSR(ZERO)
+	xxmrghd		VSR(R2), VSR(R2), VSR(ZERO)
+
+	COMPUTE_S(S1, S2, R1, R2)
+
+	C Calculate R^2 = R R
+
+	MUL(T0, T1, T2, R0, R1, R2)
+	RED(H0, H1, H2, T0, T1, T2)
+	xxpermdi	VSR(R0), VSR(R0), VSR(H0), 0b01
+	xxpermdi	VSR(R1), VSR(R1), VSR(H1), 0b01
+	xxpermdi	VSR(R2), VSR(R2), VSR(H2), 0b01
+
+	COMPUTE_S(S1, S2, R1, R2)
+
+	C Calculate R^3 = R^2 R
+
+	xxmrghd		VSR(R3), VSR(ZERO), VSR(R0)
+	xxmrghd		VSR(R4), VSR(ZERO), VSR(R1)
+	xxmrghd		VSR(R5), VSR(ZERO), VSR(R2)
+
+	MUL(T0, T1, T2, R3, R4, R5)
+	RED(H0, H1, H2, T0, T1, T2)
+
+	C Calculate R^4 = R^2 R^2
+
+	xxmrgld		VSR(R3), VSR(ZERO), VSR(R0)
+	xxmrgld		VSR(R4), VSR(ZERO), VSR(R1)
+	xxmrgld		VSR(R5), VSR(ZERO), VSR(R2)
+
+	MUL(T0, T1, T2, R3, R4, R5)
+	RED(R3, R4, R5, T0, T1, T2)
+	xxmrgld		VSR(R3), VSR(H0), VSR(R3)
+	xxmrgld		VSR(R4), VSR(H1), VSR(R4)
+	xxmrgld		VSR(R5), VSR(H2), VSR(R5)
+
+	COMPUTE_S(S4, S5, R4, R5)
+
+	C Load state
+	ld			r7, 32(CTX)
+	ld			r8, 40(CTX)
+	ld			r31, 48(CTX)
+
+	C Fold high part of H2
+	srdi		r9, r31, 2
+	sldi		r10, r9, 2
+	add			r10, r10, r9
+	andi.		r31, r31, 3
+	li			r9, 0
+	addc		r7, r7, r10
+	adde		r8, r8, r9
+	adde		r31, r31, r9
+
+	mtvsrdd		VSR(H0), 0, r7
+	mtvsrdd		VSR(H1), 0, r8
+	mtvsrdd		VSR(H2), 0, r31
+
+	C Convert state of radix 2^64 to 2^44
+	vsrd		TMP, H1, D24
+	vsld		H2, H2, D40
+	vor			H2, H2, TMP
+	vsrd		TMP2, H0, D44
+	vsld		H1, H1, D20
+	vor			H1, H1, TMP2
+	vand		H0, H0, MASK44
+	vand		H1, H1, MASK44
+
+	li			r8, 0x10
+	li			r9, 0x20
+	li			r10, 0x30
+L4B_loop:
+	C Load four blocks
+	lxvd2x		VSR(T3), 0, DATA
+	lxvd2x		VSR(T4), r8, DATA
+	lxvd2x		VSR(T5), r9, DATA
+	lxvd2x		VSR(TMP), r10, DATA
+IF_BE(`
+	xxbrd		VSR(T3), VSR(T3)
+	xxbrd		VSR(T4), VSR(T4)
+	xxbrd		VSR(T5), VSR(T5)
+	xxbrd		VSR(TMP), VSR(TMP)
+')
+	C Permute blocks in little-endian and line each two successive
+	C blocks horizontally
+	xxmrghd		VSR(T0), VSR(T4), VSR(T3)
+	xxmrgld		VSR(T1), VSR(T4), VSR(T3)
+	xxmrghd		VSR(T3), VSR(TMP), VSR(T5)
+	xxmrgld		VSR(T4), VSR(TMP), VSR(T5)
+	R64_TO_R44_4B(T0, T1, T2, T3, T4, T5)
+	vor			T2, T2, T4PAD
+	vor			T5, T5, T4PAD
+
+	C Combine first block with previous state
+	vaddudm		H0, H0, T0
+	vaddudm		H1, H1, T1
+	vaddudm		H2, H2, T2
+
+	MUL_4B(T0, T1, T2, H0, H1, H2, T3, T4, T5)
+	RED(H0, H1, H2, T0, T1, T2)
+
+	addi		DATA, DATA, 64
+	bdnz		L4B_loop
+
+	C Moving carry
+	vsrd		TMP, H1, D44
+	vaddudm		H2, H2, TMP
+	vsrd		TMP2, H2, D40
+	vsrd		TMP2, TMP2, D2
+	vsld		TMP, TMP2, D2
+	vand		H1, H1, MASK44
+	vaddudm		TMP2, TMP2, TMP
+	vaddudm		H0, H0, TMP2
+	vsrd		TMP, H0, D44
+	vaddudm		H1, H1, TMP
+	vand		H2, H2, MASK42L
+	vand		H0, H0, MASK44
+
+	C Convert state of radix 2^44 to 2^64
+	vsld		TMP, H1, D44
+	vor			H0, H0, TMP
+	vsrd		H1, H1, D20
+	vsld		TMP2, H2, D24
+	vor			H1, H1, TMP2
+	vsrd		H2, H2, D40
+
+	xxswapd		VSR(H0), VSR(H0)
+	xxswapd		VSR(H1), VSR(H1)
+	xxswapd		VSR(H2), VSR(H2)
+
+	C Store state
+	stxsd		H0, 32(CTX)
+	stxsd		H1, 40(CTX)
+	stxsd		H2, 48(CTX)
+
+Ldata_r64:
+	cmpldi	BLOCKS, 0
+	beq		Ldone
+	mtctr	BLOCKS
+	mr			r4, PADBYTE
+	ld			r6, P1305_H0 (CTX)
+	ld			r7, P1305_H1 (CTX)
+	ld			r8, P1305_H2 (CTX)
+L1B_loop:
+	BLOCK_R64(CTX,DATA,r4,r6,v0)
+	mfvsrld		r6, VSR(v0)
+	mfvsrld		r7, VSR(v1)
+	mfvsrd		r8, VSR(v1)
+	addi	DATA, DATA, 16
+	bdnz	L1B_loop
+	std		r6, P1305_H0 (CTX)
+	std		r7, P1305_H1 (CTX)
+	std		r8, P1305_H2 (CTX)
+
+Ldone:
+	C Restore non-volatile vector registers
+	ld		r31, -8(SP)
+	lxv		VSR(v31),-32(SP)
+	lxv		VSR(v30),-48(SP)
+	lxv		VSR(v29),-64(SP)
+	lxv		VSR(v28),-80(SP)
+	lxv		VSR(v27),-96(SP)
+	lxv		VSR(v26),-112(SP)
+	lxv		VSR(v25),-128(SP)
+	lxv		VSR(v24),-144(SP)
+	lxv		VSR(v23),-160(SP)
+	lxv		VSR(v22),-176(SP)
+	lxv		VSR(v21),-192(SP)
+	lxv		VSR(v20),-208(SP)
+
+	mr		r3, DATA
+
+	blr
+EPILOGUE(_nettle_poly1305_blocks)
+
+.rodata
+.align 4
+.mask44:
+.quad 0x00000FFFFFFFFFFF,0x00000FFFFFFFFFFF
+.mask42l:
+.quad 0x0000000000000000,0x000003FFFFFFFFFF
+.d40:
+.quad 0x0000000000000028,0x0000000000000028
+.d20:
+.quad 0x0000000000000014,0x0000000000000014
+.d24:
+.quad 0x0000000000000018,0x0000000000000018
+.d44:
+.quad 0x000000000000002C,0x000000000000002C
diff --git a/powerpc64/p9/poly1305-internal.asm b/powerpc64/p9/poly1305-internal.asm
index a082fed2..c23e16fd 100644
--- a/powerpc64/p9/poly1305-internal.asm
+++ b/powerpc64/p9/poly1305-internal.asm
@@ -30,6 +30,8 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
+include_src(`powerpc64/p9/poly1305.m4')
+
 C Register usage:
 
 define(`SP', `r1')
@@ -37,36 +39,8 @@ define(`TOCP', `r2')
 
 C Argments
 define(`CTX', `r3')
-define(`M', `r4')
-define(`M128', `r5')
-
-C Working state
-define(`H0', `r6')
-define(`H1', `r7')
-define(`H2', `r8')
-define(`T0', `r9')
-define(`T1', `r10')
-define(`T2', `r8')
-define(`T2A', `r9')
-define(`T2S', `r10')
-define(`IDX', `r6')
-define(`RZ', `r7')
-
-define(`ZERO', `v0')
-define(`F0', `v1')
-define(`F1', `v2')
-define(`F0S', `v3')
-define(`T', `v4')
-
-define(`R', `v5')
-define(`S', `v6')
-
-define(`T00', `v7')
-define(`T10', `v8')
-define(`T11', `v9')
-define(`MU0', `v10')
-define(`MU1', `v11')
-define(`TMP', `v12')
+define(`DATA', `r4')
+define(`PADBYTE', `r5') C Padding byte register
 
 .text
 
@@ -114,59 +88,17 @@ EPILOGUE(_nettle_poly1305_set_key)
 C void _nettle_poly1305_block(struct poly1305_ctx *ctx, const uint8_t *m, unsigned m128)
 define(`FUNC_ALIGN', `5')
 PROLOGUE(_nettle_poly1305_block)
-	ld			H0, P1305_H0 (CTX)
-	ld			H1, P1305_H1 (CTX)
-	ld			H2, P1305_H2 (CTX)
-IF_LE(`
-	ld			T0, 0(M)
-	ld			T1, 8(M)
-')
-IF_BE(`
-	ldbrx		T0, 0, M
-	addi		M, M, 8
-	ldbrx		T0, 0, M
-')
-
-	addc		T0, T0, H0
-	adde		T1, T1, H1
-	adde		T2, M128, H2
-
-	mtvsrdd		VSR(T), T0, T1
-
-	li			IDX, P1305_S0
-	lxvd2x		VSR(R), 0, CTX
-	lxvd2x		VSR(S), IDX, CTX
-
-	andi.		T2A, T2, 3
-	srdi		T2S, T2, 2
-
-	li			RZ, 0
-	vxor		ZERO, ZERO, ZERO
-
-	xxpermdi	VSR(MU0), VSR(R), VSR(S), 0b01
-	xxswapd		VSR(MU1), VSR(R)
-
-	mtvsrdd		VSR(T11), 0, T2A
-	mtvsrdd		VSR(T00), T2S, RZ
-	mtvsrdd		VSR(T10), 0, T2
-
-	vmsumudm	F0, T, MU0, ZERO
-	vmsumudm	F1, T, MU1, ZERO
-	vmsumudm	TMP, T11, MU1, ZERO
-
-	vmsumudm	F0, T00, S, F0
-	vmsumudm	F1, T10, MU0, F1
+	ld			r6, P1305_H0 (CTX)
+	ld			r7, P1305_H1 (CTX)
+	ld			r8, P1305_H2 (CTX)
 
-	xxmrgld		VSR(TMP), VSR(TMP), VSR(ZERO)
-	xxswapd		VSR(F0S), VSR(F0)
-	vadduqm		F1, F1, TMP
-	stxsd		F0S, P1305_H0 (CTX)
+	BLOCK_R64(CTX,DATA,PADBYTE,r6,v0)
 
-	li			IDX, P1305_H1
-	xxmrghd		VSR(F0), VSR(ZERO), VSR(F0)
-	vadduqm		F1, F1, F0
-	xxswapd		VSR(F1), VSR(F1)
-	stxvd2x		VSR(F1), IDX, CTX
+	li			r10, P1305_H1
+	xxswapd		VSR(v0), VSR(v0)
+	xxswapd		VSR(v1), VSR(v1)
+	stxsd		v0, P1305_H0 (CTX)
+	stxvd2x		VSR(v1), r10, CTX
 
 	blr
 EPILOGUE(_nettle_poly1305_block)
diff --git a/powerpc64/p9/poly1305.m4 b/powerpc64/p9/poly1305.m4
new file mode 100644
index 00000000..13a57e83
--- /dev/null
+++ b/powerpc64/p9/poly1305.m4
@@ -0,0 +1,102 @@
+C Threshold of processing multiple blocks in parallel
+C of a multiple of 4
+define(`POLY1305_BLOCK_THRESHOLD', `12')
+
+C DEFINES_BLOCK_R64(GPR0, VR0)
+define(`DEFINES_BLOCK_R64', `
+	define(`H0', `$1')
+	define(`H1', `INC_GPR($1,1)')
+	define(`H2', `INC_GPR($1,2)')
+
+	define(`T0', `INC_GPR($1,3)')
+	define(`T1', `INC_GPR($1,4)')
+	define(`T2', `H2')
+	define(`T2A', `INC_GPR($1,3)')
+	define(`T2S', `INC_GPR($1,4)')
+	define(`RZ', `H0')
+	define(`IDX', `INC_GPR($1,4)')
+
+	define(`F0', `$2')
+	define(`F1', `INC_VR($2,1)')
+
+	define(`ZERO', `INC_VR($2,2)')
+	define(`F0S', `INC_VR($2,3)')
+	define(`F11', `INC_VR($2,4)')
+	define(`T', `INC_VR($2,5)')
+
+	define(`R', `INC_VR($2,6)')
+	define(`S', `INC_VR($2,7)')
+
+	define(`T00', `INC_VR($2,8)')
+	define(`T10', `INC_VR($2,9)')
+	define(`T11', `INC_VR($2,10)')
+	define(`MU0', `INC_VR($2,11)')
+	define(`MU1', `INC_VR($2,12)')
+	')
+
+C CTX is the address of context where key and pre-computed values are stored
+C DATA is the address of input block
+C PADBYTE is padding byte for input block
+C GPR0 is the starting register of sequential general-purpose registers
+C used in the macro of following layout
+C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64
+C GPR3, GPR4 are temporary registers
+C VR0 is the starting register of sequential vector resigers used in
+C the macro of following layout
+C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows
+C (low 64-bit of VR0) + (low 64-bit of VR1) + (high 64-bit of VR1)
+C VR2..VR12 are temporary registers
+C BLOCK_R64(CTX, DATA, PADBYTE, GPR0, VR0)
+define(`BLOCK_R64', `
+	DEFINES_BLOCK_R64($4,$5)
+	C Load 128-bit input block
+IF_LE(`
+	ld			T0, 0($2)
+	ld			T1, 8($2)
+')
+IF_BE(`
+	li			IDX, 8
+	ldbrx		T1, IDX, $2
+	ldbrx		T0, 0, $2
+')
+	C Combine state with input block, latter is padded to 17-bytes
+	C by low-order byte of PADBYTE register
+	addc		T0, T0, H0
+	adde		T1, T1, H1
+	adde		T2, $3, H2
+
+	mtvsrdd		VSR(T), T0, T1
+
+	C Load key and pre-computed values
+	li			IDX, 16
+	lxvd2x		VSR(R), 0, $1
+	lxvd2x		VSR(S), IDX, $1
+
+	andi.		T2A, T2, 3
+	srdi		T2S, T2, 2
+
+	li			RZ, 0
+	vxor		ZERO, ZERO, ZERO
+
+	xxpermdi	VSR(MU0), VSR(R), VSR(S), 0b01
+	xxswapd		VSR(MU1), VSR(R)
+
+	mtvsrdd		VSR(T11), 0, T2A
+	mtvsrdd		VSR(T00), T2S, RZ
+	mtvsrdd		VSR(T10), 0, T2
+
+	C Multiply key by combined state and block
+	vmsumudm	F0, T, MU0, ZERO
+	vmsumudm	F1, T, MU1, ZERO
+	vmsumudm	F11, T11, MU1, ZERO
+
+	vmsumudm	F0, T00, S, F0
+	vmsumudm	F1, T10, MU0, F1
+
+	C Product addition
+	xxmrgld		VSR(F11), VSR(F11), VSR(ZERO)
+	vadduqm		F1, F1, F11
+
+	xxmrghd		VSR(F0S), VSR(ZERO), VSR(F0)
+	vadduqm		F1, F1, F0S
+	')
diff --git a/s390x/fat/sha256-compress-2.asm b/s390x/fat/sha256-compress-n-2.asm
index f4b16181..06fb1014 100644
--- a/s390x/fat/sha256-compress-2.asm
+++ b/s390x/fat/sha256-compress-n-2.asm
@@ -1,4 +1,4 @@
-C s390x/fat/sha256-compress-2.asm
+C s390x/fat/sha256-compress-n-2.asm
 
 ifelse(`
    Copyright (C) 2021 Mamone Tarsha
@@ -30,7 +30,7 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
-dnl PROLOGUE(_nettle_sha256_compress) picked up by configure
+dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
 
 define(`fat_transform', `$1_s390x')
-include_src(`s390x/msa_x1/sha256-compress.asm')
+include_src(`s390x/msa_x1/sha256-compress-n.asm')
diff --git a/s390x/msa_x1/sha256-compress.asm b/s390x/msa_x1/sha256-compress-n.asm
index 9a9511fb..51539927 100644
--- a/s390x/msa_x1/sha256-compress.asm
+++ b/s390x/msa_x1/sha256-compress-n.asm
@@ -1,7 +1,7 @@
-C s390x/msa_x1/sha256-compress.asm
+C s390x/msa_x1/sha256-compress-n.asm
 
 ifelse(`
-   Copyright (C) 2021 Mamone Tarsha
+   Copyright (C) 2021, 2022 Mamone Tarsha, Niels Möller
    This file is part of GNU Nettle.
 
    GNU Nettle is free software: you can redistribute it and/or
@@ -56,25 +56,23 @@ C |----------------------------------------------|
 C |                 H7 (4 bytes)                 |
 C *----------------------------------------------*
 
-.file "sha256-compress.asm"
+.file "sha256-compress-n.asm"
 
 .text
 
 C SHA function code
 define(`SHA256_FUNCTION_CODE', `2')
-C Size of block
-define(`SHA256_BLOCK_SIZE', `64')
 
-C void 
-C _nettle_sha256_compress(uint32_t *state, const uint8_t *input,
-C                         const uint32_t *k)
+C const uint8_t *
+C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+C			    size_t blocks, const uint8_t *input)
 
-PROLOGUE(_nettle_sha256_compress)
+PROLOGUE(_nettle_sha256_compress_n)
     lghi           %r0,SHA256_FUNCTION_CODE      C SHA-256 Function Code
     lgr            %r1,%r2
-    lgr            %r4,%r3
-    lghi           %r5,SHA256_BLOCK_SIZE
-1:  .long   0xb93e0004                           C kimd %r0,%r4. perform KIMD-SHA operation on data
+    lgr            %r2, %r5
+    sllg	   %r3, %r4, 6                   C 64 * block size
+1:  .long   0xb93e0002                           C kimd %r0,%r2. perform KIMD-SHA operation on data
     brc            1,1b
     br             RA
-EPILOGUE(_nettle_sha256_compress)
+EPILOGUE(_nettle_sha256_compress_n)
diff --git a/sha2-internal.h b/sha2-internal.h
index 40f25a5f..93080bee 100644
--- a/sha2-internal.h
+++ b/sha2-internal.h
@@ -39,8 +39,9 @@
 /* Internal compression function. STATE points to 8 uint32_t words,
    DATA points to 64 bytes of input data, possibly unaligned, and K
    points to the table of constants. */
-void
-_nettle_sha256_compress(uint32_t *state, const uint8_t *data, const uint32_t *k);
+const uint8_t *
+_nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+			  size_t blocks, const uint8_t *data);
 
 /* Internal compression function. STATE points to 8 uint64_t words,
    DATA points to 128 bytes of input data, possibly unaligned, and K
diff --git a/sha256-compress.c b/sha256-compress-n.c
index cf17e3e1..d135d14f 100644
--- a/sha256-compress.c
+++ b/sha256-compress-n.c
@@ -1,8 +1,8 @@
-/* sha256-compress.c
+/* sha256-compress-n.c
 
    The compression function of the sha256 hash function.
 
-   Copyright (C) 2001, 2010 Niels Möller
+   Copyright (C) 2001, 2010, 2022 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -118,26 +118,19 @@
   } while (0)
 
 /* For fat builds */
-#if HAVE_NATIVE_sha256_compress
-void
-_nettle_sha256_compress_c(uint32_t *state, const uint8_t *input, const uint32_t *k);
-#define _nettle_sha256_compress _nettle_sha256_compress_c
+#if HAVE_NATIVE_sha256_compress_n
+const uint8_t *
+_nettle_sha256_compress_n_c(uint32_t *state, const uint32_t *table,
+			    size_t blocks, const uint8_t *input);
+#define _nettle_sha256_compress_n _nettle_sha256_compress_n_c
 #endif
 
-void
-_nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+const uint8_t *
+_nettle_sha256_compress_n(uint32_t *state, const uint32_t *table,
+			  size_t blocks, const uint8_t *input)
 {
-  uint32_t data[SHA256_DATA_LENGTH];
   uint32_t A, B, C, D, E, F, G, H;     /* Local vars */
-  unsigned i;
-  uint32_t *d;
 
-  for (i = 0; i < SHA256_DATA_LENGTH; i++, input+= 4)
-    {
-      data[i] = READ_UINT32(input);
-    }
-
-  /* Set up first buffer and local data buffer */
   A = state[0];
   B = state[1];
   C = state[2];
@@ -146,55 +139,68 @@ _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k
   F = state[5];
   G = state[6];
   H = state[7];
-  
-  /* Heavy mangling */
-  /* First 16 subrounds that act on the original data */
 
-  DEBUG(-1);
-  for (i = 0, d = data; i<16; i+=8, k += 8, d+= 8)
+  for (; blocks > 0; blocks--)
     {
-      ROUND(A, B, C, D, E, F, G, H, k[0], d[0]); DEBUG(i);
-      ROUND(H, A, B, C, D, E, F, G, k[1], d[1]); DEBUG(i+1);
-      ROUND(G, H, A, B, C, D, E, F, k[2], d[2]);
-      ROUND(F, G, H, A, B, C, D, E, k[3], d[3]);
-      ROUND(E, F, G, H, A, B, C, D, k[4], d[4]);
-      ROUND(D, E, F, G, H, A, B, C, k[5], d[5]);
-      ROUND(C, D, E, F, G, H, A, B, k[6], d[6]); DEBUG(i+6);
-      ROUND(B, C, D, E, F, G, H, A, k[7], d[7]); DEBUG(i+7);
-    }
+      uint32_t data[SHA256_DATA_LENGTH];
+      unsigned i;
+      const uint32_t *k;
+      uint32_t *d;
+      for (i = 0; i < SHA256_DATA_LENGTH; i++, input+= 4)
+	{
+	  data[i] = READ_UINT32(input);
+	}
+
+      /* Heavy mangling */
+      /* First 16 subrounds that act on the original data */
+
+      DEBUG(-1);
+      for (i = 0, d = data, k = table; i<16; i+=8, k += 8, d+= 8)
+	{
+	  ROUND(A, B, C, D, E, F, G, H, k[0], d[0]); DEBUG(i);
+	  ROUND(H, A, B, C, D, E, F, G, k[1], d[1]); DEBUG(i+1);
+	  ROUND(G, H, A, B, C, D, E, F, k[2], d[2]);
+	  ROUND(F, G, H, A, B, C, D, E, k[3], d[3]);
+	  ROUND(E, F, G, H, A, B, C, D, k[4], d[4]);
+	  ROUND(D, E, F, G, H, A, B, C, k[5], d[5]);
+	  ROUND(C, D, E, F, G, H, A, B, k[6], d[6]); DEBUG(i+6);
+	  ROUND(B, C, D, E, F, G, H, A, k[7], d[7]); DEBUG(i+7);
+	}
   
-  for (; i<64; i += 16, k+= 16)
-    {
-      ROUND(A, B, C, D, E, F, G, H, k[ 0], EXPAND(data,  0)); DEBUG(i);
-      ROUND(H, A, B, C, D, E, F, G, k[ 1], EXPAND(data,  1)); DEBUG(i+1);
-      ROUND(G, H, A, B, C, D, E, F, k[ 2], EXPAND(data,  2)); DEBUG(i+2);
-      ROUND(F, G, H, A, B, C, D, E, k[ 3], EXPAND(data,  3)); DEBUG(i+3);
-      ROUND(E, F, G, H, A, B, C, D, k[ 4], EXPAND(data,  4)); DEBUG(i+4);
-      ROUND(D, E, F, G, H, A, B, C, k[ 5], EXPAND(data,  5)); DEBUG(i+5);
-      ROUND(C, D, E, F, G, H, A, B, k[ 6], EXPAND(data,  6)); DEBUG(i+6);
-      ROUND(B, C, D, E, F, G, H, A, k[ 7], EXPAND(data,  7)); DEBUG(i+7);
-      ROUND(A, B, C, D, E, F, G, H, k[ 8], EXPAND(data,  8)); DEBUG(i+8);
-      ROUND(H, A, B, C, D, E, F, G, k[ 9], EXPAND(data,  9)); DEBUG(i+9);
-      ROUND(G, H, A, B, C, D, E, F, k[10], EXPAND(data, 10)); DEBUG(i+10);
-      ROUND(F, G, H, A, B, C, D, E, k[11], EXPAND(data, 11)); DEBUG(i+11);
-      ROUND(E, F, G, H, A, B, C, D, k[12], EXPAND(data, 12)); DEBUG(i+12);
-      ROUND(D, E, F, G, H, A, B, C, k[13], EXPAND(data, 13)); DEBUG(i+13);
-      ROUND(C, D, E, F, G, H, A, B, k[14], EXPAND(data, 14)); DEBUG(i+14);
-      ROUND(B, C, D, E, F, G, H, A, k[15], EXPAND(data, 15)); DEBUG(i+15);
-    }
-
-  /* Update state */
-  state[0] += A;
-  state[1] += B;
-  state[2] += C;
-  state[3] += D;
-  state[4] += E;
-  state[5] += F;
-  state[6] += G;
-  state[7] += H;
+      for (; i<64; i += 16, k+= 16)
+	{
+	  ROUND(A, B, C, D, E, F, G, H, k[ 0], EXPAND(data,  0)); DEBUG(i);
+	  ROUND(H, A, B, C, D, E, F, G, k[ 1], EXPAND(data,  1)); DEBUG(i+1);
+	  ROUND(G, H, A, B, C, D, E, F, k[ 2], EXPAND(data,  2)); DEBUG(i+2);
+	  ROUND(F, G, H, A, B, C, D, E, k[ 3], EXPAND(data,  3)); DEBUG(i+3);
+	  ROUND(E, F, G, H, A, B, C, D, k[ 4], EXPAND(data,  4)); DEBUG(i+4);
+	  ROUND(D, E, F, G, H, A, B, C, k[ 5], EXPAND(data,  5)); DEBUG(i+5);
+	  ROUND(C, D, E, F, G, H, A, B, k[ 6], EXPAND(data,  6)); DEBUG(i+6);
+	  ROUND(B, C, D, E, F, G, H, A, k[ 7], EXPAND(data,  7)); DEBUG(i+7);
+	  ROUND(A, B, C, D, E, F, G, H, k[ 8], EXPAND(data,  8)); DEBUG(i+8);
+	  ROUND(H, A, B, C, D, E, F, G, k[ 9], EXPAND(data,  9)); DEBUG(i+9);
+	  ROUND(G, H, A, B, C, D, E, F, k[10], EXPAND(data, 10)); DEBUG(i+10);
+	  ROUND(F, G, H, A, B, C, D, E, k[11], EXPAND(data, 11)); DEBUG(i+11);
+	  ROUND(E, F, G, H, A, B, C, D, k[12], EXPAND(data, 12)); DEBUG(i+12);
+	  ROUND(D, E, F, G, H, A, B, C, k[13], EXPAND(data, 13)); DEBUG(i+13);
+	  ROUND(C, D, E, F, G, H, A, B, k[14], EXPAND(data, 14)); DEBUG(i+14);
+	  ROUND(B, C, D, E, F, G, H, A, k[15], EXPAND(data, 15)); DEBUG(i+15);
+	}
+
+      /* Update state */
+      state[0] = A = state[0] + A;
+      state[1] = B = state[1] + B;
+      state[2] = C = state[2] + C;
+      state[3] = D = state[3] + D;
+      state[4] = E = state[4] + E;
+      state[5] = F = state[5] + F;
+      state[6] = G = state[6] + G;
+      state[7] = H = state[7] + H;
 #if SHA256_DEBUG
-  fprintf(stderr, "99: %8x %8x %8x %8x %8x %8x %8x %8x\n",
-	  state[0], state[1], state[2], state[3],
-	  state[4], state[5], state[6], state[7]);
+      fprintf(stderr, "99: %8x %8x %8x %8x %8x %8x %8x %8x\n",
+	      state[0], state[1], state[2], state[3],
+	      state[4], state[5], state[6], state[7]);
 #endif
+    }
+  return input;
 }
diff --git a/sha256.c b/sha256.c
index 3872ca6f..0c9c21a0 100644
--- a/sha256.c
+++ b/sha256.c
@@ -46,6 +46,7 @@
 #include "sha2-internal.h"
 
 #include "macros.h"
+#include "md-internal.h"
 #include "nettle-write.h"
 
 /* Generated by the shadata program. */
@@ -70,6 +71,12 @@ K[64] =
   0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL, 
 };
 
+void
+sha256_compress(uint32_t *state, const uint8_t *input)
+{
+  _nettle_sha256_compress_n(state, K, 1, input);
+}
+
 #define COMPRESS(ctx, data) (sha256_compress((ctx)->state, (data)))
 
 /* Initialize the SHA values */
@@ -97,7 +104,22 @@ void
 sha256_update(struct sha256_ctx *ctx,
 	      size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, COMPRESS, ctx->count++);
+  size_t blocks;
+  if (ctx->index > 0)
+    {
+      /* Try to fill partial block */
+      MD_FILL_OR_RETURN (ctx, length, data);
+      sha256_compress (ctx->state, ctx->block);
+      ctx->count++;
+    }
+
+  blocks = length >> 6;
+  data = _nettle_sha256_compress_n (ctx->state, K, blocks, data);
+  ctx->count += blocks;
+  length &= 63;
+
+  memcpy (ctx->block, data, length);
+  ctx->index = length;
 }
 
 static void
@@ -161,9 +183,3 @@ sha224_digest(struct sha256_ctx *ctx,
   sha256_write_digest(ctx, length, digest);
   sha224_init(ctx);
 }
-
-void
-sha256_compress(uint32_t *state, const uint8_t *input)
-{
-  _nettle_sha256_compress(state, input, K);
-}
diff --git a/siv-gcm-aes128.c b/siv-gcm-aes128.c
new file mode 100644
index 00000000..4317d3d8
--- /dev/null
+++ b/siv-gcm-aes128.c
@@ -0,0 +1,65 @@
+/* siv-gcm-aes128.c
+
+   AES-GCM-SIV, RFC8452
+
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "nettle-meta.h"
+#include "siv-gcm.h"
+
+void
+siv_gcm_aes128_encrypt_message (const struct aes128_ctx *ctx,
+				size_t nlength, const uint8_t *nonce,
+				size_t alength, const uint8_t *adata,
+				size_t clength, uint8_t *dst, const uint8_t *src)
+{
+  struct aes128_ctx ctr_ctx;
+  siv_gcm_encrypt_message (&nettle_aes128, ctx, &ctr_ctx,
+			   nlength, nonce,
+			   alength, adata,
+			   clength, dst, src);
+}
+
+int
+siv_gcm_aes128_decrypt_message (const struct aes128_ctx *ctx,
+				size_t nlength, const uint8_t *nonce,
+				size_t alength, const uint8_t *adata,
+				size_t mlength, uint8_t *dst, const uint8_t *src)
+{
+  struct aes128_ctx ctr_ctx;
+  return siv_gcm_decrypt_message (&nettle_aes128, ctx, &ctr_ctx,
+				  nlength, nonce,
+				  alength, adata,
+				  mlength, dst, src);
+}
diff --git a/siv-gcm-aes256.c b/siv-gcm-aes256.c
new file mode 100644
index 00000000..70bf3f35
--- /dev/null
+++ b/siv-gcm-aes256.c
@@ -0,0 +1,65 @@
+/* siv-gcm-aes256.c
+
+   AES-GCM-SIV, RFC8452
+
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "nettle-meta.h"
+#include "siv-gcm.h"
+
+void
+siv_gcm_aes256_encrypt_message (const struct aes256_ctx *ctx,
+				size_t nlength, const uint8_t *nonce,
+				size_t alength, const uint8_t *adata,
+				size_t clength, uint8_t *dst, const uint8_t *src)
+{
+  struct aes256_ctx ctr_ctx;
+  siv_gcm_encrypt_message (&nettle_aes256, ctx, &ctr_ctx,
+			   nlength, nonce,
+			   alength, adata,
+			   clength, dst, src);
+}
+
+int
+siv_gcm_aes256_decrypt_message (const struct aes256_ctx *ctx,
+				size_t nlength, const uint8_t *nonce,
+				size_t alength, const uint8_t *adata,
+				size_t mlength, uint8_t *dst, const uint8_t *src)
+{
+  struct aes256_ctx ctr_ctx;
+  return siv_gcm_decrypt_message (&nettle_aes256, ctx, &ctr_ctx,
+				  nlength, nonce,
+				  alength, adata,
+				  mlength, dst, src);
+}
diff --git a/siv-gcm.c b/siv-gcm.c
new file mode 100644
index 00000000..332a7439
--- /dev/null
+++ b/siv-gcm.c
@@ -0,0 +1,229 @@
+/* siv-gcm.c
+
+   AES-GCM-SIV, RFC8452
+
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "siv-gcm.h"
+#include "ghash-internal.h"
+#include "block-internal.h"
+#include "nettle-internal.h"
+#include "macros.h"
+#include "memops.h"
+#include "ctr-internal.h"
+#include <string.h>
+
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+static void
+siv_gcm_derive_keys (const void *ctx,
+		     nettle_cipher_func *f,
+		     size_t key_size,
+		     size_t nlength, const uint8_t *nonce,
+		     union nettle_block16 *auth_key,
+		     uint8_t *encryption_key)
+{
+  union nettle_block16 block;
+  union nettle_block16 out;
+  size_t i;
+
+  block16_zero (&block);
+  memcpy (block.b + 4, nonce, MIN(nlength, SIV_GCM_NONCE_SIZE));
+
+  f (ctx, SIV_GCM_BLOCK_SIZE, out.b, block.b);
+  auth_key->u64[0] = out.u64[0];
+
+  block.b[0] = 1;
+  f (ctx, SIV_GCM_BLOCK_SIZE, out.b, block.b);
+  auth_key->u64[1] = out.u64[0];
+
+  assert (key_size % 8 == 0 && key_size / 8 + 2 <= UINT8_MAX);
+
+  for (i = 0; i < key_size; i += 8)
+    {
+      block.b[0]++;
+      f (ctx, SIV_GCM_BLOCK_SIZE, out.b, block.b);
+      memcpy (encryption_key + i, out.b, 8);
+    }
+}
+
+static nettle_fill16_func siv_gcm_fill;
+
+static void
+siv_gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
+{
+  uint32_t c;
+
+  c = LE_READ_UINT32(ctr);
+
+  for (; blocks-- > 0; buffer++, c++)
+    {
+      memcpy(buffer->b + 4, ctr + 4, SIV_GCM_BLOCK_SIZE - 4);
+      LE_WRITE_UINT32(buffer->b, c);
+    }
+
+  LE_WRITE_UINT32(ctr, c);
+}
+
+static void
+siv_ghash_pad_update (struct gcm_key *ctx,
+		      union nettle_block16 *state,
+		      size_t length, const uint8_t *data)
+{
+  size_t blocks;
+
+  blocks = length / SIV_GCM_BLOCK_SIZE;
+  if (blocks > 0)
+    {
+      data = _siv_ghash_update (ctx, state, blocks, data);
+      length &= 0xf;
+    }
+  if (length > 0)
+    {
+      uint8_t block[SIV_GCM_BLOCK_SIZE];
+
+      memset (block + length, 0, SIV_GCM_BLOCK_SIZE - length);
+      memcpy (block, data, length);
+      _siv_ghash_update (ctx, state, 1, block);
+    }
+}
+
+static void
+siv_gcm_authenticate (const void *ctx,
+		      const struct nettle_cipher *nc,
+		      const union nettle_block16 *authentication_key,
+		      const uint8_t *nonce,
+		      size_t alength, const uint8_t *adata,
+		      size_t mlength, const uint8_t *mdata,
+		      uint8_t *tag)
+{
+  union nettle_block16 state;
+  struct gcm_key siv_ghash_key;
+  union nettle_block16 block;
+
+  _siv_ghash_set_key (&siv_ghash_key, authentication_key);
+
+  block16_zero (&state);
+  siv_ghash_pad_update (&siv_ghash_key, &state, alength, adata);
+  siv_ghash_pad_update (&siv_ghash_key, &state, mlength, mdata);
+
+  block.u64[0] = bswap64_if_be (alength * 8);
+  block.u64[1] = bswap64_if_be (mlength * 8);
+
+  _siv_ghash_update (&siv_ghash_key, &state, 1, block.b);
+  block16_bswap (&state, &state);
+
+  memxor (state.b, nonce, SIV_GCM_NONCE_SIZE);
+  state.b[15] &= 0x7f;
+  nc->encrypt (ctx, SIV_GCM_BLOCK_SIZE, tag, state.b);
+}
+
+void
+siv_gcm_encrypt_message (const struct nettle_cipher *nc,
+			 const void *ctx,
+			 void *ctr_ctx,
+			 size_t nlength, const uint8_t *nonce,
+			 size_t alength, const uint8_t *adata,
+			 size_t clength, uint8_t *dst, const uint8_t *src)
+{
+  union nettle_block16 authentication_key;
+  TMP_DECL(encryption_key, uint8_t, NETTLE_MAX_CIPHER_KEY_SIZE);
+  uint8_t ctr[SIV_GCM_DIGEST_SIZE];
+  uint8_t *tag = dst + clength - SIV_GCM_BLOCK_SIZE;
+
+  assert (clength >= SIV_GCM_DIGEST_SIZE);
+  assert (nlength == SIV_GCM_NONCE_SIZE);
+
+  TMP_ALLOC(encryption_key, nc->key_size);
+  siv_gcm_derive_keys (ctx, nc->encrypt, nc->key_size, nlength, nonce,
+		       &authentication_key, encryption_key);
+
+  /* Calculate authentication tag.  */
+  nc->set_encrypt_key (ctr_ctx, encryption_key);
+
+  siv_gcm_authenticate (ctr_ctx, nc,
+			&authentication_key,
+			nonce, alength, adata,
+			clength - SIV_GCM_BLOCK_SIZE, src,
+			tag);
+
+  /* Encrypt the plaintext.  */
+
+  /* The initial counter block is the tag with the most significant
+     bit of the last byte set to one.  */
+  memcpy (ctr, tag, SIV_GCM_DIGEST_SIZE);
+  ctr[15] |= 0x80;
+  _nettle_ctr_crypt16 (ctr_ctx, nc->encrypt, siv_gcm_fill, ctr,
+		       clength - SIV_GCM_BLOCK_SIZE, dst, src);
+}
+
+int
+siv_gcm_decrypt_message (const struct nettle_cipher *nc,
+			 const void *ctx,
+			 void *ctr_ctx,
+			 size_t nlength, const uint8_t *nonce,
+			 size_t alength, const uint8_t *adata,
+			 size_t mlength, uint8_t *dst, const uint8_t *src)
+{
+  union nettle_block16 authentication_key;
+  TMP_DECL(encryption_key, uint8_t, NETTLE_MAX_CIPHER_KEY_SIZE);
+  union nettle_block16 state;
+  uint8_t tag[SIV_GCM_DIGEST_SIZE];
+
+  assert (nlength == SIV_GCM_NONCE_SIZE);
+
+  TMP_ALLOC(encryption_key, nc->key_size);
+  siv_gcm_derive_keys (ctx, nc->encrypt, nc->key_size, nlength, nonce,
+		       &authentication_key, encryption_key);
+
+  memcpy (state.b, src + mlength, SIV_GCM_DIGEST_SIZE);
+  /* The initial counter block is the tag with the most significant
+     bit of the last byte set to one.  */
+  state.b[15] |= 0x80;
+
+  /* Decrypt the ciphertext.  */
+  nc->set_encrypt_key (ctr_ctx, encryption_key);
+
+  _nettle_ctr_crypt16 (ctr_ctx, nc->encrypt, siv_gcm_fill, state.b,
+		       mlength, dst, src);
+
+  /* Calculate authentication tag.  */
+  siv_gcm_authenticate (ctr_ctx, nc,
+			&authentication_key,
+			nonce, alength, adata,
+			mlength, dst,
+			tag);
+
+  return memeql_sec (tag, src + mlength, SIV_GCM_DIGEST_SIZE);
+}
diff --git a/siv-gcm.h b/siv-gcm.h
new file mode 100644
index 00000000..1a9e3084
--- /dev/null
+++ b/siv-gcm.h
@@ -0,0 +1,107 @@
+/* siv-gcm.h
+
+   AES-GCM-SIV, RFC8452
+
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef NETTLE_SIV_GCM_H_INCLUDED
+#define NETTLE_SIV_GCM_H_INCLUDED
+
+#include "nettle-types.h"
+#include "nettle-meta.h"
+#include "aes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Name mangling */
+#define siv_gcm_encrypt_message nettle_siv_gcm_encrypt_message
+#define siv_gcm_decrypt_message nettle_siv_gcm_decrypt_message
+#define siv_gcm_aes128_encrypt_message nettle_siv_gcm_aes128_encrypt_message
+#define siv_gcm_aes128_decrypt_message nettle_siv_gcm_aes128_decrypt_message
+#define siv_gcm_aes256_encrypt_message nettle_siv_gcm_aes256_encrypt_message
+#define siv_gcm_aes256_decrypt_message nettle_siv_gcm_aes256_decrypt_message
+
+/* For AES-GCM-SIV, the block size of the underlying cipher shall be 128 bits. */
+#define SIV_GCM_BLOCK_SIZE 16
+#define SIV_GCM_DIGEST_SIZE 16
+#define SIV_GCM_NONCE_SIZE 12
+
+/* Generic interface.  NC must be a block cipher with 128-bit block
+   size, and keysize that is a multiple of 64 bits, such as AES-128 or
+   AES-256.  */
+void
+siv_gcm_encrypt_message (const struct nettle_cipher *nc,
+			 const void *ctx,
+			 void *ctr_ctx,
+			 size_t nlength, const uint8_t *nonce,
+			 size_t alength, const uint8_t *adata,
+			 size_t clength, uint8_t *dst, const uint8_t *src);
+
+int
+siv_gcm_decrypt_message (const struct nettle_cipher *nc,
+			 const void *ctx,
+			 void *ctr_ctx,
+			 size_t nlength, const uint8_t *nonce,
+			 size_t alength, const uint8_t *adata,
+			 size_t mlength, uint8_t *dst, const uint8_t *src);
+
+/* AEAD_AES_128_GCM_SIV */
+void
+siv_gcm_aes128_encrypt_message (const struct aes128_ctx *ctx,
+				size_t nlength, const uint8_t *nonce,
+				size_t alength, const uint8_t *adata,
+				size_t clength, uint8_t *dst, const uint8_t *src);
+
+int
+siv_gcm_aes128_decrypt_message (const struct aes128_ctx *ctx,
+				size_t nlength, const uint8_t *nonce,
+				size_t alength, const uint8_t *adata,
+				size_t mlength, uint8_t *dst, const uint8_t *src);
+
+/* AEAD_AES_256_GCM_SIV */
+void
+siv_gcm_aes256_encrypt_message (const struct aes256_ctx *ctx,
+				size_t nlength, const uint8_t *nonce,
+				size_t alength, const uint8_t *adata,
+				size_t clength, uint8_t *dst, const uint8_t *src);
+
+int
+siv_gcm_aes256_decrypt_message (const struct aes256_ctx *ctx,
+				size_t nlength, const uint8_t *nonce,
+				size_t alength, const uint8_t *adata,
+				size_t mlength, uint8_t *dst, const uint8_t *src);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NETTLE_SIV_H_INCLUDED */
diff --git a/siv-ghash-set-key.c b/siv-ghash-set-key.c
new file mode 100644
index 00000000..b13d7495
--- /dev/null
+++ b/siv-ghash-set-key.c
@@ -0,0 +1,52 @@
+/* siv-ghash-set-key.c
+
+   POLYVAL implementation for AES-GCM-SIV, based on GHASH
+
+   Copyright (C) 2011 Katholieke Universiteit Leuven
+   Copyright (C) 2011, 2013, 2018, 2022 Niels Möller
+   Copyright (C) 2018, 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "ghash-internal.h"
+#include "block-internal.h"
+
+void
+_siv_ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key)
+{
+  union nettle_block16 h;
+
+  block16_bswap (&h, key);
+  block16_mulx_ghash (&h, &h);
+
+  _ghash_set_key (ctx, &h);
+}
diff --git a/siv-ghash-update.c b/siv-ghash-update.c
new file mode 100644
index 00000000..21ce5c6e
--- /dev/null
+++ b/siv-ghash-update.c
@@ -0,0 +1,65 @@
+/* siv-ghash-update.c
+
+   POLYVAL implementation for AES-GCM-SIV, based on GHASH
+
+   Copyright (C) 2011 Katholieke Universiteit Leuven
+   Copyright (C) 2011, 2013, 2018, 2022 Niels Möller
+   Copyright (C) 2018, 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "ghash-internal.h"
+#include "block-internal.h"
+#include "macros.h"
+
+const uint8_t *
+_siv_ghash_update (const struct gcm_key *ctx, union nettle_block16 *state,
+		 size_t blocks, const uint8_t *data)
+{
+  for (; blocks-- > 0; data += GCM_BLOCK_SIZE)
+    {
+      union nettle_block16 b;
+
+#if WORDS_BIGENDIAN
+      b.u64[1] = LE_READ_UINT64(data);
+      b.u64[0] = LE_READ_UINT64(data + 8);
+#else
+      b.u64[1] = READ_UINT64(data);
+      b.u64[0] = READ_UINT64(data + 8);
+#endif
+
+      _ghash_update (ctx, state, 1, b.b);
+    }
+
+  return data;
+}
+
diff --git a/sm4-meta.c b/sm4-meta.c
new file mode 100644
index 00000000..d7234984
--- /dev/null
+++ b/sm4-meta.c
@@ -0,0 +1,49 @@
+/* sm4-meta.c
+
+   Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "nettle-meta.h"
+
+#include "sm4.h"
+
+const struct nettle_cipher nettle_sm4 = {
+  "sm4",
+  sizeof(struct sm4_ctx),
+  SM4_BLOCK_SIZE,
+  SM4_KEY_SIZE,
+  (nettle_set_key_func *) sm4_set_encrypt_key,
+  (nettle_set_key_func *) sm4_set_decrypt_key,
+  (nettle_cipher_func *) sm4_crypt,
+  (nettle_cipher_func *) sm4_crypt
+};
diff --git a/sm4.c b/sm4.c
new file mode 100644
index 00000000..7b3c049a
--- /dev/null
+++ b/sm4.c
@@ -0,0 +1,223 @@
+/* sm4.c
+
+   Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+#include <string.h>
+
+#include "sm4.h"
+
+#include "macros.h"
+
+
+static const uint32_t fk[4] =
+{
+  0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
+};
+
+static const uint32_t ck[32] =
+{
+  0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269,
+  0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
+  0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249,
+  0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9,
+  0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229,
+  0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299,
+  0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209,
+  0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279
+};
+
+static const uint8_t sbox[256] =
+{
+  0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7,
+  0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
+  0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3,
+  0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+  0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a,
+  0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
+  0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95,
+  0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
+  0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba,
+  0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
+  0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b,
+  0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
+  0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2,
+  0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
+  0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52,
+  0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
+  0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5,
+  0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
+  0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55,
+  0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
+  0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60,
+  0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
+  0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f,
+  0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
+  0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f,
+  0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
+  0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd,
+  0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
+  0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e,
+  0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
+  0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20,
+  0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
+};
+
+static uint32_t
+sm4_t_non_lin_sub(uint32_t x)
+{
+  uint32_t out;
+
+  out  = (uint32_t)sbox[x & 0xff];
+  out |= (uint32_t)sbox[(x >> 8) & 0xff] << 8;
+  out |= (uint32_t)sbox[(x >> 16) & 0xff] << 16;
+  out |= (uint32_t)sbox[(x >> 24) & 0xff] << 24;
+
+  return out;
+}
+
+static uint32_t
+sm4_key_lin_sub(uint32_t x)
+{
+  return x ^ ROTL32(13, x) ^ ROTL32(23, x);
+}
+
+static uint32_t
+sm4_enc_lin_sub(uint32_t x)
+{
+  return x ^ ROTL32(2, x) ^ ROTL32(10, x) ^ ROTL32(18, x) ^ ROTL32(24, x);
+}
+
+static uint32_t
+sm4_key_sub(uint32_t x)
+{
+  return sm4_key_lin_sub(sm4_t_non_lin_sub(x));
+}
+
+static uint32_t
+sm4_enc_sub(uint32_t x)
+{
+  return sm4_enc_lin_sub(sm4_t_non_lin_sub(x));
+}
+
+static uint32_t
+sm4_round(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t rk)
+{
+  return x0 ^ sm4_enc_sub(x1 ^ x2 ^ x3 ^ rk);
+}
+
+static void
+sm4_set_key(struct sm4_ctx *ctx, const uint8_t *key, int encrypt)
+{
+  uint32_t rk0, rk1, rk2, rk3;
+  unsigned i;
+
+  rk0 = READ_UINT32(key +  0) ^ fk[0];
+  rk1 = READ_UINT32(key +  4) ^ fk[1];
+  rk2 = READ_UINT32(key +  8) ^ fk[2];
+  rk3 = READ_UINT32(key + 12) ^ fk[3];
+
+  for (i = 0; i < 32; i += 4)
+    {
+      rk0 ^= sm4_key_sub(rk1 ^ rk2 ^ rk3 ^ ck[i + 0]);
+      rk1 ^= sm4_key_sub(rk2 ^ rk3 ^ rk0 ^ ck[i + 1]);
+      rk2 ^= sm4_key_sub(rk3 ^ rk0 ^ rk1 ^ ck[i + 2]);
+      rk3 ^= sm4_key_sub(rk0 ^ rk1 ^ rk2 ^ ck[i + 3]);
+
+      if (encrypt)
+        {
+          ctx->rkey[i + 0] = rk0;
+          ctx->rkey[i + 1] = rk1;
+          ctx->rkey[i + 2] = rk2;
+          ctx->rkey[i + 3] = rk3;
+        }
+      else
+        {
+          ctx->rkey[31 - 0 - i] = rk0;
+          ctx->rkey[31 - 1 - i] = rk1;
+          ctx->rkey[31 - 2 - i] = rk2;
+          ctx->rkey[31 - 3 - i] = rk3;
+        }
+    }
+}
+
+void
+sm4_set_encrypt_key(struct sm4_ctx *ctx, const uint8_t *key)
+{
+  sm4_set_key(ctx, key, 1);
+}
+
+void
+sm4_set_decrypt_key(struct sm4_ctx *ctx, const uint8_t *key)
+{
+  sm4_set_key(ctx, key, 0);
+}
+
+void
+sm4_crypt(const struct sm4_ctx *context,
+	  size_t length,
+	  uint8_t *dst,
+	  const uint8_t *src)
+{
+  const uint32_t *rk = context->rkey;
+
+  assert( !(length % SM4_BLOCK_SIZE) );
+
+  for ( ; length; length -= SM4_BLOCK_SIZE)
+    {
+      uint32_t x0, x1, x2, x3;
+      unsigned i;
+
+      x0 = READ_UINT32(src + 0 * 4);
+      x1 = READ_UINT32(src + 1 * 4);
+      x2 = READ_UINT32(src + 2 * 4);
+      x3 = READ_UINT32(src + 3 * 4);
+
+      for (i = 0; i < 32; i += 4)
+        {
+          x0 = sm4_round(x0, x1, x2, x3, rk[i + 0]);
+          x1 = sm4_round(x1, x2, x3, x0, rk[i + 1]);
+          x2 = sm4_round(x2, x3, x0, x1, rk[i + 2]);
+          x3 = sm4_round(x3, x0, x1, x2, rk[i + 3]);
+        }
+
+      WRITE_UINT32(dst + 0 * 4, x3);
+      WRITE_UINT32(dst + 1 * 4, x2);
+      WRITE_UINT32(dst + 2 * 4, x1);
+      WRITE_UINT32(dst + 3 * 4, x0);
+
+      src += SM4_BLOCK_SIZE;
+      dst += SM4_BLOCK_SIZE;
+    }
+}
diff --git a/sm4.h b/sm4.h
new file mode 100644
index 00000000..608eb3f3
--- /dev/null
+++ b/sm4.h
@@ -0,0 +1,69 @@
+/* sm4.h
+
+   Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef NETTLE_SM4_H_INCLUDED
+#define NETTLE_SM4_H_INCLUDED
+
+#include "nettle-types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Name mangling */
+#define sm4_set_encrypt_key nettle_sm4_set_encrypt_key
+#define sm4_set_decrypt_key nettle_sm4_set_decrypt_key
+#define sm4_crypt nettle_sm4_crypt
+
+#define SM4_BLOCK_SIZE 16
+#define SM4_KEY_SIZE 16
+
+struct sm4_ctx
+{
+  uint32_t rkey[32];
+};
+
+void
+sm4_set_encrypt_key(struct sm4_ctx *ctx, const uint8_t *key);
+
+void
+sm4_set_decrypt_key(struct sm4_ctx *ctx, const uint8_t *key);
+
+void
+sm4_crypt(const struct sm4_ctx *context,
+	  size_t length, uint8_t *dst,
+	  const uint8_t *src);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NETTLE_SM4_H_INCLUDED */
diff --git a/testsuite/.gitignore b/testsuite/.gitignore
index ca41472e..8c91d1af 100644
--- a/testsuite/.gitignore
+++ b/testsuite/.gitignore
@@ -4,6 +4,7 @@
 /aes-keywrap-test
 /arcfour-test
 /arctwo-test
+/balloon-test
 /base16-test
 /base64-test
 /bignum-test
@@ -98,6 +99,7 @@
 /sha512-256-test
 /sha512-test
 /sm3-test
+/sm4-test
 /streebog-test
 /twofish-test
 /umac-test
@@ -106,6 +108,7 @@
 /xts-test
 /cmac-test
 /siv-test
+/siv-gcm-test
 /bcrypt-test
 /ed448-test
 /shake256-test
diff --git a/testsuite/Makefile.in b/testsuite/Makefile.in
index 6734d3e6..025ab72d 100644
--- a/testsuite/Makefile.in
+++ b/testsuite/Makefile.in
@@ -11,7 +11,7 @@ PRE_CPPFLAGS = -I.. -I$(top_srcdir)
 PRE_LDFLAGS = -L..
 
 TS_NETTLE_SOURCES = aes-test.c aes-keywrap-test.c arcfour-test.c arctwo-test.c \
-		    blowfish-test.c bcrypt-test.c cast128-test.c \
+		    balloon-test.c blowfish-test.c bcrypt-test.c cast128-test.c \
 	            base16-test.c base64-test.c \
 		    camellia-test.c chacha-test.c \
 		    cnd-memcpy-test.c \
@@ -24,11 +24,11 @@ TS_NETTLE_SOURCES = aes-test.c aes-keywrap-test.c arcfour-test.c arctwo-test.c \
 		    sha384-test.c sha512-test.c sha512-224-test.c sha512-256-test.c \
 		    sha3-permute-test.c sha3-224-test.c sha3-256-test.c \
 		    sha3-384-test.c sha3-512-test.c \
-		    shake256-test.c streebog-test.c sm3-test.c \
+		    shake256-test.c streebog-test.c sm3-test.c sm4-test.c \
 		    serpent-test.c twofish-test.c version-test.c \
 		    knuth-lfib-test.c \
 		    cbc-test.c cfb-test.c ctr-test.c gcm-test.c eax-test.c ccm-test.c \
-		    cmac-test.c siv-test.c \
+		    cmac-test.c siv-test.c siv-gcm-test.c \
 		    poly1305-test.c chacha-poly1305-test.c \
 		    hmac-test.c umac-test.c \
 		    meta-hash-test.c meta-cipher-test.c\
@@ -47,8 +47,8 @@ TS_HOGWEED_SOURCES = sexp-test.c sexp-format-test.c \
 		     rsa-compute-root-test.c \
 		     dsa-test.c dsa-keygen-test.c \
 		     curve25519-dh-test.c curve448-dh-test.c \
-		     ecc-mod-test.c ecc-modinv-test.c ecc-redc-test.c \
-		     ecc-sqrt-test.c \
+		     ecc-mod-arith-test.c ecc-mod-test.c ecc-modinv-test.c \
+		     ecc-redc-test.c ecc-sqrt-test.c \
 		     ecc-dup-test.c ecc-add-test.c \
 		     ecc-mul-g-test.c ecc-mul-a-test.c \
 		     ecdsa-sign-test.c ecdsa-verify-test.c \
diff --git a/testsuite/balloon-test.c b/testsuite/balloon-test.c
new file mode 100644
index 00000000..ad63c7a0
--- /dev/null
+++ b/testsuite/balloon-test.c
@@ -0,0 +1,135 @@
+/* balloon-test.c
+
+   Copyright (C) 2022 Zoltan Fridrich
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#include "testutils.h"
+#include "balloon.h"
+
+static void
+test_balloon(const struct nettle_hash *alg,
+             size_t password_len, const char *password,
+             size_t salt_len, const char *salt,
+             unsigned s_cost, unsigned t_cost,
+             const struct tstring *expected)
+{
+  void *ctx = xalloc(alg->context_size);
+  uint8_t *buf = xalloc(balloon_itch(alg->digest_size, s_cost));
+
+  alg->init(ctx);
+  balloon(ctx, alg->update, alg->digest, alg->digest_size,
+          s_cost, t_cost, password_len, (const uint8_t *)password,
+          salt_len, (const uint8_t *)salt, buf, buf);
+
+  if (!MEMEQ(alg->digest_size, buf, expected->data))
+    {
+      fprintf(stderr, "test_balloon: result doesn't match the expectation:");
+      fprintf(stderr, "\nOutput: ");
+      print_hex(alg->digest_size, buf);
+      fprintf(stderr, "\nExpected:");
+      tstring_print_hex(expected);
+      fprintf(stderr, "\n");
+      FAIL();
+    }
+
+  free(ctx);
+  free(buf);
+}
+
+static void
+test_balloon_sha(const struct nettle_hash *alg,
+                 size_t password_len, const char *password,
+                 size_t salt_len, const char *salt,
+                 unsigned s_cost, unsigned t_cost,
+                 const struct tstring *expected)
+{
+  uint8_t *buf = xalloc(balloon_itch(alg->digest_size, s_cost));
+
+  if (alg == &nettle_sha1)
+    balloon_sha1(s_cost, t_cost, password_len, (const uint8_t *)password,
+                 salt_len, (const uint8_t *)salt, buf, buf);
+  else if (alg == &nettle_sha256)
+    balloon_sha256(s_cost, t_cost, password_len, (const uint8_t *)password,
+                   salt_len, (const uint8_t *)salt, buf, buf);
+  else if (alg == &nettle_sha384)
+    balloon_sha384(s_cost, t_cost, password_len, (const uint8_t *)password,
+                   salt_len, (const uint8_t *)salt, buf, buf);
+  else if (alg == &nettle_sha512)
+    balloon_sha512(s_cost, t_cost, password_len, (const uint8_t *)password,
+                   salt_len, (const uint8_t *)salt, buf, buf);
+  else
+    {
+      fprintf(stderr, "test_balloon_sha: bad test\n");
+      FAIL();
+    }
+
+  if (!MEMEQ(alg->digest_size, buf, expected->data))
+    {
+      fprintf(stderr, "test_balloon_sha: result doesn't match the expectation:");
+      fprintf(stderr, "\nOutput: ");
+      print_hex(alg->digest_size, buf);
+      fprintf(stderr, "\nExpected:");
+      tstring_print_hex(expected);
+      fprintf(stderr, "\n");
+      FAIL();
+    }
+
+  free(buf);
+}
+
+/* Test vectors are taken from:
+ * <https://github.com/nachonavarro/balloon-hashing>
+ * <https://github.com/RustCrypto/password-hashes/tree/master/balloon-hash>
+ */
+void
+test_main(void)
+{
+  test_balloon(&nettle_sha256, 8, "hunter42", 11, "examplesalt", 1024, 3,
+               SHEX("716043dff777b44aa7b88dcbab12c078abecfac9d289c5b5195967aa63440dfb"));
+  test_balloon(&nettle_sha256, 0, "", 4, "salt", 3, 3,
+               SHEX("5f02f8206f9cd212485c6bdf85527b698956701ad0852106f94b94ee94577378"));
+  test_balloon(&nettle_sha256, 8, "password", 0, "", 3, 3,
+               SHEX("20aa99d7fe3f4df4bd98c655c5480ec98b143107a331fd491deda885c4d6a6cc"));
+  test_balloon(&nettle_sha256, 1, "", 1, "", 3, 3,
+               SHEX("4fc7e302ffa29ae0eac31166cee7a552d1d71135f4e0da66486fb68a749b73a4"));
+  test_balloon(&nettle_sha256, 8, "password", 4, "salt", 1, 1,
+               SHEX("eefda4a8a75b461fa389c1dcfaf3e9dfacbc26f81f22e6f280d15cc18c417545"));
+
+  test_balloon_sha(&nettle_sha1, 8, "password", 4, "salt", 3, 3,
+                   SHEX("99393c091fdd3136f85864099ec49a439dcacc21"));
+  test_balloon_sha(&nettle_sha256, 8, "password", 4, "salt", 3, 3,
+                   SHEX("a4df347f5a312e8b2b14c32164f61a81758c807f1bdcda44f4930e2b80ab2154"));
+  test_balloon_sha(&nettle_sha384, 8, "password", 4, "salt", 3, 3,
+                   SHEX("78da235f7d0f84aba98b50a432fa6c8f7f3ecb7ea0858cfb316c7e5356aae6c8"
+                        "d7e7b3924c54c4ed71a3d0d68cb0ad68"));
+  test_balloon_sha(&nettle_sha512, 8, "password", 4, "salt", 3, 3,
+                   SHEX("9baf289dfa42990f4b189d96d4ede0f2610ba71fb644169427829d696f6866d8"
+                        "7af41eb68f9e14fd4b1f1a7ce4832f1ed6117c16e8eae753f9e1d054a7c0a7eb"));
+}
diff --git a/testsuite/ecc-add-test.c b/testsuite/ecc-add-test.c
index 6f58a3bb..4793a4bf 100644
--- a/testsuite/ecc-add-test.c
+++ b/testsuite/ecc-add-test.c
@@ -19,6 +19,24 @@ test_main (void)
 
       test_ecc_get_g (i, g);
 
+      ecc->dup (ecc, g2, g, scratch);
+      test_ecc_mul_h (i, 2, g2);
+
+      ecc->add_hhh (ecc, g3, g, g2, scratch);
+      test_ecc_mul_h (i, 3, g3);
+
+      ecc->add_hhh (ecc, g3, g2, g, scratch);
+      test_ecc_mul_h (i, 3, g3);
+
+      ecc->add_hhh (ecc, p, g, g3, scratch);
+      test_ecc_mul_h (i, 4, p);
+
+      ecc->add_hhh (ecc, p, g3, g, scratch);
+      test_ecc_mul_h (i, 4, p);
+
+      ecc->dup (ecc, p, g2, scratch);
+      test_ecc_mul_h (i, 4, p);
+
       if (ecc->p.bit_size == 255 || ecc->p.bit_size == 448)
 	{
 	  mp_limb_t *z = xalloc_limbs (ecc_size_j (ecc));
@@ -49,24 +67,20 @@ test_main (void)
 
 	  free (z);
 	}
+      else
+	{
+	  ASSERT (ecc_nonsec_add_jjj (ecc, g2, g, g, scratch));
+	  test_ecc_mul_h (i, 2, g2);
 
-      ecc->dup (ecc, g2, g, scratch);
-      test_ecc_mul_h (i, 2, g2);
-
-      ecc->add_hhh (ecc, g3, g, g2, scratch);
-      test_ecc_mul_h (i, 3, g3);
-
-      ecc->add_hhh (ecc, g3, g2, g, scratch);
-      test_ecc_mul_h (i, 3, g3);
-
-      ecc->add_hhh (ecc, p, g, g3, scratch);
-      test_ecc_mul_h (i, 4, p);
+	  ASSERT (ecc_nonsec_add_jjj (ecc, g3, g2, g, scratch));
+	  test_ecc_mul_h (i, 3, g3);
 
-      ecc->add_hhh (ecc, p, g3, g, scratch);
-      test_ecc_mul_h (i, 4, p);
+	  ASSERT (ecc_nonsec_add_jjj (ecc, p, g, g3, scratch));
+	  test_ecc_mul_h (i, 4, p);
 
-      ecc->dup (ecc, p, g2, scratch);
-      test_ecc_mul_h (i, 4, p);
+	  ASSERT (ecc_nonsec_add_jjj (ecc, p, g2, g2, scratch));
+	  test_ecc_mul_h (i, 4, p);
+	}
 
       free (g);
       free (g2);
diff --git a/testsuite/ecc-mod-arith-test.c b/testsuite/ecc-mod-arith-test.c
new file mode 100644
index 00000000..14b3bd1c
--- /dev/null
+++ b/testsuite/ecc-mod-arith-test.c
@@ -0,0 +1,160 @@
+#include "testutils.h"
+
+#define MAX_SIZE (1 + 521 / GMP_NUMB_BITS)
+#define COUNT 50000
+
+static void
+test_add(const char *name,
+	 const struct ecc_modulo *m,
+	 const mpz_t az, const mpz_t bz)
+{
+  mp_limb_t a[MAX_SIZE];
+  mp_limb_t b[MAX_SIZE];
+  mp_limb_t t[MAX_SIZE];
+  mpz_t mz;
+  mpz_t tz;
+  mpz_t ref;
+
+  mpz_init (ref);
+  mpz_add (ref, az, bz);
+  mpz_mod (ref, ref, mpz_roinit_n (mz, m->m, m->size));
+
+  mpz_limbs_copy (a, az, m->size);
+  mpz_limbs_copy (b, bz, m->size);
+  ecc_mod_add (m, t, a, b);
+
+  if (!mpz_congruent_p (ref, mpz_roinit_n (tz, t, m->size), mz))
+    {
+      fprintf (stderr, "ecc_mod_add %s failed: bit_size = %u\n",
+	       name, m->bit_size);
+
+      fprintf (stderr, "a   = ");
+      mpn_out_str (stderr, 16, a, m->size);
+      fprintf (stderr, "\nb   = ");
+      mpn_out_str (stderr, 16, b, m->size);
+      fprintf (stderr, "\nt   = ");
+      mpn_out_str (stderr, 16, t, m->size);
+      fprintf (stderr, " (bad)\nref = ");
+      mpz_out_str (stderr, 16, ref);
+      fprintf (stderr, "\n");
+      abort ();
+    }
+  mpz_clear (ref);
+}
+
+static void
+test_sub(const char *name,
+	 const struct ecc_modulo *m,
+	 /* If range is non-null, check that 0 <= r < range. */
+	 const mp_limb_t *range,
+	 const mpz_t az, const mpz_t bz)
+{
+  mp_limb_t a[MAX_SIZE];
+  mp_limb_t b[MAX_SIZE];
+  mp_limb_t t[MAX_SIZE];
+  mpz_t mz;
+  mpz_t tz;
+  mpz_t ref;
+
+  mpz_init (ref);
+  mpz_sub (ref, az, bz);
+  mpz_mod (ref, ref, mpz_roinit_n (mz, m->m, m->size));
+
+  mpz_limbs_copy (a, az, m->size);
+  mpz_limbs_copy (b, bz, m->size);
+  ecc_mod_sub (m, t, a, b);
+
+  if (!mpz_congruent_p (ref, mpz_roinit_n (tz, t, m->size), mz))
+    {
+      fprintf (stderr, "ecc_mod_sub %s failed: bit_size = %u\n",
+	       name, m->bit_size);
+
+      fprintf (stderr, "a   = ");
+      mpn_out_str (stderr, 16, a, m->size);
+      fprintf (stderr, "\nb   = ");
+      mpn_out_str (stderr, 16, b, m->size);
+      fprintf (stderr, "\nt   = ");
+      mpn_out_str (stderr, 16, t, m->size);
+      fprintf (stderr, " (bad)\nref = ");
+      mpz_out_str (stderr, 16, ref);
+      fprintf (stderr, "\n");
+      abort ();
+    }
+
+  if (range && mpn_cmp (t, range, m->size) >= 0)
+    {
+      fprintf (stderr, "ecc_mod_sub %s out of range: bit_size = %u\n",
+	       name, m->bit_size);
+
+      fprintf (stderr, "a   = ");
+      mpn_out_str (stderr, 16, a, m->size);
+      fprintf (stderr, "\nb   = ");
+      mpn_out_str (stderr, 16, b, m->size);
+      fprintf (stderr, "\nt   = ");
+      mpn_out_str (stderr, 16, t, m->size);
+      fprintf (stderr, " \nrange = ");
+      mpn_out_str (stderr, 16, range, m->size);
+      fprintf (stderr, "\n");
+      abort ();
+    }
+  mpz_clear (ref);
+}
+
+static void
+test_modulo (gmp_randstate_t rands, const char *name,
+	     const struct ecc_modulo *m, unsigned count)
+{
+  mpz_t a, b;
+  unsigned j;
+
+  mpz_init (a);
+  mpz_init (b);
+
+  for (j = 0; j < count; j++)
+    {
+      if (j & 1)
+	{
+	  mpz_rrandomb (a, rands, m->size * GMP_NUMB_BITS);
+	  mpz_rrandomb (b, rands, m->size * GMP_NUMB_BITS);
+	}
+      else
+	{
+	  mpz_urandomb (a, rands, m->size * GMP_NUMB_BITS);
+	  mpz_urandomb (b, rands, m->size * GMP_NUMB_BITS);
+	}
+      test_add (name, m, a, b);
+      test_sub (name, m, NULL, a, b);
+    }
+  if (m->bit_size < m->size * GMP_NUMB_BITS)
+    {
+      mp_limb_t two_p[MAX_SIZE];
+      mpn_lshift (two_p, m->m, m->size, 1);
+      mpz_t range;
+      mpz_roinit_n (range, two_p, m->size);
+      mpz_urandomm (a, rands, range);
+      mpz_urandomm (b, rands, range);
+      test_sub (name, m, two_p, a, b);
+    }
+  mpz_clear (a);
+  mpz_clear (b);
+}
+
+void
+test_main (void)
+{
+  gmp_randstate_t rands;
+  unsigned count = COUNT;
+  unsigned i;
+
+  gmp_randinit_default (rands);
+
+  if (test_randomize(rands))
+    count *= 20;
+
+  for (i = 0; ecc_curves[i]; i++)
+    {
+      test_modulo (rands, "p", &ecc_curves[i]->p, count);
+      test_modulo (rands, "q", &ecc_curves[i]->q, count);
+    }
+  gmp_randclear (rands);
+}
diff --git a/testsuite/ecdsa-sign-test.c b/testsuite/ecdsa-sign-test.c
index c79493ae..b8a100b6 100644
--- a/testsuite/ecdsa-sign-test.c
+++ b/testsuite/ecdsa-sign-test.c
@@ -77,6 +77,18 @@ test_main (void)
 	      "3a41e1423b1853e8aa89747b1f987364"
 	      "44705d6d6d8371ea1f578f2e"); /* s */
 
+  /* Produce a signature where verify operation results in a point duplication. */
+  test_ecdsa (&_nettle_secp_256r1,
+	      "1", /* Private key */
+	      "01010101010101010101010101010101"
+	      "01010101010101010101010101010101", /* nonce */
+	      SHEX("6ff03b949241ce1dadd43519e6960e0a"
+		   "85b41a69a05c328103aa2bce1594ca16"), /* hash */
+	      "6ff03b949241ce1dadd43519e6960e0a"
+	      "85b41a69a05c328103aa2bce1594ca16", /* r */
+	      "53f097727a0e0dc284a0daa0da0ab77d"
+	      "5792ae67ed075d1f8d5bda0f853fa093"); /* s */
+
   /* Test cases for the smaller groups, verified with a
      proof-of-concept implementation done for Yubico AB. */
   test_ecdsa (&_nettle_secp_192r1,
diff --git a/testsuite/ecdsa-verify-test.c b/testsuite/ecdsa-verify-test.c
index 8110c64d..8d527000 100644
--- a/testsuite/ecdsa-verify-test.c
+++ b/testsuite/ecdsa-verify-test.c
@@ -109,6 +109,21 @@ test_main (void)
 	      "952800792ed19341fdeeec047f2514f3b0f150d6066151fb", /* r */
 	      "ec5971222014878b50d7a19d8954bc871e7e65b00b860ffb"); /* s */
 
+  /* Test case provided by Guido Vranken, from oss-fuzz. Triggers
+     point duplication in the verify operation by using private key =
+     1 (public key = generator) and hash = r. */
+  test_ecdsa (&_nettle_secp_256r1,
+	      "6B17D1F2E12C4247F8BCE6E563A440F2"
+	      "77037D812DEB33A0F4A13945D898C296", /* x */
+	      "4FE342E2FE1A7F9B8EE7EB4A7C0F9E16"
+	      "2BCE33576B315ECECBB6406837BF51F5", /* y */
+	      SHEX("6ff03b949241ce1dadd43519e6960e0a"
+		   "85b41a69a05c328103aa2bce1594ca16"), /* hash */
+	      "6ff03b949241ce1dadd43519e6960e0a"
+	      "85b41a69a05c328103aa2bce1594ca16", /* r */
+	      "53f097727a0e0dc284a0daa0da0ab77d"
+	      "5792ae67ed075d1f8d5bda0f853fa093"); /* s */
+
   /* From RFC 4754 */
   test_ecdsa (&_nettle_secp_256r1,
 	      "2442A5CC 0ECD015F A3CA31DC 8E2BBC70"
diff --git a/testsuite/gcm-test.c b/testsuite/gcm-test.c
index 8955e9b8..d70cdd1e 100644
--- a/testsuite/gcm-test.c
+++ b/testsuite/gcm-test.c
@@ -577,6 +577,24 @@ test_main(void)
 		 "16aedbf5a0de6a57 a637b39b"),	/* iv */
 	    SHEX("5791883f822013f8bd136fc36fb9946b"));	/* tag */
 
+  /*
+   * GCM-SM4 Test Vectors from
+   * https://datatracker.ietf.org/doc/html/rfc8998
+   */
+  test_aead(&nettle_gcm_sm4, NULL,
+	    SHEX("0123456789ABCDEFFEDCBA9876543210"),
+	    SHEX("FEEDFACEDEADBEEFFEEDFACEDEADBEEFABADDAD2"),
+	    SHEX("AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBB"
+	         "CCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDD"
+	         "EEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFF"
+	         "EEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA"),
+	    SHEX("17F399F08C67D5EE19D0DC9969C4BB7D"
+	         "5FD46FD3756489069157B282BB200735"
+	         "D82710CA5C22F0CCFA7CBF93D496AC15"
+	         "A56834CBCF98C397B4024A2691233B8D"),
+	    SHEX("00001234567800000000ABCD"),
+	    SHEX("83DE3541E4C2B58177E065A9BF7B62EC"));
+
   /* Test gcm_hash, with varying message size, keys and iv all zero.
      Not compared to any other implementation. */
   test_gcm_hash (SDATA("a"),
diff --git a/testsuite/meta-aead-test.c b/testsuite/meta-aead-test.c
index 1fcede40..ceeca227 100644
--- a/testsuite/meta-aead-test.c
+++ b/testsuite/meta-aead-test.c
@@ -8,6 +8,7 @@ const char* aeads[] = {
   "gcm_aes256",
   "gcm_camellia128",
   "gcm_camellia256",
+  "gcm_sm4",
   "eax_aes128",
   "chacha_poly1305",
 };
diff --git a/testsuite/meta-cipher-test.c b/testsuite/meta-cipher-test.c
index f949fd76..912fac5a 100644
--- a/testsuite/meta-cipher-test.c
+++ b/testsuite/meta-cipher-test.c
@@ -1,5 +1,6 @@
 #include "testutils.h"
 #include "nettle-meta.h"
+#include "nettle-internal.h"
 
 const char* ciphers[] = {
   "aes128",
@@ -18,7 +19,8 @@ const char* ciphers[] = {
   "serpent256",
   "twofish128",
   "twofish192",
-  "twofish256"
+  "twofish256",
+  "sm4"
 };
 
 void
@@ -34,8 +36,11 @@ test_main(void)
     ASSERT(NULL != nettle_ciphers[j]); /* make sure we found a matching cipher */
   }
   j = 0;
-  while (NULL != nettle_ciphers[j])
-    j++;
+  for (j = 0; NULL != nettle_ciphers[j]; j++)
+    {
+      ASSERT(nettle_ciphers[j]->block_size <= NETTLE_MAX_CIPHER_BLOCK_SIZE);
+      ASSERT(nettle_ciphers[j]->key_size <= NETTLE_MAX_CIPHER_KEY_SIZE);
+    }
   ASSERT(j == count); /* we are not missing testing any ciphers */
 }
   
diff --git a/testsuite/meta-hash-test.c b/testsuite/meta-hash-test.c
index 3aed43fc..6a15e7db 100644
--- a/testsuite/meta-hash-test.c
+++ b/testsuite/meta-hash-test.c
@@ -36,6 +36,7 @@ test_main(void)
   }
 
   for (i = 0; NULL != nettle_hashes[i]; i++) {
+    ASSERT(nettle_hashes[i]->block_size <= NETTLE_MAX_HASH_BLOCK_SIZE);
     ASSERT(nettle_hashes[i]->digest_size <= NETTLE_MAX_HASH_DIGEST_SIZE);
     ASSERT(nettle_hashes[i]->context_size <= NETTLE_MAX_HASH_CONTEXT_SIZE);
   }
diff --git a/testsuite/siv-gcm-test.c b/testsuite/siv-gcm-test.c
new file mode 100644
index 00000000..eba03f23
--- /dev/null
+++ b/testsuite/siv-gcm-test.c
@@ -0,0 +1,731 @@
+/* siv-gcm-test.c
+
+   Self-test and vectors for AES-GCM-SIV mode ciphers
+
+   Copyright (C) 2022 Red Hat, Inc.
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+/* The test vectors have been collected from the following standards:
+ * RFC8452
+ */
+
+#include "testutils.h"
+#include "ghash-internal.h"
+#include "block-internal.h"
+#include "aes.h"
+#include "siv-gcm.h"
+
+/* AEAD ciphers */
+typedef void
+nettle_encrypt_message_func(void *ctx,
+			    size_t nlength, const uint8_t *nonce,
+			    size_t alength, const uint8_t *adata,
+			    size_t clength, uint8_t *dst, const uint8_t *src);
+
+typedef int
+nettle_decrypt_message_func(void *ctx,
+			    size_t nlength, const uint8_t *nonce,
+			    size_t alength, const uint8_t *adata,
+			    size_t mlength, uint8_t *dst, const uint8_t *src);
+
+static void
+test_compare_results (const char *name,
+		      const struct tstring *adata,
+		      /* Expected results. */
+		      const struct tstring *e_clear,
+		      const struct tstring *e_cipher,
+		      /* Actual results. */
+		      const void *clear,
+		      const void *cipher)
+{
+  if (!MEMEQ(e_cipher->length, e_cipher->data, cipher))
+    {
+      fprintf (stderr, "%s: encryption failed\nAdata: ", name);
+      tstring_print_hex (adata);
+      fprintf (stderr, "\nInput: ");
+      tstring_print_hex (e_clear);
+      fprintf (stderr, "\nOutput: ");
+      print_hex (e_cipher->length, cipher);
+      fprintf (stderr, "\nExpected:");
+      tstring_print_hex (e_cipher);
+      fprintf (stderr, "\n");
+      FAIL();
+    }
+  if (!MEMEQ(e_clear->length, e_clear->data, clear))
+    {
+      fprintf (stderr, "%s decrypt failed:\nAdata:", name);
+      tstring_print_hex (adata);
+      fprintf (stderr, "\nInput: ");
+      tstring_print_hex (e_cipher);
+      fprintf (stderr, "\nOutput: ");
+      print_hex (e_clear->length, clear);
+      fprintf (stderr, "\nExpected:");
+      tstring_print_hex (e_clear);
+      fprintf (stderr, "\n");
+      FAIL();
+    }
+} /* test_compare_results */
+
+static void
+test_cipher_siv_gcm (const char *name,
+		     nettle_set_key_func *siv_gcm_set_key,
+		     nettle_encrypt_message_func *siv_gcm_encrypt,
+		     nettle_decrypt_message_func *siv_gcm_decrypt,
+		     size_t context_size, size_t key_size,
+		     const struct tstring *key,
+		     const struct tstring *nonce,
+		     const struct tstring *authdata,
+		     const struct tstring *cleartext,
+		     const struct tstring *ciphertext)
+{
+  void *ctx = xalloc (context_size);
+  uint8_t *en_data;
+  uint8_t *de_data;
+  int ret;
+
+  ASSERT (key->length == key_size);
+  ASSERT (cleartext->length + SIV_GCM_DIGEST_SIZE == ciphertext->length);
+
+  de_data = xalloc (cleartext->length);
+  en_data = xalloc (ciphertext->length);
+
+  /* Ensure we get the same answers using the all-in-one API. */
+  memset (de_data, 0, cleartext->length);
+  memset (en_data, 0, ciphertext->length);
+
+  siv_gcm_set_key (ctx, key->data);
+  siv_gcm_encrypt (ctx, nonce->length, nonce->data,
+		   authdata->length, authdata->data,
+		   ciphertext->length, en_data, cleartext->data);
+  ret = siv_gcm_decrypt (ctx, nonce->length, nonce->data,
+			 authdata->length, authdata->data,
+			 cleartext->length, de_data, ciphertext->data);
+
+  if (ret != 1)
+    {
+      fprintf (stderr, "siv_gcm_decrypt_message failed to validate message\n");
+      FAIL();
+    }
+  test_compare_results (name, authdata,
+			cleartext, ciphertext, de_data, en_data);
+
+  /* Ensure that we can detect corrupted message or tag data. */
+  en_data[0] ^= 1;
+  ret = siv_gcm_decrypt (ctx, nonce->length, nonce->data,
+			 authdata->length, authdata->data,
+			 cleartext->length, de_data, en_data);
+  if (ret != 0)
+    {
+      fprintf (stderr, "siv_gcm_decrypt_message failed to detect corrupted message\n");
+      FAIL();
+    }
+
+  /* Ensure we can detect corrupted adata. */
+  if (authdata->length)
+    {
+      en_data[0] ^= 1;
+      ret = siv_gcm_decrypt (ctx, nonce->length, nonce->data,
+			    authdata->length-1, authdata->data,
+			     cleartext->length, de_data, en_data);
+    if (ret != 0)
+      {
+	fprintf (stderr, "siv_decrypt_message failed to detect corrupted message\n");
+	FAIL();
+      }
+  }
+
+  free (ctx);
+  free (en_data);
+  free (de_data);
+}
+
+#define test_siv_gcm_aes128(name, key, nonce, authdata, cleartext, ciphertext) \
+  test_cipher_siv_gcm(name, (nettle_set_key_func*)aes128_set_encrypt_key, \
+		      (nettle_encrypt_message_func*)siv_gcm_aes128_encrypt_message, \
+		      (nettle_decrypt_message_func*)siv_gcm_aes128_decrypt_message, \
+		      sizeof(struct aes128_ctx), AES128_KEY_SIZE,	\
+		      key, nonce, authdata, cleartext, ciphertext)
+
+#define test_siv_gcm_aes256(name, key, nonce, authdata, cleartext, ciphertext) \
+  test_cipher_siv_gcm(name, (nettle_set_key_func*)aes256_set_encrypt_key, \
+		      (nettle_encrypt_message_func*)siv_gcm_aes256_encrypt_message, \
+		      (nettle_decrypt_message_func*)siv_gcm_aes256_decrypt_message, \
+		      sizeof(struct aes256_ctx), AES256_KEY_SIZE,	\
+		      key, nonce, authdata, cleartext, ciphertext)
+
+static void
+test_polyval_internal (const struct tstring *key,
+		       const struct tstring *message,
+		       const struct tstring *digest)
+{
+  ASSERT (key->length == GCM_BLOCK_SIZE);
+  ASSERT (message->length % GCM_BLOCK_SIZE == 0);
+  ASSERT (digest->length == GCM_BLOCK_SIZE);
+  struct gcm_key gcm_key;
+  union nettle_block16 state;
+
+  memcpy (state.b, key->data, GCM_BLOCK_SIZE);
+  _siv_ghash_set_key (&gcm_key, &state);
+
+  block16_zero (&state);
+  _siv_ghash_update (&gcm_key, &state, message->length / GCM_BLOCK_SIZE, message->data);
+  block16_bswap (&state, &state);
+
+  if (!MEMEQ(GCM_BLOCK_SIZE, state.b, digest->data))
+    {
+      fprintf (stderr, "POLYVAL failed\n");
+      fprintf (stderr, "Key: ");
+      tstring_print_hex (key);
+      fprintf (stderr, "\nMessage: ");
+      tstring_print_hex (message);
+      fprintf (stderr, "\nOutput: ");
+      print_hex (GCM_BLOCK_SIZE, state.b);
+      fprintf (stderr, "\nExpected:");
+      tstring_print_hex (digest);
+      fprintf (stderr, "\n");
+      FAIL();
+    }
+}
+
+void
+test_main(void)
+{
+  /* RFC8452, Appendix A.  */
+  test_polyval_internal (SHEX("25629347589242761d31f826ba4b757b"),
+			 SHEX("4f4f95668c83dfb6401762bb2d01a262"
+			      "d1a24ddd2721d006bbe45f20d3c9f362"),
+			 SHEX("f7a3b47b846119fae5b7866cf5e5b77e"));
+
+  /* RFC8452, Appendix C.1.  */
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX(""),
+		       SHEX("dc20e2d83f25705bb49e439eca56de25"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV 1",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("0100000000000000"),
+		       SHEX("b5d839330ac7b786578782fff6013b81"
+			    "5b287c22493a364c"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("010000000000000000000000"),
+		       SHEX("7323ea61d05932260047d942a4978db3"
+			    "57391a0bc4fdec8b0d106639"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("743f7c8077ab25f8624e2e948579cf77"
+			    "303aaf90f6fe21199c6068577437a0c4"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("01000000000000000000000000000000"
+			    "02000000000000000000000000000000"),
+		       SHEX("84e07e62ba83a6585417245d7ec413a9"
+			    "fe427d6315c09b57ce45f2e3936a9445"
+			    "1a8e45dcd4578c667cd86847bf6155ff"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("01000000000000000000000000000000"
+		            "02000000000000000000000000000000"
+		            "03000000000000000000000000000000"),
+		       SHEX("3fd24ce1f5a67b75bf2351f181a475c7"
+		            "b800a5b4d3dcf70106b1eea82fa1d64d"
+		            "f42bf7226122fa92e17a40eeaac1201b"
+		            "5e6e311dbf395d35b0fe39c2714388f8"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("01000000000000000000000000000000"
+		            "02000000000000000000000000000000"
+		            "03000000000000000000000000000000"
+		            "04000000000000000000000000000000"),
+		       SHEX("2433668f1058190f6d43e360f4f35cd8"
+		            "e475127cfca7028ea8ab5c20f7ab2af0"
+		            "2516a2bdcbc08d521be37ff28c152bba"
+		            "36697f25b4cd169c6590d1dd39566d3f"
+		            "8a263dd317aa88d56bdf3936dba75bb8"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("0200000000000000"),
+		       SHEX("1e6daba35669f4273b0a1a2560969cdf"
+		            "790d99759abd1508"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("020000000000000000000000"),
+		       SHEX("296c7889fd99f41917f4462008299c51"
+		            "02745aaa3a0c469fad9e075a"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("02000000000000000000000000000000"),
+		       SHEX("e2b0c5da79a901c1745f700525cb335b"
+		            "8f8936ec039e4e4bb97ebd8c4457441f"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("02000000000000000000000000000000"
+		            "03000000000000000000000000000000"),
+		       SHEX("620048ef3c1e73e57e02bb8562c416a3"
+		            "19e73e4caac8e96a1ecb2933145a1d71"
+		            "e6af6a7f87287da059a71684ed3498e1"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("02000000000000000000000000000000"
+		            "03000000000000000000000000000000"
+		            "04000000000000000000000000000000"),
+		       SHEX("50c8303ea93925d64090d07bd109dfd9"
+		            "515a5a33431019c17d93465999a8b005"
+		            "3201d723120a8562b838cdff25bf9d1e"
+		            "6a8cc3865f76897c2e4b245cf31c51f2"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("02000000000000000000000000000000"
+		            "03000000000000000000000000000000"
+		            "04000000000000000000000000000000"
+		            "05000000000000000000000000000000"),
+		       SHEX("2f5c64059db55ee0fb847ed513003746"
+		            "aca4e61c711b5de2e7a77ffd02da42fe"
+		            "ec601910d3467bb8b36ebbaebce5fba3"
+		            "0d36c95f48a3e7980f0e7ac299332a80"
+		            "cdc46ae475563de037001ef84ae21744"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("010000000000000000000000"),
+		       SHEX("02000000"),
+		       SHEX("a8fe3e8707eb1f84fb28f8cb73de8e99"
+		            "e2f48a14"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01000000000000000000000000000000"
+		            "0200"),
+		       SHEX("03000000000000000000000000000000"
+		            "04000000"),
+		       SHEX("6bb0fecf5ded9b77f902c7d5da236a43"
+		            "91dd029724afc9805e976f451e6d87f6"
+		            "fe106514"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01000000000000000000000000000000"
+		            "02000000"),
+		       SHEX("03000000000000000000000000000000"
+		            "0400"),
+		       SHEX("44d0aaf6fb2f1f34add5e8064e83e12a"
+		            "2adabff9b2ef00fb47920cc72a0c0f13"
+		            "b9fd"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("e66021d5eb8e4f4066d4adb9c33560e4"),
+		       SHEX("f46e44bb3da0015c94f70887"),
+		       SHEX(""),
+		       SHEX(""),
+		       SHEX("a4194b79071b01a87d65f706e3949578"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("36864200e0eaf5284d884a0e77d31646"),
+		       SHEX("bae8e37fc83441b16034566b"),
+		       SHEX("46bb91c3c5"),
+		       SHEX("7a806c"),
+		       SHEX("af60eb711bd85bc1e4d3e0a462e074ee"
+		            "a428a8"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("aedb64a6c590bc84d1a5e269e4b47801"),
+		       SHEX("afc0577e34699b9e671fdd4f"),
+		       SHEX("fc880c94a95198874296"),
+		       SHEX("bdc66f146545"),
+		       SHEX("bb93a3e34d3cd6a9c45545cfc11f03ad"
+		            "743dba20f966"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("d5cc1fd161320b6920ce07787f86743b"),
+		       SHEX("275d1ab32f6d1f0434d8848c"),
+		       SHEX("046787f3ea22c127aaf195d1894728"),
+		       SHEX("1177441f195495860f"),
+		       SHEX("4f37281f7ad12949d01d02fd0cd174c8"
+		            "4fc5dae2f60f52fd2b"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("b3fed1473c528b8426a582995929a149"),
+		       SHEX("9e9ad8780c8d63d0ab4149c0"),
+		       SHEX("c9882e5386fd9f92ec489c8fde2be2cf"
+		            "97e74e93"),
+		       SHEX("9f572c614b4745914474e7c7"),
+		       SHEX("f54673c5ddf710c745641c8bc1dc2f87"
+		            "1fb7561da1286e655e24b7b0"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("2d4ed87da44102952ef94b02b805249b"),
+		       SHEX("ac80e6f61455bfac8308a2d4"),
+		       SHEX("2950a70d5a1db2316fd568378da107b5"
+		            "2b0da55210cc1c1b0a"),
+		       SHEX("0d8c8451178082355c9e940fea2f58"),
+		       SHEX("c9ff545e07b88a015f05b274540aa183"
+		            "b3449b9f39552de99dc214a1190b0b"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("bde3b2f204d1e9f8b06bc47f9745b3d1"),
+		       SHEX("ae06556fb6aa7890bebc18fe"),
+		       SHEX("1860f762ebfbd08284e421702de0de18"
+		            "baa9c9596291b08466f37de21c7f"),
+		       SHEX("6b3db4da3d57aa94842b9803a96e07fb"
+		            "6de7"),
+		       SHEX("6298b296e24e8cc35dce0bed484b7f30"
+		            "d5803e377094f04709f64d7b985310a4"
+		            "db84"));
+
+  test_siv_gcm_aes128 ("AEAD_AES_128_GCM_SIV",
+		       SHEX("f901cfe8a69615a93fdf7a98cad48179"),
+		       SHEX("6245709fb18853f68d833640"),
+		       SHEX("7576f7028ec6eb5ea7e298342a94d4b2"
+		            "02b370ef9768ec6561c4fe6b7e7296fa"
+		            "859c21"),
+		       SHEX("e42a3c02c25b64869e146d7b233987bd"
+		            "dfc240871d"),
+		       SHEX("391cc328d484a4f46406181bcd62efd9"
+		            "b3ee197d052d15506c84a9edd65e13e9"
+		            "d24a2a6e70"));
+
+  /* RFC8452, Appendix C.2.  */
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+			    "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX(""),
+		       SHEX("07f5f4169bbf55a8400cd47ea6fd400f"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+			    "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("0100000000000000"),
+		       SHEX("c2ef328e5c71c83b843122130f7364b7"
+			    "61e0b97427e3df28"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+			    "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("010000000000000000000000"),
+		       SHEX("9aab2aeb3faa0a34aea8e2b18ca50da9"
+			    "ae6559e48fd10f6e5c9ca17e"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("01000000000000000000000000000000"),
+		       SHEX("85a01b63025ba19b7fd3ddfc033b3e76"
+		            "c9eac6fa700942702e90862383c6c366"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("01000000000000000000000000000000"
+		            "02000000000000000000000000000000"),
+		       SHEX("4a6a9db4c8c6549201b9edb53006cba8"
+		            "21ec9cf850948a7c86c68ac7539d027f"
+		            "e819e63abcd020b006a976397632eb5d"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("01000000000000000000000000000000"
+		            "02000000000000000000000000000000"
+		            "03000000000000000000000000000000"),
+		       SHEX("c00d121893a9fa603f48ccc1ca3c57ce"
+		            "7499245ea0046db16c53c7c66fe717e3"
+		            "9cf6c748837b61f6ee3adcee17534ed5"
+		            "790bc96880a99ba804bd12c0e6a22cc4"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX(""),
+		       SHEX("01000000000000000000000000000000"
+		            "02000000000000000000000000000000"
+		            "03000000000000000000000000000000"
+		            "04000000000000000000000000000000"),
+		       SHEX("c2d5160a1f8683834910acdafc41fbb1"
+		            "632d4a353e8b905ec9a5499ac34f96c7"
+		            "e1049eb080883891a4db8caaa1f99dd0"
+		            "04d80487540735234e3744512c6f90ce"
+		            "112864c269fc0d9d88c61fa47e39aa08"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("0200000000000000"),
+		       SHEX("1de22967237a813291213f267e3b452f"
+		            "02d01ae33e4ec854"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("020000000000000000000000"),
+		       SHEX("163d6f9cc1b346cd453a2e4cc1a4a19a"
+		            "e800941ccdc57cc8413c277f"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("02000000000000000000000000000000"),
+		       SHEX("c91545823cc24f17dbb0e9e807d5ec17"
+		            "b292d28ff61189e8e49f3875ef91aff7"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("02000000000000000000000000000000"
+		            "03000000000000000000000000000000"),
+		       SHEX("07dad364bfc2b9da89116d7bef6daaaf"
+		            "6f255510aa654f920ac81b94e8bad365"
+		            "aea1bad12702e1965604374aab96dbbc"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("02000000000000000000000000000000"
+		            "03000000000000000000000000000000"
+		            "04000000000000000000000000000000"),
+		       SHEX("c67a1f0f567a5198aa1fcc8e3f213143"
+		            "36f7f51ca8b1af61feac35a86416fa47"
+		            "fbca3b5f749cdf564527f2314f42fe25"
+		            "03332742b228c647173616cfd44c54eb"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01"),
+		       SHEX("02000000000000000000000000000000"
+		            "03000000000000000000000000000000"
+		            "04000000000000000000000000000000"
+		            "05000000000000000000000000000000"),
+		       SHEX("67fd45e126bfb9a79930c43aad2d3696"
+		            "7d3f0e4d217c1e551f59727870beefc9"
+		            "8cb933a8fce9de887b1e40799988db1f"
+		            "c3f91880ed405b2dd298318858467c89"
+		            "5bde0285037c5de81e5b570a049b62a0"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("010000000000000000000000"),
+		       SHEX("02000000"),
+		       SHEX("22b3f4cd1835e517741dfddccfa07fa4"
+		            "661b74cf"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01000000000000000000000000000000"
+		            "0200"),
+		       SHEX("03000000000000000000000000000000"
+		            "04000000"),
+		       SHEX("43dd0163cdb48f9fe3212bf61b201976"
+		            "067f342bb879ad976d8242acc188ab59"
+		            "cabfe307"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("01000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("030000000000000000000000"),
+		       SHEX("01000000000000000000000000000000"
+		            "02000000"),
+		       SHEX("03000000000000000000000000000000"
+		            "0400"),
+		       SHEX("462401724b5ce6588d5a54aae5375513"
+		            "a075cfcdf5042112aa29685c912fc205"
+		            "6543"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("e66021d5eb8e4f4066d4adb9c33560e4"
+		            "f46e44bb3da0015c94f7088736864200"),
+		       SHEX("e0eaf5284d884a0e77d31646"),
+		       SHEX(""),
+		       SHEX(""),
+		       SHEX("169fbb2fbf389a995f6390af22228a62"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("bae8e37fc83441b16034566b7a806c46"
+		            "bb91c3c5aedb64a6c590bc84d1a5e269"),
+		       SHEX("e4b47801afc0577e34699b9e"),
+		       SHEX("4fbdc66f14"),
+		       SHEX("671fdd"),
+		       SHEX("0eaccb93da9bb81333aee0c785b240d3"
+		            "19719d"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("6545fc880c94a95198874296d5cc1fd1"
+		            "61320b6920ce07787f86743b275d1ab3"),
+		       SHEX("2f6d1f0434d8848c1177441f"),
+		       SHEX("6787f3ea22c127aaf195"),
+		       SHEX("195495860f04"),
+		       SHEX("a254dad4f3f96b62b84dc40c84636a5e"
+		            "c12020ec8c2c"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("d1894728b3fed1473c528b8426a58299"
+		            "5929a1499e9ad8780c8d63d0ab4149c0"),
+		       SHEX("9f572c614b4745914474e7c7"),
+		       SHEX("489c8fde2be2cf97e74e932d4ed87d"),
+		       SHEX("c9882e5386fd9f92ec"),
+		       SHEX("0df9e308678244c44bc0fd3dc6628dfe"
+		            "55ebb0b9fb2295c8c2"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("a44102952ef94b02b805249bac80e6f6"
+		            "1455bfac8308a2d40d8c845117808235"),
+		       SHEX("5c9e940fea2f582950a70d5a"),
+		       SHEX("0da55210cc1c1b0abde3b2f204d1e9f8"
+		            "b06bc47f"),
+		       SHEX("1db2316fd568378da107b52b"),
+		       SHEX("8dbeb9f7255bf5769dd56692404099c2"
+		            "587f64979f21826706d497d5"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("9745b3d1ae06556fb6aa7890bebc18fe"
+		            "6b3db4da3d57aa94842b9803a96e07fb"),
+		       SHEX("6de71860f762ebfbd08284e4"),
+		       SHEX("f37de21c7ff901cfe8a69615a93fdf7a"
+		            "98cad481796245709f"),
+		       SHEX("21702de0de18baa9c9596291b08466"),
+		       SHEX("793576dfa5c0f88729a7ed3c2f1bffb3"
+		            "080d28f6ebb5d3648ce97bd5ba67fd"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("b18853f68d833640e42a3c02c25b6486"
+		            "9e146d7b233987bddfc240871d7576f7"),
+		       SHEX("028ec6eb5ea7e298342a94d4"),
+		       SHEX("9c2159058b1f0fe91433a5bdc20e214e"
+		            "ab7fecef4454a10ef0657df21ac7"),
+		       SHEX("b202b370ef9768ec6561c4fe6b7e7296"
+		            "fa85"),
+		       SHEX("857e16a64915a787637687db4a951963"
+		            "5cdd454fc2a154fea91f8363a39fec7d"
+		            "0a49"));
+
+  test_siv_gcm_aes256 ("AEAD_AES_256_GCM_SIV",
+		       SHEX("3c535de192eaed3822a2fbbe2ca9dfc8"
+		            "8255e14a661b8aa82cc54236093bbc23"),
+		       SHEX("688089e55540db1872504e1c"),
+		       SHEX("734320ccc9d9bbbb19cb81b2af4ecbc3"
+		            "e72834321f7aa0f70b7282b4f33df23f"
+		            "167541"),
+		       SHEX("ced532ce4159b035277d4dfbb7db6296"
+		            "8b13cd4eec"),
+		       SHEX("626660c26ea6612fb17ad91e8e767639"
+		            "edd6c9faee9d6c7029675b89eaf4ba1d"
+		            "ed1a286594"));
+
+  /* RFC8452, Appendix C.3.  */
+  test_siv_gcm_aes256 ("Counter wrap",
+		       SHEX("00000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("000000000000000000000000"),
+		       SHEX(""),
+		       SHEX("00000000000000000000000000000000"
+		            "4db923dc793ee6497c76dcc03a98e108"),
+		       SHEX("f3f80f2cf0cb2dd9c5984fcda908456c"
+		            "c537703b5ba70324a6793a7bf218d3ea"
+		            "ffffffff000000000000000000000000"));
+
+  test_siv_gcm_aes256 ("Counter wrap",
+		       SHEX("00000000000000000000000000000000"
+		            "00000000000000000000000000000000"),
+		       SHEX("000000000000000000000000"),
+		       SHEX(""),
+		       SHEX("eb3640277c7ffd1303c7a542d02d3e4c"
+		            "0000000000000000"),
+		       SHEX("18ce4f0b8cb4d0cac65fea8f79257b20"
+		            "888e53e72299e56dffffffff00000000"
+		            "0000000000000000"));
+}
diff --git a/testsuite/sm4-test.c b/testsuite/sm4-test.c
new file mode 100644
index 00000000..97d9d58a
--- /dev/null
+++ b/testsuite/sm4-test.c
@@ -0,0 +1,19 @@
+#include "testutils.h"
+#include "sm4.h"
+
+void
+test_main(void)
+{
+  /* test vectors from:
+   * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+   */
+  test_cipher(&nettle_sm4,
+	      SHEX("0123456789ABCDEF FEDCBA9876543210"),
+	      SHEX("0123456789ABCDEF FEDCBA9876543210"),
+	      SHEX("681EDF34D206965E 86B3E94F536E4246"));
+
+  test_cipher(&nettle_sm4,
+	      SHEX("FEDCBA9876543210 0123456789ABCDEF"),
+	      SHEX("0001020304050607 08090A0B0C0D0E0F"),
+	      SHEX("F766678F13F01ADE AC1B3EA955ADB594"));
+}
diff --git a/testsuite/testutils.c b/testsuite/testutils.c
index 0d91d8ef..39c6bece 100644
--- a/testsuite/testutils.c
+++ b/testsuite/testutils.c
@@ -1109,6 +1109,13 @@ mpz_urandomb (mpz_t r, struct knuth_lfib_ctx *ctx, mp_bitcnt_t bits)
   nettle_mpz_set_str_256_u (r, bytes, buf);
   free (buf);
 }
+void
+mpz_urandomm (mpz_t r, struct knuth_lfib_ctx *ctx, const mpz_t n)
+{
+  /* Add some extra bits, to make result almost unbiased. */
+  mpz_urandomb(r, ctx, mpz_sizeinbase(n, 2) + 30);
+  mpz_mod(r, r, n);
+}
 #else /* !NETTLE_USE_MINI_GMP */
 static void
 get_random_seed(mpz_t seed)
diff --git a/testsuite/testutils.h b/testsuite/testutils.h
index 3e239787..00555b3a 100644
--- a/testsuite/testutils.h
+++ b/testsuite/testutils.h
@@ -164,8 +164,10 @@ typedef struct knuth_lfib_ctx gmp_randstate_t[1];
 void gmp_randinit_default (struct knuth_lfib_ctx *ctx);
 #define gmp_randclear(state)
 void mpz_urandomb (mpz_t r, struct knuth_lfib_ctx *ctx, mp_bitcnt_t bits);
+void mpz_urandomm (mpz_t r, struct knuth_lfib_ctx *ctx, const mpz_t n);
 /* This is cheating */
 #define mpz_rrandomb mpz_urandomb
+#define mpz_rrandomm mpz_urandomm
 static inline int
 test_randomize (gmp_randstate_t rands UNUSED) { return 0; }
 #else /* !NETTLE_USE_MINI_GMP */
diff --git a/x86_64/fat/sha256-compress-2.asm b/x86_64/fat/sha256-compress-n-2.asm
index 996cf8c5..60f7c8f6 100644
--- a/x86_64/fat/sha256-compress-2.asm
+++ b/x86_64/fat/sha256-compress-n-2.asm
@@ -1,4 +1,4 @@
-C x86_64/fat/sha256-compress-2.asm
+C x86_64/fat/sha256-compress-n-2.asm
 
 ifelse(`
    Copyright (C) 2018 Niels Möller
@@ -31,4 +31,4 @@ ifelse(`
 ')
 
 define(`fat_transform', `$1_sha_ni')
-include_src(`x86_64/sha_ni/sha256-compress.asm')
+include_src(`x86_64/sha_ni/sha256-compress-n.asm')
diff --git a/x86_64/fat/sha256-compress.asm b/x86_64/fat/sha256-compress-n.asm
index 2aaeb5e8..fc358858 100644
--- a/x86_64/fat/sha256-compress.asm
+++ b/x86_64/fat/sha256-compress-n.asm
@@ -1,4 +1,4 @@
-C x86_64/fat/sha256-compress.asm
+C x86_64/fat/sha256-compress-n.asm
 
 ifelse(`
    Copyright (C) 2018 Niels Möller
@@ -31,4 +31,4 @@ ifelse(`
 ')
 
 define(`fat_transform', `$1_x86_64')
-include_src(`x86_64/sha256-compress.asm')
+include_src(`x86_64/sha256-compress-n.asm')
diff --git a/x86_64/poly1305-blocks.asm b/x86_64/poly1305-blocks.asm
new file mode 100644
index 00000000..63bfed3e
--- /dev/null
+++ b/x86_64/poly1305-blocks.asm
@@ -0,0 +1,128 @@
+C x86_64/poly1305-blocks.asm
+
+ifelse(`
+   Copyright (C) 2022 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.file "poly1305-blocks.asm"
+
+define(`CTX', `%rdi') C First argument to all functions
+define(`BLOCKS', `%rsi')
+define(`MP_PARAM', `%rdx')	C Moved to MP, to not collide with mul instruction.
+
+define(`MP', `%r8')		C May clobber, both with unix and windows conventions.
+define(`T0', `%rbx')
+define(`T1', `%rcx')
+define(`H0', `%rbp')
+define(`H1', `%r9')
+define(`H2', `%r10')
+define(`F0', `%r11')
+define(`F1', `%r12')
+
+C const uint8_t *
+C _nettle_poly1305_blocks (struct poly1305_ctx *ctx, size_t blocks, const uint8_t *m)
+
+PROLOGUE(_nettle_poly1305_blocks)
+	W64_ENTRY(3, 0)
+	mov	MP_PARAM, MP
+	test	BLOCKS, BLOCKS
+	jz	.Lend
+
+	push 	%rbx
+	push 	%rbp
+	push	%r12
+	mov	P1305_H0 (CTX), H0
+	mov	P1305_H1 (CTX), H1
+	mov	P1305_H2 (CTX), H2
+	ALIGN(16)
+.Loop:
+	mov	(MP), T0
+	mov	8(MP), T1
+	add	$16, MP
+
+	add	H0, T0
+	adc	H1, T1
+	adc	$1, H2
+
+	mov	P1305_R1 (CTX), %rax
+	mul	T0			C R1*T0
+	mov	%rax, F0
+	mov	%rdx, F1
+
+	mov	T0, %rax		C Last use of T0 input
+	mov	P1305_R0 (CTX), T0
+	mul	T0			C R0*T0
+	mov	%rax, H0
+	mov	%rdx, H1
+
+	mov	T1, %rax
+	mul	T0			C R0*T1
+	add	%rax, F0
+	adc	%rdx, F1
+
+	mov	P1305_S1 (CTX), T0
+	mov	T1, %rax		C Last use of T1 input
+	mul	T0			C S1*T1
+	add	%rax, H0
+	adc	%rdx, H1
+
+	mov	H2, %rax
+	mul	T0			C S1*H2
+	add	%rax, F0
+	adc	%rdx, F1
+
+	mov	H2, T0
+	and	$3, H2
+
+	shr	$2, T0
+	mov	P1305_S0 (CTX), %rax
+	mul	T0			C S0*(H2 >> 2)
+	add	%rax, H0
+	adc	%rdx, H1
+
+	imul	P1305_R0 (CTX), H2	C R0*(H2 & 3)
+	add 	F0, H1
+	adc	F1, H2
+
+	dec	BLOCKS
+	jnz	.Loop
+
+	mov	H0, P1305_H0 (CTX)
+	mov	H1, P1305_H1 (CTX)
+	mov	H2, P1305_H2 (CTX)
+
+	pop	%r12
+	pop	%rbp
+	pop 	%rbx
+
+.Lend:
+	mov	MP, %rax
+	W64_EXIT(3, 0)
+	ret
+EPILOGUE(_nettle_poly1305_blocks)
diff --git a/x86_64/poly1305-internal.asm b/x86_64/poly1305-internal.asm
index ef2f38e4..7ce415a4 100644
--- a/x86_64/poly1305-internal.asm
+++ b/x86_64/poly1305-internal.asm
@@ -106,7 +106,7 @@ PROLOGUE(_nettle_poly1305_block)
 	adc	P1305_H2 (CTX), T2
 
 	mov	P1305_R1 (CTX), %rax
-	mul	T0			C R1 T0
+	mul	T0			C R1*T0
 	mov	%rax, F0
 	mov	%rdx, F1
 
diff --git a/x86_64/sha256-compress.asm b/x86_64/sha256-compress-n.asm
index 5ed669b1..e10d260c 100644
--- a/x86_64/sha256-compress.asm
+++ b/x86_64/sha256-compress-n.asm
@@ -1,7 +1,7 @@
-C x86_64/sha256-compress.asm
+C x86_64/sha256-compress-n.asm
 
 ifelse(`
-   Copyright (C) 2013 Niels Möller
+   Copyright (C) 2013, 2022 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -30,21 +30,24 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
-	.file "sha256-compress.asm"
+	.file "sha256-compress-n.asm"
 define(`STATE', `%rdi')
-define(`INPUT', `%rsi')
-define(`K', `%rdx')
+define(`K', `%rsi')
+define(`BLOCKS', `%rdx')
+define(`INPUT', `%rcx')
+define(`STATE_SAVED', `64(%rsp)')
+
 define(`SA', `%eax')
 define(`SB', `%ebx')
-define(`SC', `%ecx')
+define(`SC', `%ebp')
 define(`SD', `%r8d')
 define(`SE', `%r9d')
 define(`SF', `%r10d')
 define(`SG', `%r11d')
 define(`SH', `%r12d')
 define(`T0', `%r13d')
-define(`T1', `%edi')	C Overlap STATE
-define(`COUNT', `%r14')
+define(`T1', `%r14d')
+define(`COUNT', `%rdi')	C Overlap STATE
 define(`W', `%r15d')
 
 define(`EXPN', `
@@ -123,18 +126,21 @@ define(`NOEXPN', `
 	movl	W, OFFSET($1)(%rsp, COUNT, 4)
 ')
 
-	C void
-	C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+	C const uint8_t *
+	C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+	C                           size_t blocks, const uint8_t *input)
 
 	.text
 	ALIGN(16)
 
-PROLOGUE(_nettle_sha256_compress)
+PROLOGUE(_nettle_sha256_compress_n)
 	W64_ENTRY(3, 0)
+	test	BLOCKS, BLOCKS
+	jz	.Lend
 
 	sub	$120, %rsp
-	mov	%rbx, 64(%rsp)
-	mov	STATE, 72(%rsp)	C Save state, to free a register
+	mov	STATE, STATE_SAVED	C Save state, to free a register
+	mov	%rbx, 72(%rsp)
 	mov	%rbp, 80(%rsp)
 	mov	%r12, 88(%rsp)
 	mov	%r13, 96(%rsp)
@@ -149,7 +155,9 @@ PROLOGUE(_nettle_sha256_compress)
 	movl	20(STATE), SF
 	movl	24(STATE), SG
 	movl	28(STATE), SH
-	xor	COUNT, COUNT
+
+.Loop_block:
+	xorl	XREG(COUNT), XREG(COUNT)
 	ALIGN(16)
 
 .Loop1:
@@ -161,8 +169,8 @@ PROLOGUE(_nettle_sha256_compress)
 	NOEXPN(5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,5)
 	NOEXPN(6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,6)
 	NOEXPN(7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,7)
-	add	$8, COUNT
-	cmp	$16, COUNT
+	addl	$8, XREG(COUNT)
+	cmpl	$16, XREG(COUNT)
 	jne	.Loop1
 
 .Loop2:
@@ -182,22 +190,35 @@ PROLOGUE(_nettle_sha256_compress)
 	EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,13)
 	EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,14)
 	EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,15)
-	add	$16, COUNT
-	cmp	$64, COUNT
+	addl	$16, XREG(COUNT)
+	cmpl	$64, XREG(COUNT)
 	jne	.Loop2
 
-	mov	72(%rsp), STATE
-
-	addl	SA, (STATE)
-	addl	SB, 4(STATE)
-	addl	SC, 8(STATE)
-	addl	SD, 12(STATE)
-	addl	SE, 16(STATE)
-	addl	SF, 20(STATE)
-	addl	SG, 24(STATE)
-	addl	SH, 28(STATE)
-
-	mov	64(%rsp), %rbx
+	mov	STATE_SAVED, STATE
+
+	addl	(STATE), SA
+	addl	4(STATE), SB
+	addl	8(STATE), SC
+	addl	12(STATE), SD
+	addl	16(STATE), SE
+	addl	20(STATE), SF
+	addl	24(STATE), SG
+	addl	28(STATE), SH
+
+	movl	SA, (STATE)
+	movl	SB, 4(STATE)
+	movl	SC, 8(STATE)
+	movl	SD, 12(STATE)
+	movl	SE, 16(STATE)
+	movl	SF, 20(STATE)
+	movl	SG, 24(STATE)
+	movl	SH, 28(STATE)
+
+	add	$64, INPUT
+	dec	BLOCKS
+	jnz	.Loop_block
+
+	mov	72(%rsp), %rbx
 	mov	80(%rsp), %rbp
 	mov	88(%rsp), %r12
 	mov	96(%rsp), %r13
@@ -205,6 +226,8 @@ PROLOGUE(_nettle_sha256_compress)
 	mov	112(%rsp),%r15
 
 	add	$120, %rsp
+.Lend:
+	mov	INPUT, %rax
 	W64_EXIT(3, 0)
 	ret
-EPILOGUE(_nettle_sha256_compress)
+EPILOGUE(_nettle_sha256_compress_n)
diff --git a/x86_64/sha_ni/sha256-compress.asm b/x86_64/sha_ni/sha256-compress-n.asm
index 00bd3cd3..005909df 100644
--- a/x86_64/sha_ni/sha256-compress.asm
+++ b/x86_64/sha_ni/sha256-compress-n.asm
@@ -1,7 +1,7 @@
-C x86_64/sha_ni/sha256-compress.asm
+C x86_64/sha_ni/sha256-compress-n.asm
 
 ifelse(`
-   Copyright (C) 2018 Niels Möller
+   Copyright (C) 2018, 2022 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -30,10 +30,11 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
-	.file "sha256-compress.asm"
+	.file "sha256-compress-n.asm"
 define(`STATE', `%rdi')
-define(`INPUT', `%rsi')
-define(`K', `%rdx')
+define(`K', `%rsi')
+define(`BLOCKS', `%rdx')
+define(`INPUT', `%rcx')
 
 define(`MSGK',`%xmm0')	C Implicit operand of sha256rnds2
 define(`MSG0',`%xmm1')
@@ -45,7 +46,7 @@ define(`CDGH',`%xmm6')
 define(`ABEF_ORIG',`%xmm7')
 define(`CDGH_ORIG', `%xmm8')
 define(`SWAP_MASK',`%xmm9')
-define(`TMP', `%xmm9')	C Overlaps SWAP_MASK
+define(`TMP', `%xmm10')
 
 C QROUND(M0, M1, M2, M3, R)
 define(`QROUND', `
@@ -69,15 +70,19 @@ define(`TRANSPOSE', `
 	punpcklqdq $1, $3
 ')
 
-	C void
-	C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+	C const uint8_t *
+	C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+	C                           size_t blocks, const uint8_t *input)
 
 	.text
 	ALIGN(16)
 .Lswap_mask:
 	.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12
-PROLOGUE(_nettle_sha256_compress)
-	W64_ENTRY(3, 10)
+PROLOGUE(_nettle_sha256_compress_n)
+	W64_ENTRY(4, 11)
+	test	BLOCKS, BLOCKS
+	jz	.Lend
+
 	movups	(STATE), TMP
 	movups	16(STATE), ABEF
 
@@ -88,12 +93,13 @@ PROLOGUE(_nettle_sha256_compress)
 
 	movdqa	.Lswap_mask(%rip), SWAP_MASK
 
-	movdqa	ABEF, ABEF_ORIG
-	movdqa	CDGH, CDGH_ORIG
-
+.Loop:
 	movups	(INPUT), MSG0
 	pshufb	SWAP_MASK, MSG0
 
+	movdqa	ABEF, ABEF_ORIG
+	movdqa	CDGH, CDGH_ORIG
+
 	movdqa	(K), MSGK
 	paddd	MSG0, MSGK
 	sha256rnds2 ABEF, CDGH		C Round 0-1
@@ -163,6 +169,10 @@ PROLOGUE(_nettle_sha256_compress)
 	paddd ABEF_ORIG, ABEF
 	paddd CDGH_ORIG, CDGH
 
+	add	$64, INPUT
+	dec	BLOCKS
+	jnz	.Loop
+
 	TRANSPOSE(ABEF, CDGH, TMP)
 
 	pshufd	$0x1b, CDGH, CDGH
@@ -170,6 +180,8 @@ PROLOGUE(_nettle_sha256_compress)
 	movups	CDGH, 0(STATE)
 	movups	TMP, 16(STATE)
 
-	W64_EXIT(3, 10)
+.Lend:
+	mov	INPUT, %rax
+	W64_EXIT(4, 11)
 	ret
-EPILOGUE(_nettle_sha256_compress)
+EPILOGUE(_nettle_sha256_compress_n)