summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2022-07-05 19:47:39 +0200
committerNiels Möller <nisse@lysator.liu.se>2022-07-05 19:47:39 +0200
commitfad1bf50a5087a5b88a5500965ba3959d11f997f (patch)
tree428299e99c30bb24ce2bf048e5a887ac2dbc8eca
parent12bb2223428be4326c580b1b1f8e9916a2839cb1 (diff)
downloadnettle-fad1bf50a5087a5b88a5500965ba3959d11f997f.tar.gz
Implement _nettle_sha256_compress_n, C and x86_64 asm
-rw-r--r--ChangeLog14
-rw-r--r--Makefile.in2
-rw-r--r--configure.ac6
-rw-r--r--fat-setup.h4
-rw-r--r--fat-x86_64.c17
-rw-r--r--sha2-internal.h5
-rw-r--r--sha256-compress-n.c (renamed from sha256-compress.c)123
-rw-r--r--sha256.c30
-rw-r--r--x86_64/fat/sha256-compress-n-2.asm (renamed from x86_64/fat/sha256-compress-2.asm)4
-rw-r--r--x86_64/fat/sha256-compress-n.asm (renamed from x86_64/fat/sha256-compress.asm)4
-rw-r--r--x86_64/sha256-compress-n.asm (renamed from x86_64/sha256-compress.asm)85
-rw-r--r--x86_64/sha_ni/sha256-compress-n.asm (renamed from x86_64/sha_ni/sha256-compress.asm)42
12 files changed, 205 insertions, 131 deletions
diff --git a/ChangeLog b/ChangeLog
index 00d7b2c8..643c38b4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,20 @@
2022-07-05 Niels Möller <nisse@lysator.liu.se>
* md-internal.h (MD_FILL_OR_RETURN): New file, new macro.
+ * sha256-compress-n.c (_nettle_sha256_compress_n): New file and
+ function, replacing...
+ * sha256-compress.c (_nettle_sha256_compress): ...deleted file and
+ function.
+ * sha2-internal.h (_nettle_sha256_compress_n): Declare new function..
+ * sha256.c (sha256_compress): Update to use
+ _nettle_sha256_compress_n and MD_FILL_OR_RETURN.
+ * x86_64/sha256-compress-n.asm: New file. replacing...
+ * x86_64/sha256-compress.asm: ...deleted file.
+ * x86_64/sha_ni/sha256-compress-n.asm: New file. replacing...
+ * x86_64/sha_ni/sha256-compress.asm: ...deleted file.
+ * fat-setup.h (sha256_compress_n_func): New typedef, replacing...
+ (sha256_compress_func): ... deleted typedef.
+ * fat-x86_64.c: Update fat setup.
2022-06-20 Niels Möller <nisse@lysator.liu.se>
diff --git a/Makefile.in b/Makefile.in
index ba536407..64027d4d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -138,7 +138,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
salsa20-set-nonce.c \
salsa20-128-set-key.c salsa20-256-set-key.c \
sha1.c sha1-compress.c sha1-meta.c \
- sha256.c sha256-compress.c sha224-meta.c sha256-meta.c \
+ sha256.c sha256-compress-n.c sha224-meta.c sha256-meta.c \
sha512.c sha512-compress.c sha384-meta.c sha512-meta.c \
sha512-224-meta.c sha512-256-meta.c \
sha3.c sha3-permute.c \
diff --git a/configure.ac b/configure.ac
index 73c6fc21..cb30dfb3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -591,7 +591,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
chacha-core-internal.asm \
salsa20-crypt.asm salsa20-core-internal.asm \
serpent-encrypt.asm serpent-decrypt.asm \
- sha1-compress.asm sha256-compress.asm sha512-compress.asm \
+ sha1-compress.asm sha256-compress-n.asm sha512-compress.asm \
sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used.
@@ -607,7 +607,7 @@ asm_nettle_optional_list="cpuid.asm cpu-facility.asm \
chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
ghash-set-key-2.asm ghash-update-2.asm \
salsa20-2core.asm salsa20-core-internal-2.asm \
- sha1-compress-2.asm sha256-compress-2.asm \
+ sha1-compress-2.asm sha256-compress-n-2.asm \
sha3-permute-2.asm sha512-compress-2.asm \
umac-nh-n-2.asm umac-nh-2.asm"
@@ -757,7 +757,7 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_salsa20_2core
#undef HAVE_NATIVE_fat_salsa20_2core
#undef HAVE_NATIVE_sha1_compress
-#undef HAVE_NATIVE_sha256_compress
+#undef HAVE_NATIVE_sha256_compress_n
#undef HAVE_NATIVE_sha512_compress
#undef HAVE_NATIVE_sha3_permute
#undef HAVE_NATIVE_umac_nh
diff --git a/fat-setup.h b/fat-setup.h
index e77cce02..70bc2687 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -178,7 +178,9 @@ typedef void salsa20_crypt_func (struct salsa20_ctx *ctx, unsigned rounds,
const uint8_t *src);
typedef void sha1_compress_func(uint32_t *state, const uint8_t *input);
-typedef void sha256_compress_func(uint32_t *state, const uint8_t *input, const uint32_t *k);
+typedef const uint8_t *
+sha256_compress_n_func(uint32_t *state, const uint32_t *k,
+ size_t blocks, const uint8_t *input);
struct sha3_state;
typedef void sha3_permute_func (struct sha3_state *state);
diff --git a/fat-x86_64.c b/fat-x86_64.c
index 47cf78ae..0a2fedf4 100644
--- a/fat-x86_64.c
+++ b/fat-x86_64.c
@@ -155,9 +155,9 @@ DECLARE_FAT_FUNC(nettle_sha1_compress, sha1_compress_func)
DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, x86_64)
DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, sha_ni)
-DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, x86_64)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, sha_ni)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, x86_64)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, sha_ni)
DECLARE_FAT_FUNC(_nettle_ghash_set_key, ghash_set_key_func)
DECLARE_FAT_FUNC_VAR(ghash_set_key, ghash_set_key_func, c)
@@ -228,14 +228,14 @@ fat_init (void)
if (verbose)
fprintf (stderr, "libnettle: using sha_ni instructions.\n");
nettle_sha1_compress_vec = _nettle_sha1_compress_sha_ni;
- _nettle_sha256_compress_vec = _nettle_sha256_compress_sha_ni;
+ _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_sha_ni;
}
else
{
if (verbose)
fprintf (stderr, "libnettle: not using sha_ni instructions.\n");
nettle_sha1_compress_vec = _nettle_sha1_compress_x86_64;
- _nettle_sha256_compress_vec = _nettle_sha256_compress_x86_64;
+ _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_x86_64;
}
if (features.have_pclmul)
@@ -315,9 +315,10 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
(uint32_t *state, const uint8_t *input),
(state, input))
-DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
- (uint32_t *state, const uint8_t *input, const uint32_t *k),
- (state, input, k))
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+ (uint32_t *state, const uint32_t *k,
+ size_t blocks, const uint8_t *input),
+ (state, k, blocks, input))
DEFINE_FAT_FUNC(_nettle_ghash_set_key, void,
(struct gcm_key *ctx, const union nettle_block16 *key),
diff --git a/sha2-internal.h b/sha2-internal.h
index 40f25a5f..93080bee 100644
--- a/sha2-internal.h
+++ b/sha2-internal.h
@@ -39,8 +39,9 @@
/* Internal compression function. STATE points to 8 uint32_t words,
DATA points to 64 bytes of input data, possibly unaligned, and K
points to the table of constants. */
-void
-_nettle_sha256_compress(uint32_t *state, const uint8_t *data, const uint32_t *k);
+const uint8_t *
+_nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+ size_t blocks, const uint8_t *data);
/* Internal compression function. STATE points to 8 uint64_t words,
DATA points to 128 bytes of input data, possibly unaligned, and K
diff --git a/sha256-compress.c b/sha256-compress-n.c
index cf17e3e1..1e40cb1d 100644
--- a/sha256-compress.c
+++ b/sha256-compress-n.c
@@ -1,8 +1,8 @@
-/* sha256-compress.c
+/* sha256-compress-n.c
The compression function of the sha256 hash function.
- Copyright (C) 2001, 2010 Niels Möller
+ Copyright (C) 2001, 2010, 2022 Niels Möller
This file is part of GNU Nettle.
@@ -124,20 +124,12 @@ _nettle_sha256_compress_c(uint32_t *state, const uint8_t *input, const uint32_t
#define _nettle_sha256_compress _nettle_sha256_compress_c
#endif
-void
-_nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+const uint8_t *
+_nettle_sha256_compress_n(uint32_t *state, const uint32_t *table,
+ size_t blocks, const uint8_t *input)
{
- uint32_t data[SHA256_DATA_LENGTH];
uint32_t A, B, C, D, E, F, G, H; /* Local vars */
- unsigned i;
- uint32_t *d;
- for (i = 0; i < SHA256_DATA_LENGTH; i++, input+= 4)
- {
- data[i] = READ_UINT32(input);
- }
-
- /* Set up first buffer and local data buffer */
A = state[0];
B = state[1];
C = state[2];
@@ -146,55 +138,68 @@ _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k
F = state[5];
G = state[6];
H = state[7];
-
- /* Heavy mangling */
- /* First 16 subrounds that act on the original data */
- DEBUG(-1);
- for (i = 0, d = data; i<16; i+=8, k += 8, d+= 8)
+ for (; blocks > 0; blocks--)
{
- ROUND(A, B, C, D, E, F, G, H, k[0], d[0]); DEBUG(i);
- ROUND(H, A, B, C, D, E, F, G, k[1], d[1]); DEBUG(i+1);
- ROUND(G, H, A, B, C, D, E, F, k[2], d[2]);
- ROUND(F, G, H, A, B, C, D, E, k[3], d[3]);
- ROUND(E, F, G, H, A, B, C, D, k[4], d[4]);
- ROUND(D, E, F, G, H, A, B, C, k[5], d[5]);
- ROUND(C, D, E, F, G, H, A, B, k[6], d[6]); DEBUG(i+6);
- ROUND(B, C, D, E, F, G, H, A, k[7], d[7]); DEBUG(i+7);
- }
+ uint32_t data[SHA256_DATA_LENGTH];
+ unsigned i;
+ const uint32_t *k;
+ uint32_t *d;
+ for (i = 0; i < SHA256_DATA_LENGTH; i++, input+= 4)
+ {
+ data[i] = READ_UINT32(input);
+ }
+
+ /* Heavy mangling */
+ /* First 16 subrounds that act on the original data */
+
+ DEBUG(-1);
+ for (i = 0, d = data, k = table; i<16; i+=8, k += 8, d+= 8)
+ {
+ ROUND(A, B, C, D, E, F, G, H, k[0], d[0]); DEBUG(i);
+ ROUND(H, A, B, C, D, E, F, G, k[1], d[1]); DEBUG(i+1);
+ ROUND(G, H, A, B, C, D, E, F, k[2], d[2]);
+ ROUND(F, G, H, A, B, C, D, E, k[3], d[3]);
+ ROUND(E, F, G, H, A, B, C, D, k[4], d[4]);
+ ROUND(D, E, F, G, H, A, B, C, k[5], d[5]);
+ ROUND(C, D, E, F, G, H, A, B, k[6], d[6]); DEBUG(i+6);
+ ROUND(B, C, D, E, F, G, H, A, k[7], d[7]); DEBUG(i+7);
+ }
- for (; i<64; i += 16, k+= 16)
- {
- ROUND(A, B, C, D, E, F, G, H, k[ 0], EXPAND(data, 0)); DEBUG(i);
- ROUND(H, A, B, C, D, E, F, G, k[ 1], EXPAND(data, 1)); DEBUG(i+1);
- ROUND(G, H, A, B, C, D, E, F, k[ 2], EXPAND(data, 2)); DEBUG(i+2);
- ROUND(F, G, H, A, B, C, D, E, k[ 3], EXPAND(data, 3)); DEBUG(i+3);
- ROUND(E, F, G, H, A, B, C, D, k[ 4], EXPAND(data, 4)); DEBUG(i+4);
- ROUND(D, E, F, G, H, A, B, C, k[ 5], EXPAND(data, 5)); DEBUG(i+5);
- ROUND(C, D, E, F, G, H, A, B, k[ 6], EXPAND(data, 6)); DEBUG(i+6);
- ROUND(B, C, D, E, F, G, H, A, k[ 7], EXPAND(data, 7)); DEBUG(i+7);
- ROUND(A, B, C, D, E, F, G, H, k[ 8], EXPAND(data, 8)); DEBUG(i+8);
- ROUND(H, A, B, C, D, E, F, G, k[ 9], EXPAND(data, 9)); DEBUG(i+9);
- ROUND(G, H, A, B, C, D, E, F, k[10], EXPAND(data, 10)); DEBUG(i+10);
- ROUND(F, G, H, A, B, C, D, E, k[11], EXPAND(data, 11)); DEBUG(i+11);
- ROUND(E, F, G, H, A, B, C, D, k[12], EXPAND(data, 12)); DEBUG(i+12);
- ROUND(D, E, F, G, H, A, B, C, k[13], EXPAND(data, 13)); DEBUG(i+13);
- ROUND(C, D, E, F, G, H, A, B, k[14], EXPAND(data, 14)); DEBUG(i+14);
- ROUND(B, C, D, E, F, G, H, A, k[15], EXPAND(data, 15)); DEBUG(i+15);
- }
-
- /* Update state */
- state[0] += A;
- state[1] += B;
- state[2] += C;
- state[3] += D;
- state[4] += E;
- state[5] += F;
- state[6] += G;
- state[7] += H;
+ for (; i<64; i += 16, k+= 16)
+ {
+ ROUND(A, B, C, D, E, F, G, H, k[ 0], EXPAND(data, 0)); DEBUG(i);
+ ROUND(H, A, B, C, D, E, F, G, k[ 1], EXPAND(data, 1)); DEBUG(i+1);
+ ROUND(G, H, A, B, C, D, E, F, k[ 2], EXPAND(data, 2)); DEBUG(i+2);
+ ROUND(F, G, H, A, B, C, D, E, k[ 3], EXPAND(data, 3)); DEBUG(i+3);
+ ROUND(E, F, G, H, A, B, C, D, k[ 4], EXPAND(data, 4)); DEBUG(i+4);
+ ROUND(D, E, F, G, H, A, B, C, k[ 5], EXPAND(data, 5)); DEBUG(i+5);
+ ROUND(C, D, E, F, G, H, A, B, k[ 6], EXPAND(data, 6)); DEBUG(i+6);
+ ROUND(B, C, D, E, F, G, H, A, k[ 7], EXPAND(data, 7)); DEBUG(i+7);
+ ROUND(A, B, C, D, E, F, G, H, k[ 8], EXPAND(data, 8)); DEBUG(i+8);
+ ROUND(H, A, B, C, D, E, F, G, k[ 9], EXPAND(data, 9)); DEBUG(i+9);
+ ROUND(G, H, A, B, C, D, E, F, k[10], EXPAND(data, 10)); DEBUG(i+10);
+ ROUND(F, G, H, A, B, C, D, E, k[11], EXPAND(data, 11)); DEBUG(i+11);
+ ROUND(E, F, G, H, A, B, C, D, k[12], EXPAND(data, 12)); DEBUG(i+12);
+ ROUND(D, E, F, G, H, A, B, C, k[13], EXPAND(data, 13)); DEBUG(i+13);
+ ROUND(C, D, E, F, G, H, A, B, k[14], EXPAND(data, 14)); DEBUG(i+14);
+ ROUND(B, C, D, E, F, G, H, A, k[15], EXPAND(data, 15)); DEBUG(i+15);
+ }
+
+ /* Update state */
+ state[0] = A = state[0] + A;
+ state[1] = B = state[1] + B;
+ state[2] = C = state[2] + C;
+ state[3] = D = state[3] + D;
+ state[4] = E = state[4] + E;
+ state[5] = F = state[5] + F;
+ state[6] = G = state[6] + G;
+ state[7] = H = state[7] + H;
#if SHA256_DEBUG
- fprintf(stderr, "99: %8x %8x %8x %8x %8x %8x %8x %8x\n",
- state[0], state[1], state[2], state[3],
- state[4], state[5], state[6], state[7]);
+ fprintf(stderr, "99: %8x %8x %8x %8x %8x %8x %8x %8x\n",
+ state[0], state[1], state[2], state[3],
+ state[4], state[5], state[6], state[7]);
#endif
+ }
+ return input;
}
diff --git a/sha256.c b/sha256.c
index 3872ca6f..0c9c21a0 100644
--- a/sha256.c
+++ b/sha256.c
@@ -46,6 +46,7 @@
#include "sha2-internal.h"
#include "macros.h"
+#include "md-internal.h"
#include "nettle-write.h"
/* Generated by the shadata program. */
@@ -70,6 +71,12 @@ K[64] =
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL,
};
+void
+sha256_compress(uint32_t *state, const uint8_t *input)
+{
+ _nettle_sha256_compress_n(state, K, 1, input);
+}
+
#define COMPRESS(ctx, data) (sha256_compress((ctx)->state, (data)))
/* Initialize the SHA values */
@@ -97,7 +104,22 @@ void
sha256_update(struct sha256_ctx *ctx,
size_t length, const uint8_t *data)
{
- MD_UPDATE (ctx, length, data, COMPRESS, ctx->count++);
+ size_t blocks;
+ if (ctx->index > 0)
+ {
+ /* Try to fill partial block */
+ MD_FILL_OR_RETURN (ctx, length, data);
+ sha256_compress (ctx->state, ctx->block);
+ ctx->count++;
+ }
+
+ blocks = length >> 6;
+ data = _nettle_sha256_compress_n (ctx->state, K, blocks, data);
+ ctx->count += blocks;
+ length &= 63;
+
+ memcpy (ctx->block, data, length);
+ ctx->index = length;
}
static void
@@ -161,9 +183,3 @@ sha224_digest(struct sha256_ctx *ctx,
sha256_write_digest(ctx, length, digest);
sha224_init(ctx);
}
-
-void
-sha256_compress(uint32_t *state, const uint8_t *input)
-{
- _nettle_sha256_compress(state, input, K);
-}
diff --git a/x86_64/fat/sha256-compress-2.asm b/x86_64/fat/sha256-compress-n-2.asm
index 996cf8c5..60f7c8f6 100644
--- a/x86_64/fat/sha256-compress-2.asm
+++ b/x86_64/fat/sha256-compress-n-2.asm
@@ -1,4 +1,4 @@
-C x86_64/fat/sha256-compress-2.asm
+C x86_64/fat/sha256-compress-n-2.asm
ifelse(`
Copyright (C) 2018 Niels Möller
@@ -31,4 +31,4 @@ ifelse(`
')
define(`fat_transform', `$1_sha_ni')
-include_src(`x86_64/sha_ni/sha256-compress.asm')
+include_src(`x86_64/sha_ni/sha256-compress-n.asm')
diff --git a/x86_64/fat/sha256-compress.asm b/x86_64/fat/sha256-compress-n.asm
index 2aaeb5e8..fc358858 100644
--- a/x86_64/fat/sha256-compress.asm
+++ b/x86_64/fat/sha256-compress-n.asm
@@ -1,4 +1,4 @@
-C x86_64/fat/sha256-compress.asm
+C x86_64/fat/sha256-compress-n.asm
ifelse(`
Copyright (C) 2018 Niels Möller
@@ -31,4 +31,4 @@ ifelse(`
')
define(`fat_transform', `$1_x86_64')
-include_src(`x86_64/sha256-compress.asm')
+include_src(`x86_64/sha256-compress-n.asm')
diff --git a/x86_64/sha256-compress.asm b/x86_64/sha256-compress-n.asm
index 5ed669b1..e10d260c 100644
--- a/x86_64/sha256-compress.asm
+++ b/x86_64/sha256-compress-n.asm
@@ -1,7 +1,7 @@
-C x86_64/sha256-compress.asm
+C x86_64/sha256-compress-n.asm
ifelse(`
- Copyright (C) 2013 Niels Möller
+ Copyright (C) 2013, 2022 Niels Möller
This file is part of GNU Nettle.
@@ -30,21 +30,24 @@ ifelse(`
not, see http://www.gnu.org/licenses/.
')
- .file "sha256-compress.asm"
+ .file "sha256-compress-n.asm"
define(`STATE', `%rdi')
-define(`INPUT', `%rsi')
-define(`K', `%rdx')
+define(`K', `%rsi')
+define(`BLOCKS', `%rdx')
+define(`INPUT', `%rcx')
+define(`STATE_SAVED', `64(%rsp)')
+
define(`SA', `%eax')
define(`SB', `%ebx')
-define(`SC', `%ecx')
+define(`SC', `%ebp')
define(`SD', `%r8d')
define(`SE', `%r9d')
define(`SF', `%r10d')
define(`SG', `%r11d')
define(`SH', `%r12d')
define(`T0', `%r13d')
-define(`T1', `%edi') C Overlap STATE
-define(`COUNT', `%r14')
+define(`T1', `%r14d')
+define(`COUNT', `%rdi') C Overlap STATE
define(`W', `%r15d')
define(`EXPN', `
@@ -123,18 +126,21 @@ define(`NOEXPN', `
movl W, OFFSET($1)(%rsp, COUNT, 4)
')
- C void
- C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+ C const uint8_t *
+ C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+ C size_t blocks, const uint8_t *input)
.text
ALIGN(16)
-PROLOGUE(_nettle_sha256_compress)
+PROLOGUE(_nettle_sha256_compress_n)
W64_ENTRY(3, 0)
+ test BLOCKS, BLOCKS
+ jz .Lend
sub $120, %rsp
- mov %rbx, 64(%rsp)
- mov STATE, 72(%rsp) C Save state, to free a register
+ mov STATE, STATE_SAVED C Save state, to free a register
+ mov %rbx, 72(%rsp)
mov %rbp, 80(%rsp)
mov %r12, 88(%rsp)
mov %r13, 96(%rsp)
@@ -149,7 +155,9 @@ PROLOGUE(_nettle_sha256_compress)
movl 20(STATE), SF
movl 24(STATE), SG
movl 28(STATE), SH
- xor COUNT, COUNT
+
+.Loop_block:
+ xorl XREG(COUNT), XREG(COUNT)
ALIGN(16)
.Loop1:
@@ -161,8 +169,8 @@ PROLOGUE(_nettle_sha256_compress)
NOEXPN(5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,5)
NOEXPN(6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,6)
NOEXPN(7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,7)
- add $8, COUNT
- cmp $16, COUNT
+ addl $8, XREG(COUNT)
+ cmpl $16, XREG(COUNT)
jne .Loop1
.Loop2:
@@ -182,22 +190,35 @@ PROLOGUE(_nettle_sha256_compress)
EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,13)
EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,14)
EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,15)
- add $16, COUNT
- cmp $64, COUNT
+ addl $16, XREG(COUNT)
+ cmpl $64, XREG(COUNT)
jne .Loop2
- mov 72(%rsp), STATE
-
- addl SA, (STATE)
- addl SB, 4(STATE)
- addl SC, 8(STATE)
- addl SD, 12(STATE)
- addl SE, 16(STATE)
- addl SF, 20(STATE)
- addl SG, 24(STATE)
- addl SH, 28(STATE)
-
- mov 64(%rsp), %rbx
+ mov STATE_SAVED, STATE
+
+ addl (STATE), SA
+ addl 4(STATE), SB
+ addl 8(STATE), SC
+ addl 12(STATE), SD
+ addl 16(STATE), SE
+ addl 20(STATE), SF
+ addl 24(STATE), SG
+ addl 28(STATE), SH
+
+ movl SA, (STATE)
+ movl SB, 4(STATE)
+ movl SC, 8(STATE)
+ movl SD, 12(STATE)
+ movl SE, 16(STATE)
+ movl SF, 20(STATE)
+ movl SG, 24(STATE)
+ movl SH, 28(STATE)
+
+ add $64, INPUT
+ dec BLOCKS
+ jnz .Loop_block
+
+ mov 72(%rsp), %rbx
mov 80(%rsp), %rbp
mov 88(%rsp), %r12
mov 96(%rsp), %r13
@@ -205,6 +226,8 @@ PROLOGUE(_nettle_sha256_compress)
mov 112(%rsp),%r15
add $120, %rsp
+.Lend:
+ mov INPUT, %rax
W64_EXIT(3, 0)
ret
-EPILOGUE(_nettle_sha256_compress)
+EPILOGUE(_nettle_sha256_compress_n)
diff --git a/x86_64/sha_ni/sha256-compress.asm b/x86_64/sha_ni/sha256-compress-n.asm
index 00bd3cd3..005909df 100644
--- a/x86_64/sha_ni/sha256-compress.asm
+++ b/x86_64/sha_ni/sha256-compress-n.asm
@@ -1,7 +1,7 @@
-C x86_64/sha_ni/sha256-compress.asm
+C x86_64/sha_ni/sha256-compress-n.asm
ifelse(`
- Copyright (C) 2018 Niels Möller
+ Copyright (C) 2018, 2022 Niels Möller
This file is part of GNU Nettle.
@@ -30,10 +30,11 @@ ifelse(`
not, see http://www.gnu.org/licenses/.
')
- .file "sha256-compress.asm"
+ .file "sha256-compress-n.asm"
define(`STATE', `%rdi')
-define(`INPUT', `%rsi')
-define(`K', `%rdx')
+define(`K', `%rsi')
+define(`BLOCKS', `%rdx')
+define(`INPUT', `%rcx')
define(`MSGK',`%xmm0') C Implicit operand of sha256rnds2
define(`MSG0',`%xmm1')
@@ -45,7 +46,7 @@ define(`CDGH',`%xmm6')
define(`ABEF_ORIG',`%xmm7')
define(`CDGH_ORIG', `%xmm8')
define(`SWAP_MASK',`%xmm9')
-define(`TMP', `%xmm9') C Overlaps SWAP_MASK
+define(`TMP', `%xmm10')
C QROUND(M0, M1, M2, M3, R)
define(`QROUND', `
@@ -69,15 +70,19 @@ define(`TRANSPOSE', `
punpcklqdq $1, $3
')
- C void
- C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+ C const uint8_t *
+ C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+ C size_t blocks, const uint8_t *input)
.text
ALIGN(16)
.Lswap_mask:
.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12
-PROLOGUE(_nettle_sha256_compress)
- W64_ENTRY(3, 10)
+PROLOGUE(_nettle_sha256_compress_n)
+ W64_ENTRY(4, 11)
+ test BLOCKS, BLOCKS
+ jz .Lend
+
movups (STATE), TMP
movups 16(STATE), ABEF
@@ -88,12 +93,13 @@ PROLOGUE(_nettle_sha256_compress)
movdqa .Lswap_mask(%rip), SWAP_MASK
- movdqa ABEF, ABEF_ORIG
- movdqa CDGH, CDGH_ORIG
-
+.Loop:
movups (INPUT), MSG0
pshufb SWAP_MASK, MSG0
+ movdqa ABEF, ABEF_ORIG
+ movdqa CDGH, CDGH_ORIG
+
movdqa (K), MSGK
paddd MSG0, MSGK
sha256rnds2 ABEF, CDGH C Round 0-1
@@ -163,6 +169,10 @@ PROLOGUE(_nettle_sha256_compress)
paddd ABEF_ORIG, ABEF
paddd CDGH_ORIG, CDGH
+ add $64, INPUT
+ dec BLOCKS
+ jnz .Loop
+
TRANSPOSE(ABEF, CDGH, TMP)
pshufd $0x1b, CDGH, CDGH
@@ -170,6 +180,8 @@ PROLOGUE(_nettle_sha256_compress)
movups CDGH, 0(STATE)
movups TMP, 16(STATE)
- W64_EXIT(3, 10)
+.Lend:
+ mov INPUT, %rax
+ W64_EXIT(4, 11)
ret
-EPILOGUE(_nettle_sha256_compress)
+EPILOGUE(_nettle_sha256_compress_n)