summaryrefslogtreecommitdiff
path: root/cipher/sha256-ssse3-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2021-01-20 21:55:01 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2021-01-26 19:41:58 +0200
commit9f49e806f9506533236fd44b17f17b85961b20f1 (patch)
treecd6b3ad4996c8a76200831fc3a661bdfe6da98fe /cipher/sha256-ssse3-amd64.S
parent393bd6c3d1aa2b2a1b05be0e2d7fb2514e6c5ad0 (diff)
downloadlibgcrypt-9f49e806f9506533236fd44b17f17b85961b20f1.tar.gz
sha512/sha256: remove assembler macros from AMD64 implementations
* configure.ac (gcry_cv_gcc_platform_as_ok_for_intel_syntax): Remove assembler macro check from Intel syntax assembly support check. * cipher/sha256-avx-amd64.S: Replace assembler macros with C preprocessor counterparts. * cipher/sha256-avx2-bmi2-amd64.S: Ditto. * cipher/sha256-ssse3-amd64.S: Ditto. * cipher/sha512-avx-amd64.S: Ditto. * cipher/sha512-avx2-bmi2-amd64.S: Ditto. * cipher/sha512-ssse3-amd64.S: Ditto. -- Removing GNU assembler macros allows building these implementations with clang. GnuPG-bug-id: 5255 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/sha256-ssse3-amd64.S')
-rw-r--r--cipher/sha256-ssse3-amd64.S529
1 files changed, 252 insertions, 277 deletions
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index 0fb94c1b..098b0eb6 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -70,58 +70,56 @@
/* addm [mem], reg
* Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
- add \p2, \p1
- mov \p1, \p2
-.endm
+#define addm(p1, p2) \
+ add p2, p1; \
+ mov p1, p2;
/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
/* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
* Load xmm with mem and byte swap each dword */
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- MOVDQ \p1, \p2
- pshufb \p1, \p3
-.endm
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+ MOVDQ p1, p2; \
+ pshufb p1, p3;
/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
-X0 = xmm4
-X1 = xmm5
-X2 = xmm6
-X3 = xmm7
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
-XTMP0 = xmm0
-XTMP1 = xmm1
-XTMP2 = xmm2
-XTMP3 = xmm3
-XTMP4 = xmm8
-XFER = xmm9
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
-SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
-SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = xmm12
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
-NUM_BLKS = rdx /* 3rd arg */
-CTX = rsi /* 2nd arg */
-INP = rdi /* 1st arg */
+#define NUM_BLKS rdx /* 3rd arg */
+#define CTX rsi /* 2nd arg */
+#define INP rdi /* 1st arg */
-SRND = rdi /* clobbers INP */
-c = ecx
-d = r8d
-e = edx
+#define SRND rdi /* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
-TBL = rbp
-a = eax
-b = ebx
+#define TBL rbp
+#define a eax
+#define b ebx
-f = r9d
-g = r10d
-h = r11d
+#define f r9d
+#define g r10d
+#define h r11d
-y0 = r13d
-y1 = r14d
-y2 = r15d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
@@ -138,230 +136,207 @@ y2 = r15d
#define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE)
#define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE)
-/* rotate_Xs
- * Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/* ROTATE_ARGS
- * Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- /* compute s0 four at a time and s1 two at a time
- * compute W[-16] + W[-7] 4 at a time */
- movdqa XTMP0, X3
- mov y0, e /* y0 = e */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- movdqa XTMP1, X1
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- xor y2, g /* y2 = f^g */
- paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- /* compute s0 */
- palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */
- movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- pslld XTMP1, (32-7)
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- psrld XTMP2, 7
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ /* compute s0 four at a time and s1 two at a time */; \
+ /* compute W[-16] + W[-7] 4 at a time */; \
+ movdqa XTMP0, X3; \
+ mov y0, e /* y0 = e */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ movdqa XTMP1, X1; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ xor y2, g /* y2 = f^g */; \
+ paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ /* compute s0 */; \
+ palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \
+ movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ pslld XTMP1, (32-7); \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ psrld XTMP2, 7; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
-ROTATE_ARGS
- movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */
- mov y0, e /* y0 = e */
- mov y1, a /* y1 = a */
- movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- pslld XTMP3, (32-18)
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- xor y2, g /* y2 = f^g */
- psrld XTMP2, 18
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- pxor XTMP1, XTMP3
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- pxor XTMP1, XTMP4 /* XTMP1 = s0 */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- /* compute low s1 */
- pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */; \
+ mov y0, e /* y0 = e */; \
+ mov y1, a /* y1 = a */; \
+ movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ pslld XTMP3, (32-18); \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ xor y2, g /* y2 = f^g */; \
+ psrld XTMP2, 18; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ pxor XTMP1, XTMP3; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ pxor XTMP1, XTMP4 /* XTMP1 = s0 */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ /* compute low s1 */; \
+ pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
-ROTATE_ARGS
- movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */
- mov y0, e /* y0 = e */
- mov y1, a /* y1 = a */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- mov y2, f /* y2 = f */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */
- xor y2, g /* y2 = f^g */
- psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- pxor XTMP2, XTMP3
- add y2, y0 /* y2 = S1 + CH */
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */
- pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- /* compute high s1 */
- pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */; \
+ mov y0, e /* y0 = e */; \
+ mov y1, a /* y1 = a */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ mov y2, f /* y2 = f */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \
+ xor y2, g /* y2 = f^g */; \
+ psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ pxor XTMP2, XTMP3; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \
+ pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ /* compute high s1 */; \
+ pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
-ROTATE_ARGS
- movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */
- mov y0, e /* y0 = e */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- xor y2, g /* y2 = f^g */
- psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- pxor XTMP2, XTMP3
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */
- pxor X0, XTMP2 /* X0 = s1 {xDxC} */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */; \
+ mov y0, e /* y0 = e */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ xor y2, g /* y2 = f^g */; \
+ psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ pxor XTMP2, XTMP3; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \
+ pxor X0, XTMP2 /* X0 = s1 {xDxC} */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
-ROTATE_ARGS
-rotate_Xs
-.endm
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+ FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+ FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+ FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
/* input is [rsp + _XFER + %1 * 4] */
-.macro DO_ROUND i1
- mov y0, e /* y0 = e */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- mov y2, f /* y2 = f */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- xor y2, g /* y2 = f^g */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- and y2, e /* y2 = (f^g)&e */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- add y2, y0 /* y2 = S1 + CH */
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+ mov y0, e /* y0 = e */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ mov y2, f /* y2 = f */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ xor y2, g /* y2 = f^g */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
- ROTATE_ARGS
-.endm
/*
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -414,10 +389,10 @@ _gcry_sha256_transform_amd64_ssse3:
lea TBL, [.LK256 ADD_RIP]
/* byte swap first 16 dwords */
- COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
mov [rsp + _INP], INP
@@ -428,23 +403,23 @@ _gcry_sha256_transform_amd64_ssse3:
movdqa XFER, [TBL + 0*16]
paddd XFER, X0
movdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
movdqa XFER, [TBL + 1*16]
- paddd XFER, X0
+ paddd XFER, X1
movdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
movdqa XFER, [TBL + 2*16]
- paddd XFER, X0
+ paddd XFER, X2
movdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
movdqa XFER, [TBL + 3*16]
- paddd XFER, X0
+ paddd XFER, X3
movdqa [rsp + _XFER], XFER
add TBL, 4*16
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
sub SRND, 1
jne .Loop1
@@ -453,17 +428,17 @@ _gcry_sha256_transform_amd64_ssse3:
.Loop2:
paddd X0, [TBL + 0*16]
movdqa [rsp + _XFER], X0
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
+ DO_ROUND(0, a, b, c, d, e, f, g, h)
+ DO_ROUND(1, h, a, b, c, d, e, f, g)
+ DO_ROUND(2, g, h, a, b, c, d, e, f)
+ DO_ROUND(3, f, g, h, a, b, c, d, e)
paddd X1, [TBL + 1*16]
movdqa [rsp + _XFER], X1
add TBL, 2*16
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
+ DO_ROUND(0, e, f, g, h, a, b, c, d)
+ DO_ROUND(1, d, e, f, g, h, a, b, c)
+ DO_ROUND(2, c, d, e, f, g, h, a, b)
+ DO_ROUND(3, b, c, d, e, f, g, h, a)
movdqa X0, X2
movdqa X1, X3
@@ -471,14 +446,14 @@ _gcry_sha256_transform_amd64_ssse3:
sub SRND, 1
jne .Loop2
- addm [4*0 + CTX],a
- addm [4*1 + CTX],b
- addm [4*2 + CTX],c
- addm [4*3 + CTX],d
- addm [4*4 + CTX],e
- addm [4*5 + CTX],f
- addm [4*6 + CTX],g
- addm [4*7 + CTX],h
+ addm([4*0 + CTX],a)
+ addm([4*1 + CTX],b)
+ addm([4*2 + CTX],c)
+ addm([4*3 + CTX],d)
+ addm([4*4 + CTX],e)
+ addm([4*5 + CTX],f)
+ addm([4*6 + CTX],g)
+ addm([4*7 + CTX],h)
mov INP, [rsp + _INP]
add INP, 64