diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-01-20 21:55:01 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-01-26 19:41:58 +0200 |
commit | 9f49e806f9506533236fd44b17f17b85961b20f1 (patch) | |
tree | cd6b3ad4996c8a76200831fc3a661bdfe6da98fe /cipher/sha256-ssse3-amd64.S | |
parent | 393bd6c3d1aa2b2a1b05be0e2d7fb2514e6c5ad0 (diff) | |
download | libgcrypt-9f49e806f9506533236fd44b17f17b85961b20f1.tar.gz |
sha512/sha256: remove assembler macros from AMD64 implementations
* configure.ac (gcry_cv_gcc_platform_as_ok_for_intel_syntax): Remove
assembler macro check from Intel syntax assembly support check.
* cipher/sha256-avx-amd64.S: Replace assembler macros with C
preprocessor counterparts.
* cipher/sha256-avx2-bmi2-amd64.S: Ditto.
* cipher/sha256-ssse3-amd64.S: Ditto.
* cipher/sha512-avx-amd64.S: Ditto.
* cipher/sha512-avx2-bmi2-amd64.S: Ditto.
* cipher/sha512-ssse3-amd64.S: Ditto.
--
Removing GNU assembler macros allows building these implementations with
clang.
GnuPG-bug-id: 5255
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/sha256-ssse3-amd64.S')
-rw-r--r-- | cipher/sha256-ssse3-amd64.S | 529 |
1 files changed, 252 insertions, 277 deletions
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 0fb94c1b..098b0eb6 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -70,58 +70,56 @@ /* addm [mem], reg * Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask * Load xmm with mem and byte swap each dword */ -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - MOVDQ \p1, \p2 - pshufb \p1, \p3 -.endm +#define COPY_XMM_AND_BSWAP(p1, p2, p3) \ + MOVDQ p1, p2; \ + pshufb p1, p3; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ -X0 = xmm4 -X1 = xmm5 -X2 = xmm6 -X3 = xmm7 +#define X0 xmm4 +#define X1 xmm5 +#define X2 xmm6 +#define X3 xmm7 -XTMP0 = xmm0 -XTMP1 = xmm1 -XTMP2 = xmm2 -XTMP3 = xmm3 -XTMP4 = xmm8 -XFER = xmm9 +#define XTMP0 xmm0 +#define XTMP1 xmm1 +#define XTMP2 xmm2 +#define XTMP3 xmm3 +#define XTMP4 xmm8 +#define XFER xmm9 -SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */ -SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */ -BYTE_FLIP_MASK = xmm12 +#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK xmm12 -NUM_BLKS = rdx /* 3rd arg */ -CTX = rsi /* 2nd arg */ -INP = rdi /* 1st arg */ +#define NUM_BLKS rdx /* 3rd arg */ +#define CTX rsi /* 2nd arg */ +#define INP rdi /* 1st arg */ -SRND = rdi /* clobbers INP */ -c = ecx -d = r8d -e = edx +#define SRND rdi /* clobbers INP */ +#define c ecx +#define d r8d +#define e edx -TBL = rbp -a = eax -b = ebx +#define TBL rbp +#define a eax +#define b ebx -f = r9d -g = r10d -h = r11d +#define f r9d +#define g r10d +#define h r11d -y0 = r13d -y1 = r14d -y2 = r15d +#define y0 r13d +#define y1 r14d +#define y2 r15d @@ -138,230 +136,207 @@ y2 = r15d #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) -/* rotate_Xs - * Rotate values of symbols X0...X3 */ -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -/* ROTATE_ARGS - * Rotate values of symbols a...h */ -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - /* compute s0 four at a time and s1 two at a time - * compute W[-16] + W[-7] 4 at a time */ - movdqa XTMP0, X3 - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - movdqa XTMP1, X1 - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - /* compute s0 */ - palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */ - movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pslld XTMP1, (32-7) - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - psrld XTMP2, 7 - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + +#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + /* compute s0 four at a time and s1 two at a time */; \ + /* compute W[-16] + W[-7] 4 at a time */; \ + movdqa XTMP0, X3; \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + movdqa XTMP1, X1; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + /* compute s0 */; \ + palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \ + movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pslld XTMP1, (32-7); \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + psrld XTMP2, 7; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */ - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - pslld XTMP3, (32-18) - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - psrld XTMP2, 18 - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - pxor XTMP1, XTMP3 - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pxor XTMP1, XTMP4 /* XTMP1 = s0 */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - /* compute low s1 */ - pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */; \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + pslld XTMP3, (32-18); \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + psrld XTMP2, 18; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + pxor XTMP1, XTMP3; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pxor XTMP1, XTMP4 /* XTMP1 = s0 */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + /* compute low s1 */; \ + pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */ - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ - xor y2, g /* y2 = f^g */ - psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - pxor XTMP2, XTMP3 - add y2, y0 /* y2 = S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */ - pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - /* compute high s1 */ - pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */; \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ + xor y2, g /* y2 = f^g */; \ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + pxor XTMP2, XTMP3; \ + add y2, y0 /* y2 = S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \ + pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + /* compute high s1 */; \ + pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */ - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - pxor XTMP2, XTMP3 - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */ - pxor X0, XTMP2 /* X0 = s1 {xDxC} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */; \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + pxor XTMP2, XTMP3; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \ + pxor X0, XTMP2 /* X0 = s1 {xDxC} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS -rotate_Xs -.endm +#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \ + FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \ + FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \ + FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e); /* input is [rsp + _XFER + %1 * 4] */ -.macro DO_ROUND i1 - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - and y2, e /* y2 = (f^g)&e */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - add y2, y0 /* y2 = S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + add y2, y0 /* y2 = S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ - ROTATE_ARGS -.endm /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -414,10 +389,10 @@ _gcry_sha256_transform_amd64_ssse3: lea TBL, [.LK256 ADD_RIP] /* byte swap first 16 dwords */ - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK) mov [rsp + _INP], INP @@ -428,23 +403,23 @@ _gcry_sha256_transform_amd64_ssse3: movdqa XFER, [TBL + 0*16] paddd XFER, X0 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) movdqa XFER, [TBL + 1*16] - paddd XFER, X0 + paddd XFER, X1 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d) movdqa XFER, [TBL + 2*16] - paddd XFER, X0 + paddd XFER, X2 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h) movdqa XFER, [TBL + 3*16] - paddd XFER, X0 + paddd XFER, X3 movdqa [rsp + _XFER], XFER add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d) sub SRND, 1 jne .Loop1 @@ -453,17 +428,17 @@ _gcry_sha256_transform_amd64_ssse3: .Loop2: paddd X0, [TBL + 0*16] movdqa [rsp + _XFER], X0 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, a, b, c, d, e, f, g, h) + DO_ROUND(1, h, a, b, c, d, e, f, g) + DO_ROUND(2, g, h, a, b, c, d, e, f) + DO_ROUND(3, f, g, h, a, b, c, d, e) paddd X1, [TBL + 1*16] movdqa [rsp + _XFER], X1 add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, e, f, g, h, a, b, c, d) + DO_ROUND(1, d, e, f, g, h, a, b, c) + DO_ROUND(2, c, d, e, f, g, h, a, b) + DO_ROUND(3, b, c, d, e, f, g, h, a) movdqa X0, X2 movdqa X1, X3 @@ -471,14 +446,14 @@ _gcry_sha256_transform_amd64_ssse3: sub SRND, 1 jne .Loop2 - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) mov INP, [rsp + _INP] add INP, 64 |