diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-01-20 21:55:01 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-01-26 19:41:58 +0200 |
commit | 9f49e806f9506533236fd44b17f17b85961b20f1 (patch) | |
tree | cd6b3ad4996c8a76200831fc3a661bdfe6da98fe | |
parent | 393bd6c3d1aa2b2a1b05be0e2d7fb2514e6c5ad0 (diff) | |
download | libgcrypt-9f49e806f9506533236fd44b17f17b85961b20f1.tar.gz |
sha512/sha256: remove assembler macros from AMD64 implementations
* configure.ac (gcry_cv_gcc_platform_as_ok_for_intel_syntax): Remove
assembler macro check from Intel syntax assembly support check.
* cipher/sha256-avx-amd64.S: Replace assembler macros with C
preprocessor counterparts.
* cipher/sha256-avx2-bmi2-amd64.S: Ditto.
* cipher/sha256-ssse3-amd64.S: Ditto.
* cipher/sha512-avx-amd64.S: Ditto.
* cipher/sha512-avx2-bmi2-amd64.S: Ditto.
* cipher/sha512-ssse3-amd64.S: Ditto.
--
Removing GNU assembler macros allows building these implementations with
clang.
GnuPG-bug-id: 5255
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/sha256-avx-amd64.S | 516 | ||||
-rw-r--r-- | cipher/sha256-avx2-bmi2-amd64.S | 421 | ||||
-rw-r--r-- | cipher/sha256-ssse3-amd64.S | 529 | ||||
-rw-r--r-- | cipher/sha512-avx-amd64.S | 456 | ||||
-rw-r--r-- | cipher/sha512-avx2-bmi2-amd64.S | 498 | ||||
-rw-r--r-- | cipher/sha512-ssse3-amd64.S | 455 | ||||
-rw-r--r-- | configure.ac | 20 |
7 files changed, 1387 insertions, 1508 deletions
diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S index 77143ff0..ec945f84 100644 --- a/cipher/sha256-avx-amd64.S +++ b/cipher/sha256-avx-amd64.S @@ -65,67 +65,64 @@ #define VMOVDQ vmovdqu /* assume buffers not aligned */ -.macro ROR p1 p2 - /* shld is faster than ror on Intel Sandybridge */ - shld \p1, \p1, (32 - \p2) -.endm +#define ROR(p1, p2) \ + /* shld is faster than ror on Intel Sandybridge */ \ + shld p1, p1, (32 - p2); /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ /* addm [mem], reg * Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask * Load xmm with mem and byte swap each dword */ -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p1, \p2 - vpshufb \p1, \p1, \p3 -.endm +#define COPY_XMM_AND_BSWAP(p1, p2, p3) \ + VMOVDQ p1, p2; \ + vpshufb p1, p1, p3; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ -X0 = xmm4 -X1 = xmm5 -X2 = xmm6 -X3 = xmm7 +#define X0 xmm4 +#define X1 xmm5 +#define X2 xmm6 +#define X3 xmm7 -XTMP0 = xmm0 -XTMP1 = xmm1 -XTMP2 = xmm2 -XTMP3 = xmm3 -XTMP4 = xmm8 -XFER = xmm9 +#define XTMP0 xmm0 +#define XTMP1 xmm1 +#define XTMP2 xmm2 +#define XTMP3 xmm3 +#define XTMP4 xmm8 +#define XFER xmm9 -SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */ -SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */ -BYTE_FLIP_MASK = xmm12 +#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK xmm12 -NUM_BLKS = rdx /* 3rd arg */ -CTX = rsi /* 2nd arg */ -INP = rdi /* 1st arg */ +#define NUM_BLKS rdx /* 3rd arg */ +#define CTX rsi /* 2nd arg */ +#define INP rdi /* 1st arg */ -SRND = rdi /* clobbers INP */ -c = ecx -d = r8d -e = edx +#define SRND rdi /* clobbers INP */ +#define c ecx +#define d r8d +#define e edx -TBL = rbp -a = eax -b = ebx +#define TBL rbp +#define a eax +#define b ebx -f = r9d -g = r10d -h = r11d +#define f r9d +#define g r10d +#define h r11d -y0 = r13d -y1 = r14d -y2 = r15d +#define y0 r13d +#define y1 r14d +#define y2 r15d @@ -142,220 +139,197 @@ y2 = r15d #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) -/* rotate_Xs - * Rotate values of symbols X0...X3 */ -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -/* ROTATE_ARGS - * Rotate values of symbols a...h */ -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - /* compute s0 four at a time and s1 two at a time - * compute W[-16] + W[-7] 4 at a time */ - mov y0, e /* y0 = e */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - /* compute s0 */ - vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - vpslld XTMP2, XTMP1, (32-7) - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - vpsrld XTMP3, XTMP1, 7 - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + +#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + /* compute s0 four at a time and s1 two at a time */; \ + /* compute W[-16] + W[-7] 4 at a time */; \ + mov y0, e /* y0 = e */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + /* compute s0 */; \ + vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + vpslld XTMP2, XTMP1, (32-7); \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + vpsrld XTMP3, XTMP1, 7; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - vpslld XTMP2, XTMP1, (32-18) - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - vpsrld XTMP4, XTMP1, 18 - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - vpxor XTMP4, XTMP4, XTMP3 - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */ - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - /* compute low s1 */ - vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + vpslld XTMP2, XTMP1, (32-18); \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + vpsrld XTMP4, XTMP1, 18; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + vpxor XTMP4, XTMP4, XTMP3; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + /* compute low s1 */; \ + vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ - xor y2, g /* y2 = f^g */ - vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - vpxor XTMP2, XTMP2, XTMP3 - add y2, y0 /* y2 = S1 + CH */ - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */ - vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - /* compute high s1 */ - vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ + xor y2, g /* y2 = f^g */; \ + vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + vpxor XTMP2, XTMP2, XTMP3; \ + add y2, y0 /* y2 = S1 + CH */; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \ + vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + /* compute high s1 */; \ + vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - mov y0, e /* y0 = e */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - vpsrld XTMP2, XTMP2, 10 /* X0 = W[-2] >> 10 {DDCC} */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - vpxor XTMP2, XTMP2, XTMP3 - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */ - vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + vpsrld XTMP2, XTMP2, 10 /* X0 = W[-2] >> 10 {DDCC} */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + vpxor XTMP2, XTMP2, XTMP3; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \ + vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS -rotate_Xs -.endm +#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \ + FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \ + FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \ + FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e); /* input is [rsp + _XFER + %1 * 4] */ -.macro DO_ROUND i1 - mov y0, e /* y0 = e */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - and y2, e /* y2 = (f^g)&e */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - add y2, y0 /* y2 = S1 + CH */ - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + add y2, y0 /* y2 = S1 + CH */; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ - ROTATE_ARGS -.endm /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -410,10 +384,10 @@ _gcry_sha256_transform_amd64_avx: lea TBL, [.LK256 ADD_RIP] /* byte swap first 16 dwords */ - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK) mov [rsp + _INP], INP @@ -423,20 +397,20 @@ _gcry_sha256_transform_amd64_avx: .Loop1: vpaddd XFER, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) - vpaddd XFER, X0, [TBL + 1*16] + vpaddd XFER, X1, [TBL + 1*16] vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d) - vpaddd XFER, X0, [TBL + 2*16] + vpaddd XFER, X2, [TBL + 2*16] vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h) - vpaddd XFER, X0, [TBL + 3*16] + vpaddd XFER, X3, [TBL + 3*16] vmovdqa [rsp + _XFER], XFER add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d) sub SRND, 1 jne .Loop1 @@ -445,17 +419,17 @@ _gcry_sha256_transform_amd64_avx: .Loop2: vpaddd X0, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], X0 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, a, b, c, d, e, f, g, h) + DO_ROUND(1, h, a, b, c, d, e, f, g) + DO_ROUND(2, g, h, a, b, c, d, e, f) + DO_ROUND(3, f, g, h, a, b, c, d, e) vpaddd X1, X1, [TBL + 1*16] vmovdqa [rsp + _XFER], X1 add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, e, f, g, h, a, b, c, d) + DO_ROUND(1, d, e, f, g, h, a, b, c) + DO_ROUND(2, c, d, e, f, g, h, a, b) + DO_ROUND(3, b, c, d, e, f, g, h, a) vmovdqa X0, X2 vmovdqa X1, X3 @@ -463,14 +437,14 @@ _gcry_sha256_transform_amd64_avx: sub SRND, 1 jne .Loop2 - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) mov INP, [rsp + _INP] add INP, 64 diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S index 52be1a07..faefba17 100644 --- a/cipher/sha256-avx2-bmi2-amd64.S +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -70,226 +70,171 @@ /* addm [mem], reg */ /* Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ -X0 = ymm4 -X1 = ymm5 -X2 = ymm6 -X3 = ymm7 +#define X0 ymm4 +#define X1 ymm5 +#define X2 ymm6 +#define X3 ymm7 /* XMM versions of above */ -XWORD0 = xmm4 -XWORD1 = xmm5 -XWORD2 = xmm6 -XWORD3 = xmm7 - -XTMP0 = ymm0 -XTMP1 = ymm1 -XTMP2 = ymm2 -XTMP3 = ymm3 -XTMP4 = ymm8 -XFER = ymm9 -XTMP5 = ymm11 - -SHUF_00BA = ymm10 /* shuffle xBxA -> 00BA */ -SHUF_DC00 = ymm12 /* shuffle xDxC -> DC00 */ -BYTE_FLIP_MASK = ymm13 - -X_BYTE_FLIP_MASK = xmm13 /* XMM version of BYTE_FLIP_MASK */ - -NUM_BLKS = rdx /* 3rd arg */ -CTX = rsi /* 2nd arg */ -INP = rdi /* 1st arg */ -c = ecx -d = r8d -e = edx /* clobbers NUM_BLKS */ -y3 = edi /* clobbers INP */ - -TBL = rbp -SRND = CTX /* SRND is same register as CTX */ - -a = eax -b = ebx -f = r9d -g = r10d -h = r11d -old_h = r11d - -T1 = r12d -y0 = r13d -y1 = r14d -y2 = r15d - - -_XFER_SIZE = 2*64*4 /* 2 blocks, 64 rounds, 4 bytes/round */ -_XMM_SAVE_SIZE = 0 -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_CTX_SIZE = 8 -_RSP_SIZE = 8 - -_XFER = 0 -_XMM_SAVE = _XFER + _XFER_SIZE -_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE -_INP = _INP_END + _INP_END_SIZE -_CTX = _INP + _INP_SIZE -_RSP = _CTX + _CTX_SIZE -STACK_SIZE = _RSP + _RSP_SIZE - -/* rotate_Xs */ -/* Rotate values of symbols X0...X3 */ -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -/* ROTATE_ARGS */ -/* Rotate values of symbols a...h */ -.macro ROTATE_ARGS -old_h = h -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro ONE_ROUND_PART1 XFER - /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); - * d += h; - * h += Sum0 (a) + Maj (a, b, c); - * - * Ch(x, y, z) => ((x & y) + (~x & z)) - * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) - */ - - mov y3, e - add h, [\XFER] - and y3, f - rorx y0, e, 25 - rorx y1, e, 11 +#define XWORD0 xmm4 +#define XWORD1 xmm5 +#define XWORD2 xmm6 +#define XWORD3 xmm7 + +#define XTMP0 ymm0 +#define XTMP1 ymm1 +#define XTMP2 ymm2 +#define XTMP3 ymm3 +#define XTMP4 ymm8 +#define XFER ymm9 +#define XTMP5 ymm11 + +#define SHUF_00BA ymm10 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 ymm12 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK ymm13 + +#define X_BYTE_FLIP_MASK xmm13 /* XMM version of BYTE_FLIP_MASK */ + +#define NUM_BLKS rdx /* 3rd arg */ +#define CTX rsi /* 2nd arg */ +#define INP rdi /* 1st arg */ +#define c ecx +#define d r8d +#define e edx /* clobbers NUM_BLKS */ +#define y3 edi /* clobbers INP */ + +#define TBL rbp +#define SRND CTX /* SRND is same register as CTX */ + +#define a eax +#define b ebx +#define f r9d +#define g r10d +#define h r11d +#define old_h r11d + +#define T1 r12d +#define y0 r13d +#define y1 r14d +#define y2 r15d + + +#define _XFER_SIZE 2*64*4 /* 2 blocks, 64 rounds, 4 bytes/round */ +#define _XMM_SAVE_SIZE 0 +#define _INP_END_SIZE 8 +#define _INP_SIZE 8 +#define _CTX_SIZE 8 +#define _RSP_SIZE 8 + +#define _XFER 0 +#define _XMM_SAVE _XFER + _XFER_SIZE +#define _INP_END _XMM_SAVE + _XMM_SAVE_SIZE +#define _INP _INP_END + _INP_END_SIZE +#define _CTX _INP + _INP_SIZE +#define _RSP _CTX + _CTX_SIZE +#define STACK_SIZE _RSP + _RSP_SIZE + +#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \ + /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); */ \ + /* d += h; */ \ + /* h += Sum0 (a) + Maj (a, b, c); */ \ + \ + /* Ch(x, y, z) => ((x & y) + (~x & z)) */ \ + /* Maj(x, y, z) => ((x & y) + (z & (x ^ y))) */ \ + \ + mov y3, e; \ + add h, [XFERIN]; \ + and y3, f; \ + rorx y0, e, 25; \ + rorx y1, e, 11; \ + lea h, [h + y3]; \ + andn y3, e, g; \ + rorx T1, a, 13; \ + xor y0, y1; \ lea h, [h + y3] - andn y3, e, g - rorx T1, a, 13 - xor y0, y1 - lea h, [h + y3] -.endm -.macro ONE_ROUND_PART2 - rorx y2, a, 22 - rorx y1, e, 6 - mov y3, a - xor T1, y2 - xor y0, y1 - xor y3, b - lea h, [h + y0] - mov y0, a - rorx y2, a, 2 - add d, h - and y3, c - xor T1, y2 - lea h, [h + y3] - lea h, [h + T1] - and y0, b - lea h, [h + y0] -.endm - -.macro ONE_ROUND XFER - ONE_ROUND_PART1 \XFER - ONE_ROUND_PART2 -.endm - -.macro FOUR_ROUNDS_AND_SCHED XFER, XFEROUT -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */ - vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */ - vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */ - vpsrld XTMP2, XTMP1, 7 - vpslld XTMP3, XTMP1, (32-7) - vpor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 */ - vpsrld XTMP2, XTMP1,18 - - ONE_ROUND 0*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - vpsrld XTMP4, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */ - vpslld XTMP1, XTMP1, (32-18) - vpxor XTMP3, XTMP3, XTMP1 - vpxor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */ - vpxor XTMP1, XTMP3, XTMP4 /* XTMP1 = s0 */ - vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ - vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ - vpsrld XTMP4, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ - - ONE_ROUND 1*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ - vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ - vpxor XTMP2, XTMP2, XTMP3 - vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ - vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ - vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ - vpshufd XTMP2, XTMP0, 0b1010000 /* XTMP2 = W[-2] {DDCC} */ - - ONE_ROUND 2*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - vpsrld XTMP5, XTMP2, 10 /* XTMP5 = W[-2] >> 10 {DDCC} */ - vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ - vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ - vpxor XTMP2, XTMP2, XTMP3 - vpxor XTMP5, XTMP5, XTMP2 /* XTMP5 = s1 {xDxC} */ - vpshufb XTMP5, XTMP5, SHUF_DC00 /* XTMP5 = s1 {DC00} */ - vpaddd X0, XTMP5, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ - vpaddd XFER, X0, [TBL + \XFEROUT] - - ONE_ROUND_PART1 3*4+\XFER - vmovdqa [rsp + _XFER + \XFEROUT], XFER - ONE_ROUND_PART2 - ROTATE_ARGS - rotate_Xs -.endm - -.macro DO_4ROUNDS XFER -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND 0*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND 1*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND 2*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ +#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \ + rorx y2, a, 22; \ + rorx y1, e, 6; \ + mov y3, a; \ + xor T1, y2; \ + xor y0, y1; \ + xor y3, b; \ + lea h, [h + y0]; \ + mov y0, a; \ + rorx y2, a, 2; \ + add d, h; \ + and y3, c; \ + xor T1, y2; \ + lea h, [h + y3]; \ + lea h, [h + T1]; \ + and y0, b; \ + lea h, [h + y0] - ONE_ROUND 3*4+\XFER - ROTATE_ARGS -.endm +#define ONE_ROUND(XFER, a, b, c, d, e, f, g, h) \ + ONE_ROUND_PART1(XFER, a, b, c, d, e, f, g, h); \ + ONE_ROUND_PART2(a, b, c, d, e, f, g, h) + +#define FOUR_ROUNDS_AND_SCHED(XFERIN, XFEROUT, X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */; \ + vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */; \ + vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */; \ + vpsrld XTMP2, XTMP1, 7; \ + vpslld XTMP3, XTMP1, (32-7); \ + vpor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 */; \ + vpsrld XTMP2, XTMP1,18; \ + \ + ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \ + \ + /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrld XTMP4, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */; \ + vpslld XTMP1, XTMP1, (32-18); \ + vpxor XTMP3, XTMP3, XTMP1; \ + vpxor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */; \ + vpxor XTMP1, XTMP3, XTMP4 /* XTMP1 = s0 */; \ + vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ + vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ + vpsrld XTMP4, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ + \ + ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \ + \ + /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ + vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ + vpxor XTMP2, XTMP2, XTMP3; \ + vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ + vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ + vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ + vpshufd XTMP2, XTMP0, 0b1010000 /* XTMP2 = W[-2] {DDCC} */; \ + \ + ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \ + \ + /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrld XTMP5, XTMP2, 10 /* XTMP5 = W[-2] >> 10 {DDCC} */; \ + vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ + vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ + vpxor XTMP2, XTMP2, XTMP3; \ + vpxor XTMP5, XTMP5, XTMP2 /* XTMP5 = s1 {xDxC} */; \ + vpshufb XTMP5, XTMP5, SHUF_DC00 /* XTMP5 = s1 {DC00} */; \ + vpaddd X0, XTMP5, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ + vpaddd XFER, X0, [TBL + XFEROUT]; \ + \ + ONE_ROUND_PART1(3*4+XFERIN, f, g, h, a, b, c, d, e); \ + vmovdqa [rsp + _XFER + XFEROUT], XFER; \ + ONE_ROUND_PART2(f, g, h, a, b, c, d, e); + +#define DO_4ROUNDS(XFERIN, a, b, c, d, e, f, g, h) \ + ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \ + ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \ + ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \ + ONE_ROUND(3*4+XFERIN, f, g, h, a, b, c, d, e) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -391,32 +336,32 @@ _gcry_sha256_transform_amd64_avx2: .align 16 .Loop1: - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32, SRND + 4*32 - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32, SRND + 5*32 - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32, SRND + 6*32 - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32, SRND + 7*32 + FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 0*32, SRND + 4*32, X0, X1, X2, X3, a, b, c, d, e, f, g, h) + FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 1*32, SRND + 5*32, X1, X2, X3, X0, e, f, g, h, a, b, c, d) + FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 2*32, SRND + 6*32, X2, X3, X0, X1, a, b, c, d, e, f, g, h) + FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 3*32, SRND + 7*32, X3, X0, X1, X2, e, f, g, h, a, b, c, d) add SRND, 4*32 cmp SRND, 3 * 4*32 jb .Loop1 /* ; Do last 16 rounds with no scheduling */ - DO_4ROUNDS rsp + _XFER + (3*4*32 + 0*32) - DO_4ROUNDS rsp + _XFER + (3*4*32 + 1*32) - DO_4ROUNDS rsp + _XFER + (3*4*32 + 2*32) - DO_4ROUNDS rsp + _XFER + (3*4*32 + 3*32) + DO_4ROUNDS(rsp + _XFER + (3*4*32 + 0*32), a, b, c, d, e, f, g, h) + DO_4ROUNDS(rsp + _XFER + (3*4*32 + 1*32), e, f, g, h, a, b, c, d) + DO_4ROUNDS(rsp + _XFER + (3*4*32 + 2*32), a, b, c, d, e, f, g, h) + DO_4ROUNDS(rsp + _XFER + (3*4*32 + 3*32), e, f, g, h, a, b, c, d) mov CTX, [rsp + _CTX] mov INP, [rsp + _INP] - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) cmp INP, [rsp + _INP_END] ja .Ldone_hash @@ -425,8 +370,8 @@ _gcry_sha256_transform_amd64_avx2: xor SRND, SRND .align 16 .Loop3: - DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16 - DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16 + DO_4ROUNDS(rsp + _XFER + SRND + 0*32 + 16, a, b, c, d, e, f, g, h) + DO_4ROUNDS(rsp + _XFER + SRND + 1*32 + 16, e, f, g, h, a, b, c, d) add SRND, 2*32 cmp SRND, 4 * 4*32 jb .Loop3 @@ -435,14 +380,14 @@ _gcry_sha256_transform_amd64_avx2: mov INP, [rsp + _INP] add INP, 64 - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) cmp INP, [rsp + _INP_END] jb .Loop0 diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 0fb94c1b..098b0eb6 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -70,58 +70,56 @@ /* addm [mem], reg * Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask * Load xmm with mem and byte swap each dword */ -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - MOVDQ \p1, \p2 - pshufb \p1, \p3 -.endm +#define COPY_XMM_AND_BSWAP(p1, p2, p3) \ + MOVDQ p1, p2; \ + pshufb p1, p3; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ -X0 = xmm4 -X1 = xmm5 -X2 = xmm6 -X3 = xmm7 +#define X0 xmm4 +#define X1 xmm5 +#define X2 xmm6 +#define X3 xmm7 -XTMP0 = xmm0 -XTMP1 = xmm1 -XTMP2 = xmm2 -XTMP3 = xmm3 -XTMP4 = xmm8 -XFER = xmm9 +#define XTMP0 xmm0 +#define XTMP1 xmm1 +#define XTMP2 xmm2 +#define XTMP3 xmm3 +#define XTMP4 xmm8 +#define XFER xmm9 -SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */ -SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */ -BYTE_FLIP_MASK = xmm12 +#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK xmm12 -NUM_BLKS = rdx /* 3rd arg */ -CTX = rsi /* 2nd arg */ -INP = rdi /* 1st arg */ +#define NUM_BLKS rdx /* 3rd arg */ +#define CTX rsi /* 2nd arg */ +#define INP rdi /* 1st arg */ -SRND = rdi /* clobbers INP */ -c = ecx -d = r8d -e = edx +#define SRND rdi /* clobbers INP */ +#define c ecx +#define d r8d +#define e edx -TBL = rbp -a = eax -b = ebx +#define TBL rbp +#define a eax +#define b ebx -f = r9d -g = r10d -h = r11d +#define f r9d +#define g r10d +#define h r11d -y0 = r13d -y1 = r14d -y2 = r15d +#define y0 r13d +#define y1 r14d +#define y2 r15d @@ -138,230 +136,207 @@ y2 = r15d #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) -/* rotate_Xs - * Rotate values of symbols X0...X3 */ -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -/* ROTATE_ARGS - * Rotate values of symbols a...h */ -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - /* compute s0 four at a time and s1 two at a time - * compute W[-16] + W[-7] 4 at a time */ - movdqa XTMP0, X3 - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - movdqa XTMP1, X1 - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - /* compute s0 */ - palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */ - movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pslld XTMP1, (32-7) - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - psrld XTMP2, 7 - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + +#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + /* compute s0 four at a time and s1 two at a time */; \ + /* compute W[-16] + W[-7] 4 at a time */; \ + movdqa XTMP0, X3; \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + movdqa XTMP1, X1; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + /* compute s0 */; \ + palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \ + movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pslld XTMP1, (32-7); \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + psrld XTMP2, 7; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */ - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - pslld XTMP3, (32-18) - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - psrld XTMP2, 18 - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - pxor XTMP1, XTMP3 - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pxor XTMP1, XTMP4 /* XTMP1 = s0 */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - /* compute low s1 */ - pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */; \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + pslld XTMP3, (32-18); \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + psrld XTMP2, 18; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + pxor XTMP1, XTMP3; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pxor XTMP1, XTMP4 /* XTMP1 = s0 */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + /* compute low s1 */; \ + pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */ - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ - xor y2, g /* y2 = f^g */ - psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - pxor XTMP2, XTMP3 - add y2, y0 /* y2 = S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */ - pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - /* compute high s1 */ - pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */; \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ + xor y2, g /* y2 = f^g */; \ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + pxor XTMP2, XTMP3; \ + add y2, y0 /* y2 = S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \ + pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + /* compute high s1 */; \ + pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */ - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - pxor XTMP2, XTMP3 - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */ - pxor X0, XTMP2 /* X0 = s1 {xDxC} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */; \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + pxor XTMP2, XTMP3; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \ + pxor X0, XTMP2 /* X0 = s1 {xDxC} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS -rotate_Xs -.endm +#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \ + FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \ + FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \ + FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e); /* input is [rsp + _XFER + %1 * 4] */ -.macro DO_ROUND i1 - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - and y2, e /* y2 = (f^g)&e */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - add y2, y0 /* y2 = S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + add y2, y0 /* y2 = S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ - ROTATE_ARGS -.endm /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -414,10 +389,10 @@ _gcry_sha256_transform_amd64_ssse3: lea TBL, [.LK256 ADD_RIP] /* byte swap first 16 dwords */ - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK) mov [rsp + _INP], INP @@ -428,23 +403,23 @@ _gcry_sha256_transform_amd64_ssse3: movdqa XFER, [TBL + 0*16] paddd XFER, X0 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) movdqa XFER, [TBL + 1*16] - paddd XFER, X0 + paddd XFER, X1 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d) movdqa XFER, [TBL + 2*16] - paddd XFER, X0 + paddd XFER, X2 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h) movdqa XFER, [TBL + 3*16] - paddd XFER, X0 + paddd XFER, X3 movdqa [rsp + _XFER], XFER add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d) sub SRND, 1 jne .Loop1 @@ -453,17 +428,17 @@ _gcry_sha256_transform_amd64_ssse3: .Loop2: paddd X0, [TBL + 0*16] movdqa [rsp + _XFER], X0 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, a, b, c, d, e, f, g, h) + DO_ROUND(1, h, a, b, c, d, e, f, g) + DO_ROUND(2, g, h, a, b, c, d, e, f) + DO_ROUND(3, f, g, h, a, b, c, d, e) paddd X1, [TBL + 1*16] movdqa [rsp + _XFER], X1 add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, e, f, g, h, a, b, c, d) + DO_ROUND(1, d, e, f, g, h, a, b, c) + DO_ROUND(2, c, d, e, f, g, h, a, b) + DO_ROUND(3, b, c, d, e, f, g, h, a) movdqa X0, X2 movdqa X1, X3 @@ -471,14 +446,14 @@ _gcry_sha256_transform_amd64_ssse3: sub SRND, 1 jne .Loop2 - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) mov INP, [rsp + _INP] add INP, 64 diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 991fd639..75f7b070 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -53,32 +53,32 @@ .text /* Virtual Registers */ -msg = rdi /* ARG1 */ -digest = rsi /* ARG2 */ -msglen = rdx /* ARG3 */ -T1 = rcx -T2 = r8 -a_64 = r9 -b_64 = r10 -c_64 = r11 -d_64 = r12 -e_64 = r13 -f_64 = r14 -g_64 = r15 -h_64 = rbx -tmp0 = rax +#define msg rdi /* ARG1 */ +#define digest rsi /* ARG2 */ +#define msglen rdx /* ARG3 */ +#define T1 rcx +#define T2 r8 +#define a_64 r9 +#define b_64 r10 +#define c_64 r11 +#define d_64 r12 +#define e_64 r13 +#define f_64 r14 +#define g_64 r15 +#define h_64 rbx +#define tmp0 rax /* ; Local variables (stack frame) ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP */ -frame_W = 0 /* Message Schedule */ -frame_W_size = (80 * 8) -frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ -frame_WK_size = (2 * 8) -frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) -frame_GPRSAVE_size = (5 * 8) -frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) +#define frame_W 0 /* Message Schedule */ +#define frame_W_size (80 * 8) +#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +#define frame_WK_size (2 * 8) +#define frame_GPRSAVE ((frame_WK) + (frame_WK_size)) +#define frame_GPRSAVE_size (5 * 8) +#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* Useful QWORD "arrays" for simpler memory references */ @@ -90,162 +90,151 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* MSG, DIGEST, K_t, W_t are arrays */ /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ -.macro RotateState - /* Rotate symbles a..h right */ - __TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = __TMP -.endm - -.macro RORQ p1 p2 - /* shld is faster than ror on Intel Sandybridge */ - shld \p1, \p1, (64 - \p2) -.endm - -.macro SHA512_Round t - /* Compute Round %%t */ - mov T1, f_64 /* T1 = f */ - mov tmp0, e_64 /* tmp = e */ - xor T1, g_64 /* T1 = f ^ g */ - RORQ tmp0, 23 /* 41 ; tmp = e ror 23 */ - and T1, e_64 /* T1 = (f ^ g) & e */ - xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ - xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ - add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ - RORQ tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ - xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ - mov T2, a_64 /* T2 = a */ - add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ - RORQ tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ - add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ - mov tmp0, a_64 /* tmp = a */ - xor T2, c_64 /* T2 = a ^ c */ - and tmp0, c_64 /* tmp = a & c */ - and T2, b_64 /* T2 = (a ^ c) & b */ - xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ - mov tmp0, a_64 /* tmp = a */ - RORQ tmp0, 5 /* 39 ; tmp = a ror 5 */ - xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ - add d_64, T1 /* e(next_state) = d + T1 */ - RORQ tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ - xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ - lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ - RORQ tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ - add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ - RotateState -.endm - -.macro SHA512_2Sched_2Round_avx t -/* ; Compute rounds %%t-2 and %%t-1 - ; Compute message schedule QWORDS %%t and %%t+1 - - ; Two rounds are computed based on the values for K[t-2]+W[t-2] and - ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message - ; scheduler. - ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. - ; They are then added to their respective SHA512 constants at - ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] - ; For brievity, the comments following vectored instructions only refer to - ; the first of a pair of QWORDS. - ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} - ; The computation of the message schedule and the rounds are tightly - ; stitched to take advantage of instruction-level parallelism. - ; For clarity, integer instructions (for the rounds calculation) are indented - ; by one tab. Vectored instructions (for the message scheduler) are indented - ; by two tabs. */ - - vmovdqa xmm4, [W_t(\t-2)] /* XMM4 = W[t-2] */ - vmovdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ - mov T1, f_64 - vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */ - mov tmp0, e_64 - vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */ - xor T1, g_64 - RORQ tmp0, 23 /* 41 */ - vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */ - and T1, e_64 - xor tmp0, e_64 - vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */ - xor T1, g_64 - add T1, [WK_2(\t)]; - vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */ - RORQ tmp0, 4 /* 18 */ - vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */ - xor tmp0, e_64 - mov T2, a_64 - add T1, h_64 - vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */ - RORQ tmp0, 14 /* 14 */ - add T1, tmp0 - vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */ - mov tmp0, a_64 - xor T2, c_64 - vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */ - and tmp0, c_64 - and T2, b_64 - vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */ - xor T2, tmp0 - mov tmp0, a_64 - vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */ - RORQ tmp0, 5 /* 39 */ - vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */ - xor tmp0, a_64 - add d_64, T1 - RORQ tmp0, 6 /* 34 */ - xor tmp0, a_64 - vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */ - lea h_64, [T1 + T2] - RORQ tmp0, 28 /* 28 */ - vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */ - add h_64, tmp0 - RotateState - vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */ - mov T1, f_64 - vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */ - mov tmp0, e_64 - xor T1, g_64 - vpaddq xmm0, xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */ - vmovdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ - RORQ tmp0, 23 /* 41 */ - and T1, e_64 - xor tmp0, e_64 - xor T1, g_64 - vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */ - add T1, [WK_2(\t+1)] - vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */ - RORQ tmp0, 4 /* 18 */ - vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */ - xor tmp0, e_64 - vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ - mov T2, a_64 - add T1, h_64 - RORQ tmp0, 14 /* 14 */ - add T1, tmp0 - vmovdqa [W_t(\t)], xmm0 /* Store W[t] */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ - mov tmp0, a_64 - xor T2, c_64 - and tmp0, c_64 - and T2, b_64 - xor T2, tmp0 - mov tmp0, a_64 - RORQ tmp0, 5 /* 39 */ - xor tmp0, a_64 - add d_64, T1 - RORQ tmp0, 6 /* 34 */ - xor tmp0, a_64 - lea h_64, [T1 + T2] - RORQ tmp0, 28 /* 28 */ - add h_64, tmp0 - RotateState -.endm +#define RORQ(p1, p2) \ + /* shld is faster than ror on Intel Sandybridge */ \ + shld p1, p1, (64 - p2) + +#define SHA512_Round(t, a, b, c, d, e, f, g, h) \ + /* Compute Round %%t */; \ + mov T1, f /* T1 = f */; \ + mov tmp0, e /* tmp = e */; \ + xor T1, g /* T1 = f ^ g */; \ + RORQ( tmp0, 23) /* 41 ; tmp = e ror 23 */; \ + and T1, e /* T1 = (f ^ g) & e */; \ + xor tmp0, e /* tmp = (e ror 23) ^ e */; \ + xor T1, g /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \ + add T1, [WK_2(t)] /* W[t] + K[t] from message scheduler */; \ + RORQ( tmp0, 4) /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */; \ + xor tmp0, e /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \ + mov T2, a /* T2 = a */; \ + add T1, h /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \ + RORQ( tmp0, 14) /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \ + mov tmp0, a /* tmp = a */; \ + xor T2, c /* T2 = a ^ c */; \ + and tmp0, c /* tmp = a & c */; \ + and T2, b /* T2 = (a ^ c) & b */; \ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \ + mov tmp0, a /* tmp = a */; \ + RORQ( tmp0, 5) /* 39 ; tmp = a ror 5 */; \ + xor tmp0, a /* tmp = (a ror 5) ^ a */; \ + add d, T1 /* e(next_state) = d + T1 */; \ + RORQ( tmp0, 6) /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */; \ + xor tmp0, a /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \ + lea h, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */; \ + RORQ( tmp0, 28) /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \ + add h, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + +#define SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h) \ + /* \ + ; Compute rounds %%t-2 and %%t-1 \ + ; Compute message schedule QWORDS %%t and %%t+1 \ + ; \ + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and \ + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \ + ; scheduler. \ + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \ + ; They are then added to their respective SHA512 constants at \ + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \ + ; For brievity, the comments following vectored instructions only refer to \ + ; the first of a pair of QWORDS. \ + ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} \ + ; The computation of the message schedule and the rounds are tightly \ + ; stitched to take advantage of instruction-level parallelism. \ + ; For clarity, integer instructions (for the rounds calculation) are indented \ + ; by one tab. Vectored instructions (for the message scheduler) are indented \ + ; by two tabs. \ + */ \ + \ + vmovdqa xmm4, [W_t(t-2)] /* XMM4 = W[t-2] */; \ + vmovdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \ + mov T1, f; \ + vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */; \ + mov tmp0, e; \ + vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */; \ + xor T1, g; \ + RORQ( tmp0, 23) /* 41 */; \ + vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */; \ + and T1, e; \ + xor tmp0, e; \ + vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */; \ + xor T1, g; \ + add T1, [WK_2(t)]; \ + vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */; \ + RORQ( tmp0, 4) /* 18 */; \ + vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */; \ + xor tmp0, e; \ + mov T2, a; \ + add T1, h; \ + vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */; \ + RORQ( tmp0, 14) /* 14 */; \ + add T1, tmp0; \ + vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */; \ + mov tmp0, a; \ + xor T2, c; \ + vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */; \ + and tmp0, c; \ + and T2, b; \ + vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */; \ + xor T2, tmp0; \ + mov tmp0, a; \ + vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */; \ + RORQ( tmp0, 5) /* 39 */; \ + vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */; \ + xor tmp0, a; \ + add d, T1; \ + RORQ( tmp0, 6) /* 34 */; \ + xor tmp0, a; \ + vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */; \ + lea h, [T1 + T2]; \ + RORQ( tmp0, 28) /* 28 */; \ + vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */; \ + add h, tmp0 + +#define SHA512_2Sched_2Round_avx_PART2(t, a, b, c, d, e, f, g, h) \ + vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */; \ + mov T1, f; \ + vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */; \ + mov tmp0, e; \ + xor T1, g; \ + vpaddq xmm0, xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */; \ + vmovdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \ + RORQ( tmp0, 23) /* 41 */; \ + and T1, e; \ + xor tmp0, e; \ + xor T1, g; \ + vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */; \ + add T1, [WK_2(t+1)]; \ + vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */; \ + RORQ( tmp0, 4) /* 18 */; \ + vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */; \ + xor tmp0, e; \ + vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \ + mov T2, a; \ + add T1, h; \ + RORQ( tmp0, 14) /* 14 */; \ + add T1, tmp0; \ + vmovdqa [W_t(t)], xmm0 /* Store W[t] */; \ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */; \ + mov tmp0, a; \ + xor T2, c; \ + and tmp0, c; \ + and T2, b; \ + xor T2, tmp0; \ + mov tmp0, a; \ + RORQ( tmp0, 5) /* 39 */; \ + xor tmp0, a; \ + add d, T1; \ + RORQ( tmp0, 6) /* 34 */; \ + xor tmp0, a; \ + lea h, [T1 + T2]; \ + RORQ( tmp0, 28) /* 28 */; \ + add h, tmp0 + +#define SHA512_2Sched_2Round_avx(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h); \ + SHA512_2Sched_2Round_avx_PART2(t, h, a, b, c, d, e, f, g) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -295,37 +284,77 @@ _gcry_sha512_transform_amd64_avx: mov g_64, [DIGEST(6)] mov h_64, [DIGEST(7)] - t = 0 - .rept 80/2 + 1 - /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ - /* +1 iteration because the scheduler leads hashing by 1 iteration */ - .if t < 2 - /* BSWAP 2 QWORDS */ - vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] - vmovdqu xmm0, [MSG(t)] - vpshufb xmm0, xmm0, xmm1 /* BSWAP */ - vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - vmovdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ - .elseif t < 16 - /* BSWAP 2 QWORDS, Compute 2 Rounds */ - vmovdqu xmm0, [MSG(t)] - vpshufb xmm0, xmm0, xmm1 /* BSWAP */ - SHA512_Round (t - 2) /* Round t-2 */ - vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - SHA512_Round (t - 1) /* Round t-1 */ - vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ - .elseif t < 79 - /* Schedule 2 QWORDS; Compute 2 Rounds */ - SHA512_2Sched_2Round_avx t - .else - /* Compute 2 Rounds */ - SHA512_Round (t - 2) - SHA512_Round (t - 1) - .endif - t = ((t)+2) - .endr + /* BSWAP 2 QWORDS */ + vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + vmovdqu xmm0, [MSG(0)] + vpshufb xmm0, xmm0, xmm1 /* BSWAP */ + vmovdqa [W_t(0)], xmm0 /* Store Scheduled Pair */ + vpaddq xmm0, xmm0, [K_t(0)] /* Compute W[t]+K[t] */ + vmovdqa [WK_2(0)], xmm0 /* Store into WK for rounds */ + + #define T_2_14(t, a, b, c, d, e, f, g, h) \ + /* BSWAP 2 QWORDS, Compute 2 Rounds */; \ + vmovdqu xmm0, [MSG(t)]; \ + vpshufb xmm0, xmm0, xmm1 /* BSWAP */; \ + SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */; \ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64); \ + vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ + + #define T_16_78(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_avx((t), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64) + + #define T_80(t, a, b, c, d, e, f, g, h) \ + /* Compute 2 Rounds */; \ + SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64) + + T_2_14(2, a, b, c, d, e, f, g, h) + T_2_14(4, g, h, a, b, c, d, e, f) + T_2_14(6, e, f, g, h, a, b, c, d) + T_2_14(8, c, d, e, f, g, h, a, b) + T_2_14(10, a, b, c, d, e, f, g, h) + T_2_14(12, g, h, a, b, c, d, e, f) + T_2_14(14, e, f, g, h, a, b, c, d) + T_16_78(16, c, d, e, f, g, h, a, b) + T_16_78(18, a, b, c, d, e, f, g, h) + T_16_78(20, g, h, a, b, c, d, e, f) + T_16_78(22, e, f, g, h, a, b, c, d) + T_16_78(24, c, d, e, f, g, h, a, b) + T_16_78(26, a, b, c, d, e, f, g, h) + T_16_78(28, g, h, a, b, c, d, e, f) + T_16_78(30, e, f, g, h, a, b, c, d) + T_16_78(32, c, d, e, f, g, h, a, b) + T_16_78(34, a, b, c, d, e, f, g, h) + T_16_78(36, g, h, a, b, c, d, e, f) + T_16_78(38, e, f, g, h, a, b, c, d) + T_16_78(40, c, d, e, f, g, h, a, b) + T_16_78(42, a, b, c, d, e, f, g, h) + T_16_78(44, g, h, a, b, c, d, e, f) + T_16_78(46, e, f, g, h, a, b, c, d) + T_16_78(48, c, d, e, f, g, h, a, b) + T_16_78(50, a, b, c, d, e, f, g, h) + T_16_78(52, g, h, a, b, c, d, e, f) + T_16_78(54, e, f, g, h, a, b, c, d) + T_16_78(56, c, d, e, f, g, h, a, b) + T_16_78(58, a, b, c, d, e, f, g, h) + T_16_78(60, g, h, a, b, c, d, e, f) + T_16_78(62, e, f, g, h, a, b, c, d) + T_16_78(64, c, d, e, f, g, h, a, b) + T_16_78(66, a, b, c, d, e, f, g, h) + T_16_78(68, g, h, a, b, c, d, e, f) + T_16_78(70, e, f, g, h, a, b, c, d) + T_16_78(72, c, d, e, f, g, h, a, b) + T_16_78(74, a, b, c, d, e, f, g, h) + T_16_78(76, g, h, a, b, c, d, e, f) + T_16_78(78, e, f, g, h, a, b, c, d) + T_80(80, c, d, e, f, g, h, a, b) /* Update digest */ add [DIGEST(0)], a_64 @@ -357,11 +386,12 @@ _gcry_sha512_transform_amd64_avx: vzeroall /* Burn stack */ - t = 0 - .rept frame_W_size / 32 - vmovups [rsp + frame_W + (t) * 32], ymm0 - t = ((t)+1) - .endr + mov eax, 0 +.Lerase_stack: + vmovdqu [rsp + rax], ymm0 + add eax, 32 + cmp eax, frame_W_size + jne .Lerase_stack vmovdqu [rsp + frame_WK], xmm0 xor eax, eax diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 3b28ab6c..7f119e6c 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -56,46 +56,45 @@ .text /* Virtual Registers */ -Y_0 = ymm4 -Y_1 = ymm5 -Y_2 = ymm6 -Y_3 = ymm7 - -YTMP0 = ymm0 -YTMP1 = ymm1 -YTMP2 = ymm2 -YTMP3 = ymm3 -YTMP4 = ymm8 -XFER = YTMP0 - -BYTE_FLIP_MASK = ymm9 -MASK_YMM_LO = ymm10 -MASK_YMM_LOx = xmm10 - -INP = rdi /* 1st arg */ -CTX = rsi /* 2nd arg */ -NUM_BLKS = rdx /* 3rd arg */ -c = rcx -d = r8 -e = rdx -y3 = rdi - -TBL = rbp - -a = rax -b = rbx - -f = r9 -g = r10 -h = r11 -old_h = rax - -T1 = r12 -y0 = r13 -y1 = r14 -y2 = r15 - -y4 = r12 +#define Y_0 ymm4 +#define Y_1 ymm5 +#define Y_2 ymm6 +#define Y_3 ymm7 + +#define YTMP0 ymm0 +#define YTMP1 ymm1 +#define YTMP2 ymm2 +#define YTMP3 ymm3 +#define YTMP4 ymm8 +#define XFER YTMP0 + +#define BYTE_FLIP_MASK ymm9 +#define MASK_YMM_LO ymm10 +#define MASK_YMM_LOx xmm10 + +#define INP rdi /* 1st arg */ +#define CTX rsi /* 2nd arg */ +#define NUM_BLKS rdx /* 3rd arg */ +#define c rcx +#define d r8 +#define e rdx +#define y3 rdi + +#define TBL rbp + +#define a rax +#define b rbx + +#define f r9 +#define g r10 +#define h r11 + +#define T1 r12 +#define y0 r13 +#define y1 r14 +#define y2 r15 + +#define y4 r12 /* Local variables (stack frame) */ #define frame_XFER 0 @@ -116,218 +115,153 @@ y4 = r12 /* addm [mem], reg */ /* Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */ /* Load ymm with mem and byte swap each dword */ -.macro COPY_YMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p1, \p2 - vpshufb \p1, \p1, \p3 -.endm -/* rotate_Ys */ -/* Rotate values of symbols Y0...Y3 */ -.macro rotate_Ys - __Y_ = Y_0 - Y_0 = Y_1 - Y_1 = Y_2 - Y_2 = Y_3 - Y_3 = __Y_ -.endm - -/* RotateState */ -.macro RotateState - /* Rotate symbles a..h right */ - old_h = h - __TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = __TMP_ -.endm +#define COPY_YMM_AND_BSWAP(p1, p2, p3) \ + VMOVDQ p1, p2; \ + vpshufb p1, p1, p3 /* %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL */ /* YDST = {YSRC1, YSRC2} >> RVAL*8 */ -.macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL - vperm2f128 \YDST, \YSRC1, \YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */ - vpalignr \YDST, \YDST, \YSRC2, \RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */ -.endm - -.macro ONE_ROUND_PART1 XFER - /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); - * d += h; - * h += Sum0 (a) + Maj (a, b, c); - * - * Ch(x, y, z) => ((x & y) + (~x & z)) - * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) - */ - - mov y3, e - add h, [\XFER] - and y3, f - rorx y0, e, 41 - rorx y1, e, 18 +#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \ + vperm2i128 YDST, YSRC1, YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */; \ + vpalignr YDST, YDST, YSRC2, RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */ + +#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \ + /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \ + * d += h; \ + * h += Sum0 (a) + Maj (a, b, c); \ + * \ + * Ch(x, y, z) => ((x & y) + (~x & z)) \ + * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \ + */ \ + \ + mov y3, e; \ + add h, [XFERIN]; \ + and y3, f; \ + rorx y0, e, 41; \ + rorx y1, e, 18; \ + lea h, [h + y3]; \ + andn y3, e, g; \ + rorx T1, a, 34; \ + xor y0, y1; \ lea h, [h + y3] - andn y3, e, g - rorx T1, a, 34 - xor y0, y1 - lea h, [h + y3] -.endm -.macro ONE_ROUND_PART2 - rorx y2, a, 39 - rorx y1, e, 14 - mov y3, a - xor T1, y2 - xor y0, y1 - xor y3, b - lea h, [h + y0] - mov y0, a - rorx y2, a, 28 - add d, h - and y3, c - xor T1, y2 - lea h, [h + y3] - lea h, [h + T1] - and y0, b - lea h, [h + y0] -.endm - -.macro ONE_ROUND XFER - ONE_ROUND_PART1 \XFER - ONE_ROUND_PART2 -.endm - -.macro FOUR_ROUNDS_AND_SCHED X -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - /* Extract w[t-7] */ - MY_VPALIGNR YTMP0, Y_3, Y_2, 8 /* YTMP0 = W[-7] */ - /* Calculate w[t-16] + w[t-7] */ - vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */ - /* Extract w[t-15] */ - MY_VPALIGNR YTMP1, Y_1, Y_0, 8 /* YTMP1 = W[-15] */ - - /* Calculate sigma0 */ - - /* Calculate w[t-15] ror 1 */ - vpsrlq YTMP2, YTMP1, 1 - vpsllq YTMP3, YTMP1, (64-1) - vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */ - /* Calculate w[t-15] shr 7 */ - vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */ - - ONE_ROUND rsp+frame_XFER+0*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - -/*;;;;;;;;;;;;;;;;;;;;;;;;; */ - - /* Calculate w[t-15] ror 8 */ - vpsrlq YTMP2, YTMP1, 8 - vpsllq YTMP1, YTMP1, (64-8) - vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */ - /* XOR the three components */ - vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */ - vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */ - - - /* Add three components, w[t-16], w[t-7] and sigma0 */ - vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */ - /* Move to appropriate lanes for calculating w[16] and w[17] */ - vperm2f128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */ - /* Move to appropriate lanes for calculating w[18] and w[19] */ - vpand YTMP0, YTMP0, MASK_YMM_LO /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */ - - /* Calculate w[16] and w[17] in both 128 bit lanes */ - - /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */ - vperm2f128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */ - vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */ - - ONE_ROUND rsp+frame_XFER+1*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - -/*;;;;;;;;;;;;;;;;;;;;;;;;; */ +#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \ + rorx y2, a, 39; \ + rorx y1, e, 14; \ + mov y3, a; \ + xor T1, y2; \ + xor y0, y1; \ + xor y3, b; \ + lea h, [h + y0]; \ + mov y0, a; \ + rorx y2, a, 28; \ + add d, h; \ + and y3, c; \ + xor T1, y2; \ + lea h, [h + y3]; \ + lea h, [h + T1]; \ + and y0, b; \ + lea h, [h + y0] - vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */ - vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */ - vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */ - vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */ - vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */ - vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */ - vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */ - vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */ - - /* Add sigma1 to the other compunents to get w[16] and w[17] */ - vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */ - - /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */ - vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */ - - ONE_ROUND rsp+frame_XFER+2*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - -/*;;;;;;;;;;;;;;;;;;;;;;;;; */ - - vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */ - vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */ - vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */ - vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */ - vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */ - vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */ - vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */ - vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */ - - /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */ - vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */ - - /* Form w[19, w[18], w17], w[16] */ - vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */ - - ONE_ROUND_PART1 rsp+frame_XFER+3*8+\X*32 - vpaddq XFER, Y_0, [TBL + (4+\X)*32] - vmovdqa [rsp + frame_XFER + \X*32], XFER - ONE_ROUND_PART2 - RotateState - rotate_Ys -.endm - -.macro DO_4ROUNDS X - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND rsp+frame_XFER+0*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND rsp+frame_XFER+1*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND rsp+frame_XFER+2*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND rsp+frame_XFER+3*8+\X*32 - RotateState - -.endm +#define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \ + ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \ + ONE_ROUND_PART2(a, b, c, d, e, f, g, h) + +#define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \ + /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + /* Extract w[t-7] */; \ + MY_VPALIGNR( YTMP0, Y_3, Y_2, 8) /* YTMP0 = W[-7] */; \ + /* Calculate w[t-16] + w[t-7] */; \ + vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */; \ + /* Extract w[t-15] */; \ + MY_VPALIGNR( YTMP1, Y_1, Y_0, 8) /* YTMP1 = W[-15] */; \ + \ + /* Calculate sigma0 */; \ + \ + /* Calculate w[t-15] ror 1 */; \ + vpsrlq YTMP2, YTMP1, 1; \ + vpsllq YTMP3, YTMP1, (64-1); \ + vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */; \ + /* Calculate w[t-15] shr 7 */; \ + vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */; \ + \ + ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \ + \ + /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + /* Calculate w[t-15] ror 8 */; \ + vpsrlq YTMP2, YTMP1, 8; \ + vpsllq YTMP1, YTMP1, (64-8); \ + vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */; \ + /* XOR the three components */; \ + vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */; \ + vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */; \ + \ + /* Add three components, w[t-16], w[t-7] and sigma0 */; \ + vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */; \ + /* Move to appropriate lanes for calculating w[16] and w[17] */; \ + vperm2i128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \ + /* Move to appropriate lanes for calculating w[18] and w[19] */; \ + vpand YTMP0, YTMP0, MASK_YMM_LO /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */; \ + \ + /* Calculate w[16] and w[17] in both 128 bit lanes */; \ + \ + /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \ + vperm2i128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */; \ + vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */; \ + \ + ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \ + \ + /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */; \ + vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */; \ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */; \ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */; \ + vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */; \ + vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */; \ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */; \ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \ + \ + /* Add sigma1 to the other compunents to get w[16] and w[17] */; \ + vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */; \ + \ + /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \ + vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */; \ + \ + ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \ + \ + /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */; \ + vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */; \ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */; \ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */; \ + vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */; \ + vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */; \ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */; \ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \ + \ + /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \ + vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */; \ + \ + /* Form w[19, w[18], w17], w[16] */; \ + vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */; \ + \ + ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \ + vpaddq XFER, Y_0, [TBL + (4+X)*32]; \ + vmovdqa [rsp + frame_XFER + X*32], XFER; \ + ONE_ROUND_PART2(f, g, h, a, b, c, d, e) + +#define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \ + ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \ + ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \ + ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \ + ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -390,10 +324,10 @@ _gcry_sha512_transform_amd64_avx2: lea TBL,[.LK512 ADD_RIP] /*; byte swap first 16 dwords */ - COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK) add INP, 128 mov [rsp + frame_INP], INP @@ -408,20 +342,20 @@ _gcry_sha512_transform_amd64_avx2: vmovdqa [rsp + frame_XFER + 3*32], XFER /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ - movq [rsp + frame_SRND],4 + mov qword ptr [rsp + frame_SRND], 4 .align 16 .Loop0: - FOUR_ROUNDS_AND_SCHED 0 - FOUR_ROUNDS_AND_SCHED 1 - FOUR_ROUNDS_AND_SCHED 2 - FOUR_ROUNDS_AND_SCHED 3 + FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) + FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d) + FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h) + FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d) add TBL, 4*32 - subq [rsp + frame_SRND], 1 + sub qword ptr [rsp + frame_SRND], 1 jne .Loop0 - subq [rsp + frame_NBLKS], 1 + sub qword ptr [rsp + frame_NBLKS], 1 je .Ldone_hash mov INP, [rsp + frame_INP] @@ -429,62 +363,62 @@ _gcry_sha512_transform_amd64_avx2: lea TBL,[.LK512 ADD_RIP] /* load next block and byte swap */ - COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK) add INP, 128 mov [rsp + frame_INP], INP - DO_4ROUNDS 0 + DO_4ROUNDS(0, a, b, c, d, e, f, g, h) vpaddq XFER, Y_0, [TBL + 0*32] vmovdqa [rsp + frame_XFER + 0*32], XFER - DO_4ROUNDS 1 + DO_4ROUNDS(1, e, f, g, h, a, b, c, d) vpaddq XFER, Y_1, [TBL + 1*32] vmovdqa [rsp + frame_XFER + 1*32], XFER - DO_4ROUNDS 2 + DO_4ROUNDS(2, a, b, c, d, e, f, g, h) vpaddq XFER, Y_2, [TBL + 2*32] vmovdqa [rsp + frame_XFER + 2*32], XFER - DO_4ROUNDS 3 + DO_4ROUNDS(3, e, f, g, h, a, b, c, d) vpaddq XFER, Y_3, [TBL + 3*32] vmovdqa [rsp + frame_XFER + 3*32], XFER - addm [8*0 + CTX],a - addm [8*1 + CTX],b - addm [8*2 + CTX],c - addm [8*3 + CTX],d - addm [8*4 + CTX],e - addm [8*5 + CTX],f - addm [8*6 + CTX],g - addm [8*7 + CTX],h + addm([8*0 + CTX],a) + addm([8*1 + CTX],b) + addm([8*2 + CTX],c) + addm([8*3 + CTX],d) + addm([8*4 + CTX],e) + addm([8*5 + CTX],f) + addm([8*6 + CTX],g) + addm([8*7 + CTX],h) /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ - movq [rsp + frame_SRND],4 + mov qword ptr [rsp + frame_SRND],4 jmp .Loop0 .Ldone_hash: vzeroall - DO_4ROUNDS 0 + DO_4ROUNDS(0, a, b, c, d, e, f, g, h) vmovdqa [rsp + frame_XFER + 0*32], ymm0 /* burn stack */ - DO_4ROUNDS 1 + DO_4ROUNDS(1, e, f, g, h, a, b, c, d) vmovdqa [rsp + frame_XFER + 1*32], ymm0 /* burn stack */ - DO_4ROUNDS 2 + DO_4ROUNDS(2, a, b, c, d, e, f, g, h) vmovdqa [rsp + frame_XFER + 2*32], ymm0 /* burn stack */ - DO_4ROUNDS 3 + DO_4ROUNDS(3, e, f, g, h, a, b, c, d) vmovdqa [rsp + frame_XFER + 3*32], ymm0 /* burn stack */ - addm [8*0 + CTX],a + addm([8*0 + CTX],a) xor eax, eax /* burn stack */ - addm [8*1 + CTX],b - addm [8*2 + CTX],c - addm [8*3 + CTX],d - addm [8*4 + CTX],e - addm [8*5 + CTX],f - addm [8*6 + CTX],g - addm [8*7 + CTX],h + addm([8*1 + CTX],b) + addm([8*2 + CTX],c) + addm([8*3 + CTX],d) + addm([8*4 + CTX],e) + addm([8*5 + CTX],f) + addm([8*6 + CTX],g) + addm([8*7 + CTX],h) /* Restore GPRs */ mov rbp, [rsp + frame_GPRSAVE + 8 * 0] diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 39bfe362..6a1328a6 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -56,32 +56,32 @@ .text /* Virtual Registers */ -msg = rdi /* ARG1 */ -digest = rsi /* ARG2 */ -msglen = rdx /* ARG3 */ -T1 = rcx -T2 = r8 -a_64 = r9 -b_64 = r10 -c_64 = r11 -d_64 = r12 -e_64 = r13 -f_64 = r14 -g_64 = r15 -h_64 = rbx -tmp0 = rax +#define msg rdi /* ARG1 */ +#define digest rsi /* ARG2 */ +#define msglen rdx /* ARG3 */ +#define T1 rcx +#define T2 r8 +#define a_64 r9 +#define b_64 r10 +#define c_64 r11 +#define d_64 r12 +#define e_64 r13 +#define f_64 r14 +#define g_64 r15 +#define h_64 rbx +#define tmp0 rax /* ; Local variables (stack frame) ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP */ -frame_W = 0 /* Message Schedule */ -frame_W_size = (80 * 8) -frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ -frame_WK_size = (2 * 8) -frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) -frame_GPRSAVE_size = (5 * 8) -frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) +#define frame_W 0 /* Message Schedule */ +#define frame_W_size (80 * 8) +#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +#define frame_WK_size (2 * 8) +#define frame_GPRSAVE ((frame_WK) + (frame_WK_size)) +#define frame_GPRSAVE_size (5 * 8) +#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* Useful QWORD "arrays" for simpler memory references */ @@ -93,161 +93,151 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* MSG, DIGEST, K_t, W_t are arrays */ /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ -.macro RotateState - /* Rotate symbles a..h right */ - __TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = __TMP -.endm - -.macro SHA512_Round t - /* Compute Round %%t */ - mov T1, f_64 /* T1 = f */ - mov tmp0, e_64 /* tmp = e */ - xor T1, g_64 /* T1 = f ^ g */ - ror tmp0, 23 /* 41 ; tmp = e ror 23 */ - and T1, e_64 /* T1 = (f ^ g) & e */ - xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ - xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ - add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ - ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ - xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ - mov T2, a_64 /* T2 = a */ - add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ - ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ - add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ - mov tmp0, a_64 /* tmp = a */ - xor T2, c_64 /* T2 = a ^ c */ - and tmp0, c_64 /* tmp = a & c */ - and T2, b_64 /* T2 = (a ^ c) & b */ - xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ - mov tmp0, a_64 /* tmp = a */ - ror tmp0, 5 /* 39 ; tmp = a ror 5 */ - xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ - add d_64, T1 /* e(next_state) = d + T1 */ - ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ - xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ - lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ - ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ - add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ - RotateState -.endm - -.macro SHA512_2Sched_2Round_sse t -/* ; Compute rounds %%t-2 and %%t-1 - ; Compute message schedule QWORDS %%t and %%t+1 - - ; Two rounds are computed based on the values for K[t-2]+W[t-2] and - ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message - ; scheduler. - ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. - ; They are then added to their respective SHA512 constants at - ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] - ; For brievity, the comments following vectored instructions only refer to - ; the first of a pair of QWORDS. - ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} - ; The computation of the message schedule and the rounds are tightly - ; stitched to take advantage of instruction-level parallelism. - ; For clarity, integer instructions (for the rounds calculation) are indented - ; by one tab. Vectored instructions (for the message scheduler) are indented - ; by two tabs. */ - - mov T1, f_64 - movdqa xmm2, [W_t(\t-2)] /* XMM2 = W[t-2] */ - xor T1, g_64 - and T1, e_64 - movdqa xmm0, xmm2 /* XMM0 = W[t-2] */ - xor T1, g_64 - add T1, [WK_2(\t)] - movdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ - mov tmp0, e_64 - ror tmp0, 23 /* 41 */ - movdqa xmm3, xmm5 /* XMM3 = W[t-15] */ - xor tmp0, e_64 - ror tmp0, 4 /* 18 */ - psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */ - xor tmp0, e_64 - ror tmp0, 14 /* 14 */ - psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */ - add T1, tmp0 - add T1, h_64 - pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */ - mov T2, a_64 - xor T2, c_64 - pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */ - and T2, b_64 - mov tmp0, a_64 - psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */ - and tmp0, c_64 - xor T2, tmp0 - psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */ - mov tmp0, a_64 - ror tmp0, 5 /* 39 */ - pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */ - xor tmp0, a_64 - ror tmp0, 6 /* 34 */ - pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */ - xor tmp0, a_64 - ror tmp0, 28 /* 28 */ - psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */ - add T2, tmp0 - add d_64, T1 - psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */ - lea h_64, [T1 + T2] - RotateState - movdqa xmm1, xmm2 /* XMM1 = W[t-2] */ - mov T1, f_64 - xor T1, g_64 - movdqa xmm4, xmm5 /* XMM4 = W[t-15] */ - and T1, e_64 - xor T1, g_64 - psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */ - add T1, [WK_2(\t+1)] - mov tmp0, e_64 - psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */ - ror tmp0, 23 /* 41 */ - xor tmp0, e_64 - pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */ - ror tmp0, 4 /* 18 */ - xor tmp0, e_64 - pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */ - ror tmp0, 14 /* 14 */ - add T1, tmp0 - psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */ - add T1, h_64 - mov T2, a_64 - psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */ - xor T2, c_64 - and T2, b_64 - pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */ - mov tmp0, a_64 - and tmp0, c_64 - movdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ - xor T2, tmp0 - pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */ - mov tmp0, a_64 - paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */ - ror tmp0, 5 /* 39 */ - paddq xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */ - xor tmp0, a_64 - paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ - ror tmp0, 6 /* 34 */ - movdqa [W_t(\t)], xmm0 /* Store scheduled qwords */ - xor tmp0, a_64 - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - ror tmp0, 28 /* 28 */ - movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ - add T2, tmp0 - add d_64, T1 - lea h_64, [T1 + T2] - RotateState -.endm +#define SHA512_Round(t, a, b, c, d, e, f, g, h) \ + /* Compute Round %%t */; \ + mov T1, f /* T1 = f */; \ + mov tmp0, e /* tmp = e */; \ + xor T1, g /* T1 = f ^ g */; \ + ror tmp0, 23 /* 41 ; tmp = e ror 23 */; \ + and T1, e /* T1 = (f ^ g) & e */; \ + xor tmp0, e /* tmp = (e ror 23) ^ e */; \ + xor T1, g /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \ + add T1, [WK_2(t)] /* W[t] + K[t] from message scheduler */; \ + ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */; \ + xor tmp0, e /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \ + mov T2, a /* T2 = a */; \ + add T1, h /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \ + ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \ + mov tmp0, a /* tmp = a */; \ + xor T2, c /* T2 = a ^ c */; \ + and tmp0, c /* tmp = a & c */; \ + and T2, b /* T2 = (a ^ c) & b */; \ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \ + mov tmp0, a /* tmp = a */; \ + ror tmp0, 5 /* 39 ; tmp = a ror 5 */; \ + xor tmp0, a /* tmp = (a ror 5) ^ a */; \ + add d, T1 /* e(next_state) = d + T1 */; \ + ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */; \ + xor tmp0, a /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \ + lea h, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */; \ + ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \ + add h, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + +#define SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h) \ + /* \ + ; Compute rounds %%t-2 and %%t-1 \ + ; Compute message schedule QWORDS %%t and %%t+1 \ + ; \ + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and \ + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \ + ; scheduler. \ + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \ + ; They are then added to their respective SHA512 constants at \ + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \ + ; For brievity, the comments following vectored instructions only refer to \ + ; the first of a pair of QWORDS. \ + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} \ + ; The computation of the message schedule and the rounds are tightly \ + ; stitched to take advantage of instruction-level parallelism. \ + ; For clarity, integer instructions (for the rounds calculation) are indented \ + ; by one tab. Vectored instructions (for the message scheduler) are indented \ + ; by two tabs. \ + */ \ + \ + mov T1, f; \ + movdqa xmm2, [W_t(t-2)] /* XMM2 = W[t-2] */; \ + xor T1, g; \ + and T1, e; \ + movdqa xmm0, xmm2 /* XMM0 = W[t-2] */; \ + xor T1, g; \ + add T1, [WK_2(t)]; \ + movdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \ + mov tmp0, e; \ + ror tmp0, 23 /* 41 */; \ + movdqa xmm3, xmm5 /* XMM3 = W[t-15] */; \ + xor tmp0, e; \ + ror tmp0, 4 /* 18 */; \ + psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */; \ + xor tmp0, e; \ + ror tmp0, 14 /* 14 */; \ + psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */; \ + add T1, tmp0; \ + add T1, h; \ + pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */; \ + mov T2, a; \ + xor T2, c; \ + pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */; \ + and T2, b; \ + mov tmp0, a; \ + psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */; \ + and tmp0, c; \ + xor T2, tmp0; \ + psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */; \ + mov tmp0, a; \ + ror tmp0, 5 /* 39 */; \ + pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */; \ + xor tmp0, a; \ + ror tmp0, 6 /* 34 */; \ + pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */; \ + xor tmp0, a; \ + ror tmp0, 28 /* 28 */; \ + psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */; \ + add T2, tmp0; \ + add d, T1; \ + psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */; \ + lea h, [T1 + T2] + +#define SHA512_2Sched_2Round_sse_PART2(t, a, b, c, d, e, f, g, h) \ + movdqa xmm1, xmm2 /* XMM1 = W[t-2] */; \ + mov T1, f; \ + xor T1, g; \ + movdqa xmm4, xmm5 /* XMM4 = W[t-15] */; \ + and T1, e; \ + xor T1, g; \ + psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */; \ + add T1, [WK_2(t+1)]; \ + mov tmp0, e; \ + psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */; \ + ror tmp0, 23 /* 41 */; \ + xor tmp0, e; \ + pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */; \ + ror tmp0, 4 /* 18 */; \ + xor tmp0, e; \ + pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */; \ + ror tmp0, 14 /* 14 */; \ + add T1, tmp0; \ + psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */; \ + add T1, h; \ + mov T2, a; \ + psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */; \ + xor T2, c; \ + and T2, b; \ + pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */; \ + mov tmp0, a; \ + and tmp0, c; \ + movdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \ + xor T2, tmp0; \ + pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */; \ + mov tmp0, a; \ + paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */; \ + ror tmp0, 5 /* 39 */; \ + paddq xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */; \ + xor tmp0, a; \ + paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \ + ror tmp0, 6 /* 34 */; \ + movdqa [W_t(t)], xmm0 /* Store scheduled qwords */; \ + xor tmp0, a; \ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + ror tmp0, 28 /* 28 */; \ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */; \ + add T2, tmp0; \ + add d, T1; \ + lea h, [T1 + T2] + +#define SHA512_2Sched_2Round_sse(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h); \ + SHA512_2Sched_2Round_sse_PART2(t, h, a, b, c, d, e, f, g) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -295,37 +285,77 @@ _gcry_sha512_transform_amd64_ssse3: mov g_64, [DIGEST(6)] mov h_64, [DIGEST(7)] - t = 0 - .rept 80/2 + 1 - /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ - /* +1 iteration because the scheduler leads hashing by 1 iteration */ - .if t < 2 - /* BSWAP 2 QWORDS */ - movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] - movdqu xmm0, [MSG(t)] - pshufb xmm0, xmm1 /* BSWAP */ - movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - movdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ - .elseif t < 16 - /* BSWAP 2 QWORDS; Compute 2 Rounds */ - movdqu xmm0, [MSG(t)] - pshufb xmm0, xmm1 /* BSWAP */ - SHA512_Round (t - 2) /* Round t-2 */ - movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - SHA512_Round (t - 1) /* Round t-1 */ - movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ - .elseif t < 79 - /* Schedule 2 QWORDS; Compute 2 Rounds */ - SHA512_2Sched_2Round_sse t - .else - /* Compute 2 Rounds */ - SHA512_Round (t - 2) - SHA512_Round (t - 1) - .endif - t = (t)+2 - .endr + /* BSWAP 2 QWORDS */ + movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + movdqu xmm0, [MSG(0)] + pshufb xmm0, xmm1 /* BSWAP */ + movdqa [W_t(0)], xmm0 /* Store Scheduled Pair */ + paddq xmm0, [K_t(0)] /* Compute W[t]+K[t] */ + movdqa [WK_2(0)], xmm0 /* Store into WK for rounds */ + + #define T_2_14(t, a, b, c, d, e, f, g, h) \ + /* BSWAP 2 QWORDS; Compute 2 Rounds */; \ + movdqu xmm0, [MSG(t)]; \ + pshufb xmm0, xmm1 /* BSWAP */; \ + SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */; \ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64); \ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ + + #define T_16_78(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_sse((t), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64) + + #define T_80(t, a, b, c, d, e, f, g, h) \ + /* Compute 2 Rounds */; \ + SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64) + + T_2_14(2, a, b, c, d, e, f, g, h) + T_2_14(4, g, h, a, b, c, d, e, f) + T_2_14(6, e, f, g, h, a, b, c, d) + T_2_14(8, c, d, e, f, g, h, a, b) + T_2_14(10, a, b, c, d, e, f, g, h) + T_2_14(12, g, h, a, b, c, d, e, f) + T_2_14(14, e, f, g, h, a, b, c, d) + T_16_78(16, c, d, e, f, g, h, a, b) + T_16_78(18, a, b, c, d, e, f, g, h) + T_16_78(20, g, h, a, b, c, d, e, f) + T_16_78(22, e, f, g, h, a, b, c, d) + T_16_78(24, c, d, e, f, g, h, a, b) + T_16_78(26, a, b, c, d, e, f, g, h) + T_16_78(28, g, h, a, b, c, d, e, f) + T_16_78(30, e, f, g, h, a, b, c, d) + T_16_78(32, c, d, e, f, g, h, a, b) + T_16_78(34, a, b, c, d, e, f, g, h) + T_16_78(36, g, h, a, b, c, d, e, f) + T_16_78(38, e, f, g, h, a, b, c, d) + T_16_78(40, c, d, e, f, g, h, a, b) + T_16_78(42, a, b, c, d, e, f, g, h) + T_16_78(44, g, h, a, b, c, d, e, f) + T_16_78(46, e, f, g, h, a, b, c, d) + T_16_78(48, c, d, e, f, g, h, a, b) + T_16_78(50, a, b, c, d, e, f, g, h) + T_16_78(52, g, h, a, b, c, d, e, f) + T_16_78(54, e, f, g, h, a, b, c, d) + T_16_78(56, c, d, e, f, g, h, a, b) + T_16_78(58, a, b, c, d, e, f, g, h) + T_16_78(60, g, h, a, b, c, d, e, f) + T_16_78(62, e, f, g, h, a, b, c, d) + T_16_78(64, c, d, e, f, g, h, a, b) + T_16_78(66, a, b, c, d, e, f, g, h) + T_16_78(68, g, h, a, b, c, d, e, f) + T_16_78(70, e, f, g, h, a, b, c, d) + T_16_78(72, c, d, e, f, g, h, a, b) + T_16_78(74, a, b, c, d, e, f, g, h) + T_16_78(76, g, h, a, b, c, d, e, f) + T_16_78(78, e, f, g, h, a, b, c, d) + T_80(80, c, d, e, f, g, h, a, b) /* Update digest */ add [DIGEST(0)], a_64 @@ -362,11 +392,12 @@ _gcry_sha512_transform_amd64_ssse3: pxor xmm5, xmm5 /* Burn stack */ - t = 0 - .rept frame_W_size / 16 - movdqu [rsp + frame_W + (t) * 16], xmm0 - t = ((t)+1) - .endr + mov eax, 0 +.Lerase_stack: + movdqu [rsp + rax], xmm0 + add eax, 16 + cmp eax, frame_W_size + jne .Lerase_stack movdqu [rsp + frame_WK], xmm0 xor eax, eax diff --git a/configure.ac b/configure.ac index f7339a3e..e4a10b78 100644 --- a/configure.ac +++ b/configure.ac @@ -1741,21 +1741,11 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly im ".text\n\t" "actest:\n\t" "pxor xmm1, xmm7;\n\t" - /* Intel syntax implementation also use GAS macros, so check - * for them here. */ - "VAL_A = xmm4\n\t" - "VAL_B = xmm2\n\t" - ".macro SET_VAL_A p1\n\t" - " VAL_A = \\\\p1 \n\t" - ".endm\n\t" - ".macro SET_VAL_B p1\n\t" - " VAL_B = \\\\p1 \n\t" - ".endm\n\t" - "vmovdqa VAL_A, VAL_B;\n\t" - "SET_VAL_A eax\n\t" - "SET_VAL_B ebp\n\t" - "add VAL_A, VAL_B;\n\t" - "add VAL_B, 0b10101;\n\t" + "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t" + "add eax, ebp;\n\t" + "rorx eax, ebp, 1;\n\t" + "sub eax, [esp + 4];\n\t" + "add dword ptr [esp + eax], 0b10101;\n\t" ".att_syntax prefix\n\t" );]], [ actest(); ])], [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes]) |