diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-01-20 21:55:01 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-01-26 19:41:58 +0200 |
commit | 9f49e806f9506533236fd44b17f17b85961b20f1 (patch) | |
tree | cd6b3ad4996c8a76200831fc3a661bdfe6da98fe /cipher/sha512-ssse3-amd64.S | |
parent | 393bd6c3d1aa2b2a1b05be0e2d7fb2514e6c5ad0 (diff) | |
download | libgcrypt-9f49e806f9506533236fd44b17f17b85961b20f1.tar.gz |
sha512/sha256: remove assembler macros from AMD64 implementations
* configure.ac (gcry_cv_gcc_platform_as_ok_for_intel_syntax): Remove
assembler macro check from Intel syntax assembly support check.
* cipher/sha256-avx-amd64.S: Replace assembler macros with C
preprocessor counterparts.
* cipher/sha256-avx2-bmi2-amd64.S: Ditto.
* cipher/sha256-ssse3-amd64.S: Ditto.
* cipher/sha512-avx-amd64.S: Ditto.
* cipher/sha512-avx2-bmi2-amd64.S: Ditto.
* cipher/sha512-ssse3-amd64.S: Ditto.
--
Removing GNU assembler macros allows building these implementations with
clang.
GnuPG-bug-id: 5255
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/sha512-ssse3-amd64.S')
-rw-r--r-- | cipher/sha512-ssse3-amd64.S | 455 |
1 files changed, 243 insertions, 212 deletions
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 39bfe362..6a1328a6 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -56,32 +56,32 @@ .text /* Virtual Registers */ -msg = rdi /* ARG1 */ -digest = rsi /* ARG2 */ -msglen = rdx /* ARG3 */ -T1 = rcx -T2 = r8 -a_64 = r9 -b_64 = r10 -c_64 = r11 -d_64 = r12 -e_64 = r13 -f_64 = r14 -g_64 = r15 -h_64 = rbx -tmp0 = rax +#define msg rdi /* ARG1 */ +#define digest rsi /* ARG2 */ +#define msglen rdx /* ARG3 */ +#define T1 rcx +#define T2 r8 +#define a_64 r9 +#define b_64 r10 +#define c_64 r11 +#define d_64 r12 +#define e_64 r13 +#define f_64 r14 +#define g_64 r15 +#define h_64 rbx +#define tmp0 rax /* ; Local variables (stack frame) ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP */ -frame_W = 0 /* Message Schedule */ -frame_W_size = (80 * 8) -frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ -frame_WK_size = (2 * 8) -frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) -frame_GPRSAVE_size = (5 * 8) -frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) +#define frame_W 0 /* Message Schedule */ +#define frame_W_size (80 * 8) +#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +#define frame_WK_size (2 * 8) +#define frame_GPRSAVE ((frame_WK) + (frame_WK_size)) +#define frame_GPRSAVE_size (5 * 8) +#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* Useful QWORD "arrays" for simpler memory references */ @@ -93,161 +93,151 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* MSG, DIGEST, K_t, W_t are arrays */ /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ -.macro RotateState - /* Rotate symbles a..h right */ - __TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = __TMP -.endm - -.macro SHA512_Round t - /* Compute Round %%t */ - mov T1, f_64 /* T1 = f */ - mov tmp0, e_64 /* tmp = e */ - xor T1, g_64 /* T1 = f ^ g */ - ror tmp0, 23 /* 41 ; tmp = e ror 23 */ - and T1, e_64 /* T1 = (f ^ g) & e */ - xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ - xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ - add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ - ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ - xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ - mov T2, a_64 /* T2 = a */ - add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ - ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ - add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ - mov tmp0, a_64 /* tmp = a */ - xor T2, c_64 /* T2 = a ^ c */ - and tmp0, c_64 /* tmp = a & c */ - and T2, b_64 /* T2 = (a ^ c) & b */ - xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ - mov tmp0, a_64 /* tmp = a */ - ror tmp0, 5 /* 39 ; tmp = a ror 5 */ - xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ - add d_64, T1 /* e(next_state) = d + T1 */ - ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ - xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ - lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ - ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ - add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ - RotateState -.endm - -.macro SHA512_2Sched_2Round_sse t -/* ; Compute rounds %%t-2 and %%t-1 - ; Compute message schedule QWORDS %%t and %%t+1 - - ; Two rounds are computed based on the values for K[t-2]+W[t-2] and - ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message - ; scheduler. - ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. - ; They are then added to their respective SHA512 constants at - ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] - ; For brievity, the comments following vectored instructions only refer to - ; the first of a pair of QWORDS. - ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} - ; The computation of the message schedule and the rounds are tightly - ; stitched to take advantage of instruction-level parallelism. - ; For clarity, integer instructions (for the rounds calculation) are indented - ; by one tab. Vectored instructions (for the message scheduler) are indented - ; by two tabs. */ - - mov T1, f_64 - movdqa xmm2, [W_t(\t-2)] /* XMM2 = W[t-2] */ - xor T1, g_64 - and T1, e_64 - movdqa xmm0, xmm2 /* XMM0 = W[t-2] */ - xor T1, g_64 - add T1, [WK_2(\t)] - movdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ - mov tmp0, e_64 - ror tmp0, 23 /* 41 */ - movdqa xmm3, xmm5 /* XMM3 = W[t-15] */ - xor tmp0, e_64 - ror tmp0, 4 /* 18 */ - psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */ - xor tmp0, e_64 - ror tmp0, 14 /* 14 */ - psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */ - add T1, tmp0 - add T1, h_64 - pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */ - mov T2, a_64 - xor T2, c_64 - pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */ - and T2, b_64 - mov tmp0, a_64 - psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */ - and tmp0, c_64 - xor T2, tmp0 - psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */ - mov tmp0, a_64 - ror tmp0, 5 /* 39 */ - pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */ - xor tmp0, a_64 - ror tmp0, 6 /* 34 */ - pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */ - xor tmp0, a_64 - ror tmp0, 28 /* 28 */ - psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */ - add T2, tmp0 - add d_64, T1 - psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */ - lea h_64, [T1 + T2] - RotateState - movdqa xmm1, xmm2 /* XMM1 = W[t-2] */ - mov T1, f_64 - xor T1, g_64 - movdqa xmm4, xmm5 /* XMM4 = W[t-15] */ - and T1, e_64 - xor T1, g_64 - psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */ - add T1, [WK_2(\t+1)] - mov tmp0, e_64 - psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */ - ror tmp0, 23 /* 41 */ - xor tmp0, e_64 - pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */ - ror tmp0, 4 /* 18 */ - xor tmp0, e_64 - pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */ - ror tmp0, 14 /* 14 */ - add T1, tmp0 - psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */ - add T1, h_64 - mov T2, a_64 - psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */ - xor T2, c_64 - and T2, b_64 - pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */ - mov tmp0, a_64 - and tmp0, c_64 - movdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ - xor T2, tmp0 - pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */ - mov tmp0, a_64 - paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */ - ror tmp0, 5 /* 39 */ - paddq xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */ - xor tmp0, a_64 - paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ - ror tmp0, 6 /* 34 */ - movdqa [W_t(\t)], xmm0 /* Store scheduled qwords */ - xor tmp0, a_64 - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - ror tmp0, 28 /* 28 */ - movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ - add T2, tmp0 - add d_64, T1 - lea h_64, [T1 + T2] - RotateState -.endm +#define SHA512_Round(t, a, b, c, d, e, f, g, h) \ + /* Compute Round %%t */; \ + mov T1, f /* T1 = f */; \ + mov tmp0, e /* tmp = e */; \ + xor T1, g /* T1 = f ^ g */; \ + ror tmp0, 23 /* 41 ; tmp = e ror 23 */; \ + and T1, e /* T1 = (f ^ g) & e */; \ + xor tmp0, e /* tmp = (e ror 23) ^ e */; \ + xor T1, g /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \ + add T1, [WK_2(t)] /* W[t] + K[t] from message scheduler */; \ + ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */; \ + xor tmp0, e /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \ + mov T2, a /* T2 = a */; \ + add T1, h /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \ + ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \ + mov tmp0, a /* tmp = a */; \ + xor T2, c /* T2 = a ^ c */; \ + and tmp0, c /* tmp = a & c */; \ + and T2, b /* T2 = (a ^ c) & b */; \ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \ + mov tmp0, a /* tmp = a */; \ + ror tmp0, 5 /* 39 ; tmp = a ror 5 */; \ + xor tmp0, a /* tmp = (a ror 5) ^ a */; \ + add d, T1 /* e(next_state) = d + T1 */; \ + ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */; \ + xor tmp0, a /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \ + lea h, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */; \ + ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \ + add h, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + +#define SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h) \ + /* \ + ; Compute rounds %%t-2 and %%t-1 \ + ; Compute message schedule QWORDS %%t and %%t+1 \ + ; \ + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and \ + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \ + ; scheduler. \ + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \ + ; They are then added to their respective SHA512 constants at \ + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \ + ; For brievity, the comments following vectored instructions only refer to \ + ; the first of a pair of QWORDS. \ + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} \ + ; The computation of the message schedule and the rounds are tightly \ + ; stitched to take advantage of instruction-level parallelism. \ + ; For clarity, integer instructions (for the rounds calculation) are indented \ + ; by one tab. Vectored instructions (for the message scheduler) are indented \ + ; by two tabs. \ + */ \ + \ + mov T1, f; \ + movdqa xmm2, [W_t(t-2)] /* XMM2 = W[t-2] */; \ + xor T1, g; \ + and T1, e; \ + movdqa xmm0, xmm2 /* XMM0 = W[t-2] */; \ + xor T1, g; \ + add T1, [WK_2(t)]; \ + movdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \ + mov tmp0, e; \ + ror tmp0, 23 /* 41 */; \ + movdqa xmm3, xmm5 /* XMM3 = W[t-15] */; \ + xor tmp0, e; \ + ror tmp0, 4 /* 18 */; \ + psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */; \ + xor tmp0, e; \ + ror tmp0, 14 /* 14 */; \ + psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */; \ + add T1, tmp0; \ + add T1, h; \ + pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */; \ + mov T2, a; \ + xor T2, c; \ + pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */; \ + and T2, b; \ + mov tmp0, a; \ + psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */; \ + and tmp0, c; \ + xor T2, tmp0; \ + psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */; \ + mov tmp0, a; \ + ror tmp0, 5 /* 39 */; \ + pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */; \ + xor tmp0, a; \ + ror tmp0, 6 /* 34 */; \ + pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */; \ + xor tmp0, a; \ + ror tmp0, 28 /* 28 */; \ + psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */; \ + add T2, tmp0; \ + add d, T1; \ + psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */; \ + lea h, [T1 + T2] + +#define SHA512_2Sched_2Round_sse_PART2(t, a, b, c, d, e, f, g, h) \ + movdqa xmm1, xmm2 /* XMM1 = W[t-2] */; \ + mov T1, f; \ + xor T1, g; \ + movdqa xmm4, xmm5 /* XMM4 = W[t-15] */; \ + and T1, e; \ + xor T1, g; \ + psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */; \ + add T1, [WK_2(t+1)]; \ + mov tmp0, e; \ + psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */; \ + ror tmp0, 23 /* 41 */; \ + xor tmp0, e; \ + pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */; \ + ror tmp0, 4 /* 18 */; \ + xor tmp0, e; \ + pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */; \ + ror tmp0, 14 /* 14 */; \ + add T1, tmp0; \ + psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */; \ + add T1, h; \ + mov T2, a; \ + psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */; \ + xor T2, c; \ + and T2, b; \ + pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */; \ + mov tmp0, a; \ + and tmp0, c; \ + movdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \ + xor T2, tmp0; \ + pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */; \ + mov tmp0, a; \ + paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */; \ + ror tmp0, 5 /* 39 */; \ + paddq xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */; \ + xor tmp0, a; \ + paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \ + ror tmp0, 6 /* 34 */; \ + movdqa [W_t(t)], xmm0 /* Store scheduled qwords */; \ + xor tmp0, a; \ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + ror tmp0, 28 /* 28 */; \ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */; \ + add T2, tmp0; \ + add d, T1; \ + lea h, [T1 + T2] + +#define SHA512_2Sched_2Round_sse(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h); \ + SHA512_2Sched_2Round_sse_PART2(t, h, a, b, c, d, e, f, g) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -295,37 +285,77 @@ _gcry_sha512_transform_amd64_ssse3: mov g_64, [DIGEST(6)] mov h_64, [DIGEST(7)] - t = 0 - .rept 80/2 + 1 - /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ - /* +1 iteration because the scheduler leads hashing by 1 iteration */ - .if t < 2 - /* BSWAP 2 QWORDS */ - movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] - movdqu xmm0, [MSG(t)] - pshufb xmm0, xmm1 /* BSWAP */ - movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - movdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ - .elseif t < 16 - /* BSWAP 2 QWORDS; Compute 2 Rounds */ - movdqu xmm0, [MSG(t)] - pshufb xmm0, xmm1 /* BSWAP */ - SHA512_Round (t - 2) /* Round t-2 */ - movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - SHA512_Round (t - 1) /* Round t-1 */ - movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ - .elseif t < 79 - /* Schedule 2 QWORDS; Compute 2 Rounds */ - SHA512_2Sched_2Round_sse t - .else - /* Compute 2 Rounds */ - SHA512_Round (t - 2) - SHA512_Round (t - 1) - .endif - t = (t)+2 - .endr + /* BSWAP 2 QWORDS */ + movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + movdqu xmm0, [MSG(0)] + pshufb xmm0, xmm1 /* BSWAP */ + movdqa [W_t(0)], xmm0 /* Store Scheduled Pair */ + paddq xmm0, [K_t(0)] /* Compute W[t]+K[t] */ + movdqa [WK_2(0)], xmm0 /* Store into WK for rounds */ + + #define T_2_14(t, a, b, c, d, e, f, g, h) \ + /* BSWAP 2 QWORDS; Compute 2 Rounds */; \ + movdqu xmm0, [MSG(t)]; \ + pshufb xmm0, xmm1 /* BSWAP */; \ + SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */; \ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64); \ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ + + #define T_16_78(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_sse((t), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64) + + #define T_80(t, a, b, c, d, e, f, g, h) \ + /* Compute 2 Rounds */; \ + SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64) + + T_2_14(2, a, b, c, d, e, f, g, h) + T_2_14(4, g, h, a, b, c, d, e, f) + T_2_14(6, e, f, g, h, a, b, c, d) + T_2_14(8, c, d, e, f, g, h, a, b) + T_2_14(10, a, b, c, d, e, f, g, h) + T_2_14(12, g, h, a, b, c, d, e, f) + T_2_14(14, e, f, g, h, a, b, c, d) + T_16_78(16, c, d, e, f, g, h, a, b) + T_16_78(18, a, b, c, d, e, f, g, h) + T_16_78(20, g, h, a, b, c, d, e, f) + T_16_78(22, e, f, g, h, a, b, c, d) + T_16_78(24, c, d, e, f, g, h, a, b) + T_16_78(26, a, b, c, d, e, f, g, h) + T_16_78(28, g, h, a, b, c, d, e, f) + T_16_78(30, e, f, g, h, a, b, c, d) + T_16_78(32, c, d, e, f, g, h, a, b) + T_16_78(34, a, b, c, d, e, f, g, h) + T_16_78(36, g, h, a, b, c, d, e, f) + T_16_78(38, e, f, g, h, a, b, c, d) + T_16_78(40, c, d, e, f, g, h, a, b) + T_16_78(42, a, b, c, d, e, f, g, h) + T_16_78(44, g, h, a, b, c, d, e, f) + T_16_78(46, e, f, g, h, a, b, c, d) + T_16_78(48, c, d, e, f, g, h, a, b) + T_16_78(50, a, b, c, d, e, f, g, h) + T_16_78(52, g, h, a, b, c, d, e, f) + T_16_78(54, e, f, g, h, a, b, c, d) + T_16_78(56, c, d, e, f, g, h, a, b) + T_16_78(58, a, b, c, d, e, f, g, h) + T_16_78(60, g, h, a, b, c, d, e, f) + T_16_78(62, e, f, g, h, a, b, c, d) + T_16_78(64, c, d, e, f, g, h, a, b) + T_16_78(66, a, b, c, d, e, f, g, h) + T_16_78(68, g, h, a, b, c, d, e, f) + T_16_78(70, e, f, g, h, a, b, c, d) + T_16_78(72, c, d, e, f, g, h, a, b) + T_16_78(74, a, b, c, d, e, f, g, h) + T_16_78(76, g, h, a, b, c, d, e, f) + T_16_78(78, e, f, g, h, a, b, c, d) + T_80(80, c, d, e, f, g, h, a, b) /* Update digest */ add [DIGEST(0)], a_64 @@ -362,11 +392,12 @@ _gcry_sha512_transform_amd64_ssse3: pxor xmm5, xmm5 /* Burn stack */ - t = 0 - .rept frame_W_size / 16 - movdqu [rsp + frame_W + (t) * 16], xmm0 - t = ((t)+1) - .endr + mov eax, 0 +.Lerase_stack: + movdqu [rsp + rax], xmm0 + add eax, 16 + cmp eax, frame_W_size + jne .Lerase_stack movdqu [rsp + frame_WK], xmm0 xor eax, eax |