diff options
Diffstat (limited to 'cipher/sha512-avx-amd64.S')
-rw-r--r-- | cipher/sha512-avx-amd64.S | 456 |
1 files changed, 243 insertions, 213 deletions
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 991fd639..75f7b070 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -53,32 +53,32 @@ .text /* Virtual Registers */ -msg = rdi /* ARG1 */ -digest = rsi /* ARG2 */ -msglen = rdx /* ARG3 */ -T1 = rcx -T2 = r8 -a_64 = r9 -b_64 = r10 -c_64 = r11 -d_64 = r12 -e_64 = r13 -f_64 = r14 -g_64 = r15 -h_64 = rbx -tmp0 = rax +#define msg rdi /* ARG1 */ +#define digest rsi /* ARG2 */ +#define msglen rdx /* ARG3 */ +#define T1 rcx +#define T2 r8 +#define a_64 r9 +#define b_64 r10 +#define c_64 r11 +#define d_64 r12 +#define e_64 r13 +#define f_64 r14 +#define g_64 r15 +#define h_64 rbx +#define tmp0 rax /* ; Local variables (stack frame) ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP */ -frame_W = 0 /* Message Schedule */ -frame_W_size = (80 * 8) -frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ -frame_WK_size = (2 * 8) -frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) -frame_GPRSAVE_size = (5 * 8) -frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) +#define frame_W 0 /* Message Schedule */ +#define frame_W_size (80 * 8) +#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +#define frame_WK_size (2 * 8) +#define frame_GPRSAVE ((frame_WK) + (frame_WK_size)) +#define frame_GPRSAVE_size (5 * 8) +#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* Useful QWORD "arrays" for simpler memory references */ @@ -90,162 +90,151 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* MSG, DIGEST, K_t, W_t are arrays */ /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ -.macro RotateState - /* Rotate symbles a..h right */ - __TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = __TMP -.endm - -.macro RORQ p1 p2 - /* shld is faster than ror on Intel Sandybridge */ - shld \p1, \p1, (64 - \p2) -.endm - -.macro SHA512_Round t - /* Compute Round %%t */ - mov T1, f_64 /* T1 = f */ - mov tmp0, e_64 /* tmp = e */ - xor T1, g_64 /* T1 = f ^ g */ - RORQ tmp0, 23 /* 41 ; tmp = e ror 23 */ - and T1, e_64 /* T1 = (f ^ g) & e */ - xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ - xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ - add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ - RORQ tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ - xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ - mov T2, a_64 /* T2 = a */ - add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ - RORQ tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ - add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ - mov tmp0, a_64 /* tmp = a */ - xor T2, c_64 /* T2 = a ^ c */ - and tmp0, c_64 /* tmp = a & c */ - and T2, b_64 /* T2 = (a ^ c) & b */ - xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ - mov tmp0, a_64 /* tmp = a */ - RORQ tmp0, 5 /* 39 ; tmp = a ror 5 */ - xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ - add d_64, T1 /* e(next_state) = d + T1 */ - RORQ tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ - xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ - lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ - RORQ tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ - add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ - RotateState -.endm - -.macro SHA512_2Sched_2Round_avx t -/* ; Compute rounds %%t-2 and %%t-1 - ; Compute message schedule QWORDS %%t and %%t+1 - - ; Two rounds are computed based on the values for K[t-2]+W[t-2] and - ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message - ; scheduler. - ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. - ; They are then added to their respective SHA512 constants at - ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] - ; For brievity, the comments following vectored instructions only refer to - ; the first of a pair of QWORDS. - ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} - ; The computation of the message schedule and the rounds are tightly - ; stitched to take advantage of instruction-level parallelism. - ; For clarity, integer instructions (for the rounds calculation) are indented - ; by one tab. Vectored instructions (for the message scheduler) are indented - ; by two tabs. */ - - vmovdqa xmm4, [W_t(\t-2)] /* XMM4 = W[t-2] */ - vmovdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ - mov T1, f_64 - vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */ - mov tmp0, e_64 - vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */ - xor T1, g_64 - RORQ tmp0, 23 /* 41 */ - vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */ - and T1, e_64 - xor tmp0, e_64 - vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */ - xor T1, g_64 - add T1, [WK_2(\t)]; - vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */ - RORQ tmp0, 4 /* 18 */ - vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */ - xor tmp0, e_64 - mov T2, a_64 - add T1, h_64 - vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */ - RORQ tmp0, 14 /* 14 */ - add T1, tmp0 - vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */ - mov tmp0, a_64 - xor T2, c_64 - vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */ - and tmp0, c_64 - and T2, b_64 - vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */ - xor T2, tmp0 - mov tmp0, a_64 - vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */ - RORQ tmp0, 5 /* 39 */ - vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */ - xor tmp0, a_64 - add d_64, T1 - RORQ tmp0, 6 /* 34 */ - xor tmp0, a_64 - vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */ - lea h_64, [T1 + T2] - RORQ tmp0, 28 /* 28 */ - vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */ - add h_64, tmp0 - RotateState - vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */ - mov T1, f_64 - vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */ - mov tmp0, e_64 - xor T1, g_64 - vpaddq xmm0, xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */ - vmovdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ - RORQ tmp0, 23 /* 41 */ - and T1, e_64 - xor tmp0, e_64 - xor T1, g_64 - vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */ - add T1, [WK_2(\t+1)] - vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */ - RORQ tmp0, 4 /* 18 */ - vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */ - xor tmp0, e_64 - vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ - mov T2, a_64 - add T1, h_64 - RORQ tmp0, 14 /* 14 */ - add T1, tmp0 - vmovdqa [W_t(\t)], xmm0 /* Store W[t] */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ - mov tmp0, a_64 - xor T2, c_64 - and tmp0, c_64 - and T2, b_64 - xor T2, tmp0 - mov tmp0, a_64 - RORQ tmp0, 5 /* 39 */ - xor tmp0, a_64 - add d_64, T1 - RORQ tmp0, 6 /* 34 */ - xor tmp0, a_64 - lea h_64, [T1 + T2] - RORQ tmp0, 28 /* 28 */ - add h_64, tmp0 - RotateState -.endm +#define RORQ(p1, p2) \ + /* shld is faster than ror on Intel Sandybridge */ \ + shld p1, p1, (64 - p2) + +#define SHA512_Round(t, a, b, c, d, e, f, g, h) \ + /* Compute Round %%t */; \ + mov T1, f /* T1 = f */; \ + mov tmp0, e /* tmp = e */; \ + xor T1, g /* T1 = f ^ g */; \ + RORQ( tmp0, 23) /* 41 ; tmp = e ror 23 */; \ + and T1, e /* T1 = (f ^ g) & e */; \ + xor tmp0, e /* tmp = (e ror 23) ^ e */; \ + xor T1, g /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \ + add T1, [WK_2(t)] /* W[t] + K[t] from message scheduler */; \ + RORQ( tmp0, 4) /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */; \ + xor tmp0, e /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \ + mov T2, a /* T2 = a */; \ + add T1, h /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \ + RORQ( tmp0, 14) /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \ + mov tmp0, a /* tmp = a */; \ + xor T2, c /* T2 = a ^ c */; \ + and tmp0, c /* tmp = a & c */; \ + and T2, b /* T2 = (a ^ c) & b */; \ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \ + mov tmp0, a /* tmp = a */; \ + RORQ( tmp0, 5) /* 39 ; tmp = a ror 5 */; \ + xor tmp0, a /* tmp = (a ror 5) ^ a */; \ + add d, T1 /* e(next_state) = d + T1 */; \ + RORQ( tmp0, 6) /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */; \ + xor tmp0, a /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \ + lea h, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */; \ + RORQ( tmp0, 28) /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \ + add h, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + +#define SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h) \ + /* \ + ; Compute rounds %%t-2 and %%t-1 \ + ; Compute message schedule QWORDS %%t and %%t+1 \ + ; \ + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and \ + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \ + ; scheduler. \ + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \ + ; They are then added to their respective SHA512 constants at \ + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \ + ; For brievity, the comments following vectored instructions only refer to \ + ; the first of a pair of QWORDS. \ + ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} \ + ; The computation of the message schedule and the rounds are tightly \ + ; stitched to take advantage of instruction-level parallelism. \ + ; For clarity, integer instructions (for the rounds calculation) are indented \ + ; by one tab. Vectored instructions (for the message scheduler) are indented \ + ; by two tabs. \ + */ \ + \ + vmovdqa xmm4, [W_t(t-2)] /* XMM4 = W[t-2] */; \ + vmovdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \ + mov T1, f; \ + vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */; \ + mov tmp0, e; \ + vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */; \ + xor T1, g; \ + RORQ( tmp0, 23) /* 41 */; \ + vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */; \ + and T1, e; \ + xor tmp0, e; \ + vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */; \ + xor T1, g; \ + add T1, [WK_2(t)]; \ + vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */; \ + RORQ( tmp0, 4) /* 18 */; \ + vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */; \ + xor tmp0, e; \ + mov T2, a; \ + add T1, h; \ + vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */; \ + RORQ( tmp0, 14) /* 14 */; \ + add T1, tmp0; \ + vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */; \ + mov tmp0, a; \ + xor T2, c; \ + vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */; \ + and tmp0, c; \ + and T2, b; \ + vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */; \ + xor T2, tmp0; \ + mov tmp0, a; \ + vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */; \ + RORQ( tmp0, 5) /* 39 */; \ + vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */; \ + xor tmp0, a; \ + add d, T1; \ + RORQ( tmp0, 6) /* 34 */; \ + xor tmp0, a; \ + vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */; \ + lea h, [T1 + T2]; \ + RORQ( tmp0, 28) /* 28 */; \ + vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */; \ + add h, tmp0 + +#define SHA512_2Sched_2Round_avx_PART2(t, a, b, c, d, e, f, g, h) \ + vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */; \ + mov T1, f; \ + vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */; \ + mov tmp0, e; \ + xor T1, g; \ + vpaddq xmm0, xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */; \ + vmovdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \ + RORQ( tmp0, 23) /* 41 */; \ + and T1, e; \ + xor tmp0, e; \ + xor T1, g; \ + vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */; \ + add T1, [WK_2(t+1)]; \ + vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */; \ + RORQ( tmp0, 4) /* 18 */; \ + vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */; \ + xor tmp0, e; \ + vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \ + mov T2, a; \ + add T1, h; \ + RORQ( tmp0, 14) /* 14 */; \ + add T1, tmp0; \ + vmovdqa [W_t(t)], xmm0 /* Store W[t] */; \ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */; \ + mov tmp0, a; \ + xor T2, c; \ + and tmp0, c; \ + and T2, b; \ + xor T2, tmp0; \ + mov tmp0, a; \ + RORQ( tmp0, 5) /* 39 */; \ + xor tmp0, a; \ + add d, T1; \ + RORQ( tmp0, 6) /* 34 */; \ + xor tmp0, a; \ + lea h, [T1 + T2]; \ + RORQ( tmp0, 28) /* 28 */; \ + add h, tmp0 + +#define SHA512_2Sched_2Round_avx(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h); \ + SHA512_2Sched_2Round_avx_PART2(t, h, a, b, c, d, e, f, g) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -295,37 +284,77 @@ _gcry_sha512_transform_amd64_avx: mov g_64, [DIGEST(6)] mov h_64, [DIGEST(7)] - t = 0 - .rept 80/2 + 1 - /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ - /* +1 iteration because the scheduler leads hashing by 1 iteration */ - .if t < 2 - /* BSWAP 2 QWORDS */ - vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] - vmovdqu xmm0, [MSG(t)] - vpshufb xmm0, xmm0, xmm1 /* BSWAP */ - vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - vmovdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ - .elseif t < 16 - /* BSWAP 2 QWORDS, Compute 2 Rounds */ - vmovdqu xmm0, [MSG(t)] - vpshufb xmm0, xmm0, xmm1 /* BSWAP */ - SHA512_Round (t - 2) /* Round t-2 */ - vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - SHA512_Round (t - 1) /* Round t-1 */ - vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ - .elseif t < 79 - /* Schedule 2 QWORDS; Compute 2 Rounds */ - SHA512_2Sched_2Round_avx t - .else - /* Compute 2 Rounds */ - SHA512_Round (t - 2) - SHA512_Round (t - 1) - .endif - t = ((t)+2) - .endr + /* BSWAP 2 QWORDS */ + vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + vmovdqu xmm0, [MSG(0)] + vpshufb xmm0, xmm0, xmm1 /* BSWAP */ + vmovdqa [W_t(0)], xmm0 /* Store Scheduled Pair */ + vpaddq xmm0, xmm0, [K_t(0)] /* Compute W[t]+K[t] */ + vmovdqa [WK_2(0)], xmm0 /* Store into WK for rounds */ + + #define T_2_14(t, a, b, c, d, e, f, g, h) \ + /* BSWAP 2 QWORDS, Compute 2 Rounds */; \ + vmovdqu xmm0, [MSG(t)]; \ + vpshufb xmm0, xmm0, xmm1 /* BSWAP */; \ + SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */; \ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64); \ + vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ + + #define T_16_78(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_avx((t), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64) + + #define T_80(t, a, b, c, d, e, f, g, h) \ + /* Compute 2 Rounds */; \ + SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64) + + T_2_14(2, a, b, c, d, e, f, g, h) + T_2_14(4, g, h, a, b, c, d, e, f) + T_2_14(6, e, f, g, h, a, b, c, d) + T_2_14(8, c, d, e, f, g, h, a, b) + T_2_14(10, a, b, c, d, e, f, g, h) + T_2_14(12, g, h, a, b, c, d, e, f) + T_2_14(14, e, f, g, h, a, b, c, d) + T_16_78(16, c, d, e, f, g, h, a, b) + T_16_78(18, a, b, c, d, e, f, g, h) + T_16_78(20, g, h, a, b, c, d, e, f) + T_16_78(22, e, f, g, h, a, b, c, d) + T_16_78(24, c, d, e, f, g, h, a, b) + T_16_78(26, a, b, c, d, e, f, g, h) + T_16_78(28, g, h, a, b, c, d, e, f) + T_16_78(30, e, f, g, h, a, b, c, d) + T_16_78(32, c, d, e, f, g, h, a, b) + T_16_78(34, a, b, c, d, e, f, g, h) + T_16_78(36, g, h, a, b, c, d, e, f) + T_16_78(38, e, f, g, h, a, b, c, d) + T_16_78(40, c, d, e, f, g, h, a, b) + T_16_78(42, a, b, c, d, e, f, g, h) + T_16_78(44, g, h, a, b, c, d, e, f) + T_16_78(46, e, f, g, h, a, b, c, d) + T_16_78(48, c, d, e, f, g, h, a, b) + T_16_78(50, a, b, c, d, e, f, g, h) + T_16_78(52, g, h, a, b, c, d, e, f) + T_16_78(54, e, f, g, h, a, b, c, d) + T_16_78(56, c, d, e, f, g, h, a, b) + T_16_78(58, a, b, c, d, e, f, g, h) + T_16_78(60, g, h, a, b, c, d, e, f) + T_16_78(62, e, f, g, h, a, b, c, d) + T_16_78(64, c, d, e, f, g, h, a, b) + T_16_78(66, a, b, c, d, e, f, g, h) + T_16_78(68, g, h, a, b, c, d, e, f) + T_16_78(70, e, f, g, h, a, b, c, d) + T_16_78(72, c, d, e, f, g, h, a, b) + T_16_78(74, a, b, c, d, e, f, g, h) + T_16_78(76, g, h, a, b, c, d, e, f) + T_16_78(78, e, f, g, h, a, b, c, d) + T_80(80, c, d, e, f, g, h, a, b) /* Update digest */ add [DIGEST(0)], a_64 @@ -357,11 +386,12 @@ _gcry_sha512_transform_amd64_avx: vzeroall /* Burn stack */ - t = 0 - .rept frame_W_size / 32 - vmovups [rsp + frame_W + (t) * 32], ymm0 - t = ((t)+1) - .endr + mov eax, 0 +.Lerase_stack: + vmovdqu [rsp + rax], ymm0 + add eax, 32 + cmp eax, frame_W_size + jne .Lerase_stack vmovdqu [rsp + frame_WK], xmm0 xor eax, eax |