diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2019-04-05 17:37:42 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2019-04-05 17:57:45 +0300 |
commit | f3d4bd90662faaedd37ce0dae1f9e7f91748e91e (patch) | |
tree | eb006660288eecc568a0c9594bd81797a7b1b4f4 /cipher | |
parent | b982900bfe6403e95a157271d8d811c9c573af9e (diff) | |
download | libgcrypt-f3d4bd90662faaedd37ce0dae1f9e7f91748e91e.tar.gz |
Burn stack in transform functions for SHA1 AMD64 implementations
* cipher/sha1-avx-amd64.S: Burn stack inside transform functions.
* cipher/sha1-avx-bmi2-amd64.S: Ditto.
* cipher/sha1-avx2-bmi2-amd64.S: Ditto.
* cipher/sha1-ssse3-amd64.S: Ditto.
--
This change reduces per call overhead for SHA1.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/sha1-avx-amd64.S | 15 | ||||
-rw-r--r-- | cipher/sha1-avx-bmi2-amd64.S | 15 | ||||
-rw-r--r-- | cipher/sha1-avx2-bmi2-amd64.S | 47 | ||||
-rw-r--r-- | cipher/sha1-ssse3-amd64.S | 15 |
4 files changed, 52 insertions, 40 deletions
diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S index 143e4066..5d674c15 100644 --- a/cipher/sha1-avx-amd64.S +++ b/cipher/sha1-avx-amd64.S @@ -380,7 +380,7 @@ _gcry_sha1_transform_amd64_avx: .Lend: vzeroall; - /* Transform 64-79. */ + /* Transform 64-79 + burn stack */ R( b, c, d, e, a, F4, 64 ); R( a, b, c, d, e, F4, 65 ); R( e, a, b, c, d, F4, 66 ); @@ -393,12 +393,15 @@ _gcry_sha1_transform_amd64_avx: R( c, d, e, a, b, F4, 73 ); R( b, c, d, e, a, F4, 74 ); R( a, b, c, d, e, F4, 75 ); - R( e, a, b, c, d, F4, 76 ); - R( d, e, a, b, c, F4, 77 ); - R( c, d, e, a, b, F4, 78 ); + R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp); + R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp); + R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); + /* 16*4/16-1 = 3 */ + vmovdqa %xmm0, (3*16)(%rsp); + /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; @@ -416,8 +419,8 @@ _gcry_sha1_transform_amd64_avx: popq %rbp; popq %rbx; - /* burn_stack */ - movl $(16*4 + 2*8 + 31), %eax; + /* stack already burned */ + xorl %eax, %eax; .Lret: ret; diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S index 79ea24ef..fe8901ef 100644 --- a/cipher/sha1-avx-bmi2-amd64.S +++ b/cipher/sha1-avx-bmi2-amd64.S @@ -387,7 +387,7 @@ _gcry_sha1_transform_amd64_avx_bmi2: .Lend: vzeroall; - /* Transform 64-79. */ + /* Transform 64-79 + burn stack */ R( b, c, d, e, a, F4, 64 ); R( a, b, c, d, e, F4, 65 ); R( e, a, b, c, d, F4, 66 ); @@ -400,14 +400,17 @@ _gcry_sha1_transform_amd64_avx_bmi2: R( c, d, e, a, b, F4, 73 ); R( b, c, d, e, a, F4, 74 ); R( a, b, c, d, e, F4, 75 ); - R( e, a, b, c, d, F4, 76 ); - R( d, e, a, b, c, F4, 77 ); - R( c, d, e, a, b, F4, 78 ); + R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp); + R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp); + R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); addl ne, a; xorl ne, ne; + /* 16*4/16-1 = 3 */ + vmovdqa %xmm0, (3*16)(%rsp); + /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; @@ -426,8 +429,8 @@ _gcry_sha1_transform_amd64_avx_bmi2: popq %rbp; popq %rbx; - /* burn_stack */ - movl $(16*4 + 3*8 + 31), %eax; + /* stack already burned */ + xorl %eax, %eax; .Lret: ret; diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S index c666290f..2a2f21a5 100644 --- a/cipher/sha1-avx2-bmi2-amd64.S +++ b/cipher/sha1-avx2-bmi2-amd64.S @@ -504,7 +504,7 @@ _gcry_sha1_transform_amd64_avx2_bmi2: .Lend: vzeroall; - /* Transform 48-79 for block 2. */ + /* Transform 48-79 for block 2 + burn stack */ R( c, d, e, a, b, F3, 48, 1 ); R( b, c, d, e, a, F3, 49, 1 ); R( a, b, c, d, e, F3, 50, 1 ); @@ -517,30 +517,33 @@ _gcry_sha1_transform_amd64_avx2_bmi2: R( d, e, a, b, c, F3, 57, 1 ); R( c, d, e, a, b, F3, 58, 1 ); R( b, c, d, e, a, F3, 59, 1 ); - R( a, b, c, d, e, F4, 60, 1 ); - R( e, a, b, c, d, F4, 61, 1 ); - R( d, e, a, b, c, F4, 62, 1 ); - R( c, d, e, a, b, F4, 63, 1 ); - R( b, c, d, e, a, F4, 64, 1 ); - R( a, b, c, d, e, F4, 65, 1 ); - R( e, a, b, c, d, F4, 66, 1 ); - R( d, e, a, b, c, F4, 67, 1 ); - R( c, d, e, a, b, F4, 68, 1 ); - R( b, c, d, e, a, F4, 69, 1 ); - R( a, b, c, d, e, F4, 70, 1 ); - R( e, a, b, c, d, F4, 71, 1 ); - R( d, e, a, b, c, F4, 72, 1 ); - R( c, d, e, a, b, F4, 73, 1 ); - R( b, c, d, e, a, F4, 74, 1 ); - R( a, b, c, d, e, F4, 75, 1 ); - R( e, a, b, c, d, F4, 76, 1 ); - R( d, e, a, b, c, F4, 77, 1 ); - R( c, d, e, a, b, F4, 78, 1 ); + R( a, b, c, d, e, F4, 60, 1 ); vmovdqa %ymm0, (0*32)(%rsp); + R( e, a, b, c, d, F4, 61, 1 ); vmovdqa %ymm0, (1*32)(%rsp); + R( d, e, a, b, c, F4, 62, 1 ); vmovdqa %ymm0, (2*32)(%rsp); + R( c, d, e, a, b, F4, 63, 1 ); vmovdqa %ymm0, (3*32)(%rsp); + R( b, c, d, e, a, F4, 64, 1 ); vmovdqa %ymm0, (4*32)(%rsp); + R( a, b, c, d, e, F4, 65, 1 ); vmovdqa %ymm0, (5*32)(%rsp); + R( e, a, b, c, d, F4, 66, 1 ); vmovdqa %ymm0, (6*32)(%rsp); + R( d, e, a, b, c, F4, 67, 1 ); vmovdqa %ymm0, (7*32)(%rsp); + R( c, d, e, a, b, F4, 68, 1 ); vmovdqa %ymm0, (8*32)(%rsp); + R( b, c, d, e, a, F4, 69, 1 ); vmovdqa %ymm0, (9*32)(%rsp); + R( a, b, c, d, e, F4, 70, 1 ); vmovdqa %ymm0, (10*32)(%rsp); + R( e, a, b, c, d, F4, 71, 1 ); vmovdqa %ymm0, (11*32)(%rsp); + R( d, e, a, b, c, F4, 72, 1 ); vmovdqa %ymm0, (12*32)(%rsp); + R( c, d, e, a, b, F4, 73, 1 ); vmovdqa %ymm0, (13*32)(%rsp); + R( b, c, d, e, a, F4, 74, 1 ); vmovdqa %ymm0, (14*32)(%rsp); + R( a, b, c, d, e, F4, 75, 1 ); vmovdqa %ymm0, (15*32)(%rsp); + R( e, a, b, c, d, F4, 76, 1 ); vmovdqa %ymm0, (16*32)(%rsp); + R( d, e, a, b, c, F4, 77, 1 ); vmovdqa %ymm0, (17*32)(%rsp); + R( c, d, e, a, b, F4, 78, 1 ); vmovdqa %ymm0, (18*32)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79, 1 ); addl ne, a; xorl ne, ne; + /* WK_STACK_WORDS*4/32-1 = 19 */ + vmovdqa %ymm0, (19*32)(%rsp); + /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; @@ -559,8 +562,8 @@ _gcry_sha1_transform_amd64_avx2_bmi2: popq %rbp; popq %rbx; - /* burn_stack */ - movl $((WK_STACK_WORDS)*4 + 3*8 + 31), %eax; + /* stack already burned */ + xorl %eax, %eax; ret; ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2, diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index 421bebec..fff14034 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -388,7 +388,7 @@ _gcry_sha1_transform_amd64_ssse3: .align 16 .Lend: - /* Transform 64-79 + Clear XMM registers. */ + /* Transform 64-79 + Clear XMM registers + Burn stack. */ R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG); R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0); R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1); @@ -401,12 +401,15 @@ _gcry_sha1_transform_amd64_ssse3: R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6); R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7); R( a, b, c, d, e, F4, 75 ); - R( e, a, b, c, d, F4, 76 ); - R( d, e, a, b, c, F4, 77 ); - R( c, d, e, a, b, F4, 78 ); + R( e, a, b, c, d, F4, 76 ); movdqa Wtmp0, (0*16)(%rsp); + R( d, e, a, b, c, F4, 77 ); movdqa Wtmp0, (1*16)(%rsp); + R( c, d, e, a, b, F4, 78 ); movdqa Wtmp0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); + /* 16*4/16-1 = 3 */ + vmovdqa Wtmp0, (3*16)(%rsp); + /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; @@ -424,8 +427,8 @@ _gcry_sha1_transform_amd64_ssse3: popq %rbp; popq %rbx; - /* burn_stack */ - movl $(16*4 + 2*8 + 31), %eax; + /* stack already burned */ + xorl %eax, %eax; .Lret: ret; |