diff options
author | Maamoun TK <maamoun.tk@googlemail.com> | 2023-03-24 00:23:09 +0200 |
---|---|---|
committer | Maamoun TK <maamoun.tk@googlemail.com> | 2023-03-24 00:23:09 +0200 |
commit | f0105607a70b298b06ad73a88f03f1bc28f1b020 (patch) | |
tree | 1107dd5f32f909886656102d9f749bb6eb0254e9 | |
parent | 775d6adb77a885616ef3a9fcbc4c087cad129f3d (diff) | |
download | nettle-f0105607a70b298b06ad73a88f03f1bc28f1b020.tar.gz |
[x86_64] Use 2-way GHASH pclmul update
-rw-r--r-- | x86_64/pclmul/ghash-set-key.asm | 35 | ||||
-rw-r--r-- | x86_64/pclmul/ghash-update.asm | 72 |
2 files changed, 91 insertions, 16 deletions
diff --git a/x86_64/pclmul/ghash-set-key.asm b/x86_64/pclmul/ghash-set-key.asm index 3a7a976a..863ee244 100644 --- a/x86_64/pclmul/ghash-set-key.asm +++ b/x86_64/pclmul/ghash-set-key.asm @@ -39,12 +39,15 @@ define(`BSWAP', `%xmm1') define(`H', `%xmm2') define(`D', `%xmm3') define(`T', `%xmm4') -define(`MASK', `%xmm5') +define(`R', `%xmm5') +define(`M', `%xmm6') +define(`F', `%xmm7') +define(`MASK', `%xmm7') C void _ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key) PROLOGUE(_nettle_ghash_set_key) - W64_ENTRY(2, 6) + W64_ENTRY(2, 8) movdqa .Lpolynomial(%rip), P movdqa .Lbswap(%rip), BSWAP movups (KEY), H @@ -63,11 +66,35 @@ PROLOGUE(_nettle_ghash_set_key) movups H, (CTX) C Set D = x^{-64} H = {H0, H1} + P1 H0 + movdqa H, T + pshufd $0x4e, H, D C Swap H0, H1 + pclmullqhqdq P, T + pxor T, D + movups D, 16(CTX) + + movdqa H, M + movdqa H, F + movdqa H, T + pclmulhqlqdq H, T C H0 * M1 + pclmulhqhqdq H, M C H1 * M1 + pclmullqlqdq D, F C D0 * M0 + pclmullqhqdq D, H C D1 * M0 + pxor T, F + pxor M, H + + pshufd $0x4e, F, T C Swap halves of F + pxor T, H + pclmullqhqdq P, F + pxor F, H + movups H, 32(CTX) + + C Set D2 = x^{-64} H^2 = {H0, H1} + P1 H0 pshufd $0x4e, H, D C Swap H0, H1 pclmullqhqdq P, H pxor H, D - movups D, 16(CTX) - W64_EXIT(2, 6) + movups D, 48(CTX) + + W64_EXIT(2, 8) ret EPILOGUE(_nettle_ghash_set_key) diff --git a/x86_64/pclmul/ghash-update.asm b/x86_64/pclmul/ghash-update.asm index 93cc002c..1c30e7d8 100644 --- a/x86_64/pclmul/ghash-update.asm +++ b/x86_64/pclmul/ghash-update.asm @@ -41,11 +41,16 @@ define(`P', `%xmm0') define(`BSWAP', `%xmm1') define(`H', `%xmm2') define(`D', `%xmm3') -define(`T', `%xmm4') - -define(`R', `%xmm5') -define(`M', `%xmm6') -define(`F', `%xmm7') +define(`H2', `%xmm4') +define(`D2', `%xmm5') +define(`T', `%xmm6') +define(`R', `%xmm7') +define(`M', `%xmm8') +define(`F', `%xmm9') +define(`T2', `%xmm10') +define(`R2', `%xmm11') +define(`M2', `%xmm12') +define(`F2', `%xmm13') C Use pclmulqdq, doing one 64x64 --> 127 bit carry-less multiplication, C with source operands being selected from the halves of two 128-bit registers. @@ -80,21 +85,66 @@ C registers left for temporaries. C size_t blocks, const uint8_t *data) PROLOGUE(_nettle_ghash_update) - W64_ENTRY(4, 8) + W64_ENTRY(4, 14) movdqa .Lpolynomial(%rip), P movdqa .Lbswap(%rip), BSWAP movups (CTX), H movups 16(CTX), D + movups 32(CTX), H2 + movups 48(CTX), D2 movups (X), R pshufb BSWAP, R - sub $1, BLOCKS - jc .Ldone + mov BLOCKS, %rax + shr $1, %rax + jz .L1_block .Loop: movups (DATA), M pshufb BSWAP, M -.Lblock: + pxor M, R + movdqa R, M + movdqa R, F + movdqa R, T + pclmullqlqdq D2, F C {D^2}0 * M1_0 + pclmullqhqdq D2, R C {D^2}1 * M1_0 + pclmulhqlqdq H2, T C {H^2}0 * M1_1 + pclmulhqhqdq H2, M C {H^2}1 * M1_1 + + + movups 16(DATA), M2 + pshufb BSWAP, M2 + movdqa M2, R2 + movdqa M2, F2 + movdqa M2, T2 + pclmullqlqdq D, F2 C D0 * M2_0 + pclmullqhqdq D, R2 C D1 * M2_0 + pclmulhqlqdq H, T2 C H0 * M2_1 + pclmulhqhqdq H, M2 C H1 * M2_1 + + pxor T, F + pxor M, R + pxor T2, F2 + pxor M2, R2 + + pxor F2, F + pxor R2, R + + pshufd $0x4e, F, T C Swap halves of F + pxor T, R + pclmullqhqdq P, F + pxor F, R + + add $32, DATA + dec %rax + jnz .Loop + +.L1_block: + and $1, BLOCKS + jz .Ldone + + movups (DATA), M + pshufb BSWAP, M pxor M, R movdqa R, M movdqa R, F @@ -112,14 +162,12 @@ PROLOGUE(_nettle_ghash_update) pxor F, R add $16, DATA - sub $1, BLOCKS - jnc .Loop .Ldone: pshufb BSWAP, R movups R, (X) mov DATA, %rax - W64_EXIT(4, 8) + W64_EXIT(4, 14) ret EPILOGUE(_nettle_ghash_update) |