summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2023-04-02 14:52:45 +0200
committerNiels Möller <nisse@lysator.liu.se>2023-04-02 14:52:45 +0200
commit85a7772e7f6745cf7e58f0a481bf0ede9445d24b (patch)
treefe9084053515c14aaa8c14b74b75f13dfda238ee
parent8b45098af05dde848b98d99a336bc4740dbddc03 (diff)
parentf0105607a70b298b06ad73a88f03f1bc28f1b020 (diff)
downloadnettle-85a7772e7f6745cf7e58f0a481bf0ede9445d24b.tar.gz
Merge branch 'nettle-x86_ghash' into master
See https://git.lysator.liu.se/nettle/nettle/-/merge_requests/57
-rw-r--r--x86_64/pclmul/ghash-set-key.asm35
-rw-r--r--x86_64/pclmul/ghash-update.asm72
2 files changed, 91 insertions, 16 deletions
diff --git a/x86_64/pclmul/ghash-set-key.asm b/x86_64/pclmul/ghash-set-key.asm
index 3a7a976a..863ee244 100644
--- a/x86_64/pclmul/ghash-set-key.asm
+++ b/x86_64/pclmul/ghash-set-key.asm
@@ -39,12 +39,15 @@ define(`BSWAP', `%xmm1')
define(`H', `%xmm2')
define(`D', `%xmm3')
define(`T', `%xmm4')
-define(`MASK', `%xmm5')
+define(`R', `%xmm5')
+define(`M', `%xmm6')
+define(`F', `%xmm7')
+define(`MASK', `%xmm7')
C void _ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key)
PROLOGUE(_nettle_ghash_set_key)
- W64_ENTRY(2, 6)
+ W64_ENTRY(2, 8)
movdqa .Lpolynomial(%rip), P
movdqa .Lbswap(%rip), BSWAP
movups (KEY), H
@@ -63,11 +66,35 @@ PROLOGUE(_nettle_ghash_set_key)
movups H, (CTX)
C Set D = x^{-64} H = {H0, H1} + P1 H0
+ movdqa H, T
+ pshufd $0x4e, H, D C Swap H0, H1
+ pclmullqhqdq P, T
+ pxor T, D
+ movups D, 16(CTX)
+
+ movdqa H, M
+ movdqa H, F
+ movdqa H, T
+ pclmulhqlqdq H, T C H0 * M1
+ pclmulhqhqdq H, M C H1 * M1
+ pclmullqlqdq D, F C D0 * M0
+ pclmullqhqdq D, H C D1 * M0
+ pxor T, F
+ pxor M, H
+
+ pshufd $0x4e, F, T C Swap halves of F
+ pxor T, H
+ pclmullqhqdq P, F
+ pxor F, H
+ movups H, 32(CTX)
+
+ C Set D2 = x^{-64} H^2 = {H0, H1} + P1 H0
pshufd $0x4e, H, D C Swap H0, H1
pclmullqhqdq P, H
pxor H, D
- movups D, 16(CTX)
- W64_EXIT(2, 6)
+ movups D, 48(CTX)
+
+ W64_EXIT(2, 8)
ret
EPILOGUE(_nettle_ghash_set_key)
diff --git a/x86_64/pclmul/ghash-update.asm b/x86_64/pclmul/ghash-update.asm
index 93cc002c..1c30e7d8 100644
--- a/x86_64/pclmul/ghash-update.asm
+++ b/x86_64/pclmul/ghash-update.asm
@@ -41,11 +41,16 @@ define(`P', `%xmm0')
define(`BSWAP', `%xmm1')
define(`H', `%xmm2')
define(`D', `%xmm3')
-define(`T', `%xmm4')
-
-define(`R', `%xmm5')
-define(`M', `%xmm6')
-define(`F', `%xmm7')
+define(`H2', `%xmm4')
+define(`D2', `%xmm5')
+define(`T', `%xmm6')
+define(`R', `%xmm7')
+define(`M', `%xmm8')
+define(`F', `%xmm9')
+define(`T2', `%xmm10')
+define(`R2', `%xmm11')
+define(`M2', `%xmm12')
+define(`F2', `%xmm13')
C Use pclmulqdq, doing one 64x64 --> 127 bit carry-less multiplication,
C with source operands being selected from the halves of two 128-bit registers.
@@ -80,21 +85,66 @@ C registers left for temporaries.
C size_t blocks, const uint8_t *data)
PROLOGUE(_nettle_ghash_update)
- W64_ENTRY(4, 8)
+ W64_ENTRY(4, 14)
movdqa .Lpolynomial(%rip), P
movdqa .Lbswap(%rip), BSWAP
movups (CTX), H
movups 16(CTX), D
+ movups 32(CTX), H2
+ movups 48(CTX), D2
movups (X), R
pshufb BSWAP, R
- sub $1, BLOCKS
- jc .Ldone
+ mov BLOCKS, %rax
+ shr $1, %rax
+ jz .L1_block
.Loop:
movups (DATA), M
pshufb BSWAP, M
-.Lblock:
+ pxor M, R
+ movdqa R, M
+ movdqa R, F
+ movdqa R, T
+ pclmullqlqdq D2, F C {D^2}0 * M1_0
+ pclmullqhqdq D2, R C {D^2}1 * M1_0
+ pclmulhqlqdq H2, T C {H^2}0 * M1_1
+ pclmulhqhqdq H2, M C {H^2}1 * M1_1
+
+
+ movups 16(DATA), M2
+ pshufb BSWAP, M2
+ movdqa M2, R2
+ movdqa M2, F2
+ movdqa M2, T2
+ pclmullqlqdq D, F2 C D0 * M2_0
+ pclmullqhqdq D, R2 C D1 * M2_0
+ pclmulhqlqdq H, T2 C H0 * M2_1
+ pclmulhqhqdq H, M2 C H1 * M2_1
+
+ pxor T, F
+ pxor M, R
+ pxor T2, F2
+ pxor M2, R2
+
+ pxor F2, F
+ pxor R2, R
+
+ pshufd $0x4e, F, T C Swap halves of F
+ pxor T, R
+ pclmullqhqdq P, F
+ pxor F, R
+
+ add $32, DATA
+ dec %rax
+ jnz .Loop
+
+.L1_block:
+ and $1, BLOCKS
+ jz .Ldone
+
+ movups (DATA), M
+ pshufb BSWAP, M
pxor M, R
movdqa R, M
movdqa R, F
@@ -112,14 +162,12 @@ PROLOGUE(_nettle_ghash_update)
pxor F, R
add $16, DATA
- sub $1, BLOCKS
- jnc .Loop
.Ldone:
pshufb BSWAP, R
movups R, (X)
mov DATA, %rax
- W64_EXIT(4, 8)
+ W64_EXIT(4, 14)
ret
EPILOGUE(_nettle_ghash_update)