aria-avx512: small optimization for aria_diff_m

* cipher/aria-gfni-avx512-amd64.S (aria_diff_m): Use 'vpternlogq' for 3-way XOR operation. --- Using vpternlogq gives small performance improvement on AMD Zen4. With Intel tiger-lake speed is the same as before. Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off): Before: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.203 ns/B 4703 MiB/s 0.953 c/B 4700 ECB dec | 0.204 ns/B 4675 MiB/s 0.959 c/B 4700 CTR enc | 0.207 ns/B 4609 MiB/s 0.973 c/B 4700 CTR dec | 0.207 ns/B 4608 MiB/s 0.973 c/B 4700 After (~3% faster): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.197 ns/B 4847 MiB/s 0.925 c/B 4700 ECB dec | 0.197 ns/B 4852 MiB/s 0.924 c/B 4700 CTR enc | 0.200 ns/B 4759 MiB/s 0.942 c/B 4700 CTR dec | 0.200 ns/B 4772 MiB/s 0.939 c/B 4700 Cc: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
author: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2023-02-18 11:14:02 +0200
committer: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2023-02-22 20:27:56 +0200
commit: f359a3ec7e845aa446836bd47994fe18d6d41e08 (patch)
tree: ef2cea6fa0d247dc7276afec4be2150b0ce2b3ed
parent: 855f1551fd921ced652dc0c3c03601dfcd063f1c (diff)
download: libgcrypt-f359a3ec7e845aa446836bd47994fe18d6d41e08.tar.gz
1 files changed, 6 insertions, 10 deletions
diff --git a/cipher/aria-gfni-avx512-amd64.S b/cipher/aria-gfni-avx512-amd64.S
index 1076cf8b..0eaa2de8 100644
--- a/cipher/aria-gfni-avx512-amd64.S
+++ b/cipher/aria-gfni-avx512-amd64.S
@@ -406,21 +406,17 @@
 	vgf2p8affineinvqb $0, t2, y3, y3;		\
 	vgf2p8affineinvqb $0, t2, y7, y7;
 
-
 #define aria_diff_m(x0, x1, x2, x3,			\
 		    t0, t1, t2, t3)			\
 	/* T = rotr32(X, 8); */				\
 	/* X ^= T */					\
-	vpxorq x0, x3, t0;				\
-	vpxorq x1, x0, t1;				\
-	vpxorq x2, x1, t2;				\
-	vpxorq x3, x2, t3;				\
 	/* X = T ^ rotr(X, 16); */			\
-	vpxorq t2, x0, x0;				\
-	vpxorq x1, t3, t3;				\
-	vpxorq t0, x2, x2;				\
-	vpxorq t1, x3, x1;				\
-	vmovdqu64 t3, x3;
+	vmovdqa64 x0, t0;				\
+	vmovdqa64 x3, t3;				\
+	vpternlogq $0x96, x2, x1, x0;			\
+	vpternlogq $0x96, x2, x1, x3;			\
+	vpternlogq $0x96, t0, t3, x2;			\
+	vpternlogq $0x96, t0, t3, x1;
 
 #define aria_diff_word(x0, x1, x2, x3,			\
 		       x4, x5, x6, x7,			\
author	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2023-02-18 11:14:02 +0200
committer	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2023-02-22 20:27:56 +0200
commit	f359a3ec7e845aa446836bd47994fe18d6d41e08 (patch)
tree	ef2cea6fa0d247dc7276afec4be2150b0ce2b3ed
parent	855f1551fd921ced652dc0c3c03601dfcd063f1c (diff)
download	libgcrypt-f359a3ec7e845aa446836bd47994fe18d6d41e08.tar.gz