summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-18 11:14:02 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-22 20:27:56 +0200
commitf359a3ec7e845aa446836bd47994fe18d6d41e08 (patch)
treeef2cea6fa0d247dc7276afec4be2150b0ce2b3ed
parent855f1551fd921ced652dc0c3c03601dfcd063f1c (diff)
downloadlibgcrypt-f359a3ec7e845aa446836bd47994fe18d6d41e08.tar.gz
aria-avx512: small optimization for aria_diff_m
* cipher/aria-gfni-avx512-amd64.S (aria_diff_m): Use 'vpternlogq' for 3-way XOR operation. --- Using vpternlogq gives small performance improvement on AMD Zen4. With Intel tiger-lake speed is the same as before. Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off): Before: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.203 ns/B 4703 MiB/s 0.953 c/B 4700 ECB dec | 0.204 ns/B 4675 MiB/s 0.959 c/B 4700 CTR enc | 0.207 ns/B 4609 MiB/s 0.973 c/B 4700 CTR dec | 0.207 ns/B 4608 MiB/s 0.973 c/B 4700 After (~3% faster): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.197 ns/B 4847 MiB/s 0.925 c/B 4700 ECB dec | 0.197 ns/B 4852 MiB/s 0.924 c/B 4700 CTR enc | 0.200 ns/B 4759 MiB/s 0.942 c/B 4700 CTR dec | 0.200 ns/B 4772 MiB/s 0.939 c/B 4700 Cc: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--cipher/aria-gfni-avx512-amd64.S16
1 files changed, 6 insertions, 10 deletions
diff --git a/cipher/aria-gfni-avx512-amd64.S b/cipher/aria-gfni-avx512-amd64.S
index 1076cf8b..0eaa2de8 100644
--- a/cipher/aria-gfni-avx512-amd64.S
+++ b/cipher/aria-gfni-avx512-amd64.S
@@ -406,21 +406,17 @@
vgf2p8affineinvqb $0, t2, y3, y3; \
vgf2p8affineinvqb $0, t2, y7, y7;
-
#define aria_diff_m(x0, x1, x2, x3, \
t0, t1, t2, t3) \
/* T = rotr32(X, 8); */ \
/* X ^= T */ \
- vpxorq x0, x3, t0; \
- vpxorq x1, x0, t1; \
- vpxorq x2, x1, t2; \
- vpxorq x3, x2, t3; \
/* X = T ^ rotr(X, 16); */ \
- vpxorq t2, x0, x0; \
- vpxorq x1, t3, t3; \
- vpxorq t0, x2, x2; \
- vpxorq t1, x3, x1; \
- vmovdqu64 t3, x3;
+ vmovdqa64 x0, t0; \
+ vmovdqa64 x3, t3; \
+ vpternlogq $0x96, x2, x1, x0; \
+ vpternlogq $0x96, x2, x1, x3; \
+ vpternlogq $0x96, t0, t3, x2; \
+ vpternlogq $0x96, t0, t3, x1;
#define aria_diff_word(x0, x1, x2, x3, \
x4, x5, x6, x7, \