summaryrefslogtreecommitdiff
path: root/cipher
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-18 11:13:34 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-22 20:27:56 +0200
commit855f1551fd921ced652dc0c3c03601dfcd063f1c (patch)
tree6c6deb2cb79c22885c21fae923692e5c5b4f8b43 /cipher
parent45351e6474cbbe5baaa4c488222610edc417176e (diff)
downloadlibgcrypt-855f1551fd921ced652dc0c3c03601dfcd063f1c.tar.gz
aria-avx: small optimization for aria_ark_8way
* cipher/aria-aesni-avx-amd64.S (aria_ark_8way): Use 'vmovd' for loading key material and 'vpshufb' for broadcasting from byte locations 3, 2, 1 and 0. -- Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off): Before (GFNI/AVX): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.516 ns/B 1847 MiB/s 2.43 c/B 4700 ECB dec | 0.519 ns/B 1839 MiB/s 2.44 c/B 4700 CTR enc | 0.517 ns/B 1846 MiB/s 2.43 c/B 4700 CTR dec | 0.518 ns/B 1843 MiB/s 2.43 c/B 4700 After (GFNI/AVX, ~5% faster): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.490 ns/B 1947 MiB/s 2.30 c/B 4700 ECB dec | 0.490 ns/B 1946 MiB/s 2.30 c/B 4700 CTR enc | 0.493 ns/B 1935 MiB/s 2.32 c/B 4700 CTR dec | 0.493 ns/B 1934 MiB/s 2.32 c/B 4700 === Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off): Before (GFNI/AVX): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.967 ns/B 986.6 MiB/s 2.89 c/B 2992 ECB dec | 0.966 ns/B 987.1 MiB/s 2.89 c/B 2992 CTR enc | 0.972 ns/B 980.8 MiB/s 2.91 c/B 2993 CTR dec | 0.971 ns/B 982.5 MiB/s 2.90 c/B 2993 After (GFNI/AVX, ~6% faster): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.908 ns/B 1050 MiB/s 2.72 c/B 2992 ECB dec | 0.903 ns/B 1056 MiB/s 2.70 c/B 2992 CTR enc | 0.913 ns/B 1045 MiB/s 2.73 c/B 2992 CTR dec | 0.910 ns/B 1048 MiB/s 2.72 c/B 2992 === Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off): Before (AESNI/AVX): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.921 ns/B 1035 MiB/s 3.50 c/B 3800 ECB dec | 0.922 ns/B 1034 MiB/s 3.50 c/B 3800 CTR enc | 0.923 ns/B 1033 MiB/s 3.51 c/B 3800 CTR dec | 0.923 ns/B 1033 MiB/s 3.51 c/B 3800 After (AESNI/AVX, ~6% faster) ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.862 ns/B 1106 MiB/s 3.28 c/B 3800 ECB dec | 0.862 ns/B 1106 MiB/s 3.28 c/B 3800 CTR enc | 0.865 ns/B 1102 MiB/s 3.29 c/B 3800 CTR dec | 0.865 ns/B 1103 MiB/s 3.29 c/B 3800 === Benchmark on AMD EPYC 7642 (zen2): Before (AESNI/AVX): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.22 ns/B 784.5 MiB/s 4.01 c/B 3298 ECB dec | 1.22 ns/B 784.8 MiB/s 4.00 c/B 3292 CTR enc | 1.22 ns/B 780.1 MiB/s 4.03 c/B 3299 CTR dec | 1.22 ns/B 779.1 MiB/s 4.04 c/B 3299 After (AESNI/AVX, ~13% faster): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.07 ns/B 888.3 MiB/s 3.54 c/B 3299 ECB dec | 1.08 ns/B 885.3 MiB/s 3.55 c/B 3299 CTR enc | 1.07 ns/B 888.7 MiB/s 3.54 c/B 3298 CTR dec | 1.07 ns/B 887.4 MiB/s 3.55 c/B 3299 === Benchmark on Intel Core i5-6500 (skylake): Before (AESNI/AVX): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.24 ns/B 766.6 MiB/s 4.48 c/B 3598 ECB dec | 1.25 ns/B 764.9 MiB/s 4.49 c/B 3598 CTR enc | 1.25 ns/B 761.7 MiB/s 4.50 c/B 3598 CTR dec | 1.25 ns/B 761.6 MiB/s 4.51 c/B 3598 After (AESNI/AVX, ~2% faster): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.22 ns/B 780.0 MiB/s 4.40 c/B 3598 ECB dec | 1.22 ns/B 779.6 MiB/s 4.40 c/B 3598 CTR enc | 1.23 ns/B 776.6 MiB/s 4.42 c/B 3598 CTR dec | 1.23 ns/B 776.6 MiB/s 4.42 c/B 3598 === Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off): Before (AESNI/AVX): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 2.11 ns/B 452.7 MiB/s 5.25 c/B 2494 ECB dec | 2.10 ns/B 454.5 MiB/s 5.23 c/B 2494 CTR enc | 2.10 ns/B 453.2 MiB/s 5.25 c/B 2494 CTR dec | 2.10 ns/B 453.2 MiB/s 5.25 c/B 2494 After (AESNI/AVX, ~4% faster) ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 2.00 ns/B 475.8 MiB/s 5.00 c/B 2494 ECB dec | 2.00 ns/B 476.4 MiB/s 4.99 c/B 2494 CTR enc | 2.01 ns/B 474.7 MiB/s 5.01 c/B 2494 CTR dec | 2.01 ns/B 473.9 MiB/s 5.02 c/B 2494 Cc: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r--cipher/aria-aesni-avx-amd64.S29
1 files changed, 15 insertions, 14 deletions
diff --git a/cipher/aria-aesni-avx-amd64.S b/cipher/aria-aesni-avx-amd64.S
index 45b0b4a4..2a88c1e7 100644
--- a/cipher/aria-aesni-avx-amd64.S
+++ b/cipher/aria-aesni-avx-amd64.S
@@ -357,27 +357,21 @@
t0, t1, t2, rk, \
idx, round) \
/* AddRoundKey */ \
- vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
- vpsrld $24, t0, t2; \
- vpshufb t1, t2, t2; \
+ vmovd ((round * 16) + idx + 0)(rk), t0; \
+ vpshufb .Lthree_x16 rRIP, t0, t2; \
vpxor t2, x0, x0; \
- vpsrld $16, t0, t2; \
- vpshufb t1, t2, t2; \
+ vpshufb .Ltwo_x16 rRIP, t0, t2; \
vpxor t2, x1, x1; \
- vpsrld $8, t0, t2; \
- vpshufb t1, t2, t2; \
+ vpshufb .Lone_x16 rRIP, t0, t2; \
vpxor t2, x2, x2; \
vpshufb t1, t0, t2; \
vpxor t2, x3, x3; \
- vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
- vpsrld $24, t0, t2; \
- vpshufb t1, t2, t2; \
+ vmovd ((round * 16) + idx + 4)(rk), t0; \
+ vpshufb .Lthree_x16 rRIP, t0, t2; \
vpxor t2, x4, x4; \
- vpsrld $16, t0, t2; \
- vpshufb t1, t2, t2; \
+ vpshufb .Ltwo_x16 rRIP, t0, t2; \
vpxor t2, x5, x5; \
- vpsrld $8, t0, t2; \
- vpshufb t1, t2, t2; \
+ vpshufb .Lone_x16 rRIP, t0, t2; \
vpxor t2, x6, x6; \
vpshufb t1, t0, t2; \
vpxor t2, x7, x7;
@@ -858,6 +852,13 @@ SECTION_RODATA
.Ltf_hi__x2__and__fwd_aff:
.octa 0x3F893781E95FE1576CDA64D2BA0CB204
+.Lthree_x16:
+ .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+.Ltwo_x16:
+ .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+.Lone_x16:
+ .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
.Lbige_addb_1:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
.Lbige_addb_2: