diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-21 20:59:50 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-22 21:13:36 +0200 |
commit | dd4cb5d75c8e4e666db4352d999b2111b9ddb80d (patch) | |
tree | dcf482faf1ba772efa424867dce7544966b12c34 | |
parent | b9a9755742c7bf7ca8c007d33f98aaa076a382c7 (diff) | |
download | libgcrypt-dd4cb5d75c8e4e666db4352d999b2111b9ddb80d.tar.gz |
camellia-aesni-avx: speed up for round key broadcasting
* cipher/camellia-aesni-avx2-amd64.h (roundsm16, fls16): Broadcast
round key bytes directly with 'vpshufb'.
--
Benchmark on AMD Ryzen 9 7900X (turbo-freq off):
Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.837 ns/B 1139 MiB/s 3.94 c/B 4700
ECB dec | 0.839 ns/B 1137 MiB/s 3.94 c/B 4700
After (~3% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.808 ns/B 1180 MiB/s 3.80 c/B 4700
ECB dec | 0.810 ns/B 1177 MiB/s 3.81 c/B 4700
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/camellia-aesni-avx-amd64.S | 89 |
1 files changed, 47 insertions, 42 deletions
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 5ec33b9b..76e62ea8 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -1,6 +1,6 @@ /* camellia-avx-aesni-amd64.S - AES-NI/AVX implementation of Camellia cipher * - * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2013-2015,2020,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -121,25 +121,14 @@ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ - vpxor t6, t6, t6; \ vmovq key, t0; \ \ /* postfilter sbox 2 */ \ filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \ \ - vpsrldq $5, t0, t5; \ - vpsrldq $1, t0, t1; \ - vpsrldq $2, t0, t2; \ - vpsrldq $3, t0, t3; \ - vpsrldq $4, t0, t4; \ - vpshufb t6, t0, t0; \ - vpshufb t6, t1, t1; \ - vpshufb t6, t2, t2; \ - vpshufb t6, t3, t3; \ - vpshufb t6, t4, t4; \ - vpsrldq $2, t5, t7; \ - vpshufb t6, t7, t7; \ + vpshufb .Lbyte_threes rRIP, t0, t3; \ + vpshufb .Lbyte_twos rRIP, t0, t2; \ \ /* P-function */ \ vpxor x5, x0, x0; \ @@ -147,16 +136,23 @@ vpxor x7, x2, x2; \ vpxor x4, x3, x3; \ \ + vpshufb .Lbyte_ones rRIP, t0, t1; \ + vpshufb .Lbyte_sevens rRIP, t0, t7; \ + \ vpxor x2, x4, x4; \ vpxor x3, x5, x5; \ vpxor x0, x6, x6; \ vpxor x1, x7, x7; \ \ + vpshufb .Lbyte_sixs rRIP, t0, t6; \ + vpshufb .Lbyte_fives rRIP, t0, t5; \ vpxor x7, x0, x0; \ vpxor x4, x1, x1; \ vpxor x5, x2, x2; \ vpxor x6, x3, x3; \ \ + vpshufb .Lbyte_fours rRIP, t0, t4; \ + \ vpxor x3, x4, x4; \ vpxor x0, x5, x5; \ vpxor x1, x6, x6; \ @@ -165,15 +161,14 @@ /* Add key material and result to CD (x becomes new CD) */ \ \ vpxor t3, x4, x4; \ + vpxor t3, t3, t3; \ vpxor 0 * 16(mem_cd), x4, x4; \ \ + vpshufb t3, t0, t0; \ + \ vpxor t2, x5, x5; \ vpxor 1 * 16(mem_cd), x5, x5; \ \ - vpsrldq $1, t5, t3; \ - vpshufb t6, t5, t5; \ - vpshufb t6, t3, t6; \ - \ vpxor t1, x6, x6; \ vpxor 2 * 16(mem_cd), x6, x6; \ \ @@ -294,12 +289,9 @@ vpxor tt0, tt0, tt0; \ vmovd kll, t0; \ vpshufb tt0, t0, t3; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t2; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t1; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t0; \ + vpshufb .Lbyte_ones rRIP, t0, t2; \ + vpshufb .Lbyte_twos rRIP, t0, t1; \ + vpshufb .Lbyte_threes rRIP, t0, t0; \ \ vpand l0, t0, t0; \ vpand l1, t1, t1; \ @@ -325,12 +317,9 @@ \ vmovd krr, t0; \ vpshufb tt0, t0, t3; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t2; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t1; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t0; \ + vpshufb .Lbyte_ones rRIP, t0, t2; \ + vpshufb .Lbyte_twos rRIP, t0, t1; \ + vpshufb .Lbyte_threes rRIP, t0, t0; \ \ vpor 4 * 16(r), t0, t0; \ vpor 5 * 16(r), t1, t1; \ @@ -353,12 +342,9 @@ */ \ vmovd krl, t0; \ vpshufb tt0, t0, t3; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t2; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t1; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t0; \ + vpshufb .Lbyte_ones rRIP, t0, t2; \ + vpshufb .Lbyte_twos rRIP, t0, t1; \ + vpshufb .Lbyte_threes rRIP, t0, t0; \ \ vpand 0 * 16(r), t0, t0; \ vpand 1 * 16(r), t1, t1; \ @@ -384,12 +370,9 @@ \ vmovd klr, t0; \ vpshufb tt0, t0, t3; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t2; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t1; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t0; \ + vpshufb .Lbyte_ones rRIP, t0, t2; \ + vpshufb .Lbyte_twos rRIP, t0, t1; \ + vpshufb .Lbyte_threes rRIP, t0, t0; \ \ vpor l4, t0, t0; \ vpor l5, t1, t1; \ @@ -637,6 +620,28 @@ _camellia_aesni_avx_data: .long 0x80808080 .long 0x80808080 +.Lbyte_ones: + .quad 1 * 0x0101010101010101 + .quad 1 * 0x0101010101010101 +.Lbyte_twos: + .quad 2 * 0x0101010101010101 + .quad 2 * 0x0101010101010101 +.Lbyte_threes: + .quad 3 * 0x0101010101010101 + .quad 3 * 0x0101010101010101 +.Lbyte_fours: + .quad 4 * 0x0101010101010101 + .quad 4 * 0x0101010101010101 +.Lbyte_fives: + .quad 5 * 0x0101010101010101 + .quad 5 * 0x0101010101010101 +.Lbyte_sixs: + .quad 6 * 0x0101010101010101 + .quad 6 * 0x0101010101010101 +.Lbyte_sevens: + .quad 7 * 0x0101010101010101 + .quad 7 * 0x0101010101010101 + /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |