diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-03-01 21:04:51 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-03-02 12:15:48 +0200 |
commit | 6c05c808e4e848964f67157e80f1835c5146e2bc (patch) | |
tree | 960a7317a3fdc81d76ca9fc4d4151d8ec1f55f1b /cipher/camellia-simd128.h | |
parent | 652598096325c2478d7d033585dadc13bec6fb1d (diff) | |
download | libgcrypt-6c05c808e4e848964f67157e80f1835c5146e2bc.tar.gz |
camellia-simd128: faster sbox filtering with uint8 right shift
* cipher/camellia-simd128.h (if_vpsrlb128)
(if_not_vpsrlb128): New.
(filter_8bit): Use 'vpsrlb128' when available on target
architecture (PowerPC and AArch64).
--
Benchmark on POWER9:
Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 3.26 ns/B 292.8 MiB/s 7.49 c/B
ECB dec | 3.29 ns/B 290.0 MiB/s 7.56 c/B
After (~2% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 3.16 ns/B 301.4 MiB/s 7.28 c/B
ECB dec | 3.19 ns/B 298.7 MiB/s 7.34 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-simd128.h')
-rw-r--r-- | cipher/camellia-simd128.h | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h index 9cb7b987..6b44961f 100644 --- a/cipher/camellia-simd128.h +++ b/cipher/camellia-simd128.h @@ -91,6 +91,8 @@ asm_sbox_be(uint8x16_t b) o = (__m128i)vec_sld((uint8x16_t)a, \ (uint8x16_t)__tmp, (s) & 15);}) +#define if_vpsrlb128(...) __VA_ARGS__ +#define if_not_vpsrlb128(...) /*_*/ #define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o) #define vpsll_byte_128(s, a, o) vpsllb128(s, a, o) @@ -182,6 +184,8 @@ static const uint8x16_t shift_row = o = (__m128i)vextq_u8((uint8x16_t)__tmp, \ (uint8x16_t)a, (16 - (s)) & 15);}) +#define if_vpsrlb128(...) __VA_ARGS__ +#define if_not_vpsrlb128(...) /*_*/ #define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o) #define vpsll_byte_128(s, a, o) vpsllb128(s, a, o) @@ -253,6 +257,8 @@ static const uint8x16_t shift_row = #define vpsrldq128(s, a, o) (o = _mm_srli_si128(a, s)) #define vpslldq128(s, a, o) (o = _mm_slli_si128(a, s)) +#define if_vpsrlb128(...) /*_*/ +#define if_not_vpsrlb128(...) __VA_ARGS__ #define vpsrl_byte_128(s, a, o) vpsrld128(s, a, o) #define vpsll_byte_128(s, a, o) vpslld128(s, a, o) @@ -309,8 +315,9 @@ static const uint8x16_t shift_row = **********************************************************************/ #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ vpand128(x, mask4bit, tmp0); \ - vpandn128(x, mask4bit, x); \ - vpsrl_byte_128(4, x, x); \ + if_vpsrlb128(vpsrlb128(4, x, x)); \ + if_not_vpsrlb128(vpandn128(x, mask4bit, x)); \ + if_not_vpsrlb128(vpsrld128(4, x, x)); \ \ vpshufb128(tmp0, lo_t, tmp0); \ vpshufb128(x, hi_t, x); \ |