summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-03-01 21:04:51 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-03-02 12:15:48 +0200
commit6c05c808e4e848964f67157e80f1835c5146e2bc (patch)
tree960a7317a3fdc81d76ca9fc4d4151d8ec1f55f1b
parent652598096325c2478d7d033585dadc13bec6fb1d (diff)
downloadlibgcrypt-6c05c808e4e848964f67157e80f1835c5146e2bc.tar.gz
camellia-simd128: faster sbox filtering with uint8 right shift
* cipher/camellia-simd128.h (if_vpsrlb128) (if_not_vpsrlb128): New. (filter_8bit): Use 'vpsrlb128' when available on target architecture (PowerPC and AArch64). -- Benchmark on POWER9: Before: CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 3.26 ns/B 292.8 MiB/s 7.49 c/B ECB dec | 3.29 ns/B 290.0 MiB/s 7.56 c/B After (~2% faster): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 3.16 ns/B 301.4 MiB/s 7.28 c/B ECB dec | 3.19 ns/B 298.7 MiB/s 7.34 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--cipher/camellia-simd128.h11
1 files changed, 9 insertions, 2 deletions
diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h
index 9cb7b987..6b44961f 100644
--- a/cipher/camellia-simd128.h
+++ b/cipher/camellia-simd128.h
@@ -91,6 +91,8 @@ asm_sbox_be(uint8x16_t b)
o = (__m128i)vec_sld((uint8x16_t)a, \
(uint8x16_t)__tmp, (s) & 15);})
+#define if_vpsrlb128(...) __VA_ARGS__
+#define if_not_vpsrlb128(...) /*_*/
#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
@@ -182,6 +184,8 @@ static const uint8x16_t shift_row =
o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
(uint8x16_t)a, (16 - (s)) & 15);})
+#define if_vpsrlb128(...) __VA_ARGS__
+#define if_not_vpsrlb128(...) /*_*/
#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
@@ -253,6 +257,8 @@ static const uint8x16_t shift_row =
#define vpsrldq128(s, a, o) (o = _mm_srli_si128(a, s))
#define vpslldq128(s, a, o) (o = _mm_slli_si128(a, s))
+#define if_vpsrlb128(...) /*_*/
+#define if_not_vpsrlb128(...) __VA_ARGS__
#define vpsrl_byte_128(s, a, o) vpsrld128(s, a, o)
#define vpsll_byte_128(s, a, o) vpslld128(s, a, o)
@@ -309,8 +315,9 @@ static const uint8x16_t shift_row =
**********************************************************************/
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
vpand128(x, mask4bit, tmp0); \
- vpandn128(x, mask4bit, x); \
- vpsrl_byte_128(4, x, x); \
+ if_vpsrlb128(vpsrlb128(4, x, x)); \
+ if_not_vpsrlb128(vpandn128(x, mask4bit, x)); \
+ if_not_vpsrlb128(vpsrld128(4, x, x)); \
\
vpshufb128(tmp0, lo_t, tmp0); \
vpshufb128(x, hi_t, x); \