From 3c98ae9cb60a8a72d3fa6641e59775f98ec78786 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 13 Mar 2023 19:52:43 +0200 Subject: camellia-gfni: use GFNI for uint8 right shift in FLS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cipher/camellia-aesni-avx2-amd64.h (IF_GFNI, IF_NOT_GFNI): New. [CAMELLIA_GFNI_BUILD] (rol32_1_32): Add GFNI variant which uses vgf2p8affineqb for uint8 right shift by 7. (fls32): Load 'right shift by 7' bit-matrix on GFNI build. [CAMELLIA_GFNI_BUILD] (.Lright_shift_by_7): New. * cipher/camellia-gfni-avx512-amd64.S (clear_regs): Don't clear %k1. (rol32_1_64): Use vgf2p8affineqb for uint8 right shift by 7. (fls64): Adjust for rol32_1_64 changes. (.Lbyte_ones): Remove. (.Lright_shift_by_7): New. (_gcry_camellia_gfni_avx512_ctr_enc): Clear %k1 after use. -- Benchmark on Intel Core i3-1115G4: Before: CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.194 ns/B 4920 MiB/s 0.794 c/B 4096±4 ECB dec | 0.194 ns/B 4916 MiB/s 0.793 c/B 4089 After (~1.7% faster) CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.190 ns/B 5008 MiB/s 0.780 c/B 4096±3 ECB dec | 0.191 ns/B 5002 MiB/s 0.781 c/B 4096±3 Signed-off-by: Jussi Kivilinna --- cipher/camellia-aesni-avx2-amd64.h | 43 ++++++++++++++++++++++++++++++++++++- cipher/camellia-gfni-avx512-amd64.S | 37 +++++++++++++++---------------- 2 files changed, 61 insertions(+), 19 deletions(-) (limited to 'cipher') diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h index 003c4496..dff8b386 100644 --- a/cipher/camellia-aesni-avx2-amd64.h +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -73,6 +73,14 @@ # define IF_VAES(...) #endif +#ifdef CAMELLIA_GFNI_BUILD +# define IF_GFNI(...) __VA_ARGS__ +# define IF_NOT_GFNI(...) +#else +# define IF_GFNI(...) +# define IF_NOT_GFNI(...) __VA_ARGS__ +#endif + /********************************************************************** GFNI helper macros and constants **********************************************************************/ @@ -459,6 +467,26 @@ * OUT: * v0..3: (IN <<< 1) */ +#ifdef CAMELLIA_GFNI_BUILD +#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, right_shift_by_7) \ + vgf2p8affineqb $0, right_shift_by_7, v0, t0; \ + vpaddb v0, v0, v0; \ + \ + vgf2p8affineqb $0, right_shift_by_7, v1, t1; \ + vpaddb v1, v1, v1; \ + \ + vgf2p8affineqb $0, right_shift_by_7, v2, t2; \ + vpaddb v2, v2, v2; \ + \ + vpor t0, v1, v1; \ + \ + vgf2p8affineqb $0, right_shift_by_7, v3, t0; \ + vpaddb v3, v3, v3; \ + \ + vpor t1, v2, v2; \ + vpor t2, v3, v3; \ + vpor t0, v0, v0; +#else #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ vpcmpgtb v0, zero, t0; \ vpaddb v0, v0, v0; \ @@ -481,6 +509,7 @@ vpor t1, v2, v2; \ vpor t2, v3, v3; \ vpor t0, v0, v0; +#endif /* * IN: @@ -496,7 +525,8 @@ * t0 &= ll; \ * lr ^= rol32(t0, 1); \ */ \ - vpxor tt0, tt0, tt0; \ + IF_NOT_GFNI(vpxor tt0, tt0, tt0); \ + IF_GFNI(vpbroadcastq .Lright_shift_by_7 rRIP, tt0); \ vpbroadcastb 0+kll, t3; \ vpbroadcastb 1+kll, t2; \ vpbroadcastb 2+kll, t1; \ @@ -867,6 +897,17 @@ ELF(.type FUNC_NAME(_constants),@object;) BV8(0, 0, 0, 1, 1, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) +/* Bit-matrix for right shifting uint8_t values in vector by 7. */ +.Lright_shift_by_7: + .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0)) + #else /* CAMELLIA_GFNI_BUILD */ /* diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S index b676379f..643eed3e 100644 --- a/cipher/camellia-gfni-avx512-amd64.S +++ b/cipher/camellia-gfni-avx512-amd64.S @@ -105,7 +105,6 @@ clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31) #define clear_regs() \ - kxorq %k1, %k1, %k1; \ vzeroall; \ clear_zmm16_zmm31() @@ -307,22 +306,18 @@ * v0..3: (IN << 1) * t0, t1, t2, zero: (IN >> 7) */ -#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, zero, one) \ - vpcmpltb zero, v0, %k1; \ +#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, t3, right_shift_by_7) \ + vgf2p8affineqb $0, right_shift_by_7, v0, t0; \ vpaddb v0, v0, v0; \ - vpaddb one, zero, t0{%k1}{z}; \ \ - vpcmpltb zero, v1, %k1; \ + vgf2p8affineqb $0, right_shift_by_7, v1, t1; \ vpaddb v1, v1, v1; \ - vpaddb one, zero, t1{%k1}{z}; \ \ - vpcmpltb zero, v2, %k1; \ + vgf2p8affineqb $0, right_shift_by_7, v2, t2; \ vpaddb v2, v2, v2; \ - vpaddb one, zero, t2{%k1}{z}; \ \ - vpcmpltb zero, v3, %k1; \ - vpaddb v3, v3, v3; \ - vpaddb one, zero, zero{%k1}{z}; + vgf2p8affineqb $0, right_shift_by_7, v3, t3; \ + vpaddb v3, v3, v3; /* * IN: @@ -338,8 +333,7 @@ * t0 &= ll; \ * lr ^= rol32(t0, 1); \ */ \ - vpbroadcastq .Lbyte_ones rRIP, tmp; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ + vpbroadcastq .Lright_shift_by_7 rRIP, tmp; \ vpbroadcastb 0+kll, t3; \ vpbroadcastb 1+kll, t2; \ vpbroadcastb 2+kll, t1; \ @@ -360,7 +354,6 @@ vmovdqu64 l6, l##_6; \ vpternlogq $0x96, tt3, t3, l7; \ vmovdqu64 l7, l##_7; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ \ /* \ * t2 = krr; \ @@ -399,7 +392,6 @@ vpternlogq $0x96, tt1, t1, r##_5; \ vpternlogq $0x96, tt0, t2, r##_6; \ vpternlogq $0x96, tt3, t3, r##_7; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ \ /* \ * t0 = klr; \ @@ -596,9 +588,6 @@ ELF(.type _gcry_camellia_gfni_avx512__constants,@object;) .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lbyte_ones: - .quad 0x0101010101010101 - /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3 * and s4. * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. @@ -663,6 +652,17 @@ ELF(.type _gcry_camellia_gfni_avx512__constants,@object;) BV8(0, 0, 0, 1, 1, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) +/* Bit-matrix for right shifting uint8_t values in vector by 7. */ +.Lright_shift_by_7: + .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0)) + /* CTR byte addition constants */ .align 64 .Lbige_addb_0_1: @@ -904,6 +904,7 @@ _gcry_camellia_gfni_avx512_ctr_enc: add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */ add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */ add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */ + kxorq %k1, %k1, %k1; .align 4 .Lload_ctr_done: -- cgit v1.2.1