diff options
-rw-r--r-- | cipher/cipher-gcm-intel-pclmul.c | 809 | ||||
-rw-r--r-- | cipher/cipher-gcm.c | 15 | ||||
-rw-r--r-- | cipher/cipher-internal.h | 11 |
3 files changed, 724 insertions, 111 deletions
diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index daf807d0..b7324e8f 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -1,6 +1,6 @@ /* cipher-gcm-intel-pclmul.c - Intel PCLMUL accelerated Galois Counter Mode * implementation - * Copyright (C) 2013-2014,2019 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2013-2014,2019,2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -49,12 +49,18 @@ #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE +#define GCM_INTEL_USE_VPCLMUL_AVX2 (1 << 0) +#define GCM_INTEL_AGGR8_TABLE_INITIALIZED (1 << 1) +#define GCM_INTEL_AGGR16_TABLE_INITIALIZED (1 << 2) + + /* Intel PCLMUL ghash based on white paper: "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis. */ -static ASM_FUNC_ATTR_INLINE void reduction(void) +static ASM_FUNC_ATTR_INLINE +void reduction(void) { /* input: <xmm1:xmm3> */ @@ -83,7 +89,8 @@ static ASM_FUNC_ATTR_INLINE void reduction(void) ::: "memory" ); } -static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void) +static ASM_FUNC_ATTR_INLINE +void gfmul_pclmul(void) { /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified. Input must be converted to little-endian. @@ -358,12 +365,12 @@ gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table) \ "pshufd $78, %%xmm8, %%xmm11\n\t" \ "pshufd $78, %%xmm5, %%xmm7\n\t" \ - "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ \ - "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ \ + "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 2:a0+a1 */ \ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ \ "movdqa %%xmm8, %%xmm6\n\t" \ - "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ \ - "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ \ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ \ + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 2:a0*b0 */ \ + "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */ \ "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */ \ @@ -371,16 +378,16 @@ gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table) \ "pshufd $78, %%xmm0, %%xmm11\n\t" \ "pshufd $78, %%xmm2, %%xmm7\n\t" \ - "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ \ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ \ + "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */ \ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ \ "movdqa %%xmm0, %%xmm6\n\t" \ - "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ \ - "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ \ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ \ + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 1:a0*b0 */ \ + "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ \ \ - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */ \ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */ \ - "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */\ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4+5+6+7+8:a0*b0 */ \ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4+5+6+7+8:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ \ \ /* aggregated reduction... */ \ "movdqa %%xmm3, %%xmm5\n\t" \ @@ -432,14 +439,409 @@ gfmul_pclmul_aggr8_le(const void *buf, const void *h_table) reduction(); } -#endif -static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs) +#ifdef GCM_USE_INTEL_VPCLMUL_AVX2 + +#define GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le) \ + /* perform clmul and merge results... */ \ + "vmovdqu 0*16(%[buf]), %%ymm5\n\t" \ + "vmovdqu 2*16(%[buf]), %%ymm2\n\t" \ + be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \ + be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \ + "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" \ + \ + "vpshufd $78, %%ymm0, %%ymm5\n\t" \ + "vpshufd $78, %%ymm1, %%ymm4\n\t" \ + "vpxor %%ymm0, %%ymm5, %%ymm5\n\t" /* ymm5 holds 15|16:a0+a1 */ \ + "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds 15|16:b0+b1 */ \ + "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t" /* ymm3 holds 15|16:a0*b0 */ \ + "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm1 holds 15|16:a1*b1 */ \ + "vpclmulqdq $0, %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds 15|16:(a0+a1)*(b0+b1) */ \ + \ + "vmovdqu %[h1_h2], %%ymm0\n\t" \ + \ + "vpshufd $78, %%ymm13, %%ymm14\n\t" \ + "vpshufd $78, %%ymm2, %%ymm7\n\t" \ + "vpxor %%ymm13, %%ymm14, %%ymm14\n\t" /* ymm14 holds 13|14:a0+a1 */ \ + "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 13|14:b0+b1 */ \ + "vpclmulqdq $0, %%ymm2, %%ymm13, %%ymm6\n\t" /* ymm6 holds 13|14:a0*b0 */ \ + "vpclmulqdq $17, %%ymm13, %%ymm2, %%ymm2\n\t" /* ymm2 holds 13|14:a1*b1 */ \ + "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 13|14:(a0+a1)*(b0+b1) */\ + \ + "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 13+15|14+16:a0*b0 */ \ + "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 13+15|14+16:a1*b1 */ \ + "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 13+15|14+16:(a0+a1)*(b0+b1) */ \ + \ + "vmovdqu 4*16(%[buf]), %%ymm5\n\t" \ + "vmovdqu 6*16(%[buf]), %%ymm2\n\t" \ + be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \ + be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \ + \ + "vpshufd $78, %%ymm12, %%ymm14\n\t" \ + "vpshufd $78, %%ymm5, %%ymm7\n\t" \ + "vpxor %%ymm12, %%ymm14, %%ymm14\n\t" /* ymm14 holds 11|12:a0+a1 */ \ + "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 11|12:b0+b1 */ \ + "vpclmulqdq $0, %%ymm5, %%ymm12, %%ymm6\n\t" /* ymm6 holds 11|12:a0*b0 */ \ + "vpclmulqdq $17, %%ymm12, %%ymm5, %%ymm5\n\t" /* ymm5 holds 11|12:a1*b1 */ \ + "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 11|12:(a0+a1)*(b0+b1) */\ + \ + "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 11+13+15|12+14+16:a0*b0 */ \ + "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 11+13+15|12+14+16:a1*b1 */ \ + "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 11+13+15|12+14+16:(a0+a1)*(b0+b1) */\ + \ + "vpshufd $78, %%ymm11, %%ymm14\n\t" \ + "vpshufd $78, %%ymm2, %%ymm7\n\t" \ + "vpxor %%ymm11, %%ymm14, %%ymm14\n\t" /* ymm14 holds 9|10:a0+a1 */ \ + "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 9|10:b0+b1 */ \ + "vpclmulqdq $0, %%ymm2, %%ymm11, %%ymm6\n\t" /* ymm6 holds 9|10:a0*b0 */ \ + "vpclmulqdq $17, %%ymm11, %%ymm2, %%ymm2\n\t" /* ymm2 holds 9|10:a1*b1 */ \ + "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 9|10:(a0+a1)*(b0+b1) */ \ + \ + "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 9+11+…+15|10+12+…+16:a0*b0 */ \ + "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 9+11+…+15|10+12+…+16:a1*b1 */ \ + "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 9+11+…+15|10+12+…+16:(a0+a1)*(b0+b1) */\ + \ + "vmovdqu 8*16(%[buf]), %%ymm5\n\t" \ + "vmovdqu 10*16(%[buf]), %%ymm2\n\t" \ + be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \ + be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \ + \ + "vpshufd $78, %%ymm10, %%ymm14\n\t" \ + "vpshufd $78, %%ymm5, %%ymm7\n\t" \ + "vpxor %%ymm10, %%ymm14, %%ymm14\n\t" /* ymm14 holds 7|8:a0+a1 */ \ + "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 7|8:b0+b1 */ \ + "vpclmulqdq $0, %%ymm5, %%ymm10, %%ymm6\n\t" /* ymm6 holds 7|8:a0*b0 */ \ + "vpclmulqdq $17, %%ymm10, %%ymm5, %%ymm5\n\t" /* ymm5 holds 7|8:a1*b1 */ \ + "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 7|8:(a0+a1)*(b0+b1) */ \ + \ + "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 7+9+…+15|8+10+…+16:a0*b0 */ \ + "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 7+9+…+15|8+10+…+16:a1*b1 */ \ + "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 7+9+…+15|8+10+…+16:(a0+a1)*(b0+b1) */\ + \ + "vpshufd $78, %%ymm9, %%ymm14\n\t" \ + "vpshufd $78, %%ymm2, %%ymm7\n\t" \ + "vpxor %%ymm9, %%ymm14, %%ymm14\n\t" /* ymm14 holds 5|6:a0+a1 */ \ + "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 5|6:b0+b1 */ \ + "vpclmulqdq $0, %%ymm2, %%ymm9, %%ymm6\n\t" /* ymm6 holds 5|6:a0*b0 */ \ + "vpclmulqdq $17, %%ymm9, %%ymm2, %%ymm2\n\t" /* ymm2 holds 5|6:a1*b1 */ \ + "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 5|6:(a0+a1)*(b0+b1) */ \ + \ + "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 5+7+…+15|6+8+…+16:a0*b0 */ \ + "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 5+7+…+15|6+8+…+16:a1*b1 */ \ + "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 5+7+…+15|6+8+…+16:(a0+a1)*(b0+b1) */\ + \ + "vmovdqu 12*16(%[buf]), %%ymm5\n\t" \ + "vmovdqu 14*16(%[buf]), %%ymm2\n\t" \ + be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \ + be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \ + \ + "vpshufd $78, %%ymm8, %%ymm14\n\t" \ + "vpshufd $78, %%ymm5, %%ymm7\n\t" \ + "vpxor %%ymm8, %%ymm14, %%ymm14\n\t" /* ymm14 holds 3|4:a0+a1 */ \ + "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 3|4:b0+b1 */ \ + "vpclmulqdq $0, %%ymm5, %%ymm8, %%ymm6\n\t" /* ymm6 holds 3|4:a0*b0 */ \ + "vpclmulqdq $17, %%ymm8, %%ymm5, %%ymm5\n\t" /* ymm5 holds 3|4:a1*b1 */ \ + "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 3|4:(a0+a1)*(b0+b1) */ \ + \ + "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 3+5+…+15|4+6+…+16:a0*b0 */ \ + "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 3+5+…+15|4+6+…+16:a1*b1 */ \ + "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 3+5+…+15|4+6+…+16:(a0+a1)*(b0+b1) */\ + \ + "vpshufd $78, %%ymm0, %%ymm14\n\t" \ + "vpshufd $78, %%ymm2, %%ymm7\n\t" \ + "vpxor %%ymm0, %%ymm14, %%ymm14\n\t" /* ymm14 holds 1|2:a0+a1 */ \ + "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 1|2:b0+b1 */ \ + "vpclmulqdq $0, %%ymm2, %%ymm0, %%ymm6\n\t" /* ymm6 holds 1|2:a0*b0 */ \ + "vpclmulqdq $17, %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds 1|2:a1*b1 */ \ + "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 1|2:(a0+a1)*(b0+b1) */ \ + \ + "vmovdqu %[h15_h16], %%ymm0\n\t" \ + \ + "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 1+3+…+15|2+4+…+16:a0*b0 */ \ + "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 1+3+…+15|2+4+…+16:a1*b1 */ \ + "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 1+3+…+15|2+4+…+16:(a0+a1)*(b0+b1) */\ + \ + /* aggregated reduction... */ \ + "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */ \ + "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ \ + "vpslldq $8, %%ymm4, %%ymm5\n\t" \ + "vpsrldq $8, %%ymm4, %%ymm4\n\t" \ + "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" \ + "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* <ymm1:xmm3> holds the result of the \ + carry-less multiplication of ymm0 \ + by ymm1 */ \ + \ + /* first phase of the reduction */ \ + "vpsllq $1, %%ymm3, %%ymm6\n\t" /* packed right shifting << 63 */ \ + "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \ + "vpsllq $57, %%ymm3, %%ymm5\n\t" /* packed right shifting << 57 */ \ + "vpsllq $62, %%ymm6, %%ymm6\n\t" /* packed right shifting << 62 */ \ + "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */ \ + "vpshufd $0x6a, %%ymm6, %%ymm5\n\t" \ + "vpshufd $0xae, %%ymm6, %%ymm6\n\t" \ + "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */ \ + \ + /* second phase of the reduction */ \ + "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */ \ + "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 1 */ \ + "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \ + "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 2 */ \ + "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" \ + "vpsrlq $5, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 7 */ \ + "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \ + "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */ \ + \ + /* merge 128-bit halves */ \ + "vextracti128 $1, %%ymm1, %%xmm2\n\t" \ + "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" + +static ASM_FUNC_ATTR_INLINE void +gfmul_vpclmul_avx2_aggr16(const void *buf, const void *h_table, + const u64 *h1_h2_h15_h16) +{ + /* Input: + Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 + bemask: YMM15 + Hash: XMM1 + Output: + Hash: XMM1 + Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay + unmodified. + */ + asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le) + : + : [buf] "r" (buf), + [h_table] "r" (h_table), + [h1_h2] "m" (h1_h2_h15_h16[0]), + [h15_h16] "m" (h1_h2_h15_h16[4]) + : "memory" ); +} + +static ASM_FUNC_ATTR_INLINE void +gfmul_vpclmul_avx2_aggr16_le(const void *buf, const void *h_table, + const u64 *h1_h2_h15_h16) +{ + /* Input: + Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 + bemask: YMM15 + Hash: XMM1 + Output: + Hash: XMM1 + Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay + unmodified. + */ + asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(le_to_le) + : + : [buf] "r" (buf), + [h_table] "r" (h_table), + [h1_h2] "m" (h1_h2_h15_h16[0]), + [h15_h16] "m" (h1_h2_h15_h16[4]) + : "memory" ); +} + +static ASM_FUNC_ATTR_INLINE +void gfmul_pclmul_avx2(void) +{ + /* Input: YMM0 and YMM1, Output: YMM1. Input YMM0 stays unmodified. + Input must be converted to little-endian. + */ + asm volatile (/* gfmul, ymm0 has operator a and ymm1 has operator b. */ + "vpshufd $78, %%ymm0, %%ymm2\n\t" + "vpshufd $78, %%ymm1, %%ymm4\n\t" + "vpxor %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds a0+a1 */ + "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds b0+b1 */ + + "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t" /* ymm3 holds a0*b0 */ + "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm6 holds a1*b1 */ + "vpclmulqdq $0, %%ymm2, %%ymm4, %%ymm4\n\t" /* ymm4 holds (a0+a1)*(b0+b1) */ + + "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */ + "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ + "vpslldq $8, %%ymm4, %%ymm5\n\t" + "vpsrldq $8, %%ymm4, %%ymm4\n\t" + "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" + "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* <ymm1:ymm3> holds the result of the + carry-less multiplication of ymm0 + by ymm1 */ + + /* first phase of the reduction */ + "vpsllq $1, %%ymm3, %%ymm6\n\t" /* packed right shifting << 63 */ + "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" + "vpsllq $57, %%ymm3, %%ymm5\n\t" /* packed right shifting << 57 */ + "vpsllq $62, %%ymm6, %%ymm6\n\t" /* packed right shifting << 62 */ + "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */ + "vpshufd $0x6a, %%ymm6, %%ymm5\n\t" + "vpshufd $0xae, %%ymm6, %%ymm6\n\t" + "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */ + + /* second phase of the reduction */ + "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */ + "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 1 */ + "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" + "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 2 */ + "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" + "vpsrlq $5, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 7 */ + "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" + "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */ + ::: "memory" ); +} + +static ASM_FUNC_ATTR_INLINE void +gcm_lsh_avx2(void *h, unsigned int hoffs) +{ + static const u64 pconst[4] __attribute__ ((aligned (32))) = + { + U64_C(0x0000000000000001), U64_C(0xc200000000000000), + U64_C(0x0000000000000001), U64_C(0xc200000000000000) + }; + + asm volatile ("vmovdqu %[h], %%ymm2\n\t" + "vpshufd $0xff, %%ymm2, %%ymm3\n\t" + "vpsrad $31, %%ymm3, %%ymm3\n\t" + "vpslldq $8, %%ymm2, %%ymm4\n\t" + "vpand %[pconst], %%ymm3, %%ymm3\n\t" + "vpaddq %%ymm2, %%ymm2, %%ymm2\n\t" + "vpsrlq $63, %%ymm4, %%ymm4\n\t" + "vpxor %%ymm3, %%ymm2, %%ymm2\n\t" + "vpxor %%ymm4, %%ymm2, %%ymm2\n\t" + "vmovdqu %%ymm2, %[h]\n\t" + : [h] "+m" (*((byte *)h + hoffs)) + : [pconst] "m" (*pconst) + : "memory" ); +} + +static ASM_FUNC_ATTR_INLINE void +load_h1h2_to_ymm1(gcry_cipher_hd_t c) +{ + unsigned int key_pos = + offsetof(struct gcry_cipher_handle, u_mode.gcm.u_ghash_key.key); + unsigned int table_pos = + offsetof(struct gcry_cipher_handle, u_mode.gcm.gcm_table); + + if (key_pos + 16 == table_pos) + { + /* Optimization: Table follows immediately after key. */ + asm volatile ("vmovdqu %[key], %%ymm1\n\t" + : + : [key] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory"); + } + else + { + asm volatile ("vmovdqa %[key], %%xmm1\n\t" + "vinserti128 $1, 0*16(%[h_table]), %%ymm1, %%ymm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table), + [key] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory"); + } +} + +static ASM_FUNC_ATTR void +ghash_setup_aggr8_avx2(gcry_cipher_hd_t c) +{ + c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED; + + asm volatile (/* load H⁴ */ + "vbroadcasti128 3*16(%[h_table]), %%ymm0\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + /* load H <<< 1, H² <<< 1 */ + load_h1h2_to_ymm1 (c); + + gfmul_pclmul_avx2 (); /* H<<<1•H⁴ => H⁵, H²<<<1•H⁴ => H⁶ */ + + asm volatile ("vmovdqu %%ymm1, 3*16(%[h_table])\n\t" + /* load H³ <<< 1, H⁴ <<< 1 */ + "vmovdqu 1*16(%[h_table]), %%ymm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul_avx2 (); /* H³<<<1•H⁴ => H⁷, H⁴<<<1•H⁴ => H⁸ */ + + asm volatile ("vmovdqu %%ymm1, 6*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */ + "vmovdqu %%ymm1, 5*16(%[h_table])\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1, H⁶ <<< 1 */ + gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1, H⁸ <<< 1 */ +} + +static ASM_FUNC_ATTR void +ghash_setup_aggr16_avx2(gcry_cipher_hd_t c) +{ + c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR16_TABLE_INITIALIZED; + + asm volatile (/* load H⁸ */ + "vbroadcasti128 7*16(%[h_table]), %%ymm0\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + /* load H <<< 1, H² <<< 1 */ + load_h1h2_to_ymm1 (c); + + gfmul_pclmul_avx2 (); /* H<<<1•H⁸ => H⁹, H²<<<1•H⁸ => H¹⁰ */ + + asm volatile ("vmovdqu %%ymm1, 7*16(%[h_table])\n\t" + /* load H³ <<< 1, H⁴ <<< 1 */ + "vmovdqu 1*16(%[h_table]), %%ymm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul_avx2 (); /* H³<<<1•H⁸ => H¹¹, H⁴<<<1•H⁸ => H¹² */ + + asm volatile ("vmovdqu %%ymm1, 9*16(%[h_table])\n\t" + /* load H⁵ <<< 1, H⁶ <<< 1 */ + "vmovdqu 3*16(%[h_table]), %%ymm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul_avx2 (); /* H⁵<<<1•H⁸ => H¹³, H⁶<<<1•H⁸ => H¹⁴ */ + + asm volatile ("vmovdqu %%ymm1, 11*16(%[h_table])\n\t" + /* load H⁷ <<< 1, H⁸ <<< 1 */ + "vmovdqu 5*16(%[h_table]), %%ymm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul_avx2 (); /* H⁷<<<1•H⁸ => H¹⁵, H⁸<<<1•H⁸ => H¹⁶ */ + + asm volatile ("vmovdqu %%ymm1, 13*16(%[h_table])\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 7 * 16); /* H⁹ <<< 1, H¹⁰ <<< 1 */ + gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 9 * 16); /* H¹¹ <<< 1, H¹² <<< 1 */ + gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 11 * 16); /* H¹³ <<< 1, H¹⁴ <<< 1 */ + gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 13 * 16); /* H¹⁵ <<< 1, H¹⁶ <<< 1 */ +} + +#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ +#endif /* __x86_64__ */ + +static unsigned int ASM_FUNC_ATTR +_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks); + +static unsigned int ASM_FUNC_ATTR +_gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks); + +static ASM_FUNC_ATTR_INLINE void +gcm_lsh(void *h, unsigned int hoffs) { static const u64 pconst[2] __attribute__ ((aligned (16))) = { U64_C(0x0000000000000001), U64_C(0xc200000000000000) }; - asm volatile ("movdqu (%[h]), %%xmm2\n\t" + asm volatile ("movdqu %[h], %%xmm2\n\t" "pshufd $0xff, %%xmm2, %%xmm3\n\t" "movdqa %%xmm2, %%xmm4\n\t" "psrad $31, %%xmm3\n\t" @@ -449,15 +851,14 @@ static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs) "psrlq $63, %%xmm4\n\t" "pxor %%xmm3, %%xmm2\n\t" "pxor %%xmm4, %%xmm2\n\t" - "movdqu %%xmm2, (%[h])\n\t" - : - : [pconst] "m" (*pconst), - [h] "r" ((byte *)h + hoffs) + "movdqu %%xmm2, %[h]\n\t" + : [h] "+m" (*((byte *)h + hoffs)) + : [pconst] "m" (*pconst) : "memory" ); } void ASM_FUNC_ATTR -_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) +_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, unsigned int hw_features) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; @@ -480,6 +881,12 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) : "memory" ); #endif + (void)hw_features; + + c->u_mode.gcm.hw_impl_flags = 0; + c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul; + c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul; + /* Swap endianness of hsub. */ asm volatile ("movdqu (%[key]), %%xmm0\n\t" "pshufb %[be_mask], %%xmm0\n\t" @@ -489,7 +896,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) [be_mask] "m" (*be_mask) : "memory"); - gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */ + gcm_lsh (c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */ asm volatile ("movdqa %%xmm0, %%xmm1\n\t" "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */ @@ -500,80 +907,81 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) gfmul_pclmul (); /* H<<<1•H => H² */ asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t" - "movdqa %%xmm1, %%xmm7\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); - gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */ - gfmul_pclmul (); /* H<<<1•H² => H³ */ + gcm_lsh (c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */ - asm volatile ("movdqa %%xmm7, %%xmm0\n\t" - "movdqu %%xmm1, 1*16(%[h_table])\n\t" - "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */ - : - : [h_table] "r" (c->u_mode.gcm.gcm_table) - : "memory"); + if (0) + { } +#ifdef GCM_USE_INTEL_VPCLMUL_AVX2 + else if ((hw_features & HWF_INTEL_VAES_VPCLMUL) + && (hw_features & HWF_INTEL_AVX2)) + { + c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_USE_VPCLMUL_AVX2; - gfmul_pclmul (); /* H²<<<1•H² => H⁴ */ + asm volatile (/* H² */ + "vinserti128 $1, %%xmm1, %%ymm1, %%ymm1\n\t" + /* load H <<< 1, H² <<< 1 */ + "vinserti128 $1, 0*16(%[h_table]), %%ymm0, %%ymm0\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); - asm volatile ("movdqu %%xmm1, 2*16(%[h_table])\n\t" - "movdqa %%xmm1, %%xmm0\n\t" - "movdqu (%[key]), %%xmm1\n\t" /* load H <<< 1 */ - : - : [h_table] "r" (c->u_mode.gcm.gcm_table), - [key] "r" (c->u_mode.gcm.u_ghash_key.key) - : "memory"); + gfmul_pclmul_avx2 (); /* H<<<1•H² => H³, H²<<<1•H² => H⁴ */ - gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */ - gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H⁴ <<< 1 */ + asm volatile ("vmovdqu %%ymm1, 2*16(%[h_table])\n\t" /* store H⁴ for aggr8 setup */ + "vmovdqu %%ymm1, 1*16(%[h_table])\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); -#ifdef __x86_64__ - gfmul_pclmul (); /* H<<<1•H⁴ => H⁵ */ + gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1, H⁴ <<< 1 */ - asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" - "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */ - : - : [h_table] "r" (c->u_mode.gcm.gcm_table) - : "memory"); - - gfmul_pclmul (); /* H²<<<1•H⁴ => H⁶ */ - - asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t" - "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */ - : - : [h_table] "r" (c->u_mode.gcm.gcm_table) - : "memory"); + asm volatile ("vzeroupper\n\t" + ::: "memory" ); + } +#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ + else + { + asm volatile ("movdqa %%xmm1, %%xmm7\n\t" + ::: "memory"); - gfmul_pclmul (); /* H³<<<1•H⁴ => H⁷ */ + gfmul_pclmul (); /* H<<<1•H² => H³ */ - asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t" - "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H⁴ <<< 1 */ - : - : [h_table] "r" (c->u_mode.gcm.gcm_table) - : "memory"); + asm volatile ("movdqa %%xmm7, %%xmm0\n\t" + "movdqu %%xmm1, 1*16(%[h_table])\n\t" + "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); - gfmul_pclmul (); /* H³<<<1•H⁴ => H⁸ */ + gfmul_pclmul (); /* H²<<<1•H² => H⁴ */ - asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t" - : - : [h_table] "r" (c->u_mode.gcm.gcm_table) - : "memory"); + asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" /* store H⁴ for aggr8 setup */ + "movdqu %%xmm1, 2*16(%[h_table])\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); - gcm_lsh(c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1 */ - gcm_lsh(c->u_mode.gcm.gcm_table, 4 * 16); /* H⁶ <<< 1 */ - gcm_lsh(c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1 */ - gcm_lsh(c->u_mode.gcm.gcm_table, 6 * 16); /* H⁸ <<< 1 */ + gcm_lsh (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */ + gcm_lsh (c->u_mode.gcm.gcm_table, 2 * 16); /* H⁴ <<< 1 */ + } -#ifdef __WIN64__ /* Clear/restore used registers. */ - asm volatile( "pxor %%xmm0, %%xmm0\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "pxor %%xmm2, %%xmm2\n\t" - "pxor %%xmm3, %%xmm3\n\t" - "pxor %%xmm4, %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "movdqu 0*16(%0), %%xmm6\n\t" + asm volatile ("pxor %%xmm0, %%xmm0\n\t" + "pxor %%xmm1, %%xmm1\n\t" + "pxor %%xmm2, %%xmm2\n\t" + "pxor %%xmm3, %%xmm3\n\t" + "pxor %%xmm4, %%xmm4\n\t" + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "pxor %%xmm7, %%xmm7\n\t" + ::: "memory" ); +#ifdef __x86_64__ +#ifdef __WIN64__ + asm volatile ("movdqu 0*16(%0), %%xmm6\n\t" "movdqu 1*16(%0), %%xmm7\n\t" "movdqu 2*16(%0), %%xmm8\n\t" "movdqu 3*16(%0), %%xmm9\n\t" @@ -587,16 +995,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) : "r" (win64tmp) : "memory" ); #else - /* Clear used registers. */ - asm volatile( "pxor %%xmm0, %%xmm0\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "pxor %%xmm2, %%xmm2\n\t" - "pxor %%xmm3, %%xmm3\n\t" - "pxor %%xmm4, %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "pxor %%xmm7, %%xmm7\n\t" - "pxor %%xmm8, %%xmm8\n\t" + asm volatile ("pxor %%xmm8, %%xmm8\n\t" "pxor %%xmm9, %%xmm9\n\t" "pxor %%xmm10, %%xmm10\n\t" "pxor %%xmm11, %%xmm11\n\t" @@ -605,14 +1004,67 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) "pxor %%xmm14, %%xmm14\n\t" "pxor %%xmm15, %%xmm15\n\t" ::: "memory" ); -#endif -#endif +#endif /* __WIN64__ */ +#endif /* __x86_64__ */ } +#ifdef __x86_64__ +static ASM_FUNC_ATTR void +ghash_setup_aggr8(gcry_cipher_hd_t c) +{ + c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED; + + asm volatile ("movdqa 3*16(%[h_table]), %%xmm0\n\t" /* load H⁴ */ + "movdqu %[key], %%xmm1\n\t" /* load H <<< 1 */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table), + [key] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory"); + + gfmul_pclmul (); /* H<<<1•H⁴ => H⁵ */ + + asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" + "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul (); /* H²<<<1•H⁴ => H⁶ */ + + asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t" + "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul (); /* H³<<<1•H⁴ => H⁷ */ + + asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t" + "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H⁴ <<< 1 */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul (); /* H⁴<<<1•H⁴ => H⁸ */ + + asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t" + "movdqu %%xmm1, 7*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gcm_lsh (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1 */ + gcm_lsh (c->u_mode.gcm.gcm_table, 4 * 16); /* H⁶ <<< 1 */ + gcm_lsh (c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1 */ + gcm_lsh (c->u_mode.gcm.gcm_table, 6 * 16); /* H⁸ <<< 1 */ +} +#endif /* __x86_64__ */ + + unsigned int ASM_FUNC_ATTR _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, - size_t nblocks) + size_t nblocks) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; @@ -650,12 +1102,93 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, [be_mask] "m" (*be_mask) : "memory" ); +#if defined(GCM_USE_INTEL_VPCLMUL_AVX2) + if (nblocks >= 16 + && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2)) + { + u64 h1_h2_h15_h16[4*2]; + + asm volatile ("vinserti128 $1, %%xmm7, %%ymm7, %%ymm15\n\t" + "vmovdqa %%xmm1, %%xmm8\n\t" + ::: "memory" ); + + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) + { + ghash_setup_aggr8_avx2 (c); + } + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED)) + { + ghash_setup_aggr16_avx2 (c); + } + + /* Preload H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12. */ + asm volatile ("vmovdqa %%xmm8, %%xmm1\n\t" + "vmovdqu 0*16(%[h_table]), %%xmm7\n\t" + "vpxor %%xmm8, %%xmm8, %%xmm8\n\t" + "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t" /* H15|H16 */ + "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */ + "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t" /* H11|H12 */ + "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t" /* H9|H10 */ + "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t" /* H7|H8 */ + "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t" /* H5|H6 */ + "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t" /* H3|H4 */ + "vinserti128 $1, %[h_1], %%ymm7, %%ymm7\n\t" /* H1|H2 */ + "vmovdqu %%ymm0, %[h15_h16]\n\t" + "vmovdqu %%ymm7, %[h1_h2]\n\t" + : [h1_h2] "=m" (h1_h2_h15_h16[0]), + [h15_h16] "=m" (h1_h2_h15_h16[4]) + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key), + [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory" ); + + while (nblocks >= 16) + { + gfmul_vpclmul_avx2_aggr16 (buf, c->u_mode.gcm.gcm_table, + h1_h2_h15_h16); + + buf += 16 * blocksize; + nblocks -= 16; + } + + /* Clear used x86-64/XMM registers. */ + asm volatile("vmovdqu %%ymm15, %[h15_h16]\n\t" + "vmovdqu %%ymm15, %[h1_h2]\n\t" + "vzeroupper\n\t" +#ifndef __WIN64__ + "pxor %%xmm8, %%xmm8\n\t" + "pxor %%xmm9, %%xmm9\n\t" + "pxor %%xmm10, %%xmm10\n\t" + "pxor %%xmm11, %%xmm11\n\t" + "pxor %%xmm12, %%xmm12\n\t" + "pxor %%xmm13, %%xmm13\n\t" + "pxor %%xmm14, %%xmm14\n\t" + "pxor %%xmm15, %%xmm15\n\t" +#endif + "movdqa %[be_mask], %%xmm7\n\t" + : [h1_h2] "=m" (h1_h2_h15_h16[0]), + [h15_h16] "=m" (h1_h2_h15_h16[4]) + : [be_mask] "m" (*be_mask) + : "memory" ); + } +#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ + #ifdef __x86_64__ if (nblocks >= 8) { - /* Preload H1. */ asm volatile ("movdqa %%xmm7, %%xmm15\n\t" - "movdqa %[h_1], %%xmm0\n\t" + ::: "memory" ); + + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) + { + asm volatile ("movdqa %%xmm1, %%xmm8\n\t" + ::: "memory" ); + ghash_setup_aggr8 (c); + asm volatile ("movdqa %%xmm8, %%xmm1\n\t" + ::: "memory" ); + } + + /* Preload H1. */ + asm volatile ("movdqa %[h_1], %%xmm0\n\t" : : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) : "memory" ); @@ -667,6 +1200,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, buf += 8 * blocksize; nblocks -= 8; } + #ifndef __WIN64__ /* Clear used x86-64/XMM registers. */ asm volatile( "pxor %%xmm8, %%xmm8\n\t" @@ -680,7 +1214,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, ::: "memory" ); #endif } -#endif +#endif /* __x86_64__ */ while (nblocks >= 4) { @@ -761,7 +1295,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, unsigned int ASM_FUNC_ATTR _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, - size_t nblocks) + size_t nblocks) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; @@ -799,9 +1333,86 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, [be_mask] "m" (*be_mask) : "memory" ); +#if defined(GCM_USE_INTEL_VPCLMUL_AVX2) + if (nblocks >= 16 + && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2)) + { + u64 h1_h2_h15_h16[4*2]; + + asm volatile ("vmovdqa %%xmm1, %%xmm8\n\t" + ::: "memory" ); + + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) + { + ghash_setup_aggr8_avx2 (c); + } + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED)) + { + ghash_setup_aggr16_avx2 (c); + } + + /* Preload H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12. */ + asm volatile ("vmovdqa %%xmm8, %%xmm1\n\t" + "vpxor %%xmm8, %%xmm8, %%xmm8\n\t" + "vmovdqu 0*16(%[h_table]), %%xmm7\n\t" + "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t" /* H15|H16 */ + "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */ + "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t" /* H11|H12 */ + "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t" /* H9|H10 */ + "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t" /* H7|H8 */ + "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t" /* H5|H6 */ + "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t" /* H3|H4 */ + "vinserti128 $1, %[h_1], %%ymm7, %%ymm7\n\t" /* H1|H2 */ + "vmovdqu %%ymm0, %[h15_h16]\n\t" + "vmovdqu %%ymm7, %[h1_h2]\n\t" + : [h1_h2] "=m" (h1_h2_h15_h16[0]), + [h15_h16] "=m" (h1_h2_h15_h16[4]) + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key), + [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory" ); + + while (nblocks >= 16) + { + gfmul_vpclmul_avx2_aggr16_le (buf, c->u_mode.gcm.gcm_table, + h1_h2_h15_h16); + + buf += 16 * blocksize; + nblocks -= 16; + } + + /* Clear used x86-64/XMM registers. */ + asm volatile("vpxor %%xmm7, %%xmm7, %%xmm7\n\t" + "vmovdqu %%ymm7, %[h15_h16]\n\t" + "vmovdqu %%ymm7, %[h1_h2]\n\t" + "vzeroupper\n\t" +#ifndef __WIN64__ + "pxor %%xmm8, %%xmm8\n\t" + "pxor %%xmm9, %%xmm9\n\t" + "pxor %%xmm10, %%xmm10\n\t" + "pxor %%xmm11, %%xmm11\n\t" + "pxor %%xmm12, %%xmm12\n\t" + "pxor %%xmm13, %%xmm13\n\t" + "pxor %%xmm14, %%xmm14\n\t" +#endif + : [h1_h2] "=m" (h1_h2_h15_h16[0]), + [h15_h16] "=m" (h1_h2_h15_h16[4]) + : + : "memory" ); + } +#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ + #ifdef __x86_64__ if (nblocks >= 8) { + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) + { + asm volatile ("movdqa %%xmm1, %%xmm8\n\t" + ::: "memory" ); + ghash_setup_aggr8 (c); + asm volatile ("movdqa %%xmm8, %%xmm1\n\t" + ::: "memory" ); + } + /* Preload H1. */ asm volatile ("pxor %%xmm15, %%xmm15\n\t" "movdqa %[h_1], %%xmm0\n\t" diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index 69ff0de6..683f07b0 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -39,15 +39,8 @@ #ifdef GCM_USE_INTEL_PCLMUL -extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c); - -extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, - const byte *buf, size_t nblocks); - -extern unsigned int _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, - byte *result, - const byte *buf, - size_t nblocks); +extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, + unsigned int hw_features); #endif #ifdef GCM_USE_ARM_PMULL @@ -594,9 +587,7 @@ setupM (gcry_cipher_hd_t c) #ifdef GCM_USE_INTEL_PCLMUL else if (features & HWF_INTEL_PCLMUL) { - c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul; - c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul; - _gcry_ghash_setup_intel_pclmul (c); + _gcry_ghash_setup_intel_pclmul (c, features); } #endif #ifdef GCM_USE_ARM_PMULL diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index c8a1097a..e31ac860 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -72,6 +72,14 @@ # endif #endif /* GCM_USE_INTEL_PCLMUL */ +/* GCM_USE_INTEL_VPCLMUL_AVX2 indicates whether to compile GCM with Intel + VPCLMUL/AVX2 code. */ +#undef GCM_USE_INTEL_VPCLMUL_AVX2 +#if defined(__x86_64__) && defined(GCM_USE_INTEL_PCLMUL) && \ + defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) +# define GCM_USE_INTEL_VPCLMUL_AVX2 1 +#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ + /* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */ #undef GCM_USE_ARM_PMULL #if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES) @@ -355,6 +363,9 @@ struct gcry_cipher_handle /* Key length used for GCM-SIV key generating key. */ unsigned int siv_keylen; + + /* Flags for accelerated implementations. */ + unsigned int hw_impl_flags; } gcm; /* Mode specific storage for OCB mode. */ |