From 926cc22058a39c7a931e14590eab6fd7a78ba455 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 22 Feb 2023 20:19:07 +0200 Subject: camellia-aesni-avx: add acceleration for ECB/XTS/CTR32LE modes * cipher/camellia-aesni-avx-amd64.S (_gcry_camellia_aesni_avx_ecb_enc) (_gcry_camellia_aesni_avx_ecb_dec): New. * cipher/camellia-glue.c (_gcry_camellia_aesni_avx_ecb_enc) (_gcry_camellia_aesni_avx_ecb_dec): New. (camellia_setkey): Always enable XTS/ECB/CTR32LE bulk functions. (camellia_encrypt_blk1_32, camellia_decrypt_blk1_32) [USE_AESNI_AVX]: Add AESNI/AVX code-path. -- Signed-off-by: Jussi Kivilinna --- cipher/camellia-aesni-avx-amd64.S | 92 +++++++++++++++++++++++++++++++++++++++ cipher/camellia-glue.c | 59 +++++++++++++++++-------- 2 files changed, 133 insertions(+), 18 deletions(-) (limited to 'cipher') diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 1f241e03..93c96791 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -1029,6 +1029,98 @@ _gcry_camellia_aesni_avx_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;) +.align 16 +.globl _gcry_camellia_aesni_avx_ecb_enc +ELF(.type _gcry_camellia_aesni_avx_ecb_enc,@function;) + +_gcry_camellia_aesni_avx_ecb_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + */ + CFI_STARTPROC(); + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + vzeroupper; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx, (key_table)(CTX)); + + subq $(16 * 16), %rsp; + andq $~31, %rsp; + movq %rsp, %rax; + + call __camellia_enc_blk16; + + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + vzeroall; + + leave; + CFI_LEAVE(); + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_aesni_avx_ecb_enc,.-_gcry_camellia_aesni_avx_ecb_enc;) + +.align 16 +.globl _gcry_camellia_aesni_avx_ecb_dec +ELF(.type _gcry_camellia_aesni_avx_ecb_dec,@function;) + +_gcry_camellia_aesni_avx_ecb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + */ + CFI_STARTPROC(); + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + vzeroupper; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx, (key_table)(CTX, %r8, 8)); + + subq $(16 * 16), %rsp; + andq $~31, %rsp; + movq %rsp, %rax; + + call __camellia_dec_blk16; + + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + vzeroall; + + leave; + CFI_LEAVE(); + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_aesni_avx_ecb_dec,.-_gcry_camellia_aesni_avx_ecb_dec;) + .align 16 .globl _gcry_camellia_aesni_avx_cbc_dec ELF(.type _gcry_camellia_aesni_avx_cbc_dec,@function;) diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 2e00f563..8b4b4b3c 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -172,15 +172,25 @@ extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx, const u64 Ls[16]) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx, - const unsigned char *abuf, - unsigned char *offset, - unsigned char *checksum, - const u64 Ls[16]) ASM_FUNC_ABI; + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[16]) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx, const unsigned char *key, unsigned int keylen) ASM_FUNC_ABI; +extern void _gcry_camellia_aesni_avx_ecb_enc(const CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in) + ASM_FUNC_ABI; + +extern void _gcry_camellia_aesni_avx_ecb_dec(const CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in) + ASM_FUNC_ABI; + static const int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 + 2 * sizeof(void *) + ASM_EXTRA_STACK; @@ -473,18 +483,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, bulk_ops->ctr_enc = _gcry_camellia_ctr_enc; bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt; bulk_ops->ocb_auth = _gcry_camellia_ocb_auth; -#ifdef USE_AESNI_AVX2 - if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2) - { - bulk_ops->xts_crypt = _gcry_camellia_xts_crypt; - bulk_ops->ecb_crypt = _gcry_camellia_ecb_crypt; - bulk_ops->ctr32le_enc = _gcry_camellia_ctr32le_enc; - } -#else - (void)_gcry_camellia_xts_crypt; - (void)_gcry_camellia_ecb_crypt; - (void)_gcry_camellia_ctr32le_enc; -#endif + bulk_ops->xts_crypt = _gcry_camellia_xts_crypt; + bulk_ops->ecb_crypt = _gcry_camellia_ecb_crypt; + bulk_ops->ctr32le_enc = _gcry_camellia_ctr32le_enc; if (0) { } @@ -651,10 +652,21 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf, return avx2_burn_stack_depth; } #endif +#ifdef USE_AESNI_AVX + while (ctx->use_aesni_avx && num_blks >= 16) + { + _gcry_camellia_aesni_avx_ecb_enc (ctx, outbuf, inbuf); + stack_burn_size = avx_burn_stack_depth; + outbuf += CAMELLIA_BLOCK_SIZE * 16; + inbuf += CAMELLIA_BLOCK_SIZE * 16; + num_blks -= 16; + } +#endif while (num_blks) { - stack_burn_size = camellia_encrypt((void *)ctx, outbuf, inbuf); + unsigned int nburn = camellia_encrypt((void *)ctx, outbuf, inbuf); + stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size; outbuf += CAMELLIA_BLOCK_SIZE; inbuf += CAMELLIA_BLOCK_SIZE; num_blks--; @@ -731,10 +743,21 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf, return avx2_burn_stack_depth; } #endif +#ifdef USE_AESNI_AVX + while (ctx->use_aesni_avx && num_blks >= 16) + { + _gcry_camellia_aesni_avx_ecb_dec (ctx, outbuf, inbuf); + stack_burn_size = avx_burn_stack_depth; + outbuf += CAMELLIA_BLOCK_SIZE * 16; + inbuf += CAMELLIA_BLOCK_SIZE * 16; + num_blks -= 16; + } +#endif while (num_blks) { - stack_burn_size = camellia_decrypt((void *)ctx, outbuf, inbuf); + unsigned int nburn = camellia_decrypt((void *)ctx, outbuf, inbuf); + stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size; outbuf += CAMELLIA_BLOCK_SIZE; inbuf += CAMELLIA_BLOCK_SIZE; num_blks--; -- cgit v1.2.1