diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-06-17 00:52:53 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-07-23 18:41:37 +0300 |
commit | 4c0e244fc53e0f7b927bfe4cf54695b5d282fd27 (patch) | |
tree | 314fc5a03d86f9a7b9f968e3d155157a572bcbc8 /cipher/camellia-aesni-avx-amd64.S | |
parent | e0dabf74bf276500257f15b85ded9cf24ccc8334 (diff) | |
download | libgcrypt-4c0e244fc53e0f7b927bfe4cf54695b5d282fd27.tar.gz |
Camellia AES-NI/AVX/AVX2 size optimization
* cipher/camellia-aesni-avx-amd64.S: Use loop for handling repeating
'(enc|dec)_rounds16/fls16' portions of encryption/decryption.
* cipher/camellia-aesni-avx2-amd64.S: Use loop for handling repeating
'(enc|dec)_rounds32/fls32' portions of encryption/decryption.
--
Use round+fls loop to reduce binary size of Camellia AES-NI/AVX/AVX2
implementations. This also gives small performance boost on AMD Zen2.
Before:
text data bss dec hex filename
63877 0 0 63877 f985 cipher/.libs/camellia-aesni-avx2-amd64.o
59623 0 0 59623 e8e7 cipher/.libs/camellia-aesni-avx-amd64.o
After:
text data bss dec hex filename
22999 0 0 22999 59d7 cipher/.libs/camellia-aesni-avx2-amd64.o
25047 0 0 25047 61d7 cipher/.libs/camellia-aesni-avx-amd64.o
Benchmark on AMD Ryzen 7 3700X:
Before:
Cipher:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.670 ns/B 1424 MiB/s 2.88 c/B 4300
CFB dec | 0.667 ns/B 1430 MiB/s 2.87 c/B 4300
CTR enc | 0.677 ns/B 1410 MiB/s 2.91 c/B 4300
CTR dec | 0.676 ns/B 1412 MiB/s 2.90 c/B 4300
OCB enc | 0.696 ns/B 1370 MiB/s 2.98 c/B 4275
OCB dec | 0.698 ns/B 1367 MiB/s 2.98 c/B 4275
OCB auth | 0.683 ns/B 1395 MiB/s 2.94 c/B 4300
After (~8% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.611 ns/B 1561 MiB/s 2.64 c/B 4313
CFB dec | 0.616 ns/B 1549 MiB/s 2.65 c/B 4312
CTR enc | 0.625 ns/B 1525 MiB/s 2.69 c/B 4300
CTR dec | 0.625 ns/B 1526 MiB/s 2.69 c/B 4299
OCB enc | 0.639 ns/B 1493 MiB/s 2.75 c/B 4307
OCB dec | 0.642 ns/B 1485 MiB/s 2.76 c/B 4301
OCB auth | 0.631 ns/B 1512 MiB/s 2.71 c/B 4300
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-aesni-avx-amd64.S')
-rw-r--r-- | cipher/camellia-aesni-avx-amd64.S | 136 |
1 files changed, 53 insertions, 83 deletions
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 4671bcfe..64cabaa5 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -1,6 +1,6 @@ /* camellia-avx-aesni-amd64.S - AES-NI/AVX implementation of Camellia cipher * - * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -35,7 +35,6 @@ /* register macros */ #define CTX %rdi -#define RIO %r8 /********************************************************************** helper macros @@ -772,6 +771,7 @@ __camellia_enc_blk16: /* input: * %rdi: ctx, CTX * %rax: temporary storage, 256 bytes + * %r8d: 24 for 16 byte key, 32 for larger * %xmm0..%xmm15: 16 plaintext blocks * output: * %xmm0..%xmm15: 16 encrypted blocks, order swapped: @@ -781,42 +781,32 @@ __camellia_enc_blk16: leaq 8 * 16(%rax), %rcx; + leaq (-8 * 8)(CTX, %r8, 8), %r8; + inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rax, %rcx); +.align 8 +.Lenc_loop: enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rax, %rcx, 0); - fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, - ((key_table + (8) * 8) + 0)(CTX), - ((key_table + (8) * 8) + 4)(CTX), - ((key_table + (8) * 8) + 8)(CTX), - ((key_table + (8) * 8) + 12)(CTX)); - - enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, %rax, %rcx, 8); + cmpq %r8, CTX; + je .Lenc_done; + leaq (8 * 8)(CTX), CTX; fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, - ((key_table + (16) * 8) + 0)(CTX), - ((key_table + (16) * 8) + 4)(CTX), - ((key_table + (16) * 8) + 8)(CTX), - ((key_table + (16) * 8) + 12)(CTX)); - - enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, %rax, %rcx, 16); - - movl $24, %r8d; - cmpl $128, key_bitlength(CTX); - jne .Lenc_max32; + ((key_table) + 0)(CTX), + ((key_table) + 4)(CTX), + ((key_table) + 8)(CTX), + ((key_table) + 12)(CTX)); + jmp .Lenc_loop; +.align 8 .Lenc_done: /* load CD for output */ vmovdqu 0 * 16(%rcx), %xmm8; @@ -830,27 +820,9 @@ __camellia_enc_blk16: outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); + %xmm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 16(%rax)); ret; - -.align 8 -.Lenc_max32: - movl $32, %r8d; - - fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, - ((key_table + (24) * 8) + 0)(CTX), - ((key_table + (24) * 8) + 4)(CTX), - ((key_table + (24) * 8) + 8)(CTX), - ((key_table + (24) * 8) + 12)(CTX)); - - enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, %rax, %rcx, 24); - - jmp .Lenc_done; CFI_ENDPROC(); ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;) @@ -869,44 +841,38 @@ __camellia_dec_blk16: */ CFI_STARTPROC(); + movq %r8, %rcx; + movq CTX, %r8 + leaq (-8 * 8)(CTX, %rcx, 8), CTX; + leaq 8 * 16(%rax), %rcx; inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rax, %rcx); - cmpl $32, %r8d; - je .Ldec_max32; - -.Ldec_max24: +.align 8 +.Ldec_loop: dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, %rax, %rcx, 16); - - fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, - ((key_table + (16) * 8) + 8)(CTX), - ((key_table + (16) * 8) + 12)(CTX), - ((key_table + (16) * 8) + 0)(CTX), - ((key_table + (16) * 8) + 4)(CTX)); + %xmm15, %rax, %rcx, 0); - dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, %rax, %rcx, 8); + cmpq %r8, CTX; + je .Ldec_done; fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, - ((key_table + (8) * 8) + 8)(CTX), - ((key_table + (8) * 8) + 12)(CTX), - ((key_table + (8) * 8) + 0)(CTX), - ((key_table + (8) * 8) + 4)(CTX)); + ((key_table) + 8)(CTX), + ((key_table) + 12)(CTX), + ((key_table) + 0)(CTX), + ((key_table) + 4)(CTX)); - dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, %rax, %rcx, 0); + leaq (-8 * 8)(CTX), CTX; + jmp .Ldec_loop; +.align 8 +.Ldec_done: /* load CD for output */ vmovdqu 0 * 16(%rcx), %xmm8; vmovdqu 1 * 16(%rcx), %xmm9; @@ -922,22 +888,6 @@ __camellia_dec_blk16: %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); ret; - -.align 8 -.Ldec_max32: - dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, %rax, %rcx, 24); - - fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, - %xmm15, - ((key_table + (24) * 8) + 8)(CTX), - ((key_table + (24) * 8) + 12)(CTX), - ((key_table + (24) * 8) + 0)(CTX), - ((key_table + (24) * 8) + 4)(CTX)); - - jmp .Ldec_max24; CFI_ENDPROC(); ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;) @@ -967,6 +917,11 @@ _gcry_camellia_aesni_avx_ctr_enc: vzeroupper; + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + subq $(16 * 16), %rsp; andq $~31, %rsp; movq %rsp, %rax; @@ -1163,6 +1118,11 @@ _gcry_camellia_aesni_avx_cfb_dec: vzeroupper; + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + subq $(16 * 16), %rsp; andq $~31, %rsp; movq %rsp, %rax; @@ -1307,6 +1267,11 @@ _gcry_camellia_aesni_avx_ocb_enc: vmovdqu %xmm14, (%rcx); vmovdqu %xmm15, (%r8); + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r10d; + cmovel %r10d, %r8d; /* max */ + /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm15; vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; @@ -1617,6 +1582,11 @@ _gcry_camellia_aesni_avx_ocb_auth: OCB_INPUT(15, %r13, %xmm0); #undef OCB_INPUT + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r10d; + cmovel %r10d, %r8d; /* max */ + vmovdqu %xmm15, (%rdx); movq %rcx, %r10; |