diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-06-17 00:52:53 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-07-23 18:41:37 +0300 |
commit | 4c0e244fc53e0f7b927bfe4cf54695b5d282fd27 (patch) | |
tree | 314fc5a03d86f9a7b9f968e3d155157a572bcbc8 /cipher/camellia-aesni-avx2-amd64.S | |
parent | e0dabf74bf276500257f15b85ded9cf24ccc8334 (diff) | |
download | libgcrypt-4c0e244fc53e0f7b927bfe4cf54695b5d282fd27.tar.gz |
Camellia AES-NI/AVX/AVX2 size optimization
* cipher/camellia-aesni-avx-amd64.S: Use loop for handling repeating
'(enc|dec)_rounds16/fls16' portions of encryption/decryption.
* cipher/camellia-aesni-avx2-amd64.S: Use loop for handling repeating
'(enc|dec)_rounds32/fls32' portions of encryption/decryption.
--
Use round+fls loop to reduce binary size of Camellia AES-NI/AVX/AVX2
implementations. This also gives small performance boost on AMD Zen2.
Before:
text data bss dec hex filename
63877 0 0 63877 f985 cipher/.libs/camellia-aesni-avx2-amd64.o
59623 0 0 59623 e8e7 cipher/.libs/camellia-aesni-avx-amd64.o
After:
text data bss dec hex filename
22999 0 0 22999 59d7 cipher/.libs/camellia-aesni-avx2-amd64.o
25047 0 0 25047 61d7 cipher/.libs/camellia-aesni-avx-amd64.o
Benchmark on AMD Ryzen 7 3700X:
Before:
Cipher:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.670 ns/B 1424 MiB/s 2.88 c/B 4300
CFB dec | 0.667 ns/B 1430 MiB/s 2.87 c/B 4300
CTR enc | 0.677 ns/B 1410 MiB/s 2.91 c/B 4300
CTR dec | 0.676 ns/B 1412 MiB/s 2.90 c/B 4300
OCB enc | 0.696 ns/B 1370 MiB/s 2.98 c/B 4275
OCB dec | 0.698 ns/B 1367 MiB/s 2.98 c/B 4275
OCB auth | 0.683 ns/B 1395 MiB/s 2.94 c/B 4300
After (~8% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.611 ns/B 1561 MiB/s 2.64 c/B 4313
CFB dec | 0.616 ns/B 1549 MiB/s 2.65 c/B 4312
CTR enc | 0.625 ns/B 1525 MiB/s 2.69 c/B 4300
CTR dec | 0.625 ns/B 1526 MiB/s 2.69 c/B 4299
OCB enc | 0.639 ns/B 1493 MiB/s 2.75 c/B 4307
OCB dec | 0.642 ns/B 1485 MiB/s 2.76 c/B 4301
OCB auth | 0.631 ns/B 1512 MiB/s 2.71 c/B 4300
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-aesni-avx2-amd64.S')
-rw-r--r-- | cipher/camellia-aesni-avx2-amd64.S | 135 |
1 files changed, 53 insertions, 82 deletions
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S index 517e6880..f620f040 100644 --- a/cipher/camellia-aesni-avx2-amd64.S +++ b/cipher/camellia-aesni-avx2-amd64.S @@ -1,6 +1,6 @@ /* camellia-avx2-aesni-amd64.S - AES-NI/AVX2 implementation of Camellia cipher * - * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -751,6 +751,7 @@ __camellia_enc_blk32: /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes + * %r8d: 24 for 16 byte key, 32 for larger * %ymm0..%ymm15: 32 plaintext blocks * output: * %ymm0..%ymm15: 32 encrypted blocks, order swapped: @@ -760,42 +761,32 @@ __camellia_enc_blk32: leaq 8 * 32(%rax), %rcx; + leaq (-8 * 8)(CTX, %r8, 8), %r8; + inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx); +.align 8 +.Lenc_loop: enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx, 0); - fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, - ((key_table + (8) * 8) + 0)(CTX), - ((key_table + (8) * 8) + 4)(CTX), - ((key_table + (8) * 8) + 8)(CTX), - ((key_table + (8) * 8) + 12)(CTX)); - - enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 8); + cmpq %r8, CTX; + je .Lenc_done; + leaq (8 * 8)(CTX), CTX; fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, - ((key_table + (16) * 8) + 0)(CTX), - ((key_table + (16) * 8) + 4)(CTX), - ((key_table + (16) * 8) + 8)(CTX), - ((key_table + (16) * 8) + 12)(CTX)); - - enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 16); - - movl $24, %r8d; - cmpl $128, key_bitlength(CTX); - jne .Lenc_max32; + ((key_table) + 0)(CTX), + ((key_table) + 4)(CTX), + ((key_table) + 8)(CTX), + ((key_table) + 12)(CTX)); + jmp .Lenc_loop; +.align 8 .Lenc_done: /* load CD for output */ vmovdqu 0 * 32(%rcx), %ymm8; @@ -809,27 +800,9 @@ __camellia_enc_blk32: outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); + %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax)); ret; - -.align 8 -.Lenc_max32: - movl $32, %r8d; - - fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, - ((key_table + (24) * 8) + 0)(CTX), - ((key_table + (24) * 8) + 4)(CTX), - ((key_table + (24) * 8) + 8)(CTX), - ((key_table + (24) * 8) + 12)(CTX)); - - enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 24); - - jmp .Lenc_done; CFI_ENDPROC(); ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;) @@ -848,44 +821,38 @@ __camellia_dec_blk32: */ CFI_STARTPROC(); + movq %r8, %rcx; + movq CTX, %r8 + leaq (-8 * 8)(CTX, %rcx, 8), CTX; + leaq 8 * 32(%rax), %rcx; inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx); - cmpl $32, %r8d; - je .Ldec_max32; - -.Ldec_max24: +.align 8 +.Ldec_loop: dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 16); - - fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, - ((key_table + (16) * 8) + 8)(CTX), - ((key_table + (16) * 8) + 12)(CTX), - ((key_table + (16) * 8) + 0)(CTX), - ((key_table + (16) * 8) + 4)(CTX)); + %ymm15, %rax, %rcx, 0); - dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 8); + cmpq %r8, CTX; + je .Ldec_done; fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, - ((key_table + (8) * 8) + 8)(CTX), - ((key_table + (8) * 8) + 12)(CTX), - ((key_table + (8) * 8) + 0)(CTX), - ((key_table + (8) * 8) + 4)(CTX)); + ((key_table) + 8)(CTX), + ((key_table) + 12)(CTX), + ((key_table) + 0)(CTX), + ((key_table) + 4)(CTX)); - dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 0); + leaq (-8 * 8)(CTX), CTX; + jmp .Ldec_loop; +.align 8 +.Ldec_done: /* load CD for output */ vmovdqu 0 * 32(%rcx), %ymm8; vmovdqu 1 * 32(%rcx), %ymm9; @@ -901,22 +868,6 @@ __camellia_dec_blk32: %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); ret; - -.align 8 -.Ldec_max32: - dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 24); - - fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, - ((key_table + (24) * 8) + 8)(CTX), - ((key_table + (24) * 8) + 12)(CTX), - ((key_table + (24) * 8) + 0)(CTX), - ((key_table + (24) * 8) + 4)(CTX)); - - jmp .Ldec_max24; CFI_ENDPROC(); ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;) @@ -949,6 +900,11 @@ _gcry_camellia_aesni_avx2_ctr_enc: vzeroupper; + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; @@ -1216,6 +1172,11 @@ _gcry_camellia_aesni_avx2_cfb_dec: vzeroupper; + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; @@ -1384,6 +1345,11 @@ _gcry_camellia_aesni_avx2_ocb_enc: vpxor %xmm13, %xmm15, %xmm15; vmovdqu %xmm15, (%r8); + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r10d; + cmovel %r10d, %r8d; /* max */ + /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; @@ -1742,6 +1708,11 @@ _gcry_camellia_aesni_avx2_ocb_auth: vmovdqu %xmm14, (%rdx); + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r10d; + cmovel %r10d, %r8d; /* max */ + movq %rcx, %r10; /* inpack16_pre: */ |