summaryrefslogtreecommitdiff
path: root/cipher/rijndael-vaes-avx2-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2021-08-13 16:20:23 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2021-08-26 20:30:31 +0300
commit33aebb30d210768d510a2843d9cc0c0ecd4237d1 (patch)
tree2bf9100543b3ce2ce2c978250a17771386b5ca2f /cipher/rijndael-vaes-avx2-amd64.S
parent1b8994c4ecf2cb53fff46fa84a95a7c259e7cec7 (diff)
downloadlibgcrypt-33aebb30d210768d510a2843d9cc0c0ecd4237d1.tar.gz
Add x86 HW acceleration for GCM-SIV counter mode
* cipher/cipher-gcm-siv.c (do_ctr_le32): Use bulk function if available. * cipher/cipher-internal.h (cipher_bulk_ops): Add 'ctr32le_enc'. * cipher/rijndael-aesni.c (_gcry_aes_aesni_ctr32le_enc): New. * cipher/rijndael-vaes-avx2-amd64.S (_gcry_vaes_avx2_ctr32le_enc_amd64, .Lle_addd_*): New. * cipher/rijndael-vaes.c (_gcry_vaes_avx2_ctr32le_enc_amd64) (_gcry_aes_vaes_ctr32le_enc): New. * cipher/rijndael.c (_gcry_aes_aesni_ctr32le_enc) (_gcry_aes_vaes_ctr32le_enc): New prototypes. (do_setkey): Add setup of 'bulk_ops->ctr32le_enc' for AES-NI and VAES. * tests/basic.c (check_gcm_siv_cipher): Add large test-vector for bulk ops testing. -- Counter mode in GCM-SIV is little-endian on first 4 bytes of of counter block, unlike regular CTR mode which works on big-endian full block. Benchmark on AMD Ryzen 7 5800X: Before: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GCM-SIV enc | 1.00 ns/B 953.2 MiB/s 4.85 c/B 4850 GCM-SIV dec | 1.01 ns/B 940.1 MiB/s 4.92 c/B 4850 GCM-SIV auth | 0.118 ns/B 8051 MiB/s 0.575 c/B 4850 After (~6x faster): AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GCM-SIV enc | 0.150 ns/B 6367 MiB/s 0.727 c/B 4850 GCM-SIV dec | 0.161 ns/B 5909 MiB/s 0.783 c/B 4850 GCM-SIV auth | 0.118 ns/B 8051 MiB/s 0.574 c/B 4850 GnuPG-bug-id: T4485 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-vaes-avx2-amd64.S')
-rw-r--r--cipher/rijndael-vaes-avx2-amd64.S328
1 files changed, 328 insertions, 0 deletions
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index c4deea9b..d4ecf59f 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -1108,6 +1108,290 @@ _gcry_vaes_avx2_ctr_enc_amd64:
ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64)
/**********************************************************************
+ Little-endian 32-bit CTR-mode encryption (GCM-SIV)
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_ctr32le_enc_amd64,@function)
+.globl _gcry_vaes_avx2_ctr32le_enc_amd64
+_gcry_vaes_avx2_ctr32le_enc_amd64:
+ /* input:
+ * %rdi: round keys
+ * %rsi: counter
+ * %rdx: dst
+ * %rcx: src
+ * %r8: nblocks
+ * %r9: nrounds
+ */
+ CFI_STARTPROC();
+
+ vbroadcasti128 (%rsi), %ymm15; // CTR
+
+ /* Process 16 blocks per loop. */
+.align 8
+.Lctr32le_enc_blk16:
+ cmpq $16, %r8;
+ jb .Lctr32le_enc_blk8;
+
+ leaq -16(%r8), %r8;
+
+ vbroadcasti128 (0 * 16)(%rdi), %ymm8;
+
+ /* Increment counters. */
+ vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0;
+ vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1;
+ vpaddd .Lle_addd_4 rRIP, %ymm15, %ymm2;
+ vpaddd .Lle_addd_6 rRIP, %ymm15, %ymm3;
+ vpaddd .Lle_addd_8 rRIP, %ymm15, %ymm4;
+ vpaddd .Lle_addd_10 rRIP, %ymm15, %ymm5;
+ vpaddd .Lle_addd_12 rRIP, %ymm15, %ymm6;
+ vpaddd .Lle_addd_14 rRIP, %ymm15, %ymm7;
+
+ vpaddd .Lle_addd_16_2 rRIP, %ymm15, %ymm15;
+
+ /* AES rounds */
+ XOR8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (1 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+ cmpl $12, %r9d;
+ jb .Lctr32le_enc_blk16_last;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+ jz .Lctr32le_enc_blk16_last;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+
+ /* Last round and output handling. */
+ .Lctr32le_enc_blk16_last:
+ vpxor (0 * 16)(%rcx), %ymm8, %ymm9; /* Xor src to last round key. */
+ vpxor (2 * 16)(%rcx), %ymm8, %ymm10;
+ vpxor (4 * 16)(%rcx), %ymm8, %ymm11;
+ vpxor (6 * 16)(%rcx), %ymm8, %ymm12;
+ vaesenclast %ymm9, %ymm0, %ymm0;
+ vaesenclast %ymm10, %ymm1, %ymm1;
+ vaesenclast %ymm11, %ymm2, %ymm2;
+ vaesenclast %ymm12, %ymm3, %ymm3;
+ vpxor (8 * 16)(%rcx), %ymm8, %ymm9;
+ vpxor (10 * 16)(%rcx), %ymm8, %ymm10;
+ vpxor (12 * 16)(%rcx), %ymm8, %ymm11;
+ vpxor (14 * 16)(%rcx), %ymm8, %ymm8;
+ leaq (16 * 16)(%rcx), %rcx;
+ vaesenclast %ymm9, %ymm4, %ymm4;
+ vaesenclast %ymm10, %ymm5, %ymm5;
+ vaesenclast %ymm11, %ymm6, %ymm6;
+ vaesenclast %ymm8, %ymm7, %ymm7;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ vmovdqu %ymm4, (8 * 16)(%rdx);
+ vmovdqu %ymm5, (10 * 16)(%rdx);
+ vmovdqu %ymm6, (12 * 16)(%rdx);
+ vmovdqu %ymm7, (14 * 16)(%rdx);
+ leaq (16 * 16)(%rdx), %rdx;
+
+ jmp .Lctr32le_enc_blk16;
+
+ /* Handle trailing eight blocks. */
+.align 8
+.Lctr32le_enc_blk8:
+ cmpq $8, %r8;
+ jb .Lctr32le_enc_blk4;
+
+ leaq -8(%r8), %r8;
+
+ vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+
+ /* Increment counters. */
+ vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0;
+ vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1;
+ vpaddd .Lle_addd_4 rRIP, %ymm15, %ymm2;
+ vpaddd .Lle_addd_6 rRIP, %ymm15, %ymm3;
+
+ vpaddd .Lle_addd_8_2 rRIP, %ymm15, %ymm15;
+
+ /* AES rounds */
+ XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lctr32le_enc_blk8_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lctr32le_enc_blk8_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Lctr32le_enc_blk8_last:
+ vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */
+ vpxor (2 * 16)(%rcx), %ymm4, %ymm6;
+ vpxor (4 * 16)(%rcx), %ymm4, %ymm7;
+ vpxor (6 * 16)(%rcx), %ymm4, %ymm4;
+ leaq (8 * 16)(%rcx), %rcx;
+ vaesenclast %ymm5, %ymm0, %ymm0;
+ vaesenclast %ymm6, %ymm1, %ymm1;
+ vaesenclast %ymm7, %ymm2, %ymm2;
+ vaesenclast %ymm4, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ /* Handle trailing four blocks. */
+.align 8
+.Lctr32le_enc_blk4:
+ cmpq $4, %r8;
+ jb .Lctr32le_enc_blk1;
+
+ leaq -4(%r8), %r8;
+
+ vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+
+ /* Increment counters. */
+ vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0;
+ vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1;
+
+ vpaddd .Lle_addd_4_2 rRIP, %ymm15, %ymm15;
+
+ /* AES rounds */
+ XOR2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lctr32le_enc_blk4_last;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lctr32le_enc_blk4_last;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Lctr32le_enc_blk4_last:
+ vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */
+ vpxor (2 * 16)(%rcx), %ymm4, %ymm6;
+ leaq (4 * 16)(%rcx), %rcx;
+ vaesenclast %ymm5, %ymm0, %ymm0;
+ vaesenclast %ymm6, %ymm1, %ymm1;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ leaq (4 * 16)(%rdx), %rdx;
+
+ /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lctr32le_enc_blk1:
+ cmpq $1, %r8;
+ jb .Ldone_ctr32le_enc;
+
+ leaq -1(%r8), %r8;
+
+ /* Load and increament counter. */
+ vmovdqu %xmm15, %xmm0;
+ vpaddd .Lle_addd_1 rRIP, %xmm15, %xmm15;
+
+ /* AES rounds. */
+ vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (10 * 16)(%rdi), %xmm1;
+ cmpl $12, %r9d;
+ jb .Lctr32le_enc_blk1_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (12 * 16)(%rdi), %xmm1;
+ jz .Lctr32le_enc_blk1_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (14 * 16)(%rdi), %xmm1;
+
+ /* Last round and output handling. */
+ .Lctr32le_enc_blk1_last:
+ vpxor (%rcx), %xmm1, %xmm1; /* Xor src to last round key. */
+ leaq 16(%rcx), %rcx;
+ vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */
+ vmovdqu %xmm0, (%rdx);
+ leaq 16(%rdx), %rdx;
+
+ jmp .Lctr32le_enc_blk1;
+
+.align 8
+.Ldone_ctr32le_enc:
+ vmovdqu %xmm15, (%rsi);
+ vzeroall;
+ ret
+ CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64)
+
+/**********************************************************************
OCB-mode encryption/decryption
**********************************************************************/
ELF(.type _gcry_vaes_avx2_ocb_checksum,@function)
@@ -2677,6 +2961,50 @@ _gcry_vaes_consts:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
.Lbige_addb_15:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
+.Lle_addd_0:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_1:
+ .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_2:
+ .byte 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_3:
+ .byte 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_4:
+ .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_5:
+ .byte 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_6:
+ .byte 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_7:
+ .byte 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_8:
+ .byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_9:
+ .byte 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_10:
+ .byte 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_11:
+ .byte 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_12:
+ .byte 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_13:
+ .byte 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_14:
+ .byte 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_15:
+ .byte 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+.Lle_addd_4_2:
+ .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_8_2:
+ .byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_16_2:
+ .byte 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
.Lxts_gfmul_clmul:
.long 0x00, 0x87, 0x00, 0x00
.long 0x00, 0x87, 0x00, 0x00