summaryrefslogtreecommitdiff
path: root/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S')
-rw-r--r--FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S1105
1 files changed, 1105 insertions, 0 deletions
diff --git a/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S
new file mode 100644
index 000000000..95711075b
--- /dev/null
+++ b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S
@@ -0,0 +1,1105 @@
+/* poly1305_asm
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifndef HAVE_INTEL_AVX1
+#define HAVE_INTEL_AVX1
+#endif /* HAVE_INTEL_AVX1 */
+#ifndef NO_AVX2_SUPPORT
+#define HAVE_INTEL_AVX2
+#endif /* NO_AVX2_SUPPORT */
+
+#ifdef HAVE_INTEL_AVX1
+#ifndef __APPLE__
+.text
+.globl poly1305_setkey_avx
+.type poly1305_setkey_avx,@function
+.align 4
+poly1305_setkey_avx:
+#else
+.section __TEXT,__text
+.globl _poly1305_setkey_avx
+.p2align 2
+_poly1305_setkey_avx:
+#endif /* __APPLE__ */
+ movabsq $0xffffffc0fffffff, %r10
+ movabsq $0xffffffc0ffffffc, %r11
+ movq (%rsi), %rdx
+ movq 8(%rsi), %rax
+ movq 16(%rsi), %rcx
+ movq 24(%rsi), %r8
+ andq %r10, %rdx
+ andq %r11, %rax
+ movq %rdx, %r10
+ movq %rax, %r11
+ xorq %r9, %r9
+ movq %rdx, (%rdi)
+ movq %rax, 8(%rdi)
+ movq %r9, 24(%rdi)
+ movq %r9, 32(%rdi)
+ movq %r9, 40(%rdi)
+ movq %rcx, 48(%rdi)
+ movq %r8, 56(%rdi)
+ movq %r9, 352(%rdi)
+ movq %r9, 408(%rdi)
+ movq %rdx, 360(%rdi)
+ movq %rax, 416(%rdi)
+ addq %rdx, %r10
+ addq %rax, %r11
+ movq %r10, 368(%rdi)
+ movq %r11, 424(%rdi)
+ addq %rdx, %r10
+ addq %rax, %r11
+ movq %r10, 376(%rdi)
+ movq %r11, 432(%rdi)
+ addq %rdx, %r10
+ addq %rax, %r11
+ movq %r10, 384(%rdi)
+ movq %r11, 440(%rdi)
+ addq %rdx, %r10
+ addq %rax, %r11
+ movq %r10, 392(%rdi)
+ movq %r11, 448(%rdi)
+ addq %rdx, %r10
+ addq %rax, %r11
+ movq %r10, 400(%rdi)
+ movq %r11, 456(%rdi)
+ movq %r9, 608(%rdi)
+ movb $0x01, 616(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size poly1305_setkey_avx,.-poly1305_setkey_avx
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl poly1305_block_avx
+.type poly1305_block_avx,@function
+.align 4
+poly1305_block_avx:
+#else
+.section __TEXT,__text
+.globl _poly1305_block_avx
+.p2align 2
+_poly1305_block_avx:
+#endif /* __APPLE__ */
+ pushq %r15
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ movq (%rdi), %r15
+ movq 8(%rdi), %rbx
+ movq 24(%rdi), %r8
+ movq 32(%rdi), %r9
+ movq 40(%rdi), %r10
+ xorq %r14, %r14
+ movb 616(%rdi), %r14b
+ # h += m
+ movq (%rsi), %r11
+ movq 8(%rsi), %r12
+ addq %r11, %r8
+ adcq %r12, %r9
+ movq %rbx, %rax
+ adcq %r14, %r10
+ # r[1] * h[0] => rdx, rax ==> t2, t1
+ mulq %r8
+ movq %rax, %r12
+ movq %rdx, %r13
+ # r[0] * h[1] => rdx, rax ++> t2, t1
+ movq %r15, %rax
+ mulq %r9
+ addq %rax, %r12
+ movq %r15, %rax
+ adcq %rdx, %r13
+ # r[0] * h[0] => rdx, rax ==> t4, t0
+ mulq %r8
+ movq %rax, %r11
+ movq %rdx, %r8
+ # r[1] * h[1] => rdx, rax =+> t3, t2
+ movq %rbx, %rax
+ mulq %r9
+ # r[0] * h[2] +> t2
+ addq 352(%rdi,%r10,8), %r13
+ movq %rdx, %r14
+ addq %r8, %r12
+ adcq %rax, %r13
+ # r[1] * h[2] +> t3
+ adcq 408(%rdi,%r10,8), %r14
+ # r * h in r14, r13, r12, r11
+ # h = (r * h) mod 2^130 - 5
+ movq %r13, %r10
+ andq $-4, %r13
+ andq $3, %r10
+ addq %r13, %r11
+ movq %r13, %r8
+ adcq %r14, %r12
+ adcq $0x00, %r10
+ shrdq $2, %r14, %r8
+ shrq $2, %r14
+ addq %r11, %r8
+ adcq %r14, %r12
+ movq %r12, %r9
+ adcq $0x00, %r10
+ # h in r10, r9, r8
+ # Store h to ctx
+ movq %r8, 24(%rdi)
+ movq %r9, 32(%rdi)
+ movq %r10, 40(%rdi)
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %r15
+ repz retq
+#ifndef __APPLE__
+.size poly1305_block_avx,.-poly1305_block_avx
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl poly1305_blocks_avx
+.type poly1305_blocks_avx,@function
+.align 4
+poly1305_blocks_avx:
+#else
+.section __TEXT,__text
+.globl _poly1305_blocks_avx
+.p2align 2
+_poly1305_blocks_avx:
+#endif /* __APPLE__ */
+ pushq %r15
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ movq %rdx, %rcx
+ movq (%rdi), %r15
+ movq 8(%rdi), %rbx
+ movq 24(%rdi), %r8
+ movq 32(%rdi), %r9
+ movq 40(%rdi), %r10
+L_poly1305_avx_blocks_start:
+ # h += m
+ movq (%rsi), %r11
+ movq 8(%rsi), %r12
+ addq %r11, %r8
+ adcq %r12, %r9
+ movq %rbx, %rax
+ adcq $0x00, %r10
+ # r[1] * h[0] => rdx, rax ==> t2, t1
+ mulq %r8
+ movq %rax, %r12
+ movq %rdx, %r13
+ # r[0] * h[1] => rdx, rax ++> t2, t1
+ movq %r15, %rax
+ mulq %r9
+ addq %rax, %r12
+ movq %r15, %rax
+ adcq %rdx, %r13
+ # r[0] * h[0] => rdx, rax ==> t4, t0
+ mulq %r8
+ movq %rax, %r11
+ movq %rdx, %r8
+ # r[1] * h[1] => rdx, rax =+> t3, t2
+ movq %rbx, %rax
+ mulq %r9
+ # r[0] * h[2] +> t2
+ addq 360(%rdi,%r10,8), %r13
+ movq %rdx, %r14
+ addq %r8, %r12
+ adcq %rax, %r13
+ # r[1] * h[2] +> t3
+ adcq 416(%rdi,%r10,8), %r14
+ # r * h in r14, r13, r12, r11
+ # h = (r * h) mod 2^130 - 5
+ movq %r13, %r10
+ andq $-4, %r13
+ andq $3, %r10
+ addq %r13, %r11
+ movq %r13, %r8
+ adcq %r14, %r12
+ adcq $0x00, %r10
+ shrdq $2, %r14, %r8
+ shrq $2, %r14
+ addq %r11, %r8
+ adcq %r14, %r12
+ movq %r12, %r9
+ adcq $0x00, %r10
+ # h in r10, r9, r8
+ # Next block from message
+ addq $16, %rsi
+ subq $16, %rcx
+ jg L_poly1305_avx_blocks_start
+ # Store h to ctx
+ movq %r8, 24(%rdi)
+ movq %r9, 32(%rdi)
+ movq %r10, 40(%rdi)
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %r15
+ repz retq
+#ifndef __APPLE__
+.size poly1305_blocks_avx,.-poly1305_blocks_avx
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl poly1305_final_avx
+.type poly1305_final_avx,@function
+.align 4
+poly1305_final_avx:
+#else
+.section __TEXT,__text
+.globl _poly1305_final_avx
+.p2align 2
+_poly1305_final_avx:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ movq %rsi, %rbx
+ movq 608(%rdi), %rax
+ testq %rax, %rax
+ je L_poly1305_avx_final_no_more
+ movb $0x01, 480(%rdi,%rax,1)
+ jmp L_poly1305_avx_final_cmp_rem
+L_poly1305_avx_final_zero_rem:
+ movb $0x00, 480(%rdi,%rax,1)
+L_poly1305_avx_final_cmp_rem:
+ incb %al
+ cmpq $16, %rax
+ jl L_poly1305_avx_final_zero_rem
+ movb $0x00, 616(%rdi)
+ leaq 480(%rdi), %rsi
+#ifndef __APPLE__
+ callq poly1305_block_avx@plt
+#else
+ callq _poly1305_block_avx
+#endif /* __APPLE__ */
+L_poly1305_avx_final_no_more:
+ movq 24(%rdi), %rax
+ movq 32(%rdi), %rdx
+ movq 40(%rdi), %rcx
+ movq 48(%rdi), %r11
+ movq 56(%rdi), %r12
+ # h %= p
+ # h = (h + pad)
+ # mod 2^130 - 5
+ movq %rcx, %r8
+ andq $3, %rcx
+ shrq $2, %r8
+ # Multily by 5
+ leaq 0(%r8,%r8,4), %r8
+ addq %r8, %rax
+ adcq $0x00, %rdx
+ adcq $0x00, %rcx
+ # Fixup when between (1 << 130) - 1 and (1 << 130) - 5
+ movq %rax, %r8
+ movq %rdx, %r9
+ movq %rcx, %r10
+ addq $5, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ cmpq $4, %r10
+ cmoveq %r8, %rax
+ cmoveq %r9, %rdx
+ # h += pad
+ addq %r11, %rax
+ adcq %r12, %rdx
+ movq %rax, (%rbx)
+ movq %rdx, 8(%rbx)
+ # Zero out r
+ movq $0x00, (%rdi)
+ movq $0x00, 8(%rdi)
+ # Zero out h
+ movq $0x00, 24(%rdi)
+ movq $0x00, 32(%rdi)
+ movq $0x00, 40(%rdi)
+ # Zero out pad
+ movq $0x00, 48(%rdi)
+ movq $0x00, 56(%rdi)
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size poly1305_final_avx,.-poly1305_final_avx
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX1 */
+#ifdef HAVE_INTEL_AVX2
+#ifndef __APPLE__
+.text
+.globl poly1305_calc_powers_avx2
+.type poly1305_calc_powers_avx2,@function
+.align 4
+poly1305_calc_powers_avx2:
+#else
+.section __TEXT,__text
+.globl _poly1305_calc_powers_avx2
+.p2align 2
+_poly1305_calc_powers_avx2:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ pushq %rbp
+ movq (%rdi), %rcx
+ movq 8(%rdi), %r8
+ xorq %r9, %r9
+ # Convert to 26 bits in 32
+ movq %rcx, %rax
+ movq %rcx, %rdx
+ movq %rcx, %rsi
+ movq %r8, %rbx
+ movq %r8, %rbp
+ shrq $26, %rdx
+ shrdq $52, %r8, %rsi
+ shrq $14, %rbx
+ shrdq $40, %r9, %rbp
+ andq $0x3ffffff, %rax
+ andq $0x3ffffff, %rdx
+ andq $0x3ffffff, %rsi
+ andq $0x3ffffff, %rbx
+ andq $0x3ffffff, %rbp
+ movl %eax, 224(%rdi)
+ movl %edx, 228(%rdi)
+ movl %esi, 232(%rdi)
+ movl %ebx, 236(%rdi)
+ movl %ebp, 240(%rdi)
+ movl $0x00, 244(%rdi)
+ # Square 128-bit
+ movq %r8, %rax
+ mulq %rcx
+ xorq %r13, %r13
+ movq %rax, %r11
+ movq %rdx, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ movq %rcx, %rax
+ mulq %rax
+ movq %rax, %r10
+ movq %rdx, %r15
+ movq %r8, %rax
+ mulq %rax
+ addq %r15, %r11
+ adcq %rax, %r12
+ adcq %rdx, %r13
+ # Reduce 256-bit to 130-bit
+ movq %r12, %rax
+ movq %r13, %rdx
+ andq $-4, %rax
+ andq $3, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ shrdq $2, %rdx, %rax
+ shrq $2, %rdx
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ movq %r12, %rax
+ shrq $2, %rax
+ leaq 0(%rax,%rax,4), %rax
+ andq $3, %r12
+ addq %rax, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Convert to 26 bits in 32
+ movq %r10, %rax
+ movq %r10, %rdx
+ movq %r10, %rsi
+ movq %r11, %rbx
+ movq %r11, %rbp
+ shrq $26, %rdx
+ shrdq $52, %r11, %rsi
+ shrq $14, %rbx
+ shrdq $40, %r12, %rbp
+ andq $0x3ffffff, %rax
+ andq $0x3ffffff, %rdx
+ andq $0x3ffffff, %rsi
+ andq $0x3ffffff, %rbx
+ andq $0x3ffffff, %rbp
+ movl %eax, 256(%rdi)
+ movl %edx, 260(%rdi)
+ movl %esi, 264(%rdi)
+ movl %ebx, 268(%rdi)
+ movl %ebp, 272(%rdi)
+ movl $0x00, 276(%rdi)
+ # Multiply 128-bit by 130-bit
+ # r1[0] * r2[0]
+ movq %rcx, %rax
+ mulq %r10
+ movq %rax, %r13
+ movq %rdx, %r14
+ # r1[0] * r2[1]
+ movq %rcx, %rax
+ mulq %r11
+ movq $0x00, %r15
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # r1[1] * r2[0]
+ movq %r8, %rax
+ mulq %r10
+ movq $0x00, %rsi
+ addq %rax, %r14
+ adcq %rdx, %r15
+ adcq $0x00, %rsi
+ # r1[0] * r2[2]
+ movq %rcx, %rax
+ mulq %r12
+ addq %rax, %r15
+ adcq %rdx, %rsi
+ # r1[1] * r2[1]
+ movq %r8, %rax
+ mulq %r11
+ movq $0x00, %rbx
+ addq %rax, %r15
+ adcq %rdx, %rsi
+ adcq $0x00, %rbx
+ # r1[1] * r2[2]
+ movq %r8, %rax
+ mulq %r12
+ addq %rax, %rsi
+ adcq %rdx, %rbx
+ # Reduce 260-bit to 130-bit
+ movq %r15, %rax
+ movq %rsi, %rdx
+ movq %rbx, %rbx
+ andq $-4, %rax
+ andq $3, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq %rbx, %r15
+ shrdq $2, %rdx, %rax
+ shrdq $2, %rbx, %rdx
+ shrq $2, %rbx
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq %rbx, %r15
+ movq %r15, %rax
+ andq $3, %r15
+ shrq $2, %rax
+ leaq 0(%rax,%rax,4), %rax
+ addq %rax, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Convert to 26 bits in 32
+ movq %r13, %rax
+ movq %r13, %rdx
+ movq %r13, %rsi
+ movq %r14, %rbx
+ movq %r14, %rbp
+ shrq $26, %rdx
+ shrdq $52, %r14, %rsi
+ shrq $14, %rbx
+ shrdq $40, %r15, %rbp
+ andq $0x3ffffff, %rax
+ andq $0x3ffffff, %rdx
+ andq $0x3ffffff, %rsi
+ andq $0x3ffffff, %rbx
+ andq $0x3ffffff, %rbp
+ movl %eax, 288(%rdi)
+ movl %edx, 292(%rdi)
+ movl %esi, 296(%rdi)
+ movl %ebx, 300(%rdi)
+ movl %ebp, 304(%rdi)
+ movl $0x00, 308(%rdi)
+ # Square 130-bit
+ movq %r11, %rax
+ mulq %r10
+ xorq %r13, %r13
+ movq %rax, %r8
+ movq %rdx, %r9
+ addq %rax, %r8
+ adcq %rdx, %r9
+ adcq $0x00, %r13
+ movq %r10, %rax
+ mulq %rax
+ movq %rax, %rcx
+ movq %rdx, %r15
+ movq %r11, %rax
+ mulq %rax
+ addq %r15, %r8
+ adcq %rax, %r9
+ adcq %rdx, %r13
+ movq %r12, %rax
+ mulq %rax
+ movq %rax, %r14
+ movq %r12, %rax
+ mulq %r10
+ addq %rax, %r9
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ addq %rax, %r9
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ movq %r12, %rax
+ mulq %r11
+ addq %rax, %r13
+ adcq %rdx, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Reduce 260-bit to 130-bit
+ movq %r9, %rax
+ movq %r13, %rdx
+ movq %r14, %r15
+ andq $-4, %rax
+ andq $3, %r9
+ addq %rax, %rcx
+ adcq %rdx, %r8
+ adcq %r15, %r9
+ shrdq $2, %rdx, %rax
+ shrdq $2, %r15, %rdx
+ shrq $2, %r15
+ addq %rax, %rcx
+ adcq %rdx, %r8
+ adcq %r15, %r9
+ movq %r9, %rax
+ andq $3, %r9
+ shrq $2, %rax
+ leaq 0(%rax,%rax,4), %rax
+ addq %rax, %rcx
+ adcq $0x00, %r8
+ adcq $0x00, %r9
+ # Convert to 26 bits in 32
+ movq %rcx, %rax
+ movq %rcx, %rdx
+ movq %rcx, %rsi
+ movq %r8, %rbx
+ movq %r8, %rbp
+ shrq $26, %rdx
+ shrdq $52, %r8, %rsi
+ shrq $14, %rbx
+ shrdq $40, %r9, %rbp
+ andq $0x3ffffff, %rax
+ andq $0x3ffffff, %rdx
+ andq $0x3ffffff, %rsi
+ andq $0x3ffffff, %rbx
+ andq $0x3ffffff, %rbp
+ movl %eax, 320(%rdi)
+ movl %edx, 324(%rdi)
+ movl %esi, 328(%rdi)
+ movl %ebx, 332(%rdi)
+ movl %ebp, 336(%rdi)
+ movl $0x00, 340(%rdi)
+ popq %rbp
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl poly1305_setkey_avx2
+.type poly1305_setkey_avx2,@function
+.align 4
+poly1305_setkey_avx2:
+#else
+.section __TEXT,__text
+.globl _poly1305_setkey_avx2
+.p2align 2
+_poly1305_setkey_avx2:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ callq poly1305_setkey_avx@plt
+#else
+ callq _poly1305_setkey_avx
+#endif /* __APPLE__ */
+ vpxor %ymm0, %ymm0, %ymm0
+ vmovdqu %ymm0, 64(%rdi)
+ vmovdqu %ymm0, 96(%rdi)
+ vmovdqu %ymm0, 128(%rdi)
+ vmovdqu %ymm0, 160(%rdi)
+ vmovdqu %ymm0, 192(%rdi)
+ movq $0x00, 608(%rdi)
+ movw $0x00, 616(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size poly1305_setkey_avx2,.-poly1305_setkey_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_poly1305_avx2_blocks_mask:
+.quad 0x3ffffff, 0x3ffffff
+.quad 0x3ffffff, 0x3ffffff
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_poly1305_avx2_blocks_hibit:
+.quad 0x1000000, 0x1000000
+.quad 0x1000000, 0x1000000
+#ifndef __APPLE__
+.text
+.globl poly1305_blocks_avx2
+.type poly1305_blocks_avx2,@function
+.align 4
+poly1305_blocks_avx2:
+#else
+.section __TEXT,__text
+.globl _poly1305_blocks_avx2
+.p2align 2
+_poly1305_blocks_avx2:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %rbx
+ subq $0x140, %rsp
+ movq %rsp, %rcx
+ andq $-32, %rcx
+ addq $32, %rcx
+ vpxor %ymm15, %ymm15, %ymm15
+ movq %rcx, %rbx
+ leaq 64(%rdi), %rax
+ addq $0xa0, %rbx
+ cmpw $0x00, 616(%rdi)
+ jne L_poly1305_avx2_blocks_begin_h
+ # Load the message data
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %ymm1
+ vperm2i128 $32, %ymm1, %ymm0, %ymm2
+ vperm2i128 $49, %ymm1, %ymm0, %ymm0
+ vpunpckldq %ymm0, %ymm2, %ymm1
+ vpunpckhdq %ymm0, %ymm2, %ymm3
+ vpunpckldq %ymm15, %ymm1, %ymm0
+ vpunpckhdq %ymm15, %ymm1, %ymm1
+ vpunpckldq %ymm15, %ymm3, %ymm2
+ vpunpckhdq %ymm15, %ymm3, %ymm3
+ vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4
+ vpsllq $6, %ymm1, %ymm1
+ vpsllq $12, %ymm2, %ymm2
+ vpsllq $18, %ymm3, %ymm3
+ vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
+ # Reduce, in place, the message data
+ vpsrlq $26, %ymm0, %ymm10
+ vpsrlq $26, %ymm3, %ymm11
+ vpand %ymm14, %ymm0, %ymm0
+ vpand %ymm14, %ymm3, %ymm3
+ vpaddq %ymm1, %ymm10, %ymm1
+ vpaddq %ymm4, %ymm11, %ymm4
+ vpsrlq $26, %ymm1, %ymm10
+ vpsrlq $26, %ymm4, %ymm11
+ vpand %ymm14, %ymm1, %ymm1
+ vpand %ymm14, %ymm4, %ymm4
+ vpaddq %ymm2, %ymm10, %ymm2
+ vpslld $2, %ymm11, %ymm12
+ vpaddd %ymm12, %ymm11, %ymm12
+ vpsrlq $26, %ymm2, %ymm10
+ vpaddq %ymm0, %ymm12, %ymm0
+ vpsrlq $26, %ymm0, %ymm11
+ vpand %ymm14, %ymm2, %ymm2
+ vpand %ymm14, %ymm0, %ymm0
+ vpaddq %ymm3, %ymm10, %ymm3
+ vpaddq %ymm1, %ymm11, %ymm1
+ vpsrlq $26, %ymm3, %ymm10
+ vpand %ymm14, %ymm3, %ymm3
+ vpaddq %ymm4, %ymm10, %ymm4
+ addq $0x40, %rsi
+ subq $0x40, %rdx
+ jz L_poly1305_avx2_blocks_store
+ jmp L_poly1305_avx2_blocks_load_r4
+L_poly1305_avx2_blocks_begin_h:
+ # Load the H values.
+ vmovdqu (%rax), %ymm0
+ vmovdqu 32(%rax), %ymm1
+ vmovdqu 64(%rax), %ymm2
+ vmovdqu 96(%rax), %ymm3
+ vmovdqu 128(%rax), %ymm4
+ # Check if there is a power of r to load - otherwise use r^4.
+ cmpb $0x00, 616(%rdi)
+ je L_poly1305_avx2_blocks_load_r4
+ # Load the 4 powers of r - r^4, r^3, r^2, r^1.
+ vmovdqu 224(%rdi), %ymm8
+ vmovdqu 256(%rdi), %ymm7
+ vmovdqu 288(%rdi), %ymm6
+ vmovdqu 320(%rdi), %ymm5
+ vpermq $0xd8, %ymm5, %ymm5
+ vpermq $0xd8, %ymm6, %ymm6
+ vpermq $0xd8, %ymm7, %ymm7
+ vpermq $0xd8, %ymm8, %ymm8
+ vpunpcklqdq %ymm6, %ymm5, %ymm10
+ vpunpckhqdq %ymm6, %ymm5, %ymm11
+ vpunpcklqdq %ymm8, %ymm7, %ymm12
+ vpunpckhqdq %ymm8, %ymm7, %ymm13
+ vperm2i128 $32, %ymm12, %ymm10, %ymm5
+ vperm2i128 $49, %ymm12, %ymm10, %ymm7
+ vperm2i128 $32, %ymm13, %ymm11, %ymm9
+ vpsrlq $32, %ymm5, %ymm6
+ vpsrlq $32, %ymm7, %ymm8
+ jmp L_poly1305_avx2_blocks_mul_5
+L_poly1305_avx2_blocks_load_r4:
+ # Load r^4 into all four positions.
+ vmovdqu 320(%rdi), %ymm13
+ vpermq $0x00, %ymm13, %ymm5
+ vpsrlq $32, %ymm13, %ymm14
+ vpermq $0x55, %ymm13, %ymm7
+ vpermq $0xaa, %ymm13, %ymm9
+ vpermq $0x00, %ymm14, %ymm6
+ vpermq $0x55, %ymm14, %ymm8
+L_poly1305_avx2_blocks_mul_5:
+ # Multiply top 4 26-bit values of all four H by 5
+ vpslld $2, %ymm6, %ymm10
+ vpslld $2, %ymm7, %ymm11
+ vpslld $2, %ymm8, %ymm12
+ vpslld $2, %ymm9, %ymm13
+ vpaddq %ymm10, %ymm6, %ymm10
+ vpaddq %ymm11, %ymm7, %ymm11
+ vpaddq %ymm12, %ymm8, %ymm12
+ vpaddq %ymm13, %ymm9, %ymm13
+ # Store powers of r and multiple of 5 for use in multiply.
+ vmovdqa %ymm10, (%rbx)
+ vmovdqa %ymm11, 32(%rbx)
+ vmovdqa %ymm12, 64(%rbx)
+ vmovdqa %ymm13, 96(%rbx)
+ vmovdqa %ymm5, (%rcx)
+ vmovdqa %ymm6, 32(%rcx)
+ vmovdqa %ymm7, 64(%rcx)
+ vmovdqa %ymm8, 96(%rcx)
+ vmovdqa %ymm9, 128(%rcx)
+ vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
+ # If not finished then loop over data
+ cmpb $0x01, 616(%rdi)
+ jne L_poly1305_avx2_blocks_start
+ # Do last multiply, reduce, add the four H together and move to
+ # 32-bit registers
+ vpmuludq (%rbx), %ymm4, %ymm5
+ vpmuludq 32(%rbx), %ymm3, %ymm10
+ vpmuludq 32(%rbx), %ymm4, %ymm6
+ vpmuludq 64(%rbx), %ymm3, %ymm11
+ vpmuludq 64(%rbx), %ymm4, %ymm7
+ vpaddq %ymm5, %ymm10, %ymm5
+ vpmuludq 64(%rbx), %ymm2, %ymm12
+ vpmuludq 96(%rbx), %ymm4, %ymm8
+ vpaddq %ymm6, %ymm11, %ymm6
+ vpmuludq 96(%rbx), %ymm1, %ymm13
+ vpmuludq 96(%rbx), %ymm2, %ymm10
+ vpaddq %ymm5, %ymm12, %ymm5
+ vpmuludq 96(%rbx), %ymm3, %ymm11
+ vpmuludq (%rcx), %ymm3, %ymm12
+ vpaddq %ymm5, %ymm13, %ymm5
+ vpmuludq (%rcx), %ymm4, %ymm9
+ vpaddq %ymm6, %ymm10, %ymm6
+ vpmuludq (%rcx), %ymm0, %ymm13
+ vpaddq %ymm7, %ymm11, %ymm7
+ vpmuludq (%rcx), %ymm1, %ymm10
+ vpaddq %ymm8, %ymm12, %ymm8
+ vpmuludq (%rcx), %ymm2, %ymm11
+ vpmuludq 32(%rcx), %ymm2, %ymm12
+ vpaddq %ymm5, %ymm13, %ymm5
+ vpmuludq 32(%rcx), %ymm3, %ymm13
+ vpaddq %ymm6, %ymm10, %ymm6
+ vpmuludq 32(%rcx), %ymm0, %ymm10
+ vpaddq %ymm7, %ymm11, %ymm7
+ vpmuludq 32(%rcx), %ymm1, %ymm11
+ vpaddq %ymm8, %ymm12, %ymm8
+ vpmuludq 64(%rcx), %ymm1, %ymm12
+ vpaddq %ymm9, %ymm13, %ymm9
+ vpmuludq 64(%rcx), %ymm2, %ymm13
+ vpaddq %ymm6, %ymm10, %ymm6
+ vpmuludq 64(%rcx), %ymm0, %ymm10
+ vpaddq %ymm7, %ymm11, %ymm7
+ vpmuludq 96(%rcx), %ymm0, %ymm11
+ vpaddq %ymm8, %ymm12, %ymm8
+ vpmuludq 96(%rcx), %ymm1, %ymm12
+ vpaddq %ymm9, %ymm13, %ymm9
+ vpaddq %ymm7, %ymm10, %ymm7
+ vpmuludq 128(%rcx), %ymm0, %ymm13
+ vpaddq %ymm8, %ymm11, %ymm8
+ vpaddq %ymm9, %ymm12, %ymm9
+ vpaddq %ymm9, %ymm13, %ymm9
+ vpsrlq $26, %ymm5, %ymm10
+ vpsrlq $26, %ymm8, %ymm11
+ vpand %ymm14, %ymm5, %ymm5
+ vpand %ymm14, %ymm8, %ymm8
+ vpaddq %ymm6, %ymm10, %ymm6
+ vpaddq %ymm9, %ymm11, %ymm9
+ vpsrlq $26, %ymm6, %ymm10
+ vpsrlq $26, %ymm9, %ymm11
+ vpand %ymm14, %ymm6, %ymm1
+ vpand %ymm14, %ymm9, %ymm4
+ vpaddq %ymm7, %ymm10, %ymm7
+ vpslld $2, %ymm11, %ymm12
+ vpaddd %ymm12, %ymm11, %ymm12
+ vpsrlq $26, %ymm7, %ymm10
+ vpaddq %ymm5, %ymm12, %ymm5
+ vpsrlq $26, %ymm5, %ymm11
+ vpand %ymm14, %ymm7, %ymm2
+ vpand %ymm14, %ymm5, %ymm0
+ vpaddq %ymm8, %ymm10, %ymm8
+ vpaddq %ymm1, %ymm11, %ymm1
+ vpsrlq $26, %ymm8, %ymm10
+ vpand %ymm14, %ymm8, %ymm3
+ vpaddq %ymm4, %ymm10, %ymm4
+ vpsrldq $8, %ymm0, %ymm5
+ vpsrldq $8, %ymm1, %ymm6
+ vpsrldq $8, %ymm2, %ymm7
+ vpsrldq $8, %ymm3, %ymm8
+ vpsrldq $8, %ymm4, %ymm9
+ vpaddq %ymm0, %ymm5, %ymm0
+ vpaddq %ymm1, %ymm6, %ymm1
+ vpaddq %ymm2, %ymm7, %ymm2
+ vpaddq %ymm3, %ymm8, %ymm3
+ vpaddq %ymm4, %ymm9, %ymm4
+ vpermq $2, %ymm0, %ymm5
+ vpermq $2, %ymm1, %ymm6
+ vpermq $2, %ymm2, %ymm7
+ vpermq $2, %ymm3, %ymm8
+ vpermq $2, %ymm4, %ymm9
+ vpaddq %ymm0, %ymm5, %ymm0
+ vpaddq %ymm1, %ymm6, %ymm1
+ vpaddq %ymm2, %ymm7, %ymm2
+ vpaddq %ymm3, %ymm8, %ymm3
+ vpaddq %ymm4, %ymm9, %ymm4
+ vmovd %xmm0, %r8d
+ vmovd %xmm1, %r9d
+ vmovd %xmm2, %r10d
+ vmovd %xmm3, %r11d
+ vmovd %xmm4, %r12d
+ jmp L_poly1305_avx2_blocks_end_calc
+L_poly1305_avx2_blocks_start:
+ vmovdqu (%rsi), %ymm5
+ vmovdqu 32(%rsi), %ymm6
+ vperm2i128 $32, %ymm6, %ymm5, %ymm7
+ vperm2i128 $49, %ymm6, %ymm5, %ymm5
+ vpunpckldq %ymm5, %ymm7, %ymm6
+ vpunpckhdq %ymm5, %ymm7, %ymm8
+ vpunpckldq %ymm15, %ymm6, %ymm5
+ vpunpckhdq %ymm15, %ymm6, %ymm6
+ vpunpckldq %ymm15, %ymm8, %ymm7
+ vpunpckhdq %ymm15, %ymm8, %ymm8
+ vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9
+ vpsllq $6, %ymm6, %ymm6
+ vpsllq $12, %ymm7, %ymm7
+ vpsllq $18, %ymm8, %ymm8
+ vpmuludq (%rbx), %ymm4, %ymm10
+ vpaddq %ymm5, %ymm10, %ymm5
+ vpmuludq 32(%rbx), %ymm3, %ymm10
+ vpmuludq 32(%rbx), %ymm4, %ymm11
+ vpaddq %ymm6, %ymm11, %ymm6
+ vpmuludq 64(%rbx), %ymm3, %ymm11
+ vpmuludq 64(%rbx), %ymm4, %ymm12
+ vpaddq %ymm7, %ymm12, %ymm7
+ vpaddq %ymm5, %ymm10, %ymm5
+ vpmuludq 64(%rbx), %ymm2, %ymm12
+ vpmuludq 96(%rbx), %ymm4, %ymm13
+ vpaddq %ymm8, %ymm13, %ymm8
+ vpaddq %ymm6, %ymm11, %ymm6
+ vpmuludq 96(%rbx), %ymm1, %ymm13
+ vpmuludq 96(%rbx), %ymm2, %ymm10
+ vpaddq %ymm5, %ymm12, %ymm5
+ vpmuludq 96(%rbx), %ymm3, %ymm11
+ vpmuludq (%rcx), %ymm3, %ymm12
+ vpaddq %ymm5, %ymm13, %ymm5
+ vpmuludq (%rcx), %ymm4, %ymm13
+ vpaddq %ymm9, %ymm13, %ymm9
+ vpaddq %ymm6, %ymm10, %ymm6
+ vpmuludq (%rcx), %ymm0, %ymm13
+ vpaddq %ymm7, %ymm11, %ymm7
+ vpmuludq (%rcx), %ymm1, %ymm10
+ vpaddq %ymm8, %ymm12, %ymm8
+ vpmuludq (%rcx), %ymm2, %ymm11
+ vpmuludq 32(%rcx), %ymm2, %ymm12
+ vpaddq %ymm5, %ymm13, %ymm5
+ vpmuludq 32(%rcx), %ymm3, %ymm13
+ vpaddq %ymm6, %ymm10, %ymm6
+ vpmuludq 32(%rcx), %ymm0, %ymm10
+ vpaddq %ymm7, %ymm11, %ymm7
+ vpmuludq 32(%rcx), %ymm1, %ymm11
+ vpaddq %ymm8, %ymm12, %ymm8
+ vpmuludq 64(%rcx), %ymm1, %ymm12
+ vpaddq %ymm9, %ymm13, %ymm9
+ vpmuludq 64(%rcx), %ymm2, %ymm13
+ vpaddq %ymm6, %ymm10, %ymm6
+ vpmuludq 64(%rcx), %ymm0, %ymm10
+ vpaddq %ymm7, %ymm11, %ymm7
+ vpmuludq 96(%rcx), %ymm0, %ymm11
+ vpaddq %ymm8, %ymm12, %ymm8
+ vpmuludq 96(%rcx), %ymm1, %ymm12
+ vpaddq %ymm9, %ymm13, %ymm9
+ vpaddq %ymm7, %ymm10, %ymm7
+ vpmuludq 128(%rcx), %ymm0, %ymm13
+ vpaddq %ymm8, %ymm11, %ymm8
+ vpaddq %ymm9, %ymm12, %ymm9
+ vpaddq %ymm9, %ymm13, %ymm9
+ vpsrlq $26, %ymm5, %ymm10
+ vpsrlq $26, %ymm8, %ymm11
+ vpand %ymm14, %ymm5, %ymm5
+ vpand %ymm14, %ymm8, %ymm8
+ vpaddq %ymm6, %ymm10, %ymm6
+ vpaddq %ymm9, %ymm11, %ymm9
+ vpsrlq $26, %ymm6, %ymm10
+ vpsrlq $26, %ymm9, %ymm11
+ vpand %ymm14, %ymm6, %ymm1
+ vpand %ymm14, %ymm9, %ymm4
+ vpaddq %ymm7, %ymm10, %ymm7
+ vpslld $2, %ymm11, %ymm12
+ vpaddd %ymm12, %ymm11, %ymm12
+ vpsrlq $26, %ymm7, %ymm10
+ vpaddq %ymm5, %ymm12, %ymm5
+ vpsrlq $26, %ymm5, %ymm11
+ vpand %ymm14, %ymm7, %ymm2
+ vpand %ymm14, %ymm5, %ymm0
+ vpaddq %ymm8, %ymm10, %ymm8
+ vpaddq %ymm1, %ymm11, %ymm1
+ vpsrlq $26, %ymm8, %ymm10
+ vpand %ymm14, %ymm8, %ymm3
+ vpaddq %ymm4, %ymm10, %ymm4
+ addq $0x40, %rsi
+ subq $0x40, %rdx
+ jnz L_poly1305_avx2_blocks_start
+L_poly1305_avx2_blocks_store:
+ # Store four H values - state
+ vmovdqu %ymm0, (%rax)
+ vmovdqu %ymm1, 32(%rax)
+ vmovdqu %ymm2, 64(%rax)
+ vmovdqu %ymm3, 96(%rax)
+ vmovdqu %ymm4, 128(%rax)
+L_poly1305_avx2_blocks_end_calc:
+ cmpb $0x00, 616(%rdi)
+ je L_poly1305_avx2_blocks_complete
+ movq %r8, %rax
+ movq %r10, %rdx
+ movq %r12, %rcx
+ shrq $12, %rdx
+ shrq $24, %rcx
+ shlq $26, %r9
+ shlq $52, %r10
+ shlq $14, %r11
+ shlq $40, %r12
+ addq %r9, %rax
+ adcq %r10, %rax
+ adcq %r11, %rdx
+ adcq %r12, %rdx
+ adcq $0x00, %rcx
+ movq %rcx, %r8
+ andq $3, %rcx
+ shrq $2, %r8
+ leaq 0(%r8,%r8,4), %r8
+ addq %r8, %rax
+ adcq $0x00, %rdx
+ adcq $0x00, %rcx
+ movq %rax, 24(%rdi)
+ movq %rdx, 32(%rdi)
+ movq %rcx, 40(%rdi)
+L_poly1305_avx2_blocks_complete:
+ movb $0x01, 617(%rdi)
+ addq $0x140, %rsp
+ popq %rbx
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl poly1305_final_avx2
+.type poly1305_final_avx2,@function
+.align 4
+poly1305_final_avx2:
+#else
+.section __TEXT,__text
+.globl _poly1305_final_avx2
+.p2align 2
+_poly1305_final_avx2:
+#endif /* __APPLE__ */
+ movb $0x01, 616(%rdi)
+ movb 617(%rdi), %cl
+ cmpb $0x00, %cl
+ je L_poly1305_avx2_final_done_blocks_X4
+ pushq %rsi
+ movq $0x40, %rdx
+ xorq %rsi, %rsi
+#ifndef __APPLE__
+ callq poly1305_blocks_avx2@plt
+#else
+ callq _poly1305_blocks_avx2
+#endif /* __APPLE__ */
+ popq %rsi
+L_poly1305_avx2_final_done_blocks_X4:
+ movq 608(%rdi), %rax
+ movq %rax, %rcx
+ andq $-16, %rcx
+ cmpb $0x00, %cl
+ je L_poly1305_avx2_final_done_blocks
+ pushq %rcx
+ pushq %rax
+ pushq %rsi
+ movq %rcx, %rdx
+ leaq 480(%rdi), %rsi
+#ifndef __APPLE__
+ callq poly1305_blocks_avx@plt
+#else
+ callq _poly1305_blocks_avx
+#endif /* __APPLE__ */
+ popq %rsi
+ popq %rax
+ popq %rcx
+L_poly1305_avx2_final_done_blocks:
+ subq %rcx, 608(%rdi)
+ xorq %rdx, %rdx
+ jmp L_poly1305_avx2_final_cmp_copy
+L_poly1305_avx2_final_start_copy:
+ movb 480(%rdi,%rcx,1), %r8b
+ movb %r8b, 480(%rdi,%rdx,1)
+ incb %cl
+ incb %dl
+L_poly1305_avx2_final_cmp_copy:
+ cmp %rcx, %rax
+ jne L_poly1305_avx2_final_start_copy
+#ifndef __APPLE__
+ callq poly1305_final_avx@plt
+#else
+ callq _poly1305_final_avx
+#endif /* __APPLE__ */
+ vpxor %ymm0, %ymm0, %ymm0
+ vmovdqu %ymm0, 64(%rdi)
+ vmovdqu %ymm0, 96(%rdi)
+ vmovdqu %ymm0, 128(%rdi)
+ vmovdqu %ymm0, 160(%rdi)
+ vmovdqu %ymm0, 192(%rdi)
+ vmovdqu %ymm0, 224(%rdi)
+ vmovdqu %ymm0, 256(%rdi)
+ vmovdqu %ymm0, 288(%rdi)
+ vmovdqu %ymm0, 320(%rdi)
+ movq $0x00, 608(%rdi)
+ movw $0x00, 616(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size poly1305_final_avx2,.-poly1305_final_avx2
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX2 */