diff options
Diffstat (limited to 'FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S')
-rw-r--r-- | FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S | 1105 |
1 files changed, 1105 insertions, 0 deletions
diff --git a/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S new file mode 100644 index 000000000..95711075b --- /dev/null +++ b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/poly1305_asm.S @@ -0,0 +1,1105 @@ +/* poly1305_asm + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifndef HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX1 +#endif /* HAVE_INTEL_AVX1 */ +#ifndef NO_AVX2_SUPPORT +#define HAVE_INTEL_AVX2 +#endif /* NO_AVX2_SUPPORT */ + +#ifdef HAVE_INTEL_AVX1 +#ifndef __APPLE__ +.text +.globl poly1305_setkey_avx +.type poly1305_setkey_avx,@function +.align 4 +poly1305_setkey_avx: +#else +.section __TEXT,__text +.globl _poly1305_setkey_avx +.p2align 2 +_poly1305_setkey_avx: +#endif /* __APPLE__ */ + movabsq $0xffffffc0fffffff, %r10 + movabsq $0xffffffc0ffffffc, %r11 + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + andq %r10, %rdx + andq %r11, %rax + movq %rdx, %r10 + movq %rax, %r11 + xorq %r9, %r9 + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %r9, 24(%rdi) + movq %r9, 32(%rdi) + movq %r9, 40(%rdi) + movq %rcx, 48(%rdi) + movq %r8, 56(%rdi) + movq %r9, 352(%rdi) + movq %r9, 408(%rdi) + movq %rdx, 360(%rdi) + movq %rax, 416(%rdi) + addq %rdx, %r10 + addq %rax, %r11 + movq %r10, 368(%rdi) + movq %r11, 424(%rdi) + addq %rdx, %r10 + addq %rax, %r11 + movq %r10, 376(%rdi) + movq %r11, 432(%rdi) + addq %rdx, %r10 + addq %rax, %r11 + movq %r10, 384(%rdi) + movq %r11, 440(%rdi) + addq %rdx, %r10 + addq %rax, %r11 + movq %r10, 392(%rdi) + movq %r11, 448(%rdi) + addq %rdx, %r10 + addq %rax, %r11 + movq %r10, 400(%rdi) + movq %r11, 456(%rdi) + movq %r9, 608(%rdi) + movb $0x01, 616(%rdi) + repz retq +#ifndef __APPLE__ +.size poly1305_setkey_avx,.-poly1305_setkey_avx +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl poly1305_block_avx +.type poly1305_block_avx,@function +.align 4 +poly1305_block_avx: +#else +.section __TEXT,__text +.globl _poly1305_block_avx +.p2align 2 +_poly1305_block_avx: +#endif /* __APPLE__ */ + pushq %r15 + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + movq (%rdi), %r15 + movq 8(%rdi), %rbx + movq 24(%rdi), %r8 + movq 32(%rdi), %r9 + movq 40(%rdi), %r10 + xorq %r14, %r14 + movb 616(%rdi), %r14b + # h += m + movq (%rsi), %r11 + movq 8(%rsi), %r12 + addq %r11, %r8 + adcq %r12, %r9 + movq %rbx, %rax + adcq %r14, %r10 + # r[1] * h[0] => rdx, rax ==> t2, t1 + mulq %r8 + movq %rax, %r12 + movq %rdx, %r13 + # r[0] * h[1] => rdx, rax ++> t2, t1 + movq %r15, %rax + mulq %r9 + addq %rax, %r12 + movq %r15, %rax + adcq %rdx, %r13 + # r[0] * h[0] => rdx, rax ==> t4, t0 + mulq %r8 + movq %rax, %r11 + movq %rdx, %r8 + # r[1] * h[1] => rdx, rax =+> t3, t2 + movq %rbx, %rax + mulq %r9 + # r[0] * h[2] +> t2 + addq 352(%rdi,%r10,8), %r13 + movq %rdx, %r14 + addq %r8, %r12 + adcq %rax, %r13 + # r[1] * h[2] +> t3 + adcq 408(%rdi,%r10,8), %r14 + # r * h in r14, r13, r12, r11 + # h = (r * h) mod 2^130 - 5 + movq %r13, %r10 + andq $-4, %r13 + andq $3, %r10 + addq %r13, %r11 + movq %r13, %r8 + adcq %r14, %r12 + adcq $0x00, %r10 + shrdq $2, %r14, %r8 + shrq $2, %r14 + addq %r11, %r8 + adcq %r14, %r12 + movq %r12, %r9 + adcq $0x00, %r10 + # h in r10, r9, r8 + # Store h to ctx + movq %r8, 24(%rdi) + movq %r9, 32(%rdi) + movq %r10, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %r15 + repz retq +#ifndef __APPLE__ +.size poly1305_block_avx,.-poly1305_block_avx +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl poly1305_blocks_avx +.type poly1305_blocks_avx,@function +.align 4 +poly1305_blocks_avx: +#else +.section __TEXT,__text +.globl _poly1305_blocks_avx +.p2align 2 +_poly1305_blocks_avx: +#endif /* __APPLE__ */ + pushq %r15 + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + movq %rdx, %rcx + movq (%rdi), %r15 + movq 8(%rdi), %rbx + movq 24(%rdi), %r8 + movq 32(%rdi), %r9 + movq 40(%rdi), %r10 +L_poly1305_avx_blocks_start: + # h += m + movq (%rsi), %r11 + movq 8(%rsi), %r12 + addq %r11, %r8 + adcq %r12, %r9 + movq %rbx, %rax + adcq $0x00, %r10 + # r[1] * h[0] => rdx, rax ==> t2, t1 + mulq %r8 + movq %rax, %r12 + movq %rdx, %r13 + # r[0] * h[1] => rdx, rax ++> t2, t1 + movq %r15, %rax + mulq %r9 + addq %rax, %r12 + movq %r15, %rax + adcq %rdx, %r13 + # r[0] * h[0] => rdx, rax ==> t4, t0 + mulq %r8 + movq %rax, %r11 + movq %rdx, %r8 + # r[1] * h[1] => rdx, rax =+> t3, t2 + movq %rbx, %rax + mulq %r9 + # r[0] * h[2] +> t2 + addq 360(%rdi,%r10,8), %r13 + movq %rdx, %r14 + addq %r8, %r12 + adcq %rax, %r13 + # r[1] * h[2] +> t3 + adcq 416(%rdi,%r10,8), %r14 + # r * h in r14, r13, r12, r11 + # h = (r * h) mod 2^130 - 5 + movq %r13, %r10 + andq $-4, %r13 + andq $3, %r10 + addq %r13, %r11 + movq %r13, %r8 + adcq %r14, %r12 + adcq $0x00, %r10 + shrdq $2, %r14, %r8 + shrq $2, %r14 + addq %r11, %r8 + adcq %r14, %r12 + movq %r12, %r9 + adcq $0x00, %r10 + # h in r10, r9, r8 + # Next block from message + addq $16, %rsi + subq $16, %rcx + jg L_poly1305_avx_blocks_start + # Store h to ctx + movq %r8, 24(%rdi) + movq %r9, 32(%rdi) + movq %r10, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %r15 + repz retq +#ifndef __APPLE__ +.size poly1305_blocks_avx,.-poly1305_blocks_avx +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl poly1305_final_avx +.type poly1305_final_avx,@function +.align 4 +poly1305_final_avx: +#else +.section __TEXT,__text +.globl _poly1305_final_avx +.p2align 2 +_poly1305_final_avx: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + movq %rsi, %rbx + movq 608(%rdi), %rax + testq %rax, %rax + je L_poly1305_avx_final_no_more + movb $0x01, 480(%rdi,%rax,1) + jmp L_poly1305_avx_final_cmp_rem +L_poly1305_avx_final_zero_rem: + movb $0x00, 480(%rdi,%rax,1) +L_poly1305_avx_final_cmp_rem: + incb %al + cmpq $16, %rax + jl L_poly1305_avx_final_zero_rem + movb $0x00, 616(%rdi) + leaq 480(%rdi), %rsi +#ifndef __APPLE__ + callq poly1305_block_avx@plt +#else + callq _poly1305_block_avx +#endif /* __APPLE__ */ +L_poly1305_avx_final_no_more: + movq 24(%rdi), %rax + movq 32(%rdi), %rdx + movq 40(%rdi), %rcx + movq 48(%rdi), %r11 + movq 56(%rdi), %r12 + # h %= p + # h = (h + pad) + # mod 2^130 - 5 + movq %rcx, %r8 + andq $3, %rcx + shrq $2, %r8 + # Multily by 5 + leaq 0(%r8,%r8,4), %r8 + addq %r8, %rax + adcq $0x00, %rdx + adcq $0x00, %rcx + # Fixup when between (1 << 130) - 1 and (1 << 130) - 5 + movq %rax, %r8 + movq %rdx, %r9 + movq %rcx, %r10 + addq $5, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + cmpq $4, %r10 + cmoveq %r8, %rax + cmoveq %r9, %rdx + # h += pad + addq %r11, %rax + adcq %r12, %rdx + movq %rax, (%rbx) + movq %rdx, 8(%rbx) + # Zero out r + movq $0x00, (%rdi) + movq $0x00, 8(%rdi) + # Zero out h + movq $0x00, 24(%rdi) + movq $0x00, 32(%rdi) + movq $0x00, 40(%rdi) + # Zero out pad + movq $0x00, 48(%rdi) + movq $0x00, 56(%rdi) + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size poly1305_final_avx,.-poly1305_final_avx +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_AVX2 +#ifndef __APPLE__ +.text +.globl poly1305_calc_powers_avx2 +.type poly1305_calc_powers_avx2,@function +.align 4 +poly1305_calc_powers_avx2: +#else +.section __TEXT,__text +.globl _poly1305_calc_powers_avx2 +.p2align 2 +_poly1305_calc_powers_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + movq (%rdi), %rcx + movq 8(%rdi), %r8 + xorq %r9, %r9 + # Convert to 26 bits in 32 + movq %rcx, %rax + movq %rcx, %rdx + movq %rcx, %rsi + movq %r8, %rbx + movq %r8, %rbp + shrq $26, %rdx + shrdq $52, %r8, %rsi + shrq $14, %rbx + shrdq $40, %r9, %rbp + andq $0x3ffffff, %rax + andq $0x3ffffff, %rdx + andq $0x3ffffff, %rsi + andq $0x3ffffff, %rbx + andq $0x3ffffff, %rbp + movl %eax, 224(%rdi) + movl %edx, 228(%rdi) + movl %esi, 232(%rdi) + movl %ebx, 236(%rdi) + movl %ebp, 240(%rdi) + movl $0x00, 244(%rdi) + # Square 128-bit + movq %r8, %rax + mulq %rcx + xorq %r13, %r13 + movq %rax, %r11 + movq %rdx, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + movq %rcx, %rax + mulq %rax + movq %rax, %r10 + movq %rdx, %r15 + movq %r8, %rax + mulq %rax + addq %r15, %r11 + adcq %rax, %r12 + adcq %rdx, %r13 + # Reduce 256-bit to 130-bit + movq %r12, %rax + movq %r13, %rdx + andq $-4, %rax + andq $3, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + shrdq $2, %rdx, %rax + shrq $2, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + movq %r12, %rax + shrq $2, %rax + leaq 0(%rax,%rax,4), %rax + andq $3, %r12 + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Convert to 26 bits in 32 + movq %r10, %rax + movq %r10, %rdx + movq %r10, %rsi + movq %r11, %rbx + movq %r11, %rbp + shrq $26, %rdx + shrdq $52, %r11, %rsi + shrq $14, %rbx + shrdq $40, %r12, %rbp + andq $0x3ffffff, %rax + andq $0x3ffffff, %rdx + andq $0x3ffffff, %rsi + andq $0x3ffffff, %rbx + andq $0x3ffffff, %rbp + movl %eax, 256(%rdi) + movl %edx, 260(%rdi) + movl %esi, 264(%rdi) + movl %ebx, 268(%rdi) + movl %ebp, 272(%rdi) + movl $0x00, 276(%rdi) + # Multiply 128-bit by 130-bit + # r1[0] * r2[0] + movq %rcx, %rax + mulq %r10 + movq %rax, %r13 + movq %rdx, %r14 + # r1[0] * r2[1] + movq %rcx, %rax + mulq %r11 + movq $0x00, %r15 + addq %rax, %r14 + adcq %rdx, %r15 + # r1[1] * r2[0] + movq %r8, %rax + mulq %r10 + movq $0x00, %rsi + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rsi + # r1[0] * r2[2] + movq %rcx, %rax + mulq %r12 + addq %rax, %r15 + adcq %rdx, %rsi + # r1[1] * r2[1] + movq %r8, %rax + mulq %r11 + movq $0x00, %rbx + addq %rax, %r15 + adcq %rdx, %rsi + adcq $0x00, %rbx + # r1[1] * r2[2] + movq %r8, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbx + # Reduce 260-bit to 130-bit + movq %r15, %rax + movq %rsi, %rdx + movq %rbx, %rbx + andq $-4, %rax + andq $3, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq %rbx, %r15 + shrdq $2, %rdx, %rax + shrdq $2, %rbx, %rdx + shrq $2, %rbx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %rbx, %r15 + movq %r15, %rax + andq $3, %r15 + shrq $2, %rax + leaq 0(%rax,%rax,4), %rax + addq %rax, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Convert to 26 bits in 32 + movq %r13, %rax + movq %r13, %rdx + movq %r13, %rsi + movq %r14, %rbx + movq %r14, %rbp + shrq $26, %rdx + shrdq $52, %r14, %rsi + shrq $14, %rbx + shrdq $40, %r15, %rbp + andq $0x3ffffff, %rax + andq $0x3ffffff, %rdx + andq $0x3ffffff, %rsi + andq $0x3ffffff, %rbx + andq $0x3ffffff, %rbp + movl %eax, 288(%rdi) + movl %edx, 292(%rdi) + movl %esi, 296(%rdi) + movl %ebx, 300(%rdi) + movl %ebp, 304(%rdi) + movl $0x00, 308(%rdi) + # Square 130-bit + movq %r11, %rax + mulq %r10 + xorq %r13, %r13 + movq %rax, %r8 + movq %rdx, %r9 + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %r13 + movq %r10, %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %r15 + movq %r11, %rax + mulq %rax + addq %r15, %r8 + adcq %rax, %r9 + adcq %rdx, %r13 + movq %r12, %rax + mulq %rax + movq %rax, %r14 + movq %r12, %rax + mulq %r10 + addq %rax, %r9 + adcq %rdx, %r13 + adcq $0x00, %r14 + addq %rax, %r9 + adcq %rdx, %r13 + adcq $0x00, %r14 + movq %r12, %rax + mulq %r11 + addq %rax, %r13 + adcq %rdx, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Reduce 260-bit to 130-bit + movq %r9, %rax + movq %r13, %rdx + movq %r14, %r15 + andq $-4, %rax + andq $3, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq %r15, %r9 + shrdq $2, %rdx, %rax + shrdq $2, %r15, %rdx + shrq $2, %r15 + addq %rax, %rcx + adcq %rdx, %r8 + adcq %r15, %r9 + movq %r9, %rax + andq $3, %r9 + shrq $2, %rax + leaq 0(%rax,%rax,4), %rax + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + # Convert to 26 bits in 32 + movq %rcx, %rax + movq %rcx, %rdx + movq %rcx, %rsi + movq %r8, %rbx + movq %r8, %rbp + shrq $26, %rdx + shrdq $52, %r8, %rsi + shrq $14, %rbx + shrdq $40, %r9, %rbp + andq $0x3ffffff, %rax + andq $0x3ffffff, %rdx + andq $0x3ffffff, %rsi + andq $0x3ffffff, %rbx + andq $0x3ffffff, %rbp + movl %eax, 320(%rdi) + movl %edx, 324(%rdi) + movl %esi, 328(%rdi) + movl %ebx, 332(%rdi) + movl %ebp, 336(%rdi) + movl $0x00, 340(%rdi) + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl poly1305_setkey_avx2 +.type poly1305_setkey_avx2,@function +.align 4 +poly1305_setkey_avx2: +#else +.section __TEXT,__text +.globl _poly1305_setkey_avx2 +.p2align 2 +_poly1305_setkey_avx2: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + callq poly1305_setkey_avx@plt +#else + callq _poly1305_setkey_avx +#endif /* __APPLE__ */ + vpxor %ymm0, %ymm0, %ymm0 + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm0, 128(%rdi) + vmovdqu %ymm0, 160(%rdi) + vmovdqu %ymm0, 192(%rdi) + movq $0x00, 608(%rdi) + movw $0x00, 616(%rdi) + repz retq +#ifndef __APPLE__ +.size poly1305_setkey_avx2,.-poly1305_setkey_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_poly1305_avx2_blocks_mask: +.quad 0x3ffffff, 0x3ffffff +.quad 0x3ffffff, 0x3ffffff +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_poly1305_avx2_blocks_hibit: +.quad 0x1000000, 0x1000000 +.quad 0x1000000, 0x1000000 +#ifndef __APPLE__ +.text +.globl poly1305_blocks_avx2 +.type poly1305_blocks_avx2,@function +.align 4 +poly1305_blocks_avx2: +#else +.section __TEXT,__text +.globl _poly1305_blocks_avx2 +.p2align 2 +_poly1305_blocks_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %rbx + subq $0x140, %rsp + movq %rsp, %rcx + andq $-32, %rcx + addq $32, %rcx + vpxor %ymm15, %ymm15, %ymm15 + movq %rcx, %rbx + leaq 64(%rdi), %rax + addq $0xa0, %rbx + cmpw $0x00, 616(%rdi) + jne L_poly1305_avx2_blocks_begin_h + # Load the message data + vmovdqu (%rsi), %ymm0 + vmovdqu 32(%rsi), %ymm1 + vperm2i128 $32, %ymm1, %ymm0, %ymm2 + vperm2i128 $49, %ymm1, %ymm0, %ymm0 + vpunpckldq %ymm0, %ymm2, %ymm1 + vpunpckhdq %ymm0, %ymm2, %ymm3 + vpunpckldq %ymm15, %ymm1, %ymm0 + vpunpckhdq %ymm15, %ymm1, %ymm1 + vpunpckldq %ymm15, %ymm3, %ymm2 + vpunpckhdq %ymm15, %ymm3, %ymm3 + vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4 + vpsllq $6, %ymm1, %ymm1 + vpsllq $12, %ymm2, %ymm2 + vpsllq $18, %ymm3, %ymm3 + vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14 + # Reduce, in place, the message data + vpsrlq $26, %ymm0, %ymm10 + vpsrlq $26, %ymm3, %ymm11 + vpand %ymm14, %ymm0, %ymm0 + vpand %ymm14, %ymm3, %ymm3 + vpaddq %ymm1, %ymm10, %ymm1 + vpaddq %ymm4, %ymm11, %ymm4 + vpsrlq $26, %ymm1, %ymm10 + vpsrlq $26, %ymm4, %ymm11 + vpand %ymm14, %ymm1, %ymm1 + vpand %ymm14, %ymm4, %ymm4 + vpaddq %ymm2, %ymm10, %ymm2 + vpslld $2, %ymm11, %ymm12 + vpaddd %ymm12, %ymm11, %ymm12 + vpsrlq $26, %ymm2, %ymm10 + vpaddq %ymm0, %ymm12, %ymm0 + vpsrlq $26, %ymm0, %ymm11 + vpand %ymm14, %ymm2, %ymm2 + vpand %ymm14, %ymm0, %ymm0 + vpaddq %ymm3, %ymm10, %ymm3 + vpaddq %ymm1, %ymm11, %ymm1 + vpsrlq $26, %ymm3, %ymm10 + vpand %ymm14, %ymm3, %ymm3 + vpaddq %ymm4, %ymm10, %ymm4 + addq $0x40, %rsi + subq $0x40, %rdx + jz L_poly1305_avx2_blocks_store + jmp L_poly1305_avx2_blocks_load_r4 +L_poly1305_avx2_blocks_begin_h: + # Load the H values. + vmovdqu (%rax), %ymm0 + vmovdqu 32(%rax), %ymm1 + vmovdqu 64(%rax), %ymm2 + vmovdqu 96(%rax), %ymm3 + vmovdqu 128(%rax), %ymm4 + # Check if there is a power of r to load - otherwise use r^4. + cmpb $0x00, 616(%rdi) + je L_poly1305_avx2_blocks_load_r4 + # Load the 4 powers of r - r^4, r^3, r^2, r^1. + vmovdqu 224(%rdi), %ymm8 + vmovdqu 256(%rdi), %ymm7 + vmovdqu 288(%rdi), %ymm6 + vmovdqu 320(%rdi), %ymm5 + vpermq $0xd8, %ymm5, %ymm5 + vpermq $0xd8, %ymm6, %ymm6 + vpermq $0xd8, %ymm7, %ymm7 + vpermq $0xd8, %ymm8, %ymm8 + vpunpcklqdq %ymm6, %ymm5, %ymm10 + vpunpckhqdq %ymm6, %ymm5, %ymm11 + vpunpcklqdq %ymm8, %ymm7, %ymm12 + vpunpckhqdq %ymm8, %ymm7, %ymm13 + vperm2i128 $32, %ymm12, %ymm10, %ymm5 + vperm2i128 $49, %ymm12, %ymm10, %ymm7 + vperm2i128 $32, %ymm13, %ymm11, %ymm9 + vpsrlq $32, %ymm5, %ymm6 + vpsrlq $32, %ymm7, %ymm8 + jmp L_poly1305_avx2_blocks_mul_5 +L_poly1305_avx2_blocks_load_r4: + # Load r^4 into all four positions. + vmovdqu 320(%rdi), %ymm13 + vpermq $0x00, %ymm13, %ymm5 + vpsrlq $32, %ymm13, %ymm14 + vpermq $0x55, %ymm13, %ymm7 + vpermq $0xaa, %ymm13, %ymm9 + vpermq $0x00, %ymm14, %ymm6 + vpermq $0x55, %ymm14, %ymm8 +L_poly1305_avx2_blocks_mul_5: + # Multiply top 4 26-bit values of all four H by 5 + vpslld $2, %ymm6, %ymm10 + vpslld $2, %ymm7, %ymm11 + vpslld $2, %ymm8, %ymm12 + vpslld $2, %ymm9, %ymm13 + vpaddq %ymm10, %ymm6, %ymm10 + vpaddq %ymm11, %ymm7, %ymm11 + vpaddq %ymm12, %ymm8, %ymm12 + vpaddq %ymm13, %ymm9, %ymm13 + # Store powers of r and multiple of 5 for use in multiply. + vmovdqa %ymm10, (%rbx) + vmovdqa %ymm11, 32(%rbx) + vmovdqa %ymm12, 64(%rbx) + vmovdqa %ymm13, 96(%rbx) + vmovdqa %ymm5, (%rcx) + vmovdqa %ymm6, 32(%rcx) + vmovdqa %ymm7, 64(%rcx) + vmovdqa %ymm8, 96(%rcx) + vmovdqa %ymm9, 128(%rcx) + vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14 + # If not finished then loop over data + cmpb $0x01, 616(%rdi) + jne L_poly1305_avx2_blocks_start + # Do last multiply, reduce, add the four H together and move to + # 32-bit registers + vpmuludq (%rbx), %ymm4, %ymm5 + vpmuludq 32(%rbx), %ymm3, %ymm10 + vpmuludq 32(%rbx), %ymm4, %ymm6 + vpmuludq 64(%rbx), %ymm3, %ymm11 + vpmuludq 64(%rbx), %ymm4, %ymm7 + vpaddq %ymm5, %ymm10, %ymm5 + vpmuludq 64(%rbx), %ymm2, %ymm12 + vpmuludq 96(%rbx), %ymm4, %ymm8 + vpaddq %ymm6, %ymm11, %ymm6 + vpmuludq 96(%rbx), %ymm1, %ymm13 + vpmuludq 96(%rbx), %ymm2, %ymm10 + vpaddq %ymm5, %ymm12, %ymm5 + vpmuludq 96(%rbx), %ymm3, %ymm11 + vpmuludq (%rcx), %ymm3, %ymm12 + vpaddq %ymm5, %ymm13, %ymm5 + vpmuludq (%rcx), %ymm4, %ymm9 + vpaddq %ymm6, %ymm10, %ymm6 + vpmuludq (%rcx), %ymm0, %ymm13 + vpaddq %ymm7, %ymm11, %ymm7 + vpmuludq (%rcx), %ymm1, %ymm10 + vpaddq %ymm8, %ymm12, %ymm8 + vpmuludq (%rcx), %ymm2, %ymm11 + vpmuludq 32(%rcx), %ymm2, %ymm12 + vpaddq %ymm5, %ymm13, %ymm5 + vpmuludq 32(%rcx), %ymm3, %ymm13 + vpaddq %ymm6, %ymm10, %ymm6 + vpmuludq 32(%rcx), %ymm0, %ymm10 + vpaddq %ymm7, %ymm11, %ymm7 + vpmuludq 32(%rcx), %ymm1, %ymm11 + vpaddq %ymm8, %ymm12, %ymm8 + vpmuludq 64(%rcx), %ymm1, %ymm12 + vpaddq %ymm9, %ymm13, %ymm9 + vpmuludq 64(%rcx), %ymm2, %ymm13 + vpaddq %ymm6, %ymm10, %ymm6 + vpmuludq 64(%rcx), %ymm0, %ymm10 + vpaddq %ymm7, %ymm11, %ymm7 + vpmuludq 96(%rcx), %ymm0, %ymm11 + vpaddq %ymm8, %ymm12, %ymm8 + vpmuludq 96(%rcx), %ymm1, %ymm12 + vpaddq %ymm9, %ymm13, %ymm9 + vpaddq %ymm7, %ymm10, %ymm7 + vpmuludq 128(%rcx), %ymm0, %ymm13 + vpaddq %ymm8, %ymm11, %ymm8 + vpaddq %ymm9, %ymm12, %ymm9 + vpaddq %ymm9, %ymm13, %ymm9 + vpsrlq $26, %ymm5, %ymm10 + vpsrlq $26, %ymm8, %ymm11 + vpand %ymm14, %ymm5, %ymm5 + vpand %ymm14, %ymm8, %ymm8 + vpaddq %ymm6, %ymm10, %ymm6 + vpaddq %ymm9, %ymm11, %ymm9 + vpsrlq $26, %ymm6, %ymm10 + vpsrlq $26, %ymm9, %ymm11 + vpand %ymm14, %ymm6, %ymm1 + vpand %ymm14, %ymm9, %ymm4 + vpaddq %ymm7, %ymm10, %ymm7 + vpslld $2, %ymm11, %ymm12 + vpaddd %ymm12, %ymm11, %ymm12 + vpsrlq $26, %ymm7, %ymm10 + vpaddq %ymm5, %ymm12, %ymm5 + vpsrlq $26, %ymm5, %ymm11 + vpand %ymm14, %ymm7, %ymm2 + vpand %ymm14, %ymm5, %ymm0 + vpaddq %ymm8, %ymm10, %ymm8 + vpaddq %ymm1, %ymm11, %ymm1 + vpsrlq $26, %ymm8, %ymm10 + vpand %ymm14, %ymm8, %ymm3 + vpaddq %ymm4, %ymm10, %ymm4 + vpsrldq $8, %ymm0, %ymm5 + vpsrldq $8, %ymm1, %ymm6 + vpsrldq $8, %ymm2, %ymm7 + vpsrldq $8, %ymm3, %ymm8 + vpsrldq $8, %ymm4, %ymm9 + vpaddq %ymm0, %ymm5, %ymm0 + vpaddq %ymm1, %ymm6, %ymm1 + vpaddq %ymm2, %ymm7, %ymm2 + vpaddq %ymm3, %ymm8, %ymm3 + vpaddq %ymm4, %ymm9, %ymm4 + vpermq $2, %ymm0, %ymm5 + vpermq $2, %ymm1, %ymm6 + vpermq $2, %ymm2, %ymm7 + vpermq $2, %ymm3, %ymm8 + vpermq $2, %ymm4, %ymm9 + vpaddq %ymm0, %ymm5, %ymm0 + vpaddq %ymm1, %ymm6, %ymm1 + vpaddq %ymm2, %ymm7, %ymm2 + vpaddq %ymm3, %ymm8, %ymm3 + vpaddq %ymm4, %ymm9, %ymm4 + vmovd %xmm0, %r8d + vmovd %xmm1, %r9d + vmovd %xmm2, %r10d + vmovd %xmm3, %r11d + vmovd %xmm4, %r12d + jmp L_poly1305_avx2_blocks_end_calc +L_poly1305_avx2_blocks_start: + vmovdqu (%rsi), %ymm5 + vmovdqu 32(%rsi), %ymm6 + vperm2i128 $32, %ymm6, %ymm5, %ymm7 + vperm2i128 $49, %ymm6, %ymm5, %ymm5 + vpunpckldq %ymm5, %ymm7, %ymm6 + vpunpckhdq %ymm5, %ymm7, %ymm8 + vpunpckldq %ymm15, %ymm6, %ymm5 + vpunpckhdq %ymm15, %ymm6, %ymm6 + vpunpckldq %ymm15, %ymm8, %ymm7 + vpunpckhdq %ymm15, %ymm8, %ymm8 + vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9 + vpsllq $6, %ymm6, %ymm6 + vpsllq $12, %ymm7, %ymm7 + vpsllq $18, %ymm8, %ymm8 + vpmuludq (%rbx), %ymm4, %ymm10 + vpaddq %ymm5, %ymm10, %ymm5 + vpmuludq 32(%rbx), %ymm3, %ymm10 + vpmuludq 32(%rbx), %ymm4, %ymm11 + vpaddq %ymm6, %ymm11, %ymm6 + vpmuludq 64(%rbx), %ymm3, %ymm11 + vpmuludq 64(%rbx), %ymm4, %ymm12 + vpaddq %ymm7, %ymm12, %ymm7 + vpaddq %ymm5, %ymm10, %ymm5 + vpmuludq 64(%rbx), %ymm2, %ymm12 + vpmuludq 96(%rbx), %ymm4, %ymm13 + vpaddq %ymm8, %ymm13, %ymm8 + vpaddq %ymm6, %ymm11, %ymm6 + vpmuludq 96(%rbx), %ymm1, %ymm13 + vpmuludq 96(%rbx), %ymm2, %ymm10 + vpaddq %ymm5, %ymm12, %ymm5 + vpmuludq 96(%rbx), %ymm3, %ymm11 + vpmuludq (%rcx), %ymm3, %ymm12 + vpaddq %ymm5, %ymm13, %ymm5 + vpmuludq (%rcx), %ymm4, %ymm13 + vpaddq %ymm9, %ymm13, %ymm9 + vpaddq %ymm6, %ymm10, %ymm6 + vpmuludq (%rcx), %ymm0, %ymm13 + vpaddq %ymm7, %ymm11, %ymm7 + vpmuludq (%rcx), %ymm1, %ymm10 + vpaddq %ymm8, %ymm12, %ymm8 + vpmuludq (%rcx), %ymm2, %ymm11 + vpmuludq 32(%rcx), %ymm2, %ymm12 + vpaddq %ymm5, %ymm13, %ymm5 + vpmuludq 32(%rcx), %ymm3, %ymm13 + vpaddq %ymm6, %ymm10, %ymm6 + vpmuludq 32(%rcx), %ymm0, %ymm10 + vpaddq %ymm7, %ymm11, %ymm7 + vpmuludq 32(%rcx), %ymm1, %ymm11 + vpaddq %ymm8, %ymm12, %ymm8 + vpmuludq 64(%rcx), %ymm1, %ymm12 + vpaddq %ymm9, %ymm13, %ymm9 + vpmuludq 64(%rcx), %ymm2, %ymm13 + vpaddq %ymm6, %ymm10, %ymm6 + vpmuludq 64(%rcx), %ymm0, %ymm10 + vpaddq %ymm7, %ymm11, %ymm7 + vpmuludq 96(%rcx), %ymm0, %ymm11 + vpaddq %ymm8, %ymm12, %ymm8 + vpmuludq 96(%rcx), %ymm1, %ymm12 + vpaddq %ymm9, %ymm13, %ymm9 + vpaddq %ymm7, %ymm10, %ymm7 + vpmuludq 128(%rcx), %ymm0, %ymm13 + vpaddq %ymm8, %ymm11, %ymm8 + vpaddq %ymm9, %ymm12, %ymm9 + vpaddq %ymm9, %ymm13, %ymm9 + vpsrlq $26, %ymm5, %ymm10 + vpsrlq $26, %ymm8, %ymm11 + vpand %ymm14, %ymm5, %ymm5 + vpand %ymm14, %ymm8, %ymm8 + vpaddq %ymm6, %ymm10, %ymm6 + vpaddq %ymm9, %ymm11, %ymm9 + vpsrlq $26, %ymm6, %ymm10 + vpsrlq $26, %ymm9, %ymm11 + vpand %ymm14, %ymm6, %ymm1 + vpand %ymm14, %ymm9, %ymm4 + vpaddq %ymm7, %ymm10, %ymm7 + vpslld $2, %ymm11, %ymm12 + vpaddd %ymm12, %ymm11, %ymm12 + vpsrlq $26, %ymm7, %ymm10 + vpaddq %ymm5, %ymm12, %ymm5 + vpsrlq $26, %ymm5, %ymm11 + vpand %ymm14, %ymm7, %ymm2 + vpand %ymm14, %ymm5, %ymm0 + vpaddq %ymm8, %ymm10, %ymm8 + vpaddq %ymm1, %ymm11, %ymm1 + vpsrlq $26, %ymm8, %ymm10 + vpand %ymm14, %ymm8, %ymm3 + vpaddq %ymm4, %ymm10, %ymm4 + addq $0x40, %rsi + subq $0x40, %rdx + jnz L_poly1305_avx2_blocks_start +L_poly1305_avx2_blocks_store: + # Store four H values - state + vmovdqu %ymm0, (%rax) + vmovdqu %ymm1, 32(%rax) + vmovdqu %ymm2, 64(%rax) + vmovdqu %ymm3, 96(%rax) + vmovdqu %ymm4, 128(%rax) +L_poly1305_avx2_blocks_end_calc: + cmpb $0x00, 616(%rdi) + je L_poly1305_avx2_blocks_complete + movq %r8, %rax + movq %r10, %rdx + movq %r12, %rcx + shrq $12, %rdx + shrq $24, %rcx + shlq $26, %r9 + shlq $52, %r10 + shlq $14, %r11 + shlq $40, %r12 + addq %r9, %rax + adcq %r10, %rax + adcq %r11, %rdx + adcq %r12, %rdx + adcq $0x00, %rcx + movq %rcx, %r8 + andq $3, %rcx + shrq $2, %r8 + leaq 0(%r8,%r8,4), %r8 + addq %r8, %rax + adcq $0x00, %rdx + adcq $0x00, %rcx + movq %rax, 24(%rdi) + movq %rdx, 32(%rdi) + movq %rcx, 40(%rdi) +L_poly1305_avx2_blocks_complete: + movb $0x01, 617(%rdi) + addq $0x140, %rsp + popq %rbx + popq %r12 + repz retq +#ifndef __APPLE__ +.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl poly1305_final_avx2 +.type poly1305_final_avx2,@function +.align 4 +poly1305_final_avx2: +#else +.section __TEXT,__text +.globl _poly1305_final_avx2 +.p2align 2 +_poly1305_final_avx2: +#endif /* __APPLE__ */ + movb $0x01, 616(%rdi) + movb 617(%rdi), %cl + cmpb $0x00, %cl + je L_poly1305_avx2_final_done_blocks_X4 + pushq %rsi + movq $0x40, %rdx + xorq %rsi, %rsi +#ifndef __APPLE__ + callq poly1305_blocks_avx2@plt +#else + callq _poly1305_blocks_avx2 +#endif /* __APPLE__ */ + popq %rsi +L_poly1305_avx2_final_done_blocks_X4: + movq 608(%rdi), %rax + movq %rax, %rcx + andq $-16, %rcx + cmpb $0x00, %cl + je L_poly1305_avx2_final_done_blocks + pushq %rcx + pushq %rax + pushq %rsi + movq %rcx, %rdx + leaq 480(%rdi), %rsi +#ifndef __APPLE__ + callq poly1305_blocks_avx@plt +#else + callq _poly1305_blocks_avx +#endif /* __APPLE__ */ + popq %rsi + popq %rax + popq %rcx +L_poly1305_avx2_final_done_blocks: + subq %rcx, 608(%rdi) + xorq %rdx, %rdx + jmp L_poly1305_avx2_final_cmp_copy +L_poly1305_avx2_final_start_copy: + movb 480(%rdi,%rcx,1), %r8b + movb %r8b, 480(%rdi,%rdx,1) + incb %cl + incb %dl +L_poly1305_avx2_final_cmp_copy: + cmp %rcx, %rax + jne L_poly1305_avx2_final_start_copy +#ifndef __APPLE__ + callq poly1305_final_avx@plt +#else + callq _poly1305_final_avx +#endif /* __APPLE__ */ + vpxor %ymm0, %ymm0, %ymm0 + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm0, 128(%rdi) + vmovdqu %ymm0, 160(%rdi) + vmovdqu %ymm0, 192(%rdi) + vmovdqu %ymm0, 224(%rdi) + vmovdqu %ymm0, 256(%rdi) + vmovdqu %ymm0, 288(%rdi) + vmovdqu %ymm0, 320(%rdi) + movq $0x00, 608(%rdi) + movw $0x00, 616(%rdi) + repz retq +#ifndef __APPLE__ +.size poly1305_final_avx2,.-poly1305_final_avx2 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ |