diff options
Diffstat (limited to 'FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S')
-rw-r--r-- | FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S | 16542 |
1 files changed, 16542 insertions, 0 deletions
diff --git a/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S new file mode 100644 index 000000000..6d0f638b5 --- /dev/null +++ b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S @@ -0,0 +1,16542 @@ +/* fe_x25519_asm + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifndef HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX1 +#endif /* HAVE_INTEL_AVX1 */ +#ifndef NO_AVX2_SUPPORT +#define HAVE_INTEL_AVX2 +#endif /* NO_AVX2_SUPPORT */ + +#ifndef __APPLE__ +.text +.globl fe_init +.type fe_init,@function +.align 4 +fe_init: +#else +.section __TEXT,__text +.globl _fe_init +.p2align 2 +_fe_init: +#endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +#ifndef __APPLE__ + movq cpuFlagsSet@GOTPCREL(%rip), %rax + movl (%rax), %eax +#else + movl _cpuFlagsSet(%rip), %eax +#endif /* __APPLE__ */ + testl %eax, %eax + je L_fe_init_get_flags + repz retq +L_fe_init_get_flags: +#ifndef __APPLE__ + callq cpuid_get_flags@plt +#else + callq _cpuid_get_flags +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq intelFlags@GOTPCREL(%rip), %rdx + movl %eax, (%rdx) +#else + movl %eax, _intelFlags(%rip) +#endif /* __APPLE__ */ + andl $0x50, %eax + cmpl $0x50, %eax + jne L_fe_init_flags_done +#ifndef __APPLE__ + movq fe_mul_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_mul_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_mul_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_mul_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_sq_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_sq_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_sq_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_sq_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_mul121666_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_mul121666_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_mul121666_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_mul121666_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_sq2_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_sq2_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_sq2_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_sq2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_invert_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_invert_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_invert_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_invert_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq curve25519_avx2@GOTPCREL(%rip), %rax +#else + leaq _curve25519_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq curve25519_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _curve25519_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_pow22523_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_pow22523_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_pow22523_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_pow22523_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_ge_to_p2_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_ge_to_p2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_ge_to_p3_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_ge_to_p3_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_ge_dbl_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_ge_dbl_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_ge_madd_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_madd_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_ge_madd_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_ge_msub_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_msub_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_ge_msub_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_add_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_ge_add_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_add_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_ge_add_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_ge_sub_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_ge_sub_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_ge_sub_p(%rip) +#endif /* __APPLE__ */ +L_fe_init_flags_done: +#ifndef __APPLE__ + movq cpuFlagsSet@GOTPCREL(%rip), %rdx + movl $0x1, (%rdx) +#else + movl $0x1, _cpuFlagsSet(%rip) +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ + repz retq +#ifndef __APPLE__ +.size fe_init,.-fe_init +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_frombytes +.type fe_frombytes,@function +.align 4 +fe_frombytes: +#else +.section __TEXT,__text +.globl _fe_frombytes +.p2align 2 +_fe_frombytes: +#endif /* __APPLE__ */ + movq $0x7fffffffffffffff, %r9 + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + andq %r9, %r8 + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %rcx, 16(%rdi) + movq %r8, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size fe_frombytes,.-fe_frombytes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_tobytes +.type fe_tobytes,@function +.align 4 +fe_tobytes: +#else +.section __TEXT,__text +.globl _fe_tobytes +.p2align 2 +_fe_tobytes: +#endif /* __APPLE__ */ + movq $0x7fffffffffffffff, %r10 + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + addq $19, %rdx + adcq $0x00, %rax + adcq $0x00, %rcx + adcq $0x00, %r8 + shrq $63, %r8 + imulq $19, %r8, %r9 + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + addq %r9, %rdx + adcq $0x00, %rax + adcq $0x00, %rcx + adcq $0x00, %r8 + andq %r10, %r8 + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %rcx, 16(%rdi) + movq %r8, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size fe_tobytes,.-fe_tobytes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_1 +.type fe_1,@function +.align 4 +fe_1: +#else +.section __TEXT,__text +.globl _fe_1 +.p2align 2 +_fe_1: +#endif /* __APPLE__ */ + # Set one + movq $0x01, (%rdi) + movq $0x00, 8(%rdi) + movq $0x00, 16(%rdi) + movq $0x00, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size fe_1,.-fe_1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_0 +.type fe_0,@function +.align 4 +fe_0: +#else +.section __TEXT,__text +.globl _fe_0 +.p2align 2 +_fe_0: +#endif /* __APPLE__ */ + # Set zero + movq $0x00, (%rdi) + movq $0x00, 8(%rdi) + movq $0x00, 16(%rdi) + movq $0x00, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size fe_0,.-fe_0 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_copy +.type fe_copy,@function +.align 4 +fe_copy: +#else +.section __TEXT,__text +.globl _fe_copy +.p2align 2 +_fe_copy: +#endif /* __APPLE__ */ + # Copy + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %rcx, 16(%rdi) + movq %r8, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size fe_copy,.-fe_copy +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sub +.type fe_sub,@function +.align 4 +fe_sub: +#else +.section __TEXT,__text +.globl _fe_sub +.p2align 2 +_fe_sub: +#endif /* __APPLE__ */ + pushq %r12 + # Sub + movq (%rsi), %rax + movq 8(%rsi), %rcx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + subq (%rdx), %rax + movq $0x00, %r10 + sbbq 8(%rdx), %rcx + movq $-19, %r11 + sbbq 16(%rdx), %r8 + movq $0x7fffffffffffffff, %r12 + sbbq 24(%rdx), %r9 + sbbq $0x00, %r10 + # Mask the modulus + andq %r10, %r11 + andq %r10, %r12 + # Add modulus (if underflow) + addq %r11, %rax + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r12, %r9 + movq %rax, (%rdi) + movq %rcx, 8(%rdi) + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_sub,.-fe_sub +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_add +.type fe_add,@function +.align 4 +fe_add: +#else +.section __TEXT,__text +.globl _fe_add +.p2align 2 +_fe_add: +#endif /* __APPLE__ */ + pushq %r12 + # Add + movq (%rsi), %rax + movq 8(%rsi), %rcx + addq (%rdx), %rax + movq 16(%rsi), %r8 + adcq 8(%rdx), %rcx + movq 24(%rsi), %r10 + adcq 16(%rdx), %r8 + movq $-19, %r11 + adcq 24(%rdx), %r10 + movq $0x7fffffffffffffff, %r12 + movq %r10, %r9 + sarq $63, %r10 + # Mask the modulus + andq %r10, %r11 + andq %r10, %r12 + # Sub modulus (if overflow) + subq %r11, %rax + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r12, %r9 + movq %rax, (%rdi) + movq %rcx, 8(%rdi) + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_add,.-fe_add +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_neg +.type fe_neg,@function +.align 4 +fe_neg: +#else +.section __TEXT,__text +.globl _fe_neg +.p2align 2 +_fe_neg: +#endif /* __APPLE__ */ + movq $-19, %rdx + movq $-1, %rax + movq $-1, %rcx + movq $0x7fffffffffffffff, %r8 + subq (%rsi), %rdx + sbbq 8(%rsi), %rax + sbbq 16(%rsi), %rcx + sbbq 24(%rsi), %r8 + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %rcx, 16(%rdi) + movq %r8, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size fe_neg,.-fe_neg +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_cmov +.type fe_cmov,@function +.align 4 +fe_cmov: +#else +.section __TEXT,__text +.globl _fe_cmov +.p2align 2 +_fe_cmov: +#endif /* __APPLE__ */ + cmpl $0x01, %edx + movq (%rdi), %rcx + movq 8(%rdi), %r8 + movq 16(%rdi), %r9 + movq 24(%rdi), %r10 + cmoveq (%rsi), %rcx + cmoveq 8(%rsi), %r8 + cmoveq 16(%rsi), %r9 + cmoveq 24(%rsi), %r10 + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size fe_cmov,.-fe_cmov +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_isnonzero +.type fe_isnonzero,@function +.align 4 +fe_isnonzero: +#else +.section __TEXT,__text +.globl _fe_isnonzero +.p2align 2 +_fe_isnonzero: +#endif /* __APPLE__ */ + movq $0x7fffffffffffffff, %r10 + movq (%rdi), %rax + movq 8(%rdi), %rdx + movq 16(%rdi), %rcx + movq 24(%rdi), %r8 + addq $19, %rax + adcq $0x00, %rdx + adcq $0x00, %rcx + adcq $0x00, %r8 + shrq $63, %r8 + imulq $19, %r8, %r9 + movq (%rdi), %rax + movq 8(%rdi), %rdx + movq 16(%rdi), %rcx + movq 24(%rdi), %r8 + addq %r9, %rax + adcq $0x00, %rdx + adcq $0x00, %rcx + adcq $0x00, %r8 + andq %r10, %r8 + orq %rdx, %rax + orq %rcx, %rax + orq %r8, %rax + repz retq +#ifndef __APPLE__ +.size fe_isnonzero,.-fe_isnonzero +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_isnegative +.type fe_isnegative,@function +.align 4 +fe_isnegative: +#else +.section __TEXT,__text +.globl _fe_isnegative +.p2align 2 +_fe_isnegative: +#endif /* __APPLE__ */ + movq $0x7fffffffffffffff, %r11 + movq (%rdi), %rdx + movq 8(%rdi), %rcx + movq 16(%rdi), %r8 + movq 24(%rdi), %r9 + movq %rdx, %rax + addq $19, %rdx + adcq $0x00, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + shrq $63, %r9 + imulq $19, %r9, %r10 + addq %r10, %rax + andq $0x01, %rax + repz retq +#ifndef __APPLE__ +.size fe_isnegative,.-fe_isnegative +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_cmov_table +.type fe_cmov_table,@function +.align 4 +fe_cmov_table: +#else +.section __TEXT,__text +.globl _fe_cmov_table +.p2align 2 +_fe_cmov_table: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx, %rcx + movsbq %cl, %rax + cdq + xorb %dl, %al + subb %dl, %al + movb %al, %r15b + movq $0x01, %rax + xorq %rdx, %rdx + xorq %r8, %r8 + xorq %r9, %r9 + movq $0x01, %r10 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + cmpb $0x01, %r15b + movq (%rsi), %r14 + cmoveq %r14, %rax + movq 8(%rsi), %r14 + cmoveq %r14, %rdx + movq 16(%rsi), %r14 + cmoveq %r14, %r8 + movq 24(%rsi), %r14 + cmoveq %r14, %r9 + movq 32(%rsi), %r14 + cmoveq %r14, %r10 + movq 40(%rsi), %r14 + cmoveq %r14, %r11 + movq 48(%rsi), %r14 + cmoveq %r14, %r12 + movq 56(%rsi), %r14 + cmoveq %r14, %r13 + cmpb $2, %r15b + movq 96(%rsi), %r14 + cmoveq %r14, %rax + movq 104(%rsi), %r14 + cmoveq %r14, %rdx + movq 112(%rsi), %r14 + cmoveq %r14, %r8 + movq 120(%rsi), %r14 + cmoveq %r14, %r9 + movq 128(%rsi), %r14 + cmoveq %r14, %r10 + movq 136(%rsi), %r14 + cmoveq %r14, %r11 + movq 144(%rsi), %r14 + cmoveq %r14, %r12 + movq 152(%rsi), %r14 + cmoveq %r14, %r13 + cmpb $3, %r15b + movq 192(%rsi), %r14 + cmoveq %r14, %rax + movq 200(%rsi), %r14 + cmoveq %r14, %rdx + movq 208(%rsi), %r14 + cmoveq %r14, %r8 + movq 216(%rsi), %r14 + cmoveq %r14, %r9 + movq 224(%rsi), %r14 + cmoveq %r14, %r10 + movq 232(%rsi), %r14 + cmoveq %r14, %r11 + movq 240(%rsi), %r14 + cmoveq %r14, %r12 + movq 248(%rsi), %r14 + cmoveq %r14, %r13 + cmpb $4, %r15b + movq 288(%rsi), %r14 + cmoveq %r14, %rax + movq 296(%rsi), %r14 + cmoveq %r14, %rdx + movq 304(%rsi), %r14 + cmoveq %r14, %r8 + movq 312(%rsi), %r14 + cmoveq %r14, %r9 + movq 320(%rsi), %r14 + cmoveq %r14, %r10 + movq 328(%rsi), %r14 + cmoveq %r14, %r11 + movq 336(%rsi), %r14 + cmoveq %r14, %r12 + movq 344(%rsi), %r14 + cmoveq %r14, %r13 + cmpb $5, %r15b + movq 384(%rsi), %r14 + cmoveq %r14, %rax + movq 392(%rsi), %r14 + cmoveq %r14, %rdx + movq 400(%rsi), %r14 + cmoveq %r14, %r8 + movq 408(%rsi), %r14 + cmoveq %r14, %r9 + movq 416(%rsi), %r14 + cmoveq %r14, %r10 + movq 424(%rsi), %r14 + cmoveq %r14, %r11 + movq 432(%rsi), %r14 + cmoveq %r14, %r12 + movq 440(%rsi), %r14 + cmoveq %r14, %r13 + cmpb $6, %r15b + movq 480(%rsi), %r14 + cmoveq %r14, %rax + movq 488(%rsi), %r14 + cmoveq %r14, %rdx + movq 496(%rsi), %r14 + cmoveq %r14, %r8 + movq 504(%rsi), %r14 + cmoveq %r14, %r9 + movq 512(%rsi), %r14 + cmoveq %r14, %r10 + movq 520(%rsi), %r14 + cmoveq %r14, %r11 + movq 528(%rsi), %r14 + cmoveq %r14, %r12 + movq 536(%rsi), %r14 + cmoveq %r14, %r13 + cmpb $7, %r15b + movq 576(%rsi), %r14 + cmoveq %r14, %rax + movq 584(%rsi), %r14 + cmoveq %r14, %rdx + movq 592(%rsi), %r14 + cmoveq %r14, %r8 + movq 600(%rsi), %r14 + cmoveq %r14, %r9 + movq 608(%rsi), %r14 + cmoveq %r14, %r10 + movq 616(%rsi), %r14 + cmoveq %r14, %r11 + movq 624(%rsi), %r14 + cmoveq %r14, %r12 + movq 632(%rsi), %r14 + cmoveq %r14, %r13 + cmpb $8, %r15b + movq 672(%rsi), %r14 + cmoveq %r14, %rax + movq 680(%rsi), %r14 + cmoveq %r14, %rdx + movq 688(%rsi), %r14 + cmoveq %r14, %r8 + movq 696(%rsi), %r14 + cmoveq %r14, %r9 + movq 704(%rsi), %r14 + cmoveq %r14, %r10 + movq 712(%rsi), %r14 + cmoveq %r14, %r11 + movq 720(%rsi), %r14 + cmoveq %r14, %r12 + movq 728(%rsi), %r14 + cmoveq %r14, %r13 + cmpb $0x00, %cl + movq %rax, %r14 + cmovlq %r10, %rax + cmovlq %r14, %r10 + movq %rdx, %r14 + cmovlq %r11, %rdx + cmovlq %r14, %r11 + movq %r8, %r14 + cmovlq %r12, %r8 + cmovlq %r14, %r12 + movq %r9, %r14 + cmovlq %r13, %r9 + cmovlq %r14, %r13 + movq %rax, (%rdi) + movq %rdx, 8(%rdi) + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + movq %r10, 32(%rdi) + movq %r11, 40(%rdi) + movq %r12, 48(%rdi) + movq %r13, 56(%rdi) + xorq %rax, %rax + xorq %rdx, %rdx + xorq %r8, %r8 + xorq %r9, %r9 + cmpb $0x01, %r15b + movq 64(%rsi), %r14 + cmoveq %r14, %rax + movq 72(%rsi), %r14 + cmoveq %r14, %rdx + movq 80(%rsi), %r14 + cmoveq %r14, %r8 + movq 88(%rsi), %r14 + cmoveq %r14, %r9 + cmpb $2, %r15b + movq 160(%rsi), %r14 + cmoveq %r14, %rax + movq 168(%rsi), %r14 + cmoveq %r14, %rdx + movq 176(%rsi), %r14 + cmoveq %r14, %r8 + movq 184(%rsi), %r14 + cmoveq %r14, %r9 + cmpb $3, %r15b + movq 256(%rsi), %r14 + cmoveq %r14, %rax + movq 264(%rsi), %r14 + cmoveq %r14, %rdx + movq 272(%rsi), %r14 + cmoveq %r14, %r8 + movq 280(%rsi), %r14 + cmoveq %r14, %r9 + cmpb $4, %r15b + movq 352(%rsi), %r14 + cmoveq %r14, %rax + movq 360(%rsi), %r14 + cmoveq %r14, %rdx + movq 368(%rsi), %r14 + cmoveq %r14, %r8 + movq 376(%rsi), %r14 + cmoveq %r14, %r9 + cmpb $5, %r15b + movq 448(%rsi), %r14 + cmoveq %r14, %rax + movq 456(%rsi), %r14 + cmoveq %r14, %rdx + movq 464(%rsi), %r14 + cmoveq %r14, %r8 + movq 472(%rsi), %r14 + cmoveq %r14, %r9 + cmpb $6, %r15b + movq 544(%rsi), %r14 + cmoveq %r14, %rax + movq 552(%rsi), %r14 + cmoveq %r14, %rdx + movq 560(%rsi), %r14 + cmoveq %r14, %r8 + movq 568(%rsi), %r14 + cmoveq %r14, %r9 + cmpb $7, %r15b + movq 640(%rsi), %r14 + cmoveq %r14, %rax + movq 648(%rsi), %r14 + cmoveq %r14, %rdx + movq 656(%rsi), %r14 + cmoveq %r14, %r8 + movq 664(%rsi), %r14 + cmoveq %r14, %r9 + cmpb $8, %r15b + movq 736(%rsi), %r14 + cmoveq %r14, %rax + movq 744(%rsi), %r14 + cmoveq %r14, %rdx + movq 752(%rsi), %r14 + cmoveq %r14, %r8 + movq 760(%rsi), %r14 + cmoveq %r14, %r9 + movq $-19, %r10 + movq $-1, %r11 + movq $-1, %r12 + movq $0x7fffffffffffffff, %r13 + subq %rax, %r10 + sbbq %rdx, %r11 + sbbq %r8, %r12 + sbbq %r9, %r13 + cmpb $0x00, %cl + cmovlq %r10, %rax + cmovlq %r11, %rdx + cmovlq %r12, %r8 + cmovlq %r13, %r9 + movq %rax, 64(%rdi) + movq %rdx, 72(%rdi) + movq %r8, 80(%rdi) + movq %r9, 88(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_cmov_table,.-fe_cmov_table +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_mul +.type fe_mul,@function +.align 4 +fe_mul: +#else +.section __TEXT,__text +.globl _fe_mul +.p2align 2 +_fe_mul: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_mul_p(%rip) +#else + jmpq *_fe_mul_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_mul,.-fe_mul +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq +.type fe_sq,@function +.align 4 +fe_sq: +#else +.section __TEXT,__text +.globl _fe_sq +.p2align 2 +_fe_sq: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_sq_p(%rip) +#else + jmpq *_fe_sq_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_sq,.-fe_sq +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_mul121666 +.type fe_mul121666,@function +.align 4 +fe_mul121666: +#else +.section __TEXT,__text +.globl _fe_mul121666 +.p2align 2 +_fe_mul121666: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_mul121666_p(%rip) +#else + jmpq *_fe_mul121666_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_mul121666,.-fe_mul121666 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq2 +.type fe_sq2,@function +.align 4 +fe_sq2: +#else +.section __TEXT,__text +.globl _fe_sq2 +.p2align 2 +_fe_sq2: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_sq2_p(%rip) +#else + jmpq *_fe_sq2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_sq2,.-fe_sq2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_invert +.type fe_invert,@function +.align 4 +fe_invert: +#else +.section __TEXT,__text +.globl _fe_invert +.p2align 2 +_fe_invert: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_invert_p(%rip) +#else + jmpq *_fe_invert_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_invert,.-fe_invert +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl curve25519 +.type curve25519,@function +.align 4 +curve25519: +#else +.section __TEXT,__text +.globl _curve25519 +.p2align 2 +_curve25519: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *curve25519_p(%rip) +#else + jmpq *_curve25519_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size curve25519,.-curve25519 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_pow22523 +.type fe_pow22523,@function +.align 4 +fe_pow22523: +#else +.section __TEXT,__text +.globl _fe_pow22523 +.p2align 2 +_fe_pow22523: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_pow22523_p(%rip) +#else + jmpq *_fe_pow22523_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_pow22523,.-fe_pow22523 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_to_p2 +.type fe_ge_to_p2,@function +.align 4 +fe_ge_to_p2: +#else +.section __TEXT,__text +.globl _fe_ge_to_p2 +.p2align 2 +_fe_ge_to_p2: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_ge_to_p2_p(%rip) +#else + jmpq *_fe_ge_to_p2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_ge_to_p2,.-fe_ge_to_p2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_to_p3 +.type fe_ge_to_p3,@function +.align 4 +fe_ge_to_p3: +#else +.section __TEXT,__text +.globl _fe_ge_to_p3 +.p2align 2 +_fe_ge_to_p3: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_ge_to_p3_p(%rip) +#else + jmpq *_fe_ge_to_p3_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_ge_to_p3,.-fe_ge_to_p3 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_dbl +.type fe_ge_dbl,@function +.align 4 +fe_ge_dbl: +#else +.section __TEXT,__text +.globl _fe_ge_dbl +.p2align 2 +_fe_ge_dbl: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_ge_dbl_p(%rip) +#else + jmpq *_fe_ge_dbl_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_ge_dbl,.-fe_ge_dbl +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_madd +.type fe_ge_madd,@function +.align 4 +fe_ge_madd: +#else +.section __TEXT,__text +.globl _fe_ge_madd +.p2align 2 +_fe_ge_madd: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_ge_madd_p(%rip) +#else + jmpq *_fe_ge_madd_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_ge_madd,.-fe_ge_madd +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_msub +.type fe_ge_msub,@function +.align 4 +fe_ge_msub: +#else +.section __TEXT,__text +.globl _fe_ge_msub +.p2align 2 +_fe_ge_msub: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_ge_msub_p(%rip) +#else + jmpq *_fe_ge_msub_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_ge_msub,.-fe_ge_msub +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_add +.type fe_ge_add,@function +.align 4 +fe_ge_add: +#else +.section __TEXT,__text +.globl _fe_ge_add +.p2align 2 +_fe_ge_add: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_ge_add_p(%rip) +#else + jmpq *_fe_ge_add_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_ge_add,.-fe_ge_add +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_sub +.type fe_ge_sub,@function +.align 4 +fe_ge_sub: +#else +.section __TEXT,__text +.globl _fe_ge_sub +.p2align 2 +_fe_ge_sub: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_ge_sub_p(%rip) +#else + jmpq *_fe_ge_sub_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_ge_sub,.-fe_ge_sub +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type cpuFlagsSet, @object +.size cpuFlagsSet,4 +cpuFlagsSet: + .long 0 +#else +.section __DATA,__data +.p2align 2 +_cpuFlagsSet: + .long 0 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type intelFlags, @object +.size intelFlags,4 +intelFlags: + .long 0 +#else +.section __DATA,__data +.p2align 2 +_intelFlags: + .long 0 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_mul_p, @object +.size fe_mul_p,8 +fe_mul_p: + .quad fe_mul_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_mul_p: + .quad _fe_mul_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_sq_p, @object +.size fe_sq_p,8 +fe_sq_p: + .quad fe_sq_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_sq_p: + .quad _fe_sq_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_mul121666_p, @object +.size fe_mul121666_p,8 +fe_mul121666_p: + .quad fe_mul121666_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_mul121666_p: + .quad _fe_mul121666_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_sq2_p, @object +.size fe_sq2_p,8 +fe_sq2_p: + .quad fe_sq2_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_sq2_p: + .quad _fe_sq2_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_invert_p, @object +.size fe_invert_p,8 +fe_invert_p: + .quad fe_invert_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_invert_p: + .quad _fe_invert_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type curve25519_p, @object +.size curve25519_p,8 +curve25519_p: + .quad curve25519_x64 +#else +.section __DATA,__data +.p2align 2 +_curve25519_p: + .quad _curve25519_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_pow22523_p, @object +.size fe_pow22523_p,8 +fe_pow22523_p: + .quad fe_pow22523_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_pow22523_p: + .quad _fe_pow22523_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_ge_to_p2_p, @object +.size fe_ge_to_p2_p,8 +fe_ge_to_p2_p: + .quad fe_ge_to_p2_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_ge_to_p2_p: + .quad _fe_ge_to_p2_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_ge_to_p3_p, @object +.size fe_ge_to_p3_p,8 +fe_ge_to_p3_p: + .quad fe_ge_to_p3_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_ge_to_p3_p: + .quad _fe_ge_to_p3_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_ge_dbl_p, @object +.size fe_ge_dbl_p,8 +fe_ge_dbl_p: + .quad fe_ge_dbl_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_ge_dbl_p: + .quad _fe_ge_dbl_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_ge_madd_p, @object +.size fe_ge_madd_p,8 +fe_ge_madd_p: + .quad fe_ge_madd_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_ge_madd_p: + .quad _fe_ge_madd_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_ge_msub_p, @object +.size fe_ge_msub_p,8 +fe_ge_msub_p: + .quad fe_ge_msub_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_ge_msub_p: + .quad _fe_ge_msub_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_ge_add_p, @object +.size fe_ge_add_p,8 +fe_ge_add_p: + .quad fe_ge_add_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_ge_add_p: + .quad _fe_ge_add_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type fe_ge_sub_p, @object +.size fe_ge_sub_p,8 +fe_ge_sub_p: + .quad fe_ge_sub_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_ge_sub_p: + .quad _fe_ge_sub_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_mul_x64 +.type fe_mul_x64,@function +.align 4 +fe_mul_x64: +#else +.section __TEXT,__text +.globl _fe_mul_x64 +.p2align 2 +_fe_mul_x64: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rcx + # Multiply + # A[0] * B[0] + movq (%rcx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rcx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_mul_x64,.-fe_mul_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq_x64 +.type fe_sq_x64,@function +.align 4 +fe_sq_x64: +#else +.section __TEXT,__text +.globl _fe_sq_x64 +.p2align 2 +_fe_sq_x64: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + # Square + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %r15 + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %r15, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %r15 + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %r15, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %r15 + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %r15, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + # Reduce + movq $0x7fffffffffffffff, %r15 + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + shldq $0x01, %r10, %r11 + andq %r15, %r10 + # Multiply top half by 19 + movq $19, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + # Add remaining product results in + addq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + adcq %rax, %r10 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rax + andq %r15, %r10 + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Reduce if top bit set + movq %r10, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %r15, %r10 + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Store + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_sq_x64,.-fe_sq_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq_n_x64 +.type fe_sq_n_x64,@function +.align 4 +fe_sq_n_x64: +#else +.section __TEXT,__text +.globl _fe_sq_n_x64 +.p2align 2 +_fe_sq_n_x64: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rcx +L_fe_sq_n_x64: + # Square + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %rbx + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %rbx, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %rbx, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rbx + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rbx, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + decb %cl + jnz L_fe_sq_n_x64 + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_sq_n_x64,.-fe_sq_n_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_mul121666_x64 +.type fe_mul121666_x64,@function +.align 4 +fe_mul121666_x64: +#else +.section __TEXT,__text +.globl _fe_mul121666_x64 +.p2align 2 +_fe_mul121666_x64: +#endif /* __APPLE__ */ + pushq %r12 + # Multiply by 121666 + movq $0x1db42, %rax + mulq (%rsi) + xorq %r10, %r10 + movq %rax, %r8 + movq %rdx, %r9 + movq $0x1db42, %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + movq $0x1db42, %rax + mulq 16(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + movq $0x1db42, %rax + mulq 24(%rsi) + movq $0x7fffffffffffffff, %rcx + addq %rax, %r11 + adcq %rdx, %r12 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + movq $19, %rax + mulq %r12 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_mul121666_x64,.-fe_mul121666_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq2_x64 +.type fe_sq2_x64,@function +.align 4 +fe_sq2_x64: +#else +.section __TEXT,__text +.globl _fe_sq2_x64 +.p2align 2 +_fe_sq2_x64: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + # Square * 2 + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %r15 + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %r15, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %r15 + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %r15, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %r15 + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %r15, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + # Reduce + movq $0x7fffffffffffffff, %rbx + xorq %rax, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $3, %r14, %rax + shldq $2, %r13, %r14 + shldq $2, %r12, %r13 + shldq $2, %r11, %r12 + shldq $2, %r10, %r11 + shldq $0x01, %r9, %r10 + shldq $0x01, %r8, %r9 + shldq $0x01, %rcx, %r8 + shlq $0x01, %rcx + andq %rbx, %r10 + # Two out left, one in right + andq %rbx, %r14 + # Multiply top bits by 19*19 + imulq $0x169, %rax, %r15 + # Multiply top half by 19 + movq $19, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + # Add remaining produce results in + addq %r15, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + adcq %rax, %r10 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r10 + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Reduce if top bit set + movq %r10, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r10 + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Store + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_sq2_x64,.-fe_sq2_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_invert_x64 +.type fe_invert_x64,@function +.align 4 +fe_invert_x64: +#else +.section __TEXT,__text +.globl _fe_invert_x64 +.p2align 2 +_fe_invert_x64: +#endif /* __APPLE__ */ + subq $0x90, %rsp + # Invert + movq %rdi, 128(%rsp) + movq %rsi, 136(%rsp) + movq %rsp, %rdi + movq 136(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq 136(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $19, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $0x63, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + movq 128(%rsp), %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + addq $0x90, %rsp + repz retq +#ifndef __APPLE__ +.text +.globl curve25519_x64 +.type curve25519_x64,@function +.align 4 +curve25519_x64: +#else +.section __TEXT,__text +.globl _curve25519_x64 +.p2align 2 +_curve25519_x64: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + movq %rdx, %r8 + subq $0xb8, %rsp + xorq %rbx, %rbx + movq %rdi, 176(%rsp) + # Set one + movq $0x01, (%rdi) + movq $0x00, 8(%rdi) + movq $0x00, 16(%rdi) + movq $0x00, 24(%rdi) + # Set zero + movq $0x00, (%rsp) + movq $0x00, 8(%rsp) + movq $0x00, 16(%rsp) + movq $0x00, 24(%rsp) + # Set one + movq $0x01, 32(%rsp) + movq $0x00, 40(%rsp) + movq $0x00, 48(%rsp) + movq $0x00, 56(%rsp) + # Copy + movq (%r8), %rcx + movq 8(%r8), %r9 + movq 16(%r8), %r10 + movq 24(%r8), %r11 + movq %rcx, 64(%rsp) + movq %r9, 72(%rsp) + movq %r10, 80(%rsp) + movq %r11, 88(%rsp) + movb $62, 168(%rsp) + movq $3, 160(%rsp) +L_curve25519_x64_words: +L_curve25519_x64_bits: + movq 160(%rsp), %r9 + movb 168(%rsp), %cl + movq (%rsi,%r9,8), %rbp + shrq %cl, %rbp + andq $0x01, %rbp + xorq %rbp, %rbx + negq %rbx + # Conditional Swap + movq (%rdi), %rcx + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + xorq 64(%rsp), %rcx + xorq 72(%rsp), %r9 + xorq 80(%rsp), %r10 + xorq 88(%rsp), %r11 + andq %rbx, %rcx + andq %rbx, %r9 + andq %rbx, %r10 + andq %rbx, %r11 + xorq %rcx, (%rdi) + xorq %r9, 8(%rdi) + xorq %r10, 16(%rdi) + xorq %r11, 24(%rdi) + xorq %rcx, 64(%rsp) + xorq %r9, 72(%rsp) + xorq %r10, 80(%rsp) + xorq %r11, 88(%rsp) + # Conditional Swap + movq (%rsp), %rcx + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + xorq 32(%rsp), %rcx + xorq 40(%rsp), %r9 + xorq 48(%rsp), %r10 + xorq 56(%rsp), %r11 + andq %rbx, %rcx + andq %rbx, %r9 + andq %rbx, %r10 + andq %rbx, %r11 + xorq %rcx, (%rsp) + xorq %r9, 8(%rsp) + xorq %r10, 16(%rsp) + xorq %r11, 24(%rsp) + xorq %rcx, 32(%rsp) + xorq %r9, 40(%rsp) + xorq %r10, 48(%rsp) + xorq %r11, 56(%rsp) + movq %rbp, %rbx + # Add + movq (%rdi), %rcx + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %rbp + movq %rcx, %r12 + addq (%rsp), %rcx + movq %r9, %r13 + adcq 8(%rsp), %r9 + movq %r10, %r14 + adcq 16(%rsp), %r10 + movq %rbp, %r15 + adcq 24(%rsp), %rbp + movq $-19, %rax + movq %rbp, %r11 + movq $0x7fffffffffffffff, %rdx + sarq $63, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Sub modulus (if overflow) + subq %rax, %rcx + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rdx, %r11 + # Sub + subq (%rsp), %r12 + movq $0x00, %rbp + sbbq 8(%rsp), %r13 + movq $-19, %rax + sbbq 16(%rsp), %r14 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rsp), %r15 + sbbq $0x00, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Add modulus (if underflow) + addq %rax, %r12 + adcq %rbp, %r13 + adcq %rbp, %r14 + adcq %rdx, %r15 + movq %rcx, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, 128(%rsp) + movq %r13, 136(%rsp) + movq %r14, 144(%rsp) + movq %r15, 152(%rsp) + # Add + movq 64(%rsp), %rcx + movq 72(%rsp), %r9 + movq 80(%rsp), %r10 + movq 88(%rsp), %rbp + movq %rcx, %r12 + addq 32(%rsp), %rcx + movq %r9, %r13 + adcq 40(%rsp), %r9 + movq %r10, %r14 + adcq 48(%rsp), %r10 + movq %rbp, %r15 + adcq 56(%rsp), %rbp + movq $-19, %rax + movq %rbp, %r11 + movq $0x7fffffffffffffff, %rdx + sarq $63, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Sub modulus (if overflow) + subq %rax, %rcx + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rdx, %r11 + # Sub + subq 32(%rsp), %r12 + movq $0x00, %rbp + sbbq 40(%rsp), %r13 + movq $-19, %rax + sbbq 48(%rsp), %r14 + movq $0x7fffffffffffffff, %rdx + sbbq 56(%rsp), %r15 + sbbq $0x00, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Add modulus (if underflow) + addq %rax, %r12 + adcq %rbp, %r13 + adcq %rbp, %r14 + adcq %rdx, %r15 + movq %rcx, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + movq %r12, 96(%rsp) + movq %r13, 104(%rsp) + movq %r14, 112(%rsp) + movq %r15, 120(%rsp) + # Multiply + # A[0] * B[0] + movq (%rdi), %rax + mulq 96(%rsp) + movq %rax, %rcx + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rdi), %rax + mulq 96(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rdi), %rax + mulq 104(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rdi), %rax + mulq 96(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rdi), %rax + mulq 104(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rdi), %rax + mulq 112(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rdi), %rax + mulq 96(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rdi), %rax + mulq 104(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rdi), %rax + mulq 112(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rdi), %rax + mulq 120(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rdi), %rax + mulq 104(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rdi), %rax + mulq 112(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rdi), %rax + mulq 120(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rdi), %rax + mulq 112(%rsp) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rdi), %rax + mulq 120(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rdi), %rax + mulq 120(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) + # Multiply + # A[0] * B[0] + movq 128(%rsp), %rax + mulq (%rsp) + movq %rax, %rcx + movq %rdx, %r9 + # A[0] * B[1] + movq 136(%rsp), %rax + mulq (%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq 128(%rsp), %rax + mulq 8(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 144(%rsp), %rax + mulq (%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 136(%rsp), %rax + mulq 8(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq 128(%rsp), %rax + mulq 16(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 152(%rsp), %rax + mulq (%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 144(%rsp), %rax + mulq 8(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 136(%rsp), %rax + mulq 16(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq 128(%rsp), %rax + mulq 24(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 152(%rsp), %rax + mulq 8(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 144(%rsp), %rax + mulq 16(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 136(%rsp), %rax + mulq 24(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 152(%rsp), %rax + mulq 16(%rsp) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 144(%rsp), %rax + mulq 24(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 152(%rsp), %rax + mulq 24(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + # Square + # A[0] * A[1] + movq 128(%rsp), %rax + mulq 136(%rsp) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq 128(%rsp), %rax + mulq 144(%rsp) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq 128(%rsp), %rax + mulq 152(%rsp) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 136(%rsp), %rax + mulq 144(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 136(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 144(%rsp), %rax + mulq 152(%rsp) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq 128(%rsp), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbp + # A[1] * A[1] + movq 136(%rsp), %rax + mulq %rax + addq %rbp, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[2] * A[2] + movq 144(%rsp), %rax + mulq %rax + addq %rbp, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[3] * A[3] + movq 152(%rsp), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rbp, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, 96(%rsp) + movq %r9, 104(%rsp) + movq %r10, 112(%rsp) + movq %r11, 120(%rsp) + # Square + # A[0] * A[1] + movq (%rdi), %rax + mulq 8(%rdi) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq (%rdi), %rax + mulq 16(%rdi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq (%rdi), %rax + mulq 24(%rdi) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 8(%rdi), %rax + mulq 16(%rdi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 8(%rdi), %rax + mulq 24(%rdi) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 16(%rdi), %rax + mulq 24(%rdi) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq (%rdi), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbp + # A[1] * A[1] + movq 8(%rdi), %rax + mulq %rax + addq %rbp, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[2] * A[2] + movq 16(%rdi), %rax + mulq %rax + addq %rbp, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[3] * A[3] + movq 24(%rdi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rbp, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, 128(%rsp) + movq %r9, 136(%rsp) + movq %r10, 144(%rsp) + movq %r11, 152(%rsp) + # Add + movq 32(%rsp), %rcx + movq 40(%rsp), %r9 + movq 48(%rsp), %r10 + movq 56(%rsp), %rbp + movq %rcx, %r12 + addq (%rsp), %rcx + movq %r9, %r13 + adcq 8(%rsp), %r9 + movq %r10, %r14 + adcq 16(%rsp), %r10 + movq %rbp, %r15 + adcq 24(%rsp), %rbp + movq $-19, %rax + movq %rbp, %r11 + movq $0x7fffffffffffffff, %rdx + sarq $63, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Sub modulus (if overflow) + subq %rax, %rcx + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rdx, %r11 + # Sub + subq (%rsp), %r12 + movq $0x00, %rbp + sbbq 8(%rsp), %r13 + movq $-19, %rax + sbbq 16(%rsp), %r14 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rsp), %r15 + sbbq $0x00, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Add modulus (if underflow) + addq %rax, %r12 + adcq %rbp, %r13 + adcq %rbp, %r14 + adcq %rdx, %r15 + movq %rcx, 64(%rsp) + movq %r9, 72(%rsp) + movq %r10, 80(%rsp) + movq %r11, 88(%rsp) + movq %r12, (%rsp) + movq %r13, 8(%rsp) + movq %r14, 16(%rsp) + movq %r15, 24(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rax + mulq 128(%rsp) + movq %rax, %rcx + movq %rdx, %r9 + # A[0] * B[1] + movq 104(%rsp), %rax + mulq 128(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq 96(%rsp), %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 112(%rsp), %rax + mulq 128(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 104(%rsp), %rax + mulq 136(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq 96(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 120(%rsp), %rax + mulq 128(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 112(%rsp), %rax + mulq 136(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 104(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq 96(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 120(%rsp), %rax + mulq 136(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 112(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 104(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 120(%rsp), %rax + mulq 144(%rsp) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 112(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 120(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + # Sub + movq 128(%rsp), %rcx + movq 136(%rsp), %r9 + movq 144(%rsp), %r10 + movq 152(%rsp), %r11 + subq 96(%rsp), %rcx + movq $0x00, %rbp + sbbq 104(%rsp), %r9 + movq $-19, %rax + sbbq 112(%rsp), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 120(%rsp), %r11 + sbbq $0x00, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Add modulus (if underflow) + addq %rax, %rcx + adcq %rbp, %r9 + adcq %rbp, %r10 + adcq %rdx, %r11 + movq %rcx, 128(%rsp) + movq %r9, 136(%rsp) + movq %r10, 144(%rsp) + movq %r11, 152(%rsp) + # Square + # A[0] * A[1] + movq (%rsp), %rax + mulq 8(%rsp) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq (%rsp), %rax + mulq 16(%rsp) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq (%rsp), %rax + mulq 24(%rsp) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 8(%rsp), %rax + mulq 16(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 8(%rsp), %rax + mulq 24(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 16(%rsp), %rax + mulq 24(%rsp) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq (%rsp), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbp + # A[1] * A[1] + movq 8(%rsp), %rax + mulq %rax + addq %rbp, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[2] * A[2] + movq 16(%rsp), %rax + mulq %rax + addq %rbp, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[3] * A[3] + movq 24(%rsp), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rbp, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + # Multiply by 121666 + movq $0x1db42, %rax + mulq 128(%rsp) + xorq %r10, %r10 + movq %rax, %rcx + movq %rdx, %r9 + movq $0x1db42, %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + movq $0x1db42, %rax + mulq 144(%rsp) + xorq %r13, %r13 + addq %rax, %r10 + adcq %rdx, %r11 + movq $0x1db42, %rax + mulq 152(%rsp) + movq $0x7fffffffffffffff, %r12 + addq %rax, %r11 + adcq %rdx, %r13 + shldq $0x01, %r11, %r13 + andq %r12, %r11 + movq $19, %rax + mulq %r13 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %rcx, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) + # Square + # A[0] * A[1] + movq 64(%rsp), %rax + mulq 72(%rsp) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq 64(%rsp), %rax + mulq 80(%rsp) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq 64(%rsp), %rax + mulq 88(%rsp) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 72(%rsp), %rax + mulq 80(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 72(%rsp), %rax + mulq 88(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 80(%rsp), %rax + mulq 88(%rsp) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq 64(%rsp), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %rbp + # A[1] * A[1] + movq 72(%rsp), %rax + mulq %rax + addq %rbp, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[2] * A[2] + movq 80(%rsp), %rax + mulq %rax + addq %rbp, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rbp + # A[3] * A[3] + movq 88(%rsp), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rbp, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, 64(%rsp) + movq %r9, 72(%rsp) + movq %r10, 80(%rsp) + movq %r11, 88(%rsp) + # Add + movq 96(%rsp), %rcx + movq 104(%rsp), %r9 + addq 32(%rsp), %rcx + movq 112(%rsp), %r10 + adcq 40(%rsp), %r9 + movq 120(%rsp), %rbp + adcq 48(%rsp), %r10 + movq $-19, %rax + adcq 56(%rsp), %rbp + movq $0x7fffffffffffffff, %rdx + movq %rbp, %r11 + sarq $63, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Sub modulus (if overflow) + subq %rax, %rcx + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rdx, %r11 + movq %rcx, 96(%rsp) + movq %r9, 104(%rsp) + movq %r10, 112(%rsp) + movq %r11, 120(%rsp) + # Multiply + # A[0] * B[0] + movq (%rsp), %rax + mulq (%r8) + movq %rax, %rcx + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rsp), %rax + mulq (%r8) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rsp), %rax + mulq 8(%r8) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rsp), %rax + mulq (%r8) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rsp), %rax + mulq 8(%r8) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rsp), %rax + mulq 16(%r8) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rsp), %rax + mulq (%r8) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rsp), %rax + mulq 8(%r8) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rsp), %rax + mulq 16(%r8) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rsp), %rax + mulq 24(%r8) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rsp), %rax + mulq 8(%r8) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rsp), %rax + mulq 16(%r8) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rsp), %rax + mulq 24(%r8) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rsp), %rax + mulq 16(%r8) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rsp), %rax + mulq 24(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rsp), %rax + mulq 24(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rax + mulq 128(%rsp) + movq %rax, %rcx + movq %rdx, %r9 + # A[0] * B[1] + movq 104(%rsp), %rax + mulq 128(%rsp) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq 96(%rsp), %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 112(%rsp), %rax + mulq 128(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 104(%rsp), %rax + mulq 136(%rsp) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq 96(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 120(%rsp), %rax + mulq 128(%rsp) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 112(%rsp), %rax + mulq 136(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 104(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq 96(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 120(%rsp), %rax + mulq 136(%rsp) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 112(%rsp), %rax + mulq 144(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 104(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 120(%rsp), %rax + mulq 144(%rsp) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 112(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 120(%rsp), %rax + mulq 152(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + decb 168(%rsp) + jge L_curve25519_x64_bits + movq $63, 168(%rsp) + decb 160(%rsp) + jge L_curve25519_x64_words + # Invert + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + movq %rsp, %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 128(%rsp), %rsi + movq $19, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 128(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 128(%rsp), %rsi + movq $0x63, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 128(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq 176(%rsp), %rdi + # Multiply + # A[0] * B[0] + movq (%rsp), %rax + mulq (%rdi) + movq %rax, %rcx + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rsp), %rax + mulq (%rdi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rsp), %rax + mulq 8(%rdi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rsp), %rax + mulq (%rdi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rsp), %rax + mulq 8(%rdi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rsp), %rax + mulq 16(%rdi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rsp), %rax + mulq (%rdi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rsp), %rax + mulq 8(%rdi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rsp), %rax + mulq 16(%rdi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rsp), %rax + mulq 24(%rdi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rsp), %rax + mulq 8(%rdi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rsp), %rax + mulq 16(%rdi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rsp), %rax + mulq 24(%rdi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rsp), %rax + mulq 16(%rdi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rsp), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rsp), %rax + mulq 24(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbp + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rbp, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %rcx + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbp, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %rcx, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + xorq %rax, %rax + addq $0xb8, %rsp + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size curve25519_x64,.-curve25519_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_pow22523_x64 +.type fe_pow22523_x64,@function +.align 4 +fe_pow22523_x64: +#else +.section __TEXT,__text +.globl _fe_pow22523_x64 +.p2align 2 +_fe_pow22523_x64: +#endif /* __APPLE__ */ + subq $0x70, %rsp + # pow22523 + movq %rdi, 96(%rsp) + movq %rsi, 104(%rsp) + movq %rsp, %rdi + movq 104(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq 104(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $19, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $0x63, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + movq 96(%rsp), %rdi + movq %rsp, %rsi + movq 104(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq 104(%rsp), %rsi + movq 96(%rsp), %rdi + addq $0x70, %rsp + repz retq +#ifndef __APPLE__ +.text +.globl fe_ge_to_p2_x64 +.type fe_ge_to_p2_x64,@function +.align 4 +fe_ge_to_p2_x64: +#else +.section __TEXT,__text +.globl _fe_ge_to_p2_x64 +.p2align 2 +_fe_ge_to_p2_x64: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $40, %rsp + movq %rsi, (%rsp) + movq %rdx, 8(%rsp) + movq %rcx, 16(%rsp) + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + movq 16(%rsp), %rsi + movq 88(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 24(%rsp), %rsi + movq 32(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 32(%rsp), %rsi + movq 88(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $40, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_to_p3_x64 +.type fe_ge_to_p3_x64,@function +.align 4 +fe_ge_to_p3_x64: +#else +.section __TEXT,__text +.globl _fe_ge_to_p3_x64 +.p2align 2 +_fe_ge_to_p3_x64: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $40, %rsp + movq %rsi, (%rsp) + movq %rdx, 8(%rsp) + movq %rcx, 16(%rsp) + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + movq 24(%rsp), %rsi + movq 96(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 32(%rsp), %rsi + movq 88(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 88(%rsp), %rsi + movq 96(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq 24(%rsp), %rsi + movq 32(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $40, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_dbl_x64 +.type fe_ge_dbl_x64,@function +.align 4 +fe_ge_dbl_x64: +#else +.section __TEXT,__text +.globl _fe_ge_dbl_x64 +.p2align 2 +_fe_ge_dbl_x64: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x50, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq (%rsp), %rdi + movq 32(%rsp), %rsi + # Square + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %rcx + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %rcx, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rcx + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %rcx, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rcx + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rcx, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq 40(%rsp), %rsi + # Square + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %rcx + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %rcx, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rcx + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %rcx, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rcx + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rcx, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + movq 128(%rsp), %rsi + # Square * 2 + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %rcx + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %rcx, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rcx + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %rcx, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rcx + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rcx, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbx + xorq %rax, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $3, %r15, %rax + shldq $2, %r14, %r15 + shldq $2, %r13, %r14 + shldq $2, %r12, %r13 + shldq $2, %r11, %r12 + shldq $0x01, %r10, %r11 + shldq $0x01, %r9, %r10 + shldq $0x01, %r8, %r9 + shlq $0x01, %r8 + andq %rbx, %r11 + # Two out left, one in right + andq %rbx, %r15 + # Multiply top bits by 19*19 + imulq $0x169, %rax, %rcx + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining produce results in + addq %rcx, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 32(%rsp), %rsi + movq 40(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + leaq 48(%rsp), %rdi + movq 8(%rsp), %rsi + # Square + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r9 + movq %rdx, %r10 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r14, %r14 + addq %rax, %r13 + adcq %rdx, %r14 + # Double + xorq %r15, %r15 + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq $0x00, %r15 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %rcx + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %rcx, %r9 + adcq %rax, %r10 + adcq $0x00, %rdx + movq %rdx, %rcx + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %rcx, %r11 + adcq %rax, %r12 + adcq $0x00, %rdx + movq %rdx, %rcx + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + addq %rcx, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 16(%rsp), %rsi + movq (%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq 16(%rsp), %rsi + movq (%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + leaq 48(%rsp), %rsi + movq 8(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $0x50, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_dbl_x64,.-fe_ge_dbl_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_madd_x64 +.type fe_ge_madd_x64,@function +.align 4 +fe_ge_madd_x64: +#else +.section __TEXT,__text +.globl _fe_ge_madd_x64 +.p2align 2 +_fe_ge_madd_x64: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x50, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq (%rsp), %rdi + movq 40(%rsp), %rsi + movq 32(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 40(%rsp), %rsi + movq 32(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq (%rsp), %rsi + movq 152(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 8(%rsp), %rsi + movq 160(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + movq 144(%rsp), %rsi + movq 136(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + leaq 48(%rsp), %rdi + movq 128(%rsp), %rsi + movq 128(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 16(%rsp), %rsi + movq 8(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 16(%rsp), %rsi + movq 8(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + leaq 48(%rsp), %rsi + movq 24(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + leaq 48(%rsp), %rsi + movq 24(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $0x50, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_madd_x64,.-fe_ge_madd_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_msub_x64 +.type fe_ge_msub_x64,@function +.align 4 +fe_ge_msub_x64: +#else +.section __TEXT,__text +.globl _fe_ge_msub_x64 +.p2align 2 +_fe_ge_msub_x64: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x50, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq (%rsp), %rdi + movq 40(%rsp), %rsi + movq 32(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 40(%rsp), %rsi + movq 32(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq (%rsp), %rsi + movq 160(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 8(%rsp), %rsi + movq 152(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + movq 144(%rsp), %rsi + movq 136(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + leaq 48(%rsp), %rdi + movq 128(%rsp), %rsi + movq 128(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 16(%rsp), %rsi + movq 8(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 16(%rsp), %rsi + movq 8(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + leaq 48(%rsp), %rsi + movq 24(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + leaq 48(%rsp), %rsi + movq 24(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $0x50, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_msub_x64,.-fe_ge_msub_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_add_x64 +.type fe_ge_add_x64,@function +.align 4 +fe_ge_add_x64: +#else +.section __TEXT,__text +.globl _fe_ge_add_x64 +.p2align 2 +_fe_ge_add_x64: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x50, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq (%rsp), %rdi + movq 40(%rsp), %rsi + movq 32(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 40(%rsp), %rsi + movq 32(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq (%rsp), %rsi + movq 160(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 8(%rsp), %rsi + movq 168(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + movq 152(%rsp), %rsi + movq 136(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 128(%rsp), %rsi + movq 144(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + leaq 48(%rsp), %rdi + movq (%rsp), %rsi + movq (%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 16(%rsp), %rsi + movq 8(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 16(%rsp), %rsi + movq 8(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + leaq 48(%rsp), %rsi + movq 24(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + leaq 48(%rsp), %rsi + movq 24(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $0x50, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_add_x64,.-fe_ge_add_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_sub_x64 +.type fe_ge_sub_x64,@function +.align 4 +fe_ge_sub_x64: +#else +.section __TEXT,__text +.globl _fe_ge_sub_x64 +.p2align 2 +_fe_ge_sub_x64: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x50, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq (%rsp), %rdi + movq 40(%rsp), %rsi + movq 32(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 40(%rsp), %rsi + movq 32(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq (%rsp), %rsi + movq 168(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 8(%rsp), %rsi + movq 160(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + movq 152(%rsp), %rsi + movq 136(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 128(%rsp), %rsi + movq 144(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbx), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbx), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbx), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbx), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbx), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbx), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbx), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbx), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbx), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbx), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbx), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rax + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $19, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + movq $19, %rax + adcq %rdx, %r13 + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $19, %rax + adcq %rdx, %r14 + mulq %r15 + # Add remaining product results in + addq %r12, %r9 + adcq %r13, %r10 + adcq %r14, %r11 + adcq %rax, %r11 + adcq $0x00, %rdx + # Overflow + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + leaq 48(%rsp), %rdi + movq (%rsp), %rsi + movq (%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 16(%rsp), %rsi + movq 8(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 16(%rsp), %rsi + movq 8(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + leaq 48(%rsp), %rsi + movq 24(%rsp), %rbx + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rbx), %r8 + movq $0x00, %rcx + sbbq 8(%rbx), %r9 + movq $-19, %rax + sbbq 16(%rbx), %r10 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rbx), %r11 + sbbq $0x00, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Add modulus (if underflow) + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rdi + leaq 48(%rsp), %rsi + movq 24(%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rcx + adcq 16(%rbx), %r10 + movq $-19, %rax + adcq 24(%rbx), %rcx + movq $0x7fffffffffffffff, %rdx + movq %rcx, %r11 + sarq $63, %rcx + # Mask the modulus + andq %rcx, %rax + andq %rcx, %rdx + # Sub modulus (if overflow) + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $0x50, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_sub_x64,.-fe_ge_sub_x64 +#endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +#ifndef __APPLE__ +.text +.globl fe_mul_avx2 +.type fe_mul_avx2,@function +.align 4 +fe_mul_avx2: +#else +.section __TEXT,__text +.globl _fe_mul_avx2 +.p2align 2 +_fe_mul_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rax, %rcx + xorq %r15, %r15 + adcxq %rax, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rcx, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rax, %rcx + adoxq %rax, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rax, %r14 + adoxq %rcx, %r10 + adcxq %rax, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rax, %rcx + adcxq %r14, %r12 + adoxq %rax, %r11 + adcxq %r15, %r13 + adoxq %rcx, %r12 + # A[0] * B[2] + mulxq (%rsi), %rax, %rcx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rax, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rax + adcxq %rcx, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rax, %r11 + mulxq 24(%rsi), %rax, %rcx + adcxq %rax, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rax + adcxq %rcx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rax, %r13 + mulxq 24(%rsi), %rax, %rcx + adoxq %r15, %r14 + adcxq %rax, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rax + adcxq %rcx, %r15 + xorq %rcx, %rcx + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rax, %r12 + mulxq 24(%rsi), %rdx, %rax + adoxq %rdx, %r11 + adoxq %rax, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rax + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rax, %r14 + mulxq 24(%rsi), %rax, %rdx + adcxq %rcx, %r15 + adoxq %rax, %r13 + adoxq %rdx, %r14 + adoxq %rcx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r12, %rax, %r12 + adcxq %rax, %r8 + adoxq %r12, %r9 + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_mul_avx2,.-fe_mul_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq_avx2 +.type fe_sq_avx2,@function +.align 4 +fe_sq_avx2: +#else +.section __TEXT,__text +.globl _fe_sq_avx2 +.p2align 2 +_fe_sq_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + # Square + # A[0] * A[1] + movq (%rsi), %rdx + mulxq 8(%rsi), %r9, %r10 + # A[0] * A[3] + mulxq 24(%rsi), %r11, %r12 + # A[2] * A[1] + movq 16(%rsi), %rdx + mulxq 8(%rsi), %rcx, %rbx + xorq %r15, %r15 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rsi), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq (%rsi), %rcx, %rbx + adoxq %r15, %r13 + adcxq %rcx, %r10 + adoxq %r15, %r14 + # A[1] * A[3] + movq 8(%rsi), %rdx + mulxq 24(%rsi), %rax, %r8 + adcxq %rbx, %r11 + adcxq %rax, %r12 + adcxq %r8, %r13 + adcxq %r15, %r14 + # Double with Carry Flag + xorq %r15, %r15 + # A[0] * A[0] + movq (%rsi), %rdx + mulxq %rdx, %r8, %rax + adcxq %r9, %r9 + # A[1] * A[1] + movq 8(%rsi), %rdx + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rax, %r9 + adcxq %r11, %r11 + adoxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r11 + adcxq %r13, %r13 + adoxq %rax, %r12 + # A[3] * A[3] + movq 24(%rsi), %rdx + mulxq %rdx, %rax, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r13 + adcxq %r15, %r15 + adoxq %rax, %r14 + adoxq %rbx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r12, %rax, %r12 + adcxq %rax, %r8 + adoxq %r12, %r9 + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_sq_avx2,.-fe_sq_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq_n_avx2 +.type fe_sq_n_avx2,@function +.align 4 +fe_sq_n_avx2: +#else +.section __TEXT,__text +.globl _fe_sq_n_avx2 +.p2align 2 +_fe_sq_n_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbp + movq %rdx, %rbp +L_fe_sq_n_avx2: + # Square + # A[0] * A[1] + movq (%rsi), %rdx + mulxq 8(%rsi), %r9, %r10 + # A[0] * A[3] + mulxq 24(%rsi), %r11, %r12 + # A[2] * A[1] + movq 16(%rsi), %rdx + mulxq 8(%rsi), %rcx, %rbx + xorq %r15, %r15 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rsi), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq (%rsi), %rcx, %rbx + adoxq %r15, %r13 + adcxq %rcx, %r10 + adoxq %r15, %r14 + # A[1] * A[3] + movq 8(%rsi), %rdx + mulxq 24(%rsi), %rax, %r8 + adcxq %rbx, %r11 + adcxq %rax, %r12 + adcxq %r8, %r13 + adcxq %r15, %r14 + # Double with Carry Flag + xorq %r15, %r15 + # A[0] * A[0] + movq (%rsi), %rdx + mulxq %rdx, %r8, %rax + adcxq %r9, %r9 + # A[1] * A[1] + movq 8(%rsi), %rdx + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rax, %r9 + adcxq %r11, %r11 + adoxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r11 + adcxq %r13, %r13 + adoxq %rax, %r12 + # A[3] * A[3] + movq 24(%rsi), %rdx + mulxq %rdx, %rax, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r13 + adcxq %r15, %r15 + adoxq %rax, %r14 + adoxq %rbx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r12, %rax, %r12 + adcxq %rax, %r8 + adoxq %r12, %r9 + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + decb %bpl + jnz L_fe_sq_n_avx2 + popq %rbp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_sq_n_avx2,.-fe_sq_n_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_mul121666_avx2 +.type fe_mul121666_avx2,@function +.align 4 +fe_mul121666_avx2: +#else +.section __TEXT,__text +.globl _fe_mul121666_avx2 +.p2align 2 +_fe_mul121666_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq $0x1db42, %rdx + mulxq (%rsi), %rax, %r13 + mulxq 8(%rsi), %rcx, %r12 + mulxq 16(%rsi), %r8, %r11 + mulxq 24(%rsi), %r9, %r10 + addq %r13, %rcx + adcq %r12, %r8 + adcq %r11, %r9 + adcq $0x00, %r10 + movq $0x7fffffffffffffff, %r13 + shldq $0x01, %r9, %r10 + andq %r13, %r9 + imulq $19, %r10, %r10 + addq %r10, %rax + adcq $0x00, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + movq %rax, (%rdi) + movq %rcx, 8(%rdi) + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_mul121666_avx2,.-fe_mul121666_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_sq2_avx2 +.type fe_sq2_avx2,@function +.align 4 +fe_sq2_avx2: +#else +.section __TEXT,__text +.globl _fe_sq2_avx2 +.p2align 2 +_fe_sq2_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + # Square * 2 + # A[0] * A[1] + movq (%rsi), %rdx + mulxq 8(%rsi), %r9, %r10 + # A[0] * A[3] + mulxq 24(%rsi), %r11, %r12 + # A[2] * A[1] + movq 16(%rsi), %rdx + mulxq 8(%rsi), %rcx, %rbx + xorq %r15, %r15 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rsi), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq (%rsi), %rcx, %rbx + adoxq %r15, %r13 + adcxq %rcx, %r10 + adoxq %r15, %r14 + # A[1] * A[3] + movq 8(%rsi), %rdx + mulxq 24(%rsi), %rax, %r8 + adcxq %rbx, %r11 + adcxq %rax, %r12 + adcxq %r8, %r13 + adcxq %r15, %r14 + # Double with Carry Flag + xorq %r15, %r15 + # A[0] * A[0] + movq (%rsi), %rdx + mulxq %rdx, %r8, %rax + adcxq %r9, %r9 + # A[1] * A[1] + movq 8(%rsi), %rdx + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rax, %r9 + adcxq %r11, %r11 + adoxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r11 + adcxq %r13, %r13 + adoxq %rax, %r12 + # A[3] * A[3] + movq 24(%rsi), %rdx + mulxq %rdx, %rax, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r13 + adcxq %r15, %r15 + adoxq %rax, %r14 + adoxq %rbx, %r15 + # Reduce + movq $0x7fffffffffffffff, %rbx + xorq %rax, %rax + # Move top half into t4-t7 and remove top bit from t3 and double + shldq $3, %r15, %rax + shldq $2, %r14, %r15 + shldq $2, %r13, %r14 + shldq $2, %r12, %r13 + shldq $2, %r11, %r12 + shldq $0x01, %r10, %r11 + shldq $0x01, %r9, %r10 + shldq $0x01, %r8, %r9 + shlq $0x01, %r8 + andq %rbx, %r11 + # Two out left, one in right + andq %rbx, %r15 + # Multiply top bits by 19*19 + imulq $0x169, %rax, %rcx + xorq %rbx, %rbx + # Multiply top half by 19 + movq $19, %rdx + adoxq %rcx, %r8 + mulxq %r12, %rax, %r12 + adcxq %rax, %r8 + adoxq %r12, %r9 + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rbx, %rdx + adcxq %rbx, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rdx, %rax + andq %rbx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rbx, %r11 + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_sq2_avx2,.-fe_sq2_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_invert_avx2 +.type fe_invert_avx2,@function +.align 4 +fe_invert_avx2: +#else +.section __TEXT,__text +.globl _fe_invert_avx2 +.p2align 2 +_fe_invert_avx2: +#endif /* __APPLE__ */ + subq $0x90, %rsp + # Invert + movq %rdi, 128(%rsp) + movq %rsi, 136(%rsp) + movq %rsp, %rdi + movq 136(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq 136(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $19, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $0x63, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + movq 128(%rsp), %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + addq $0x90, %rsp + repz retq +#ifndef __APPLE__ +.text +.globl curve25519_avx2 +.type curve25519_avx2,@function +.align 4 +curve25519_avx2: +#else +.section __TEXT,__text +.globl _curve25519_avx2 +.p2align 2 +_curve25519_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbp + movq %rdx, %r8 + subq $0xc0, %rsp + movq $0x00, 184(%rsp) + movq %rdi, 176(%rsp) + # Set one + movq $0x01, (%rdi) + movq $0x00, 8(%rdi) + movq $0x00, 16(%rdi) + movq $0x00, 24(%rdi) + # Set zero + movq $0x00, (%rsp) + movq $0x00, 8(%rsp) + movq $0x00, 16(%rsp) + movq $0x00, 24(%rsp) + # Set one + movq $0x01, 32(%rsp) + movq $0x00, 40(%rsp) + movq $0x00, 48(%rsp) + movq $0x00, 56(%rsp) + # Copy + movq (%r8), %r9 + movq 8(%r8), %r10 + movq 16(%r8), %r11 + movq 24(%r8), %r12 + movq %r9, 64(%rsp) + movq %r10, 72(%rsp) + movq %r11, 80(%rsp) + movq %r12, 88(%rsp) + movb $62, 168(%rsp) + movq $3, 160(%rsp) +L_curve25519_avx2_words: +L_curve25519_avx2_bits: + movq 184(%rsp), %rbx + movq 160(%rsp), %r9 + movb 168(%rsp), %cl + movq (%rsi,%r9,8), %rax + shrq %cl, %rax + andq $0x01, %rax + xorq %rax, %rbx + negq %rbx + # Conditional Swap + movq (%rdi), %r9 + movq 8(%rdi), %r10 + movq 16(%rdi), %r11 + movq 24(%rdi), %r12 + xorq 64(%rsp), %r9 + xorq 72(%rsp), %r10 + xorq 80(%rsp), %r11 + xorq 88(%rsp), %r12 + andq %rbx, %r9 + andq %rbx, %r10 + andq %rbx, %r11 + andq %rbx, %r12 + xorq %r9, (%rdi) + xorq %r10, 8(%rdi) + xorq %r11, 16(%rdi) + xorq %r12, 24(%rdi) + xorq %r9, 64(%rsp) + xorq %r10, 72(%rsp) + xorq %r11, 80(%rsp) + xorq %r12, 88(%rsp) + # Conditional Swap + movq (%rsp), %r9 + movq 8(%rsp), %r10 + movq 16(%rsp), %r11 + movq 24(%rsp), %r12 + xorq 32(%rsp), %r9 + xorq 40(%rsp), %r10 + xorq 48(%rsp), %r11 + xorq 56(%rsp), %r12 + andq %rbx, %r9 + andq %rbx, %r10 + andq %rbx, %r11 + andq %rbx, %r12 + xorq %r9, (%rsp) + xorq %r10, 8(%rsp) + xorq %r11, 16(%rsp) + xorq %r12, 24(%rsp) + xorq %r9, 32(%rsp) + xorq %r10, 40(%rsp) + xorq %r11, 48(%rsp) + xorq %r12, 56(%rsp) + movq %rax, 184(%rsp) + # Add + movq (%rdi), %r9 + movq 8(%rdi), %r10 + movq 16(%rdi), %r11 + movq 24(%rdi), %rax + movq %r9, %r13 + addq (%rsp), %r9 + movq %r10, %r14 + adcq 8(%rsp), %r10 + movq %r11, %r15 + adcq 16(%rsp), %r11 + movq %rax, %rbp + adcq 24(%rsp), %rax + movq $-19, %rcx + movq %rax, %r12 + movq $0x7fffffffffffffff, %rbx + sarq $63, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Sub modulus (if overflow) + subq %rcx, %r9 + sbbq %rax, %r10 + sbbq %rax, %r11 + sbbq %rbx, %r12 + # Sub + subq (%rsp), %r13 + movq $0x00, %rax + sbbq 8(%rsp), %r14 + movq $-19, %rcx + sbbq 16(%rsp), %r15 + movq $0x7fffffffffffffff, %rbx + sbbq 24(%rsp), %rbp + sbbq $0x00, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Add modulus (if underflow) + addq %rcx, %r13 + adcq %rax, %r14 + adcq %rax, %r15 + adcq %rbx, %rbp + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + movq %r13, 128(%rsp) + movq %r14, 136(%rsp) + movq %r15, 144(%rsp) + movq %rbp, 152(%rsp) + # Add + movq 64(%rsp), %r9 + movq 72(%rsp), %r10 + movq 80(%rsp), %r11 + movq 88(%rsp), %rax + movq %r9, %r13 + addq 32(%rsp), %r9 + movq %r10, %r14 + adcq 40(%rsp), %r10 + movq %r11, %r15 + adcq 48(%rsp), %r11 + movq %rax, %rbp + adcq 56(%rsp), %rax + movq $-19, %rcx + movq %rax, %r12 + movq $0x7fffffffffffffff, %rbx + sarq $63, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Sub modulus (if overflow) + subq %rcx, %r9 + sbbq %rax, %r10 + sbbq %rax, %r11 + sbbq %rbx, %r12 + # Sub + subq 32(%rsp), %r13 + movq $0x00, %rax + sbbq 40(%rsp), %r14 + movq $-19, %rcx + sbbq 48(%rsp), %r15 + movq $0x7fffffffffffffff, %rbx + sbbq 56(%rsp), %rbp + sbbq $0x00, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Add modulus (if underflow) + addq %rcx, %r13 + adcq %rax, %r14 + adcq %rax, %r15 + adcq %rbx, %rbp + movq %r9, (%rsp) + movq %r10, 8(%rsp) + movq %r11, 16(%rsp) + movq %r12, 24(%rsp) + movq %r13, 96(%rsp) + movq %r14, 104(%rsp) + movq %r15, 112(%rsp) + movq %rbp, 120(%rsp) + # Multiply + # A[0] * B[0] + movq (%rdi), %rdx + mulxq 96(%rsp), %r9, %r10 + # A[2] * B[0] + mulxq 112(%rsp), %r11, %r12 + # A[1] * B[0] + mulxq 104(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adcxq %rcx, %r10 + # A[1] * B[3] + movq 24(%rdi), %rdx + mulxq 104(%rsp), %r13, %r14 + adcxq %rbx, %r11 + # A[0] * B[1] + movq 8(%rdi), %rdx + mulxq 96(%rsp), %rcx, %rbx + adoxq %rcx, %r10 + # A[2] * B[1] + mulxq 112(%rsp), %rcx, %r15 + adoxq %rbx, %r11 + adcxq %rcx, %r12 + # A[1] * B[2] + movq 16(%rdi), %rdx + mulxq 104(%rsp), %rcx, %rbx + adcxq %r15, %r13 + adoxq %rcx, %r12 + adcxq %rbp, %r14 + adoxq %rbx, %r13 + # A[0] * B[2] + mulxq 96(%rsp), %rcx, %rbx + adoxq %rbp, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 + # A[1] * B[1] + movq 8(%rdi), %rdx + mulxq 104(%rsp), %rdx, %rcx + adcxq %rbx, %r12 + adoxq %rdx, %r11 + # A[3] * B[1] + movq 8(%rdi), %rdx + adoxq %rcx, %r12 + mulxq 120(%rsp), %rcx, %rbx + adcxq %rcx, %r13 + # A[2] * B[2] + movq 16(%rdi), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rbx, %r14 + adoxq %rdx, %r13 + # A[3] * B[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r14 + mulxq 120(%rsp), %rcx, %rbx + adoxq %rbp, %r15 + adcxq %rcx, %r15 + # A[0] * B[3] + mulxq 96(%rsp), %rdx, %rcx + adcxq %rbx, %rbp + xorq %rbx, %rbx + adcxq %rdx, %r12 + # A[3] * B[0] + movq (%rdi), %rdx + adcxq %rcx, %r13 + mulxq 120(%rsp), %rdx, %rcx + adoxq %rdx, %r12 + adoxq %rcx, %r13 + # A[2] * B[3] + movq 24(%rdi), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r14 + # A[3] * B[2] + movq 16(%rdi), %rdx + adcxq %rcx, %r15 + mulxq 120(%rsp), %rcx, %rdx + adcxq %rbx, %rbp + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rbx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rbx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rbx, %rbx + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rbx, %rdx + adcxq %rbx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, 32(%rsp) + movq %r10, 40(%rsp) + movq %r11, 48(%rsp) + movq %r12, 56(%rsp) + # Multiply + # A[0] * B[0] + movq 128(%rsp), %rdx + mulxq (%rsp), %r9, %r10 + # A[2] * B[0] + mulxq 16(%rsp), %r11, %r12 + # A[1] * B[0] + mulxq 8(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adcxq %rcx, %r10 + # A[1] * B[3] + movq 152(%rsp), %rdx + mulxq 8(%rsp), %r13, %r14 + adcxq %rbx, %r11 + # A[0] * B[1] + movq 136(%rsp), %rdx + mulxq (%rsp), %rcx, %rbx + adoxq %rcx, %r10 + # A[2] * B[1] + mulxq 16(%rsp), %rcx, %r15 + adoxq %rbx, %r11 + adcxq %rcx, %r12 + # A[1] * B[2] + movq 144(%rsp), %rdx + mulxq 8(%rsp), %rcx, %rbx + adcxq %r15, %r13 + adoxq %rcx, %r12 + adcxq %rbp, %r14 + adoxq %rbx, %r13 + # A[0] * B[2] + mulxq (%rsp), %rcx, %rbx + adoxq %rbp, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 + # A[1] * B[1] + movq 136(%rsp), %rdx + mulxq 8(%rsp), %rdx, %rcx + adcxq %rbx, %r12 + adoxq %rdx, %r11 + # A[3] * B[1] + movq 136(%rsp), %rdx + adoxq %rcx, %r12 + mulxq 24(%rsp), %rcx, %rbx + adcxq %rcx, %r13 + # A[2] * B[2] + movq 144(%rsp), %rdx + mulxq 16(%rsp), %rdx, %rcx + adcxq %rbx, %r14 + adoxq %rdx, %r13 + # A[3] * B[3] + movq 152(%rsp), %rdx + adoxq %rcx, %r14 + mulxq 24(%rsp), %rcx, %rbx + adoxq %rbp, %r15 + adcxq %rcx, %r15 + # A[0] * B[3] + mulxq (%rsp), %rdx, %rcx + adcxq %rbx, %rbp + xorq %rbx, %rbx + adcxq %rdx, %r12 + # A[3] * B[0] + movq 128(%rsp), %rdx + adcxq %rcx, %r13 + mulxq 24(%rsp), %rdx, %rcx + adoxq %rdx, %r12 + adoxq %rcx, %r13 + # A[2] * B[3] + movq 152(%rsp), %rdx + mulxq 16(%rsp), %rdx, %rcx + adcxq %rdx, %r14 + # A[3] * B[2] + movq 144(%rsp), %rdx + adcxq %rcx, %r15 + mulxq 24(%rsp), %rcx, %rdx + adcxq %rbx, %rbp + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rbx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rbx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rbx, %rbx + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rbx, %rdx + adcxq %rbx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, (%rsp) + movq %r10, 8(%rsp) + movq %r11, 16(%rsp) + movq %r12, 24(%rsp) + # Square + # A[0] * A[1] + movq 128(%rsp), %rdx + mulxq 136(%rsp), %r10, %r11 + # A[0] * A[3] + mulxq 152(%rsp), %r12, %r13 + # A[2] * A[1] + movq 144(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adoxq %rcx, %r12 + # A[2] * A[3] + mulxq 152(%rsp), %r14, %r15 + adoxq %rbx, %r13 + # A[2] * A[0] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rbp, %r14 + adcxq %rcx, %r11 + adoxq %rbp, %r15 + # A[1] * A[3] + movq 136(%rsp), %rdx + mulxq 152(%rsp), %rax, %r9 + adcxq %rbx, %r12 + adcxq %rax, %r13 + adcxq %r9, %r14 + adcxq %rbp, %r15 + # Double with Carry Flag + xorq %rbp, %rbp + # A[0] * A[0] + movq 128(%rsp), %rdx + mulxq %rdx, %r9, %rax + adcxq %r10, %r10 + # A[1] * A[1] + movq 136(%rsp), %rdx + mulxq %rdx, %rcx, %rbx + adcxq %r11, %r11 + adoxq %rax, %r10 + adcxq %r12, %r12 + adoxq %rcx, %r11 + # A[2] * A[2] + movq 144(%rsp), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r12 + adcxq %r14, %r14 + adoxq %rax, %r13 + # A[3] * A[3] + movq 152(%rsp), %rdx + mulxq %rdx, %rax, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r14 + adcxq %rbp, %rbp + adoxq %rax, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rcx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + mulxq %r15, %rax, %r15 + adcxq %rax, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rax + andq %rcx, %r12 + addq %rax, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r12 + addq %rax, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, 96(%rsp) + movq %r10, 104(%rsp) + movq %r11, 112(%rsp) + movq %r12, 120(%rsp) + # Square + # A[0] * A[1] + movq (%rdi), %rdx + mulxq 8(%rdi), %r10, %r11 + # A[0] * A[3] + mulxq 24(%rdi), %r12, %r13 + # A[2] * A[1] + movq 16(%rdi), %rdx + mulxq 8(%rdi), %rcx, %rbx + xorq %rbp, %rbp + adoxq %rcx, %r12 + # A[2] * A[3] + mulxq 24(%rdi), %r14, %r15 + adoxq %rbx, %r13 + # A[2] * A[0] + mulxq (%rdi), %rcx, %rbx + adoxq %rbp, %r14 + adcxq %rcx, %r11 + adoxq %rbp, %r15 + # A[1] * A[3] + movq 8(%rdi), %rdx + mulxq 24(%rdi), %rax, %r9 + adcxq %rbx, %r12 + adcxq %rax, %r13 + adcxq %r9, %r14 + adcxq %rbp, %r15 + # Double with Carry Flag + xorq %rbp, %rbp + # A[0] * A[0] + movq (%rdi), %rdx + mulxq %rdx, %r9, %rax + adcxq %r10, %r10 + # A[1] * A[1] + movq 8(%rdi), %rdx + mulxq %rdx, %rcx, %rbx + adcxq %r11, %r11 + adoxq %rax, %r10 + adcxq %r12, %r12 + adoxq %rcx, %r11 + # A[2] * A[2] + movq 16(%rdi), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r12 + adcxq %r14, %r14 + adoxq %rax, %r13 + # A[3] * A[3] + movq 24(%rdi), %rdx + mulxq %rdx, %rax, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r14 + adcxq %rbp, %rbp + adoxq %rax, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rcx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + mulxq %r15, %rax, %r15 + adcxq %rax, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rax + andq %rcx, %r12 + addq %rax, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r12 + addq %rax, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, 128(%rsp) + movq %r10, 136(%rsp) + movq %r11, 144(%rsp) + movq %r12, 152(%rsp) + # Add + movq 32(%rsp), %r9 + movq 40(%rsp), %r10 + movq 48(%rsp), %r11 + movq 56(%rsp), %rax + movq %r9, %r13 + addq (%rsp), %r9 + movq %r10, %r14 + adcq 8(%rsp), %r10 + movq %r11, %r15 + adcq 16(%rsp), %r11 + movq %rax, %rbp + adcq 24(%rsp), %rax + movq $-19, %rcx + movq %rax, %r12 + movq $0x7fffffffffffffff, %rbx + sarq $63, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Sub modulus (if overflow) + subq %rcx, %r9 + sbbq %rax, %r10 + sbbq %rax, %r11 + sbbq %rbx, %r12 + # Sub + subq (%rsp), %r13 + movq $0x00, %rax + sbbq 8(%rsp), %r14 + movq $-19, %rcx + sbbq 16(%rsp), %r15 + movq $0x7fffffffffffffff, %rbx + sbbq 24(%rsp), %rbp + sbbq $0x00, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Add modulus (if underflow) + addq %rcx, %r13 + adcq %rax, %r14 + adcq %rax, %r15 + adcq %rbx, %rbp + movq %r9, 64(%rsp) + movq %r10, 72(%rsp) + movq %r11, 80(%rsp) + movq %r12, 88(%rsp) + movq %r13, (%rsp) + movq %r14, 8(%rsp) + movq %r15, 16(%rsp) + movq %rbp, 24(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rdx + mulxq 128(%rsp), %r9, %r10 + # A[2] * B[0] + mulxq 144(%rsp), %r11, %r12 + # A[1] * B[0] + mulxq 136(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adcxq %rcx, %r10 + # A[1] * B[3] + movq 120(%rsp), %rdx + mulxq 136(%rsp), %r13, %r14 + adcxq %rbx, %r11 + # A[0] * B[1] + movq 104(%rsp), %rdx + mulxq 128(%rsp), %rcx, %rbx + adoxq %rcx, %r10 + # A[2] * B[1] + mulxq 144(%rsp), %rcx, %r15 + adoxq %rbx, %r11 + adcxq %rcx, %r12 + # A[1] * B[2] + movq 112(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + adcxq %r15, %r13 + adoxq %rcx, %r12 + adcxq %rbp, %r14 + adoxq %rbx, %r13 + # A[0] * B[2] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rbp, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 + # A[1] * B[1] + movq 104(%rsp), %rdx + mulxq 136(%rsp), %rdx, %rcx + adcxq %rbx, %r12 + adoxq %rdx, %r11 + # A[3] * B[1] + movq 104(%rsp), %rdx + adoxq %rcx, %r12 + mulxq 152(%rsp), %rcx, %rbx + adcxq %rcx, %r13 + # A[2] * B[2] + movq 112(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rbx, %r14 + adoxq %rdx, %r13 + # A[3] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r14 + mulxq 152(%rsp), %rcx, %rbx + adoxq %rbp, %r15 + adcxq %rcx, %r15 + # A[0] * B[3] + mulxq 128(%rsp), %rdx, %rcx + adcxq %rbx, %rbp + xorq %rbx, %rbx + adcxq %rdx, %r12 + # A[3] * B[0] + movq 96(%rsp), %rdx + adcxq %rcx, %r13 + mulxq 152(%rsp), %rdx, %rcx + adoxq %rdx, %r12 + adoxq %rcx, %r13 + # A[2] * B[3] + movq 120(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rdx, %r14 + # A[3] * B[2] + movq 112(%rsp), %rdx + adcxq %rcx, %r15 + mulxq 152(%rsp), %rcx, %rdx + adcxq %rbx, %rbp + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rbx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rbx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rbx, %rbx + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rbx, %rdx + adcxq %rbx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + # Sub + movq 128(%rsp), %r9 + movq 136(%rsp), %r10 + movq 144(%rsp), %r11 + movq 152(%rsp), %r12 + subq 96(%rsp), %r9 + movq $0x00, %rax + sbbq 104(%rsp), %r10 + movq $-19, %rcx + sbbq 112(%rsp), %r11 + movq $0x7fffffffffffffff, %rbx + sbbq 120(%rsp), %r12 + sbbq $0x00, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Add modulus (if underflow) + addq %rcx, %r9 + adcq %rax, %r10 + adcq %rax, %r11 + adcq %rbx, %r12 + movq %r9, 128(%rsp) + movq %r10, 136(%rsp) + movq %r11, 144(%rsp) + movq %r12, 152(%rsp) + # Square + # A[0] * A[1] + movq (%rsp), %rdx + mulxq 8(%rsp), %r10, %r11 + # A[0] * A[3] + mulxq 24(%rsp), %r12, %r13 + # A[2] * A[1] + movq 16(%rsp), %rdx + mulxq 8(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adoxq %rcx, %r12 + # A[2] * A[3] + mulxq 24(%rsp), %r14, %r15 + adoxq %rbx, %r13 + # A[2] * A[0] + mulxq (%rsp), %rcx, %rbx + adoxq %rbp, %r14 + adcxq %rcx, %r11 + adoxq %rbp, %r15 + # A[1] * A[3] + movq 8(%rsp), %rdx + mulxq 24(%rsp), %rax, %r9 + adcxq %rbx, %r12 + adcxq %rax, %r13 + adcxq %r9, %r14 + adcxq %rbp, %r15 + # Double with Carry Flag + xorq %rbp, %rbp + # A[0] * A[0] + movq (%rsp), %rdx + mulxq %rdx, %r9, %rax + adcxq %r10, %r10 + # A[1] * A[1] + movq 8(%rsp), %rdx + mulxq %rdx, %rcx, %rbx + adcxq %r11, %r11 + adoxq %rax, %r10 + adcxq %r12, %r12 + adoxq %rcx, %r11 + # A[2] * A[2] + movq 16(%rsp), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r12 + adcxq %r14, %r14 + adoxq %rax, %r13 + # A[3] * A[3] + movq 24(%rsp), %rdx + mulxq %rdx, %rax, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r14 + adcxq %rbp, %rbp + adoxq %rax, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rcx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + mulxq %r15, %rax, %r15 + adcxq %rax, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rax + andq %rcx, %r12 + addq %rax, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r12 + addq %rax, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, (%rsp) + movq %r10, 8(%rsp) + movq %r11, 16(%rsp) + movq %r12, 24(%rsp) + movq $0x1db42, %rdx + mulxq 128(%rsp), %r9, %rbp + mulxq 136(%rsp), %r10, %r15 + mulxq 144(%rsp), %r11, %r14 + mulxq 152(%rsp), %r12, %r13 + addq %rbp, %r10 + adcq %r15, %r11 + adcq %r14, %r12 + adcq $0x00, %r13 + movq $0x7fffffffffffffff, %rbp + shldq $0x01, %r12, %r13 + andq %rbp, %r12 + imulq $19, %r13, %r13 + addq %r13, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + movq %r9, 32(%rsp) + movq %r10, 40(%rsp) + movq %r11, 48(%rsp) + movq %r12, 56(%rsp) + # Square + # A[0] * A[1] + movq 64(%rsp), %rdx + mulxq 72(%rsp), %r10, %r11 + # A[0] * A[3] + mulxq 88(%rsp), %r12, %r13 + # A[2] * A[1] + movq 80(%rsp), %rdx + mulxq 72(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adoxq %rcx, %r12 + # A[2] * A[3] + mulxq 88(%rsp), %r14, %r15 + adoxq %rbx, %r13 + # A[2] * A[0] + mulxq 64(%rsp), %rcx, %rbx + adoxq %rbp, %r14 + adcxq %rcx, %r11 + adoxq %rbp, %r15 + # A[1] * A[3] + movq 72(%rsp), %rdx + mulxq 88(%rsp), %rax, %r9 + adcxq %rbx, %r12 + adcxq %rax, %r13 + adcxq %r9, %r14 + adcxq %rbp, %r15 + # Double with Carry Flag + xorq %rbp, %rbp + # A[0] * A[0] + movq 64(%rsp), %rdx + mulxq %rdx, %r9, %rax + adcxq %r10, %r10 + # A[1] * A[1] + movq 72(%rsp), %rdx + mulxq %rdx, %rcx, %rbx + adcxq %r11, %r11 + adoxq %rax, %r10 + adcxq %r12, %r12 + adoxq %rcx, %r11 + # A[2] * A[2] + movq 80(%rsp), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r12 + adcxq %r14, %r14 + adoxq %rax, %r13 + # A[3] * A[3] + movq 88(%rsp), %rdx + mulxq %rdx, %rax, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r14 + adcxq %rbp, %rbp + adoxq %rax, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rcx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + mulxq %r15, %rax, %r15 + adcxq %rax, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rax + andq %rcx, %r12 + addq %rax, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rax + andq %rcx, %r12 + addq %rax, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, 64(%rsp) + movq %r10, 72(%rsp) + movq %r11, 80(%rsp) + movq %r12, 88(%rsp) + # Add + movq 96(%rsp), %r9 + movq 104(%rsp), %r10 + addq 32(%rsp), %r9 + movq 112(%rsp), %r11 + adcq 40(%rsp), %r10 + movq 120(%rsp), %rax + adcq 48(%rsp), %r11 + movq $-19, %rcx + adcq 56(%rsp), %rax + movq $0x7fffffffffffffff, %rbx + movq %rax, %r12 + sarq $63, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Sub modulus (if overflow) + subq %rcx, %r9 + sbbq %rax, %r10 + sbbq %rax, %r11 + sbbq %rbx, %r12 + movq %r9, 96(%rsp) + movq %r10, 104(%rsp) + movq %r11, 112(%rsp) + movq %r12, 120(%rsp) + # Multiply + # A[0] * B[0] + movq (%rsp), %rdx + mulxq (%r8), %r9, %r10 + # A[2] * B[0] + mulxq 16(%r8), %r11, %r12 + # A[1] * B[0] + mulxq 8(%r8), %rcx, %rbx + xorq %rbp, %rbp + adcxq %rcx, %r10 + # A[1] * B[3] + movq 24(%rsp), %rdx + mulxq 8(%r8), %r13, %r14 + adcxq %rbx, %r11 + # A[0] * B[1] + movq 8(%rsp), %rdx + mulxq (%r8), %rcx, %rbx + adoxq %rcx, %r10 + # A[2] * B[1] + mulxq 16(%r8), %rcx, %r15 + adoxq %rbx, %r11 + adcxq %rcx, %r12 + # A[1] * B[2] + movq 16(%rsp), %rdx + mulxq 8(%r8), %rcx, %rbx + adcxq %r15, %r13 + adoxq %rcx, %r12 + adcxq %rbp, %r14 + adoxq %rbx, %r13 + # A[0] * B[2] + mulxq (%r8), %rcx, %rbx + adoxq %rbp, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 + # A[1] * B[1] + movq 8(%rsp), %rdx + mulxq 8(%r8), %rdx, %rcx + adcxq %rbx, %r12 + adoxq %rdx, %r11 + # A[3] * B[1] + movq 8(%rsp), %rdx + adoxq %rcx, %r12 + mulxq 24(%r8), %rcx, %rbx + adcxq %rcx, %r13 + # A[2] * B[2] + movq 16(%rsp), %rdx + mulxq 16(%r8), %rdx, %rcx + adcxq %rbx, %r14 + adoxq %rdx, %r13 + # A[3] * B[3] + movq 24(%rsp), %rdx + adoxq %rcx, %r14 + mulxq 24(%r8), %rcx, %rbx + adoxq %rbp, %r15 + adcxq %rcx, %r15 + # A[0] * B[3] + mulxq (%r8), %rdx, %rcx + adcxq %rbx, %rbp + xorq %rbx, %rbx + adcxq %rdx, %r12 + # A[3] * B[0] + movq (%rsp), %rdx + adcxq %rcx, %r13 + mulxq 24(%r8), %rdx, %rcx + adoxq %rdx, %r12 + adoxq %rcx, %r13 + # A[2] * B[3] + movq 24(%rsp), %rdx + mulxq 16(%r8), %rdx, %rcx + adcxq %rdx, %r14 + # A[3] * B[2] + movq 16(%rsp), %rdx + adcxq %rcx, %r15 + mulxq 24(%r8), %rcx, %rdx + adcxq %rbx, %rbp + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rbx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rbx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rbx, %rbx + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rbx, %rdx + adcxq %rbx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, 32(%rsp) + movq %r10, 40(%rsp) + movq %r11, 48(%rsp) + movq %r12, 56(%rsp) + # Multiply + # A[0] * B[0] + movq 96(%rsp), %rdx + mulxq 128(%rsp), %r9, %r10 + # A[2] * B[0] + mulxq 144(%rsp), %r11, %r12 + # A[1] * B[0] + mulxq 136(%rsp), %rcx, %rbx + xorq %rbp, %rbp + adcxq %rcx, %r10 + # A[1] * B[3] + movq 120(%rsp), %rdx + mulxq 136(%rsp), %r13, %r14 + adcxq %rbx, %r11 + # A[0] * B[1] + movq 104(%rsp), %rdx + mulxq 128(%rsp), %rcx, %rbx + adoxq %rcx, %r10 + # A[2] * B[1] + mulxq 144(%rsp), %rcx, %r15 + adoxq %rbx, %r11 + adcxq %rcx, %r12 + # A[1] * B[2] + movq 112(%rsp), %rdx + mulxq 136(%rsp), %rcx, %rbx + adcxq %r15, %r13 + adoxq %rcx, %r12 + adcxq %rbp, %r14 + adoxq %rbx, %r13 + # A[0] * B[2] + mulxq 128(%rsp), %rcx, %rbx + adoxq %rbp, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 + # A[1] * B[1] + movq 104(%rsp), %rdx + mulxq 136(%rsp), %rdx, %rcx + adcxq %rbx, %r12 + adoxq %rdx, %r11 + # A[3] * B[1] + movq 104(%rsp), %rdx + adoxq %rcx, %r12 + mulxq 152(%rsp), %rcx, %rbx + adcxq %rcx, %r13 + # A[2] * B[2] + movq 112(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rbx, %r14 + adoxq %rdx, %r13 + # A[3] * B[3] + movq 120(%rsp), %rdx + adoxq %rcx, %r14 + mulxq 152(%rsp), %rcx, %rbx + adoxq %rbp, %r15 + adcxq %rcx, %r15 + # A[0] * B[3] + mulxq 128(%rsp), %rdx, %rcx + adcxq %rbx, %rbp + xorq %rbx, %rbx + adcxq %rdx, %r12 + # A[3] * B[0] + movq 96(%rsp), %rdx + adcxq %rcx, %r13 + mulxq 152(%rsp), %rdx, %rcx + adoxq %rdx, %r12 + adoxq %rcx, %r13 + # A[2] * B[3] + movq 120(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rdx, %r14 + # A[3] * B[2] + movq 112(%rsp), %rdx + adcxq %rcx, %r15 + mulxq 152(%rsp), %rcx, %rdx + adcxq %rbx, %rbp + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rbx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rbx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rbx, %rbx + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rbx, %rdx + adcxq %rbx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, (%rsp) + movq %r10, 8(%rsp) + movq %r11, 16(%rsp) + movq %r12, 24(%rsp) + decb 168(%rsp) + jge L_curve25519_avx2_bits + movq $63, 168(%rsp) + decb 160(%rsp) + jge L_curve25519_avx2_words + # Invert + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + movq %rsp, %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 128(%rsp), %rsi + movq $19, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 128(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $9, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 128(%rsp), %rdi + leaq 128(%rsp), %rsi + movq $0x63, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 128(%rsp), %rsi + leaq 96(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 96(%rsp), %rdi + leaq 96(%rsp), %rsi + movq $49, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 96(%rsp), %rsi + leaq 64(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + movq 176(%rsp), %rdi + # Multiply + # A[0] * B[0] + movq (%rsp), %rdx + mulxq (%rdi), %r9, %r10 + # A[2] * B[0] + mulxq 16(%rdi), %r11, %r12 + # A[1] * B[0] + mulxq 8(%rdi), %rcx, %rbx + xorq %rbp, %rbp + adcxq %rcx, %r10 + # A[1] * B[3] + movq 24(%rsp), %rdx + mulxq 8(%rdi), %r13, %r14 + adcxq %rbx, %r11 + # A[0] * B[1] + movq 8(%rsp), %rdx + mulxq (%rdi), %rcx, %rbx + adoxq %rcx, %r10 + # A[2] * B[1] + mulxq 16(%rdi), %rcx, %r15 + adoxq %rbx, %r11 + adcxq %rcx, %r12 + # A[1] * B[2] + movq 16(%rsp), %rdx + mulxq 8(%rdi), %rcx, %rbx + adcxq %r15, %r13 + adoxq %rcx, %r12 + adcxq %rbp, %r14 + adoxq %rbx, %r13 + # A[0] * B[2] + mulxq (%rdi), %rcx, %rbx + adoxq %rbp, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 + # A[1] * B[1] + movq 8(%rsp), %rdx + mulxq 8(%rdi), %rdx, %rcx + adcxq %rbx, %r12 + adoxq %rdx, %r11 + # A[3] * B[1] + movq 8(%rsp), %rdx + adoxq %rcx, %r12 + mulxq 24(%rdi), %rcx, %rbx + adcxq %rcx, %r13 + # A[2] * B[2] + movq 16(%rsp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rbx, %r14 + adoxq %rdx, %r13 + # A[3] * B[3] + movq 24(%rsp), %rdx + adoxq %rcx, %r14 + mulxq 24(%rdi), %rcx, %rbx + adoxq %rbp, %r15 + adcxq %rcx, %r15 + # A[0] * B[3] + mulxq (%rdi), %rdx, %rcx + adcxq %rbx, %rbp + xorq %rbx, %rbx + adcxq %rdx, %r12 + # A[3] * B[0] + movq (%rsp), %rdx + adcxq %rcx, %r13 + mulxq 24(%rdi), %rdx, %rcx + adoxq %rdx, %r12 + adoxq %rcx, %r13 + # A[2] * B[3] + movq 24(%rsp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rdx, %r14 + # A[3] * B[2] + movq 16(%rsp), %rdx + adcxq %rcx, %r15 + mulxq 24(%rdi), %rcx, %rdx + adcxq %rbx, %rbp + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %rbx, %rbp + # Reduce + movq $0x7fffffffffffffff, %rbx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r15, %rbp + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + andq %rbx, %r12 + # Multiply top half by 19 + movq $19, %rdx + xorq %rbx, %rbx + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + mulxq %rbp, %rbp, %rdx + adcxq %rbp, %r12 + adoxq %rbx, %rdx + adcxq %rbx, %rdx + # Overflow + shldq $0x01, %r12, %rdx + movq $0x7fffffffffffffff, %rbx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Reduce if top bit set + movq %r12, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + # Store + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + xorq %rax, %rax + addq $0xc0, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size curve25519_avx2,.-curve25519_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_pow22523_avx2 +.type fe_pow22523_avx2,@function +.align 4 +fe_pow22523_avx2: +#else +.section __TEXT,__text +.globl _fe_pow22523_avx2 +.p2align 2 +_fe_pow22523_avx2: +#endif /* __APPLE__ */ + subq $0x70, %rsp + # pow22523 + movq %rdi, 96(%rsp) + movq %rsi, 104(%rsp) + movq %rsp, %rdi + movq 104(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq 104(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movb $4, %dl +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movb $9, %dl +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movb $19, %dl +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movb $9, %dl +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movb $49, %dl +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 64(%rsp), %rdi + leaq 64(%rsp), %rsi + movb $0x63, %dl +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 64(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movb $49, %dl +#ifndef __APPLE__ + callq fe_sq_n_avx2@plt +#else + callq _fe_sq_n_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_avx2@plt +#else + callq _fe_sq_avx2 +#endif /* __APPLE__ */ + movq 96(%rsp), %rdi + movq %rsp, %rsi + movq 104(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_avx2@plt +#else + callq _fe_mul_avx2 +#endif /* __APPLE__ */ + movq 104(%rsp), %rsi + movq 96(%rsp), %rdi + addq $0x70, %rsp + repz retq +#ifndef __APPLE__ +.text +.globl fe_ge_to_p2_avx2 +.type fe_ge_to_p2_avx2,@function +.align 4 +fe_ge_to_p2_avx2: +#else +.section __TEXT,__text +.globl _fe_ge_to_p2_avx2 +.p2align 2 +_fe_ge_to_p2_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $40, %rsp + movq %rsi, (%rsp) + movq %rdx, 8(%rsp) + movq %rcx, 16(%rsp) + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + movq 16(%rsp), %rsi + movq 88(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 24(%rsp), %rsi + movq 32(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 88(%rsp), %rsi + # Multiply + # A[0] * B[0] + movq (%rsi), %rdx + mulxq (%rbx), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rbx), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rbx), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rsi), %rdx + mulxq 8(%rbx), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rsi), %rdx + mulxq (%rbx), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rbx), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rsi), %rdx + mulxq 8(%rbx), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rbx), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rsi), %rdx + mulxq 8(%rbx), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rsi), %rdx + adoxq %rcx, %r11 + mulxq 24(%rbx), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rsi), %rdx + mulxq 16(%rbx), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rsi), %rdx + adoxq %rcx, %r13 + mulxq 24(%rbx), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rbx), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rsi), %rdx + adcxq %rcx, %r12 + mulxq 24(%rbx), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rsi), %rdx + mulxq 16(%rbx), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rsi), %rdx + adcxq %rcx, %r14 + mulxq 24(%rbx), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $40, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_to_p3_avx2 +.type fe_ge_to_p3_avx2,@function +.align 4 +fe_ge_to_p3_avx2: +#else +.section __TEXT,__text +.globl _fe_ge_to_p3_avx2 +.p2align 2 +_fe_ge_to_p3_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $40, %rsp + movq %rsi, (%rsp) + movq %rdx, 8(%rsp) + movq %rcx, 16(%rsp) + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + movq 24(%rsp), %rsi + movq 96(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq (%rsp), %rdi + movq 32(%rsp), %rsi + movq 88(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq 96(%rsp), %rsi + # Multiply + # A[0] * B[0] + movq (%rsi), %rdx + mulxq (%rbx), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rbx), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rbx), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rsi), %rdx + mulxq 8(%rbx), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rsi), %rdx + mulxq (%rbx), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rbx), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rsi), %rdx + mulxq 8(%rbx), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rbx), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rsi), %rdx + mulxq 8(%rbx), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rsi), %rdx + adoxq %rcx, %r11 + mulxq 24(%rbx), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rsi), %rdx + mulxq 16(%rbx), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rsi), %rdx + adoxq %rcx, %r13 + mulxq 24(%rbx), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rbx), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rsi), %rdx + adcxq %rcx, %r12 + mulxq 24(%rbx), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rsi), %rdx + mulxq 16(%rbx), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rsi), %rdx + adcxq %rcx, %r14 + mulxq 24(%rbx), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq 24(%rsp), %rsi + movq 32(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + addq $40, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_dbl_avx2 +.type fe_ge_dbl_avx2,@function +.align 4 +fe_ge_dbl_avx2: +#else +.section __TEXT,__text +.globl _fe_ge_dbl_avx2 +.p2align 2 +_fe_ge_dbl_avx2: +#endif /* __APPLE__ */ + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $48, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq 32(%rsp), %rsi + # Square + # A[0] * A[1] + movq (%rsi), %rdx + mulxq 8(%rsi), %r9, %r10 + # A[0] * A[3] + mulxq 24(%rsi), %r11, %r12 + # A[2] * A[1] + movq 16(%rsi), %rdx + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rsi), %r13, %r14 + adoxq %rax, %r12 + # A[2] * A[0] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + adcxq %rcx, %r10 + adoxq %r15, %r14 + # A[1] * A[3] + movq 8(%rsi), %rdx + mulxq 24(%rsi), %rbp, %r8 + adcxq %rax, %r11 + adcxq %rbp, %r12 + adcxq %r8, %r13 + adcxq %r15, %r14 + # Double with Carry Flag + xorq %r15, %r15 + # A[0] * A[0] + movq (%rsi), %rdx + mulxq %rdx, %r8, %rbp + adcxq %r9, %r9 + # A[1] * A[1] + movq 8(%rsi), %rdx + mulxq %rdx, %rcx, %rax + adcxq %r10, %r10 + adoxq %rbp, %r9 + adcxq %r11, %r11 + adoxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rsi), %rdx + mulxq %rdx, %rbp, %rcx + adcxq %r12, %r12 + adoxq %rax, %r11 + adcxq %r13, %r13 + adoxq %rbp, %r12 + # A[3] * A[3] + movq 24(%rsi), %rdx + mulxq %rdx, %rbp, %rax + adcxq %r14, %r14 + adoxq %rcx, %r13 + adcxq %r15, %r15 + adoxq %rbp, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r12, %rbp, %r12 + adcxq %rbp, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbp, %r13 + adcxq %rbp, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbp, %r14 + adcxq %rbp, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rbp + andq %rcx, %r11 + addq %rbp, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rbp + andq %rcx, %r11 + addq %rbp, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 16(%rsp), %rdi + movq 40(%rsp), %rbx + # Square + # A[0] * A[1] + movq (%rbx), %rdx + mulxq 8(%rbx), %r9, %r10 + # A[0] * A[3] + mulxq 24(%rbx), %r11, %r12 + # A[2] * A[1] + movq 16(%rbx), %rdx + mulxq 8(%rbx), %rcx, %rax + xorq %r15, %r15 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rbx), %r13, %r14 + adoxq %rax, %r12 + # A[2] * A[0] + mulxq (%rbx), %rcx, %rax + adoxq %r15, %r13 + adcxq %rcx, %r10 + adoxq %r15, %r14 + # A[1] * A[3] + movq 8(%rbx), %rdx + mulxq 24(%rbx), %rbp, %r8 + adcxq %rax, %r11 + adcxq %rbp, %r12 + adcxq %r8, %r13 + adcxq %r15, %r14 + # Double with Carry Flag + xorq %r15, %r15 + # A[0] * A[0] + movq (%rbx), %rdx + mulxq %rdx, %r8, %rbp + adcxq %r9, %r9 + # A[1] * A[1] + movq 8(%rbx), %rdx + mulxq %rdx, %rcx, %rax + adcxq %r10, %r10 + adoxq %rbp, %r9 + adcxq %r11, %r11 + adoxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rbx), %rdx + mulxq %rdx, %rbp, %rcx + adcxq %r12, %r12 + adoxq %rax, %r11 + adcxq %r13, %r13 + adoxq %rbp, %r12 + # A[3] * A[3] + movq 24(%rbx), %rdx + mulxq %rdx, %rbp, %rax + adcxq %r14, %r14 + adoxq %rcx, %r13 + adcxq %r15, %r15 + adoxq %rbp, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r12, %rbp, %r12 + adcxq %rbp, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbp, %r13 + adcxq %rbp, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbp, %r14 + adcxq %rbp, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rbp + andq %rcx, %r11 + addq %rbp, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rbp + andq %rcx, %r11 + addq %rbp, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq (%rbx), %r8 + movq 16(%rsi), %r10 + adcq 8(%rbx), %r9 + movq 24(%rsi), %rdx + adcq 16(%rbx), %r10 + movq $-19, %rcx + adcq 24(%rbx), %rdx + movq $0x7fffffffffffffff, %rax + movq %rdx, %r11 + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 24(%rsp), %rsi + # Square + # A[0] * A[1] + movq (%rdi), %rdx + mulxq 8(%rdi), %r9, %r10 + # A[0] * A[3] + mulxq 24(%rdi), %r11, %r12 + # A[2] * A[1] + movq 16(%rdi), %rdx + mulxq 8(%rdi), %rcx, %rax + xorq %r15, %r15 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rdi), %r13, %r14 + adoxq %rax, %r12 + # A[2] * A[0] + mulxq (%rdi), %rcx, %rax + adoxq %r15, %r13 + adcxq %rcx, %r10 + adoxq %r15, %r14 + # A[1] * A[3] + movq 8(%rdi), %rdx + mulxq 24(%rdi), %rbp, %r8 + adcxq %rax, %r11 + adcxq %rbp, %r12 + adcxq %r8, %r13 + adcxq %r15, %r14 + # Double with Carry Flag + xorq %r15, %r15 + # A[0] * A[0] + movq (%rdi), %rdx + mulxq %rdx, %r8, %rbp + adcxq %r9, %r9 + # A[1] * A[1] + movq 8(%rdi), %rdx + mulxq %rdx, %rcx, %rax + adcxq %r10, %r10 + adoxq %rbp, %r9 + adcxq %r11, %r11 + adoxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rdi), %rdx + mulxq %rdx, %rbp, %rcx + adcxq %r12, %r12 + adoxq %rax, %r11 + adcxq %r13, %r13 + adoxq %rbp, %r12 + # A[3] * A[3] + movq 24(%rdi), %rdx + mulxq %rdx, %rbp, %rax + adcxq %r14, %r14 + adoxq %rcx, %r13 + adcxq %r15, %r15 + adoxq %rbp, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rcx + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rcx, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rcx, %rcx + mulxq %r12, %rbp, %r12 + adcxq %rbp, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbp, %r13 + adcxq %rbp, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbp, %r14 + adcxq %rbp, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rcx, %rdx + adcxq %rcx, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rcx + imulq $19, %rdx, %rbp + andq %rcx, %r11 + addq %rbp, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rbp + andq %rcx, %r11 + addq %rbp, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 16(%rsp), %rsi + movq (%rsp), %rbx + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %rdx + movq %r8, %r12 + addq (%rbx), %r8 + movq %r9, %r13 + adcq 8(%rbx), %r9 + movq %r10, %r14 + adcq 16(%rbx), %r10 + movq %rdx, %r15 + adcq 24(%rbx), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rbx), %r12 + movq $0x00, %rdx + sbbq 8(%rbx), %r13 + movq $-19, %rcx + sbbq 16(%rbx), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rbx), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, (%rsi) + movq %r13, 8(%rsi) + movq %r14, 16(%rsi) + movq %r15, 24(%rsi) + movq 24(%rsp), %rsi + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rdi), %r8 + movq $0x00, %rdx + sbbq 8(%rdi), %r9 + movq $-19, %rcx + sbbq 16(%rdi), %r10 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rdi), %r11 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r8 + adcq %rdx, %r9 + adcq %rdx, %r10 + adcq %rax, %r11 + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq 104(%rsp), %rdi + # Square * 2 + # A[0] * A[1] + movq (%rdi), %rdx + mulxq 8(%rdi), %r9, %r10 + # A[0] * A[3] + mulxq 24(%rdi), %r11, %r12 + # A[2] * A[1] + movq 16(%rdi), %rdx + mulxq 8(%rdi), %rcx, %rax + xorq %r15, %r15 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rdi), %r13, %r14 + adoxq %rax, %r12 + # A[2] * A[0] + mulxq (%rdi), %rcx, %rax + adoxq %r15, %r13 + adcxq %rcx, %r10 + adoxq %r15, %r14 + # A[1] * A[3] + movq 8(%rdi), %rdx + mulxq 24(%rdi), %rbp, %r8 + adcxq %rax, %r11 + adcxq %rbp, %r12 + adcxq %r8, %r13 + adcxq %r15, %r14 + # Double with Carry Flag + xorq %r15, %r15 + # A[0] * A[0] + movq (%rdi), %rdx + mulxq %rdx, %r8, %rbp + adcxq %r9, %r9 + # A[1] * A[1] + movq 8(%rdi), %rdx + mulxq %rdx, %rcx, %rax + adcxq %r10, %r10 + adoxq %rbp, %r9 + adcxq %r11, %r11 + adoxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rdi), %rdx + mulxq %rdx, %rbp, %rcx + adcxq %r12, %r12 + adoxq %rax, %r11 + adcxq %r13, %r13 + adoxq %rbp, %r12 + # A[3] * A[3] + movq 24(%rdi), %rdx + mulxq %rdx, %rbp, %rax + adcxq %r14, %r14 + adoxq %rcx, %r13 + adcxq %r15, %r15 + adoxq %rbp, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + xorq %rbp, %rbp + # Move top half into t4-t7 and remove top bit from t3 and double + shldq $3, %r15, %rbp + shldq $2, %r14, %r15 + shldq $2, %r13, %r14 + shldq $2, %r12, %r13 + shldq $2, %r11, %r12 + shldq $0x01, %r10, %r11 + shldq $0x01, %r9, %r10 + shldq $0x01, %r8, %r9 + shlq $0x01, %r8 + andq %rax, %r11 + # Two out left, one in right + andq %rax, %r15 + # Multiply top bits by 19*19 + imulq $0x169, %rbp, %rcx + xorq %rax, %rax + # Multiply top half by 19 + movq $19, %rdx + adoxq %rcx, %r8 + mulxq %r12, %rbp, %r12 + adcxq %rbp, %r8 + adoxq %r12, %r9 + mulxq %r13, %rbp, %r13 + adcxq %rbp, %r9 + adoxq %r13, %r10 + mulxq %r14, %rbp, %r14 + adcxq %rbp, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rbp + andq %rax, %r11 + addq %rbp, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rbp + andq %rax, %r11 + addq %rbp, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 16(%rsp), %rdi + # Sub + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + subq (%rdi), %r8 + movq $0x00, %rdx + sbbq 8(%rdi), %r9 + movq $-19, %rcx + sbbq 16(%rdi), %r10 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rdi), %r11 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r8 + adcq %rdx, %r9 + adcq %rdx, %r10 + adcq %rax, %r11 + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + addq $48, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + repz retq +#ifndef __APPLE__ +.size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_madd_avx2 +.type fe_ge_madd_avx2,@function +.align 4 +fe_ge_madd_avx2: +#else +.section __TEXT,__text +.globl _fe_ge_madd_avx2 +.p2align 2 +_fe_ge_madd_avx2: +#endif /* __APPLE__ */ + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $48, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq 8(%rsp), %rsi + movq 40(%rsp), %rbx + movq 32(%rsp), %rbp + # Add + movq (%rbx), %r8 + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %rdx + movq %r8, %r12 + addq (%rbp), %r8 + movq %r9, %r13 + adcq 8(%rbp), %r9 + movq %r10, %r14 + adcq 16(%rbp), %r10 + movq %rdx, %r15 + adcq 24(%rbp), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rbp), %r12 + movq $0x00, %rdx + sbbq 8(%rbp), %r13 + movq $-19, %rcx + sbbq 16(%rbp), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rbp), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, (%rsi) + movq %r13, 8(%rsi) + movq %r14, 16(%rsi) + movq %r15, 24(%rsi) + movq 16(%rsp), %rbx + movq 128(%rsp), %rbp + # Multiply + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rdi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rdi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rdi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rdi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rdi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rdi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rdi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rdi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rdi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rcx, %r11 + mulxq 24(%rdi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rdi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rdi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbp), %rdx + adcxq %rcx, %r12 + mulxq 24(%rdi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbp), %rdx + adcxq %rcx, %r14 + mulxq 24(%rdi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq 136(%rsp), %rdi + # Multiply + # A[0] * B[0] + movq (%rdi), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rdi), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rdi), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rdi), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rdi), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rdi), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rdi), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rdi), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rdi), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rdi), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 24(%rsp), %rdi + movq 120(%rsp), %rsi + movq 112(%rsp), %rbp + # Multiply + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbp), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbp), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbp), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rdi + movq (%rsp), %rsi + # Add + movq (%rbx), %r8 + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %rdx + movq %r8, %r12 + addq (%rdi), %r8 + movq %r9, %r13 + adcq 8(%rdi), %r9 + movq %r10, %r14 + adcq 16(%rdi), %r10 + movq %rdx, %r15 + adcq 24(%rdi), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rdi), %r12 + movq $0x00, %rdx + sbbq 8(%rdi), %r13 + movq $-19, %rcx + sbbq 16(%rdi), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rdi), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, (%rsi) + movq %r13, 8(%rsi) + movq %r14, 16(%rsi) + movq %r15, 24(%rsi) + movq 104(%rsp), %rdi + # Double + movq (%rdi), %r8 + movq 8(%rdi), %r9 + addq %r8, %r8 + movq 16(%rdi), %r10 + adcq %r9, %r9 + movq 24(%rdi), %rdx + adcq %r10, %r10 + movq $-19, %rcx + adcq %rdx, %rdx + movq $0x7fffffffffffffff, %rax + movq %rdx, %r11 + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq 24(%rsp), %rdi + # Add + movq (%rbx), %r8 + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %rdx + movq %r8, %r12 + addq (%rdi), %r8 + movq %r9, %r13 + adcq 8(%rdi), %r9 + movq %r10, %r14 + adcq 16(%rdi), %r10 + movq %rdx, %r15 + adcq 24(%rdi), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rdi), %r12 + movq $0x00, %rdx + sbbq 8(%rdi), %r13 + movq $-19, %rcx + sbbq 16(%rdi), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rdi), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq %r12, (%rdi) + movq %r13, 8(%rdi) + movq %r14, 16(%rdi) + movq %r15, 24(%rdi) + addq $48, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + repz retq +#ifndef __APPLE__ +.size fe_ge_madd_avx2,.-fe_ge_madd_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_msub_avx2 +.type fe_ge_msub_avx2,@function +.align 4 +fe_ge_msub_avx2: +#else +.section __TEXT,__text +.globl _fe_ge_msub_avx2 +.p2align 2 +_fe_ge_msub_avx2: +#endif /* __APPLE__ */ + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $48, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq 8(%rsp), %rsi + movq 40(%rsp), %rbx + movq 32(%rsp), %rbp + # Add + movq (%rbx), %r8 + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %rdx + movq %r8, %r12 + addq (%rbp), %r8 + movq %r9, %r13 + adcq 8(%rbp), %r9 + movq %r10, %r14 + adcq 16(%rbp), %r10 + movq %rdx, %r15 + adcq 24(%rbp), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rbp), %r12 + movq $0x00, %rdx + sbbq 8(%rbp), %r13 + movq $-19, %rcx + sbbq 16(%rbp), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rbp), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, (%rsi) + movq %r13, 8(%rsi) + movq %r14, 16(%rsi) + movq %r15, 24(%rsi) + movq 16(%rsp), %rbx + movq 136(%rsp), %rbp + # Multiply + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rdi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rdi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rdi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rdi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rdi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rdi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rdi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rdi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rdi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rcx, %r11 + mulxq 24(%rdi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rdi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rdi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbp), %rdx + adcxq %rcx, %r12 + mulxq 24(%rdi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbp), %rdx + adcxq %rcx, %r14 + mulxq 24(%rdi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq 128(%rsp), %rdi + # Multiply + # A[0] * B[0] + movq (%rdi), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rdi), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rdi), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rdi), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rdi), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rdi), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rdi), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rdi), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rdi), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rdi), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 24(%rsp), %rdi + movq 120(%rsp), %rsi + movq 112(%rsp), %rbp + # Multiply + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbp), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbp), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbp), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 8(%rsp), %rsi + movq (%rsp), %rbp + # Add + movq (%rbx), %r8 + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %rdx + movq %r8, %r12 + addq (%rsi), %r8 + movq %r9, %r13 + adcq 8(%rsi), %r9 + movq %r10, %r14 + adcq 16(%rsi), %r10 + movq %rdx, %r15 + adcq 24(%rsi), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rsi), %r12 + movq $0x00, %rdx + sbbq 8(%rsi), %r13 + movq $-19, %rcx + sbbq 16(%rsi), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rsi), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq %r12, (%rbp) + movq %r13, 8(%rbp) + movq %r14, 16(%rbp) + movq %r15, 24(%rbp) + movq 104(%rsp), %rsi + # Double + movq (%rsi), %r8 + movq 8(%rsi), %r9 + addq %r8, %r8 + movq 16(%rsi), %r10 + adcq %r9, %r9 + movq 24(%rsi), %rdx + adcq %r10, %r10 + movq $-19, %rcx + adcq %rdx, %rdx + movq $0x7fffffffffffffff, %rax + movq %rdx, %r11 + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + # Add + movq (%rbx), %r8 + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %rdx + movq %r8, %r12 + addq (%rdi), %r8 + movq %r9, %r13 + adcq 8(%rdi), %r9 + movq %r10, %r14 + adcq 16(%rdi), %r10 + movq %rdx, %r15 + adcq 24(%rdi), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rdi), %r12 + movq $0x00, %rdx + sbbq 8(%rdi), %r13 + movq $-19, %rcx + sbbq 16(%rdi), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rdi), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, (%rbx) + movq %r13, 8(%rbx) + movq %r14, 16(%rbx) + movq %r15, 24(%rbx) + addq $48, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + repz retq +#ifndef __APPLE__ +.size fe_ge_msub_avx2,.-fe_ge_msub_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_add_avx2 +.type fe_ge_add_avx2,@function +.align 4 +fe_ge_add_avx2: +#else +.section __TEXT,__text +.globl _fe_ge_add_avx2 +.p2align 2 +_fe_ge_add_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x50, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq 8(%rsp), %rsi + movq 40(%rsp), %rbx + movq 32(%rsp), %rbp + # Add + movq (%rbx), %r8 + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %rdx + movq %r8, %r12 + addq (%rbp), %r8 + movq %r9, %r13 + adcq 8(%rbp), %r9 + movq %r10, %r14 + adcq 16(%rbp), %r10 + movq %rdx, %r15 + adcq 24(%rbp), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rbp), %r12 + movq $0x00, %rdx + sbbq 8(%rbp), %r13 + movq $-19, %rcx + sbbq 16(%rbp), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rbp), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, (%rsi) + movq %r13, 8(%rsi) + movq %r14, 16(%rsi) + movq %r15, 24(%rsi) + movq 16(%rsp), %rbx + movq 168(%rsp), %rbp + # Multiply + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rdi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rdi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rdi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rdi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rdi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rdi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rdi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rdi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rdi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rcx, %r11 + mulxq 24(%rdi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rdi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rdi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbp), %rdx + adcxq %rcx, %r12 + mulxq 24(%rdi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbp), %rdx + adcxq %rcx, %r14 + mulxq 24(%rdi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq 176(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 24(%rsp), %rsi + movq 160(%rsp), %rbx + movq 144(%rsp), %rbp + # Multiply + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rbx), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rbx), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rbx), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rbx), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rbx), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rbx), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rbx), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rbx), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rbx), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rcx, %r11 + mulxq 24(%rbx), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rbx), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rbx), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rbx), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbp), %rdx + adcxq %rcx, %r12 + mulxq 24(%rbx), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbp), %rdx + mulxq 16(%rbx), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbp), %rdx + adcxq %rcx, %r14 + mulxq 24(%rbx), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 136(%rsp), %rsi + movq 152(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + leaq 48(%rsp), %rsi + # Double + movq (%rdi), %r8 + movq 8(%rdi), %r9 + addq %r8, %r8 + movq 16(%rdi), %r10 + adcq %r9, %r9 + movq 24(%rdi), %rdx + adcq %r10, %r10 + movq $-19, %rcx + adcq %rdx, %rdx + movq $0x7fffffffffffffff, %rax + movq %rdx, %r11 + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 8(%rsp), %rbx + movq 16(%rsp), %rbp + # Add + movq (%rbp), %r8 + movq 8(%rbp), %r9 + movq 16(%rbp), %r10 + movq 24(%rbp), %rdx + movq %r8, %r12 + addq (%rbx), %r8 + movq %r9, %r13 + adcq 8(%rbx), %r9 + movq %r10, %r14 + adcq 16(%rbx), %r10 + movq %rdx, %r15 + adcq 24(%rbx), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rbx), %r12 + movq $0x00, %rdx + sbbq 8(%rbx), %r13 + movq $-19, %rcx + sbbq 16(%rbx), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rbx), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq %r12, (%rdi) + movq %r13, 8(%rdi) + movq %r14, 16(%rdi) + movq %r15, 24(%rdi) + movq 24(%rsp), %rdi + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %rdx + movq %r8, %r12 + addq (%rdi), %r8 + movq %r9, %r13 + adcq 8(%rdi), %r9 + movq %r10, %r14 + adcq 16(%rdi), %r10 + movq %rdx, %r15 + adcq 24(%rdi), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rdi), %r12 + movq $0x00, %rdx + sbbq 8(%rdi), %r13 + movq $-19, %rcx + sbbq 16(%rdi), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rdi), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rbp) + movq %r9, 8(%rbp) + movq %r10, 16(%rbp) + movq %r11, 24(%rbp) + movq %r12, (%rdi) + movq %r13, 8(%rdi) + movq %r14, 16(%rdi) + movq %r15, 24(%rdi) + addq $0x50, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_add_avx2,.-fe_ge_add_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_ge_sub_avx2 +.type fe_ge_sub_avx2,@function +.align 4 +fe_ge_sub_avx2: +#else +.section __TEXT,__text +.globl _fe_ge_sub_avx2 +.p2align 2 +_fe_ge_sub_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x50, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq 8(%rsp), %rsi + movq 40(%rsp), %rbx + movq 32(%rsp), %rbp + # Add + movq (%rbx), %r8 + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %rdx + movq %r8, %r12 + addq (%rbp), %r8 + movq %r9, %r13 + adcq 8(%rbp), %r9 + movq %r10, %r14 + adcq 16(%rbp), %r10 + movq %rdx, %r15 + adcq 24(%rbp), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rbp), %r12 + movq $0x00, %rdx + sbbq 8(%rbp), %r13 + movq $-19, %rcx + sbbq 16(%rbp), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rbp), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, (%rsi) + movq %r13, 8(%rsi) + movq %r14, 16(%rsi) + movq %r15, 24(%rsi) + movq 16(%rsp), %rbx + movq 176(%rsp), %rbp + # Multiply + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rdi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rdi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rdi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rdi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rdi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rdi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rdi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rdi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rdi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rcx, %r11 + mulxq 24(%rdi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rdi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rdi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbp), %rdx + adcxq %rcx, %r12 + mulxq 24(%rdi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbp), %rdx + adcxq %rcx, %r14 + mulxq 24(%rdi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq 168(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 24(%rsp), %rsi + movq 160(%rsp), %rbx + movq 144(%rsp), %rbp + # Multiply + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rbx), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rbx), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rbx), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rbx), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rbx), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rbx), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rbx), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rbx), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rbx), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rcx, %r11 + mulxq 24(%rbx), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rbx), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rcx, %r13 + mulxq 24(%rbx), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rbx), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbp), %rdx + adcxq %rcx, %r12 + mulxq 24(%rbx), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbp), %rdx + mulxq 16(%rbx), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbp), %rdx + adcxq %rcx, %r14 + mulxq 24(%rbx), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 136(%rsp), %rsi + movq 152(%rsp), %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rcx, %rax + xorq %r15, %r15 + adcxq %rcx, %r9 + # A[1] * B[3] + movq 24(%rbx), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rax, %r10 + # A[0] * B[1] + movq 8(%rbx), %rdx + mulxq (%rsi), %rcx, %rax + adoxq %rcx, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rcx, %r14 + adoxq %rax, %r10 + adcxq %rcx, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rcx, %rax + adcxq %r14, %r12 + adoxq %rcx, %r11 + adcxq %r15, %r13 + adoxq %rax, %r12 + # A[0] * B[2] + mulxq (%rsi), %rcx, %rax + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rcx, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %rax, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbx), %rdx + adoxq %rcx, %r11 + mulxq 24(%rsi), %rcx, %rax + adcxq %rcx, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rax, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rcx, %r13 + mulxq 24(%rsi), %rcx, %rax + adoxq %r15, %r14 + adcxq %rcx, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rcx + adcxq %rax, %r15 + xorq %rax, %rax + adcxq %rdx, %r11 + # A[3] * B[0] + movq (%rbx), %rdx + adcxq %rcx, %r12 + mulxq 24(%rsi), %rdx, %rcx + adoxq %rdx, %r11 + adoxq %rcx, %r12 + # A[2] * B[3] + movq 24(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %rdx, %r13 + # A[3] * B[2] + movq 16(%rbx), %rdx + adcxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %rdx + adcxq %rax, %r15 + adoxq %rcx, %r13 + adoxq %rdx, %r14 + adoxq %rax, %r15 + # Reduce + movq $0x7fffffffffffffff, %rax + # Move top half into t4-t7 and remove top bit from t3 + shldq $0x01, %r14, %r15 + shldq $0x01, %r13, %r14 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + andq %rax, %r11 + # Multiply top half by 19 + movq $19, %rdx + xorq %rax, %rax + mulxq %r12, %rcx, %r12 + adcxq %rcx, %r8 + adoxq %r12, %r9 + mulxq %r13, %rcx, %r13 + adcxq %rcx, %r9 + adoxq %r13, %r10 + mulxq %r14, %rcx, %r14 + adcxq %rcx, %r10 + adoxq %r14, %r11 + mulxq %r15, %r15, %rdx + adcxq %r15, %r11 + adoxq %rax, %rdx + adcxq %rax, %rdx + # Overflow + shldq $0x01, %r11, %rdx + movq $0x7fffffffffffffff, %rax + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Reduce if top bit set + movq %r11, %rdx + shrq $63, %rdx + imulq $19, %rdx, %rcx + andq %rax, %r11 + addq %rcx, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + leaq 48(%rsp), %rsi + # Double + movq (%rdi), %r8 + movq 8(%rdi), %r9 + addq %r8, %r8 + movq 16(%rdi), %r10 + adcq %r9, %r9 + movq 24(%rdi), %rdx + adcq %r10, %r10 + movq $-19, %rcx + adcq %rdx, %rdx + movq $0x7fffffffffffffff, %rax + movq %rdx, %r11 + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + movq %r8, (%rsi) + movq %r9, 8(%rsi) + movq %r10, 16(%rsi) + movq %r11, 24(%rsi) + movq 8(%rsp), %rbx + movq 16(%rsp), %rbp + # Add + movq (%rbp), %r8 + movq 8(%rbp), %r9 + movq 16(%rbp), %r10 + movq 24(%rbp), %rdx + movq %r8, %r12 + addq (%rbx), %r8 + movq %r9, %r13 + adcq 8(%rbx), %r9 + movq %r10, %r14 + adcq 16(%rbx), %r10 + movq %rdx, %r15 + adcq 24(%rbx), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rbx), %r12 + movq $0x00, %rdx + sbbq 8(%rbx), %r13 + movq $-19, %rcx + sbbq 16(%rbx), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rbx), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rbx) + movq %r9, 8(%rbx) + movq %r10, 16(%rbx) + movq %r11, 24(%rbx) + movq %r12, (%rdi) + movq %r13, 8(%rdi) + movq %r14, 16(%rdi) + movq %r15, 24(%rdi) + movq 24(%rsp), %rdi + # Add + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %rdx + movq %r8, %r12 + addq (%rdi), %r8 + movq %r9, %r13 + adcq 8(%rdi), %r9 + movq %r10, %r14 + adcq 16(%rdi), %r10 + movq %rdx, %r15 + adcq 24(%rdi), %rdx + movq $-19, %rcx + movq %rdx, %r11 + movq $0x7fffffffffffffff, %rax + sarq $63, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Sub modulus (if overflow) + subq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rdx, %r10 + sbbq %rax, %r11 + # Sub + subq (%rdi), %r12 + movq $0x00, %rdx + sbbq 8(%rdi), %r13 + movq $-19, %rcx + sbbq 16(%rdi), %r14 + movq $0x7fffffffffffffff, %rax + sbbq 24(%rdi), %r15 + sbbq $0x00, %rdx + # Mask the modulus + andq %rdx, %rcx + andq %rdx, %rax + # Add modulus (if underflow) + addq %rcx, %r12 + adcq %rdx, %r13 + adcq %rdx, %r14 + adcq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, (%rbp) + movq %r13, 8(%rbp) + movq %r14, 16(%rbp) + movq %r15, 24(%rbp) + addq $0x50, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_ge_sub_avx2,.-fe_ge_sub_avx2 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ |