summaryrefslogtreecommitdiff
path: root/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S')
-rw-r--r--FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S16542
1 files changed, 16542 insertions, 0 deletions
diff --git a/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S
new file mode 100644
index 000000000..6d0f638b5
--- /dev/null
+++ b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S
@@ -0,0 +1,16542 @@
+/* fe_x25519_asm
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifndef HAVE_INTEL_AVX1
+#define HAVE_INTEL_AVX1
+#endif /* HAVE_INTEL_AVX1 */
+#ifndef NO_AVX2_SUPPORT
+#define HAVE_INTEL_AVX2
+#endif /* NO_AVX2_SUPPORT */
+
+#ifndef __APPLE__
+.text
+.globl fe_init
+.type fe_init,@function
+.align 4
+fe_init:
+#else
+.section __TEXT,__text
+.globl _fe_init
+.p2align 2
+_fe_init:
+#endif /* __APPLE__ */
+#ifdef HAVE_INTEL_AVX2
+#ifndef __APPLE__
+ movq cpuFlagsSet@GOTPCREL(%rip), %rax
+ movl (%rax), %eax
+#else
+ movl _cpuFlagsSet(%rip), %eax
+#endif /* __APPLE__ */
+ testl %eax, %eax
+ je L_fe_init_get_flags
+ repz retq
+L_fe_init_get_flags:
+#ifndef __APPLE__
+ callq cpuid_get_flags@plt
+#else
+ callq _cpuid_get_flags
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq intelFlags@GOTPCREL(%rip), %rdx
+ movl %eax, (%rdx)
+#else
+ movl %eax, _intelFlags(%rip)
+#endif /* __APPLE__ */
+ andl $0x50, %eax
+ cmpl $0x50, %eax
+ jne L_fe_init_flags_done
+#ifndef __APPLE__
+ movq fe_mul_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_mul_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_mul_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_mul_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_sq_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_sq_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_sq_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_sq_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_mul121666_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_mul121666_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_mul121666_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_mul121666_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_sq2_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_sq2_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_sq2_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_sq2_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_invert_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_invert_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_invert_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_invert_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq curve25519_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _curve25519_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq curve25519_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _curve25519_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_pow22523_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_pow22523_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_pow22523_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_pow22523_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_ge_to_p2_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_ge_to_p2_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_ge_to_p3_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_ge_to_p3_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_ge_dbl_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_ge_dbl_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_ge_madd_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_madd_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_ge_madd_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_ge_msub_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_msub_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_ge_msub_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_add_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_ge_add_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_add_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_ge_add_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax
+#else
+ leaq _fe_ge_sub_avx2(%rip), %rax
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ movq fe_ge_sub_p@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+#else
+ movq %rax, _fe_ge_sub_p(%rip)
+#endif /* __APPLE__ */
+L_fe_init_flags_done:
+#ifndef __APPLE__
+ movq cpuFlagsSet@GOTPCREL(%rip), %rdx
+ movl $0x1, (%rdx)
+#else
+ movl $0x1, _cpuFlagsSet(%rip)
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX2 */
+ repz retq
+#ifndef __APPLE__
+.size fe_init,.-fe_init
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_frombytes
+.type fe_frombytes,@function
+.align 4
+fe_frombytes:
+#else
+.section __TEXT,__text
+.globl _fe_frombytes
+.p2align 2
+_fe_frombytes:
+#endif /* __APPLE__ */
+ movq $0x7fffffffffffffff, %r9
+ movq (%rsi), %rdx
+ movq 8(%rsi), %rax
+ movq 16(%rsi), %rcx
+ movq 24(%rsi), %r8
+ andq %r9, %r8
+ movq %rdx, (%rdi)
+ movq %rax, 8(%rdi)
+ movq %rcx, 16(%rdi)
+ movq %r8, 24(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size fe_frombytes,.-fe_frombytes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_tobytes
+.type fe_tobytes,@function
+.align 4
+fe_tobytes:
+#else
+.section __TEXT,__text
+.globl _fe_tobytes
+.p2align 2
+_fe_tobytes:
+#endif /* __APPLE__ */
+ movq $0x7fffffffffffffff, %r10
+ movq (%rsi), %rdx
+ movq 8(%rsi), %rax
+ movq 16(%rsi), %rcx
+ movq 24(%rsi), %r8
+ addq $19, %rdx
+ adcq $0x00, %rax
+ adcq $0x00, %rcx
+ adcq $0x00, %r8
+ shrq $63, %r8
+ imulq $19, %r8, %r9
+ movq (%rsi), %rdx
+ movq 8(%rsi), %rax
+ movq 16(%rsi), %rcx
+ movq 24(%rsi), %r8
+ addq %r9, %rdx
+ adcq $0x00, %rax
+ adcq $0x00, %rcx
+ adcq $0x00, %r8
+ andq %r10, %r8
+ movq %rdx, (%rdi)
+ movq %rax, 8(%rdi)
+ movq %rcx, 16(%rdi)
+ movq %r8, 24(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size fe_tobytes,.-fe_tobytes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_1
+.type fe_1,@function
+.align 4
+fe_1:
+#else
+.section __TEXT,__text
+.globl _fe_1
+.p2align 2
+_fe_1:
+#endif /* __APPLE__ */
+ # Set one
+ movq $0x01, (%rdi)
+ movq $0x00, 8(%rdi)
+ movq $0x00, 16(%rdi)
+ movq $0x00, 24(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size fe_1,.-fe_1
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_0
+.type fe_0,@function
+.align 4
+fe_0:
+#else
+.section __TEXT,__text
+.globl _fe_0
+.p2align 2
+_fe_0:
+#endif /* __APPLE__ */
+ # Set zero
+ movq $0x00, (%rdi)
+ movq $0x00, 8(%rdi)
+ movq $0x00, 16(%rdi)
+ movq $0x00, 24(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size fe_0,.-fe_0
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_copy
+.type fe_copy,@function
+.align 4
+fe_copy:
+#else
+.section __TEXT,__text
+.globl _fe_copy
+.p2align 2
+_fe_copy:
+#endif /* __APPLE__ */
+ # Copy
+ movq (%rsi), %rdx
+ movq 8(%rsi), %rax
+ movq 16(%rsi), %rcx
+ movq 24(%rsi), %r8
+ movq %rdx, (%rdi)
+ movq %rax, 8(%rdi)
+ movq %rcx, 16(%rdi)
+ movq %r8, 24(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size fe_copy,.-fe_copy
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sub
+.type fe_sub,@function
+.align 4
+fe_sub:
+#else
+.section __TEXT,__text
+.globl _fe_sub
+.p2align 2
+_fe_sub:
+#endif /* __APPLE__ */
+ pushq %r12
+ # Sub
+ movq (%rsi), %rax
+ movq 8(%rsi), %rcx
+ movq 16(%rsi), %r8
+ movq 24(%rsi), %r9
+ subq (%rdx), %rax
+ movq $0x00, %r10
+ sbbq 8(%rdx), %rcx
+ movq $-19, %r11
+ sbbq 16(%rdx), %r8
+ movq $0x7fffffffffffffff, %r12
+ sbbq 24(%rdx), %r9
+ sbbq $0x00, %r10
+ # Mask the modulus
+ andq %r10, %r11
+ andq %r10, %r12
+ # Add modulus (if underflow)
+ addq %r11, %rax
+ adcq %r10, %rcx
+ adcq %r10, %r8
+ adcq %r12, %r9
+ movq %rax, (%rdi)
+ movq %rcx, 8(%rdi)
+ movq %r8, 16(%rdi)
+ movq %r9, 24(%rdi)
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_sub,.-fe_sub
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_add
+.type fe_add,@function
+.align 4
+fe_add:
+#else
+.section __TEXT,__text
+.globl _fe_add
+.p2align 2
+_fe_add:
+#endif /* __APPLE__ */
+ pushq %r12
+ # Add
+ movq (%rsi), %rax
+ movq 8(%rsi), %rcx
+ addq (%rdx), %rax
+ movq 16(%rsi), %r8
+ adcq 8(%rdx), %rcx
+ movq 24(%rsi), %r10
+ adcq 16(%rdx), %r8
+ movq $-19, %r11
+ adcq 24(%rdx), %r10
+ movq $0x7fffffffffffffff, %r12
+ movq %r10, %r9
+ sarq $63, %r10
+ # Mask the modulus
+ andq %r10, %r11
+ andq %r10, %r12
+ # Sub modulus (if overflow)
+ subq %r11, %rax
+ sbbq %r10, %rcx
+ sbbq %r10, %r8
+ sbbq %r12, %r9
+ movq %rax, (%rdi)
+ movq %rcx, 8(%rdi)
+ movq %r8, 16(%rdi)
+ movq %r9, 24(%rdi)
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_add,.-fe_add
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_neg
+.type fe_neg,@function
+.align 4
+fe_neg:
+#else
+.section __TEXT,__text
+.globl _fe_neg
+.p2align 2
+_fe_neg:
+#endif /* __APPLE__ */
+ movq $-19, %rdx
+ movq $-1, %rax
+ movq $-1, %rcx
+ movq $0x7fffffffffffffff, %r8
+ subq (%rsi), %rdx
+ sbbq 8(%rsi), %rax
+ sbbq 16(%rsi), %rcx
+ sbbq 24(%rsi), %r8
+ movq %rdx, (%rdi)
+ movq %rax, 8(%rdi)
+ movq %rcx, 16(%rdi)
+ movq %r8, 24(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size fe_neg,.-fe_neg
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_cmov
+.type fe_cmov,@function
+.align 4
+fe_cmov:
+#else
+.section __TEXT,__text
+.globl _fe_cmov
+.p2align 2
+_fe_cmov:
+#endif /* __APPLE__ */
+ cmpl $0x01, %edx
+ movq (%rdi), %rcx
+ movq 8(%rdi), %r8
+ movq 16(%rdi), %r9
+ movq 24(%rdi), %r10
+ cmoveq (%rsi), %rcx
+ cmoveq 8(%rsi), %r8
+ cmoveq 16(%rsi), %r9
+ cmoveq 24(%rsi), %r10
+ movq %rcx, (%rdi)
+ movq %r8, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ repz retq
+#ifndef __APPLE__
+.size fe_cmov,.-fe_cmov
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_isnonzero
+.type fe_isnonzero,@function
+.align 4
+fe_isnonzero:
+#else
+.section __TEXT,__text
+.globl _fe_isnonzero
+.p2align 2
+_fe_isnonzero:
+#endif /* __APPLE__ */
+ movq $0x7fffffffffffffff, %r10
+ movq (%rdi), %rax
+ movq 8(%rdi), %rdx
+ movq 16(%rdi), %rcx
+ movq 24(%rdi), %r8
+ addq $19, %rax
+ adcq $0x00, %rdx
+ adcq $0x00, %rcx
+ adcq $0x00, %r8
+ shrq $63, %r8
+ imulq $19, %r8, %r9
+ movq (%rdi), %rax
+ movq 8(%rdi), %rdx
+ movq 16(%rdi), %rcx
+ movq 24(%rdi), %r8
+ addq %r9, %rax
+ adcq $0x00, %rdx
+ adcq $0x00, %rcx
+ adcq $0x00, %r8
+ andq %r10, %r8
+ orq %rdx, %rax
+ orq %rcx, %rax
+ orq %r8, %rax
+ repz retq
+#ifndef __APPLE__
+.size fe_isnonzero,.-fe_isnonzero
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_isnegative
+.type fe_isnegative,@function
+.align 4
+fe_isnegative:
+#else
+.section __TEXT,__text
+.globl _fe_isnegative
+.p2align 2
+_fe_isnegative:
+#endif /* __APPLE__ */
+ movq $0x7fffffffffffffff, %r11
+ movq (%rdi), %rdx
+ movq 8(%rdi), %rcx
+ movq 16(%rdi), %r8
+ movq 24(%rdi), %r9
+ movq %rdx, %rax
+ addq $19, %rdx
+ adcq $0x00, %rcx
+ adcq $0x00, %r8
+ adcq $0x00, %r9
+ shrq $63, %r9
+ imulq $19, %r9, %r10
+ addq %r10, %rax
+ andq $0x01, %rax
+ repz retq
+#ifndef __APPLE__
+.size fe_isnegative,.-fe_isnegative
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_cmov_table
+.type fe_cmov_table,@function
+.align 4
+fe_cmov_table:
+#else
+.section __TEXT,__text
+.globl _fe_cmov_table
+.p2align 2
+_fe_cmov_table:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rdx, %rcx
+ movsbq %cl, %rax
+ cdq
+ xorb %dl, %al
+ subb %dl, %al
+ movb %al, %r15b
+ movq $0x01, %rax
+ xorq %rdx, %rdx
+ xorq %r8, %r8
+ xorq %r9, %r9
+ movq $0x01, %r10
+ xorq %r11, %r11
+ xorq %r12, %r12
+ xorq %r13, %r13
+ cmpb $0x01, %r15b
+ movq (%rsi), %r14
+ cmoveq %r14, %rax
+ movq 8(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 16(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 24(%rsi), %r14
+ cmoveq %r14, %r9
+ movq 32(%rsi), %r14
+ cmoveq %r14, %r10
+ movq 40(%rsi), %r14
+ cmoveq %r14, %r11
+ movq 48(%rsi), %r14
+ cmoveq %r14, %r12
+ movq 56(%rsi), %r14
+ cmoveq %r14, %r13
+ cmpb $2, %r15b
+ movq 96(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 104(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 112(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 120(%rsi), %r14
+ cmoveq %r14, %r9
+ movq 128(%rsi), %r14
+ cmoveq %r14, %r10
+ movq 136(%rsi), %r14
+ cmoveq %r14, %r11
+ movq 144(%rsi), %r14
+ cmoveq %r14, %r12
+ movq 152(%rsi), %r14
+ cmoveq %r14, %r13
+ cmpb $3, %r15b
+ movq 192(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 200(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 208(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 216(%rsi), %r14
+ cmoveq %r14, %r9
+ movq 224(%rsi), %r14
+ cmoveq %r14, %r10
+ movq 232(%rsi), %r14
+ cmoveq %r14, %r11
+ movq 240(%rsi), %r14
+ cmoveq %r14, %r12
+ movq 248(%rsi), %r14
+ cmoveq %r14, %r13
+ cmpb $4, %r15b
+ movq 288(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 296(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 304(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 312(%rsi), %r14
+ cmoveq %r14, %r9
+ movq 320(%rsi), %r14
+ cmoveq %r14, %r10
+ movq 328(%rsi), %r14
+ cmoveq %r14, %r11
+ movq 336(%rsi), %r14
+ cmoveq %r14, %r12
+ movq 344(%rsi), %r14
+ cmoveq %r14, %r13
+ cmpb $5, %r15b
+ movq 384(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 392(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 400(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 408(%rsi), %r14
+ cmoveq %r14, %r9
+ movq 416(%rsi), %r14
+ cmoveq %r14, %r10
+ movq 424(%rsi), %r14
+ cmoveq %r14, %r11
+ movq 432(%rsi), %r14
+ cmoveq %r14, %r12
+ movq 440(%rsi), %r14
+ cmoveq %r14, %r13
+ cmpb $6, %r15b
+ movq 480(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 488(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 496(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 504(%rsi), %r14
+ cmoveq %r14, %r9
+ movq 512(%rsi), %r14
+ cmoveq %r14, %r10
+ movq 520(%rsi), %r14
+ cmoveq %r14, %r11
+ movq 528(%rsi), %r14
+ cmoveq %r14, %r12
+ movq 536(%rsi), %r14
+ cmoveq %r14, %r13
+ cmpb $7, %r15b
+ movq 576(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 584(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 592(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 600(%rsi), %r14
+ cmoveq %r14, %r9
+ movq 608(%rsi), %r14
+ cmoveq %r14, %r10
+ movq 616(%rsi), %r14
+ cmoveq %r14, %r11
+ movq 624(%rsi), %r14
+ cmoveq %r14, %r12
+ movq 632(%rsi), %r14
+ cmoveq %r14, %r13
+ cmpb $8, %r15b
+ movq 672(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 680(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 688(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 696(%rsi), %r14
+ cmoveq %r14, %r9
+ movq 704(%rsi), %r14
+ cmoveq %r14, %r10
+ movq 712(%rsi), %r14
+ cmoveq %r14, %r11
+ movq 720(%rsi), %r14
+ cmoveq %r14, %r12
+ movq 728(%rsi), %r14
+ cmoveq %r14, %r13
+ cmpb $0x00, %cl
+ movq %rax, %r14
+ cmovlq %r10, %rax
+ cmovlq %r14, %r10
+ movq %rdx, %r14
+ cmovlq %r11, %rdx
+ cmovlq %r14, %r11
+ movq %r8, %r14
+ cmovlq %r12, %r8
+ cmovlq %r14, %r12
+ movq %r9, %r14
+ cmovlq %r13, %r9
+ cmovlq %r14, %r13
+ movq %rax, (%rdi)
+ movq %rdx, 8(%rdi)
+ movq %r8, 16(%rdi)
+ movq %r9, 24(%rdi)
+ movq %r10, 32(%rdi)
+ movq %r11, 40(%rdi)
+ movq %r12, 48(%rdi)
+ movq %r13, 56(%rdi)
+ xorq %rax, %rax
+ xorq %rdx, %rdx
+ xorq %r8, %r8
+ xorq %r9, %r9
+ cmpb $0x01, %r15b
+ movq 64(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 72(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 80(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 88(%rsi), %r14
+ cmoveq %r14, %r9
+ cmpb $2, %r15b
+ movq 160(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 168(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 176(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 184(%rsi), %r14
+ cmoveq %r14, %r9
+ cmpb $3, %r15b
+ movq 256(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 264(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 272(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 280(%rsi), %r14
+ cmoveq %r14, %r9
+ cmpb $4, %r15b
+ movq 352(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 360(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 368(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 376(%rsi), %r14
+ cmoveq %r14, %r9
+ cmpb $5, %r15b
+ movq 448(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 456(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 464(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 472(%rsi), %r14
+ cmoveq %r14, %r9
+ cmpb $6, %r15b
+ movq 544(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 552(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 560(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 568(%rsi), %r14
+ cmoveq %r14, %r9
+ cmpb $7, %r15b
+ movq 640(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 648(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 656(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 664(%rsi), %r14
+ cmoveq %r14, %r9
+ cmpb $8, %r15b
+ movq 736(%rsi), %r14
+ cmoveq %r14, %rax
+ movq 744(%rsi), %r14
+ cmoveq %r14, %rdx
+ movq 752(%rsi), %r14
+ cmoveq %r14, %r8
+ movq 760(%rsi), %r14
+ cmoveq %r14, %r9
+ movq $-19, %r10
+ movq $-1, %r11
+ movq $-1, %r12
+ movq $0x7fffffffffffffff, %r13
+ subq %rax, %r10
+ sbbq %rdx, %r11
+ sbbq %r8, %r12
+ sbbq %r9, %r13
+ cmpb $0x00, %cl
+ cmovlq %r10, %rax
+ cmovlq %r11, %rdx
+ cmovlq %r12, %r8
+ cmovlq %r13, %r9
+ movq %rax, 64(%rdi)
+ movq %rdx, 72(%rdi)
+ movq %r8, 80(%rdi)
+ movq %r9, 88(%rdi)
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_cmov_table,.-fe_cmov_table
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_mul
+.type fe_mul,@function
+.align 4
+fe_mul:
+#else
+.section __TEXT,__text
+.globl _fe_mul
+.p2align 2
+_fe_mul:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_mul_p(%rip)
+#else
+ jmpq *_fe_mul_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_mul,.-fe_mul
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sq
+.type fe_sq,@function
+.align 4
+fe_sq:
+#else
+.section __TEXT,__text
+.globl _fe_sq
+.p2align 2
+_fe_sq:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_sq_p(%rip)
+#else
+ jmpq *_fe_sq_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_sq,.-fe_sq
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_mul121666
+.type fe_mul121666,@function
+.align 4
+fe_mul121666:
+#else
+.section __TEXT,__text
+.globl _fe_mul121666
+.p2align 2
+_fe_mul121666:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_mul121666_p(%rip)
+#else
+ jmpq *_fe_mul121666_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_mul121666,.-fe_mul121666
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sq2
+.type fe_sq2,@function
+.align 4
+fe_sq2:
+#else
+.section __TEXT,__text
+.globl _fe_sq2
+.p2align 2
+_fe_sq2:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_sq2_p(%rip)
+#else
+ jmpq *_fe_sq2_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_sq2,.-fe_sq2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_invert
+.type fe_invert,@function
+.align 4
+fe_invert:
+#else
+.section __TEXT,__text
+.globl _fe_invert
+.p2align 2
+_fe_invert:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_invert_p(%rip)
+#else
+ jmpq *_fe_invert_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_invert,.-fe_invert
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl curve25519
+.type curve25519,@function
+.align 4
+curve25519:
+#else
+.section __TEXT,__text
+.globl _curve25519
+.p2align 2
+_curve25519:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *curve25519_p(%rip)
+#else
+ jmpq *_curve25519_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size curve25519,.-curve25519
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_pow22523
+.type fe_pow22523,@function
+.align 4
+fe_pow22523:
+#else
+.section __TEXT,__text
+.globl _fe_pow22523
+.p2align 2
+_fe_pow22523:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_pow22523_p(%rip)
+#else
+ jmpq *_fe_pow22523_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_pow22523,.-fe_pow22523
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_to_p2
+.type fe_ge_to_p2,@function
+.align 4
+fe_ge_to_p2:
+#else
+.section __TEXT,__text
+.globl _fe_ge_to_p2
+.p2align 2
+_fe_ge_to_p2:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_ge_to_p2_p(%rip)
+#else
+ jmpq *_fe_ge_to_p2_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_ge_to_p2,.-fe_ge_to_p2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_to_p3
+.type fe_ge_to_p3,@function
+.align 4
+fe_ge_to_p3:
+#else
+.section __TEXT,__text
+.globl _fe_ge_to_p3
+.p2align 2
+_fe_ge_to_p3:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_ge_to_p3_p(%rip)
+#else
+ jmpq *_fe_ge_to_p3_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_ge_to_p3,.-fe_ge_to_p3
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_dbl
+.type fe_ge_dbl,@function
+.align 4
+fe_ge_dbl:
+#else
+.section __TEXT,__text
+.globl _fe_ge_dbl
+.p2align 2
+_fe_ge_dbl:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_ge_dbl_p(%rip)
+#else
+ jmpq *_fe_ge_dbl_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_ge_dbl,.-fe_ge_dbl
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_madd
+.type fe_ge_madd,@function
+.align 4
+fe_ge_madd:
+#else
+.section __TEXT,__text
+.globl _fe_ge_madd
+.p2align 2
+_fe_ge_madd:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_ge_madd_p(%rip)
+#else
+ jmpq *_fe_ge_madd_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_ge_madd,.-fe_ge_madd
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_msub
+.type fe_ge_msub,@function
+.align 4
+fe_ge_msub:
+#else
+.section __TEXT,__text
+.globl _fe_ge_msub
+.p2align 2
+_fe_ge_msub:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_ge_msub_p(%rip)
+#else
+ jmpq *_fe_ge_msub_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_ge_msub,.-fe_ge_msub
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_add
+.type fe_ge_add,@function
+.align 4
+fe_ge_add:
+#else
+.section __TEXT,__text
+.globl _fe_ge_add
+.p2align 2
+_fe_ge_add:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_ge_add_p(%rip)
+#else
+ jmpq *_fe_ge_add_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_ge_add,.-fe_ge_add
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_sub
+.type fe_ge_sub,@function
+.align 4
+fe_ge_sub:
+#else
+.section __TEXT,__text
+.globl _fe_ge_sub
+.p2align 2
+_fe_ge_sub:
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+ jmpq *fe_ge_sub_p(%rip)
+#else
+ jmpq *_fe_ge_sub_p(%rip)
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.size fe_ge_sub,.-fe_ge_sub
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type cpuFlagsSet, @object
+.size cpuFlagsSet,4
+cpuFlagsSet:
+ .long 0
+#else
+.section __DATA,__data
+.p2align 2
+_cpuFlagsSet:
+ .long 0
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type intelFlags, @object
+.size intelFlags,4
+intelFlags:
+ .long 0
+#else
+.section __DATA,__data
+.p2align 2
+_intelFlags:
+ .long 0
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_mul_p, @object
+.size fe_mul_p,8
+fe_mul_p:
+ .quad fe_mul_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_mul_p:
+ .quad _fe_mul_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_sq_p, @object
+.size fe_sq_p,8
+fe_sq_p:
+ .quad fe_sq_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_sq_p:
+ .quad _fe_sq_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_mul121666_p, @object
+.size fe_mul121666_p,8
+fe_mul121666_p:
+ .quad fe_mul121666_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_mul121666_p:
+ .quad _fe_mul121666_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_sq2_p, @object
+.size fe_sq2_p,8
+fe_sq2_p:
+ .quad fe_sq2_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_sq2_p:
+ .quad _fe_sq2_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_invert_p, @object
+.size fe_invert_p,8
+fe_invert_p:
+ .quad fe_invert_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_invert_p:
+ .quad _fe_invert_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type curve25519_p, @object
+.size curve25519_p,8
+curve25519_p:
+ .quad curve25519_x64
+#else
+.section __DATA,__data
+.p2align 2
+_curve25519_p:
+ .quad _curve25519_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_pow22523_p, @object
+.size fe_pow22523_p,8
+fe_pow22523_p:
+ .quad fe_pow22523_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_pow22523_p:
+ .quad _fe_pow22523_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_ge_to_p2_p, @object
+.size fe_ge_to_p2_p,8
+fe_ge_to_p2_p:
+ .quad fe_ge_to_p2_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_ge_to_p2_p:
+ .quad _fe_ge_to_p2_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_ge_to_p3_p, @object
+.size fe_ge_to_p3_p,8
+fe_ge_to_p3_p:
+ .quad fe_ge_to_p3_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_ge_to_p3_p:
+ .quad _fe_ge_to_p3_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_ge_dbl_p, @object
+.size fe_ge_dbl_p,8
+fe_ge_dbl_p:
+ .quad fe_ge_dbl_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_ge_dbl_p:
+ .quad _fe_ge_dbl_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_ge_madd_p, @object
+.size fe_ge_madd_p,8
+fe_ge_madd_p:
+ .quad fe_ge_madd_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_ge_madd_p:
+ .quad _fe_ge_madd_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_ge_msub_p, @object
+.size fe_ge_msub_p,8
+fe_ge_msub_p:
+ .quad fe_ge_msub_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_ge_msub_p:
+ .quad _fe_ge_msub_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_ge_add_p, @object
+.size fe_ge_add_p,8
+fe_ge_add_p:
+ .quad fe_ge_add_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_ge_add_p:
+ .quad _fe_ge_add_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+.type fe_ge_sub_p, @object
+.size fe_ge_sub_p,8
+fe_ge_sub_p:
+ .quad fe_ge_sub_x64
+#else
+.section __DATA,__data
+.p2align 2
+_fe_ge_sub_p:
+ .quad _fe_ge_sub_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_mul_x64
+.type fe_mul_x64,@function
+.align 4
+fe_mul_x64:
+#else
+.section __TEXT,__text
+.globl _fe_mul_x64
+.p2align 2
+_fe_mul_x64:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ movq %rdx, %rcx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rcx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rcx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rcx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rcx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rcx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rcx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rcx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rcx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rcx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rcx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rcx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rcx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rcx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rcx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rcx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rcx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_mul_x64,.-fe_mul_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sq_x64
+.type fe_sq_x64,@function
+.align 4
+fe_sq_x64:
+#else
+.section __TEXT,__text
+.globl _fe_sq_x64
+.p2align 2
+_fe_sq_x64:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ # Square
+ # A[0] * A[1]
+ movq (%rsi), %rax
+ mulq 8(%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * A[2]
+ movq (%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[0] * A[3]
+ movq (%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * A[2]
+ movq 8(%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[1] * A[3]
+ movq 8(%rsi), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[2] * A[3]
+ movq 16(%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r13, %r13
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # Double
+ xorq %r14, %r14
+ addq %r8, %r8
+ adcq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq $0x00, %r14
+ # A[0] * A[0]
+ movq (%rsi), %rax
+ mulq %rax
+ movq %rax, %rcx
+ movq %rdx, %r15
+ # A[1] * A[1]
+ movq 8(%rsi), %rax
+ mulq %rax
+ addq %r15, %r8
+ adcq %rax, %r9
+ adcq $0x00, %rdx
+ movq %rdx, %r15
+ # A[2] * A[2]
+ movq 16(%rsi), %rax
+ mulq %rax
+ addq %r15, %r10
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ movq %rdx, %r15
+ # A[3] * A[3]
+ movq 24(%rsi), %rax
+ mulq %rax
+ addq %rax, %r13
+ adcq %rdx, %r14
+ addq %r15, %r12
+ adcq $0x00, %r13
+ adcq $0x00, %r14
+ # Reduce
+ movq $0x7fffffffffffffff, %r15
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ shldq $0x01, %r10, %r11
+ andq %r15, %r10
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r11
+ xorq %r11, %r11
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r11
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ # Add remaining product results in
+ addq %r11, %r8
+ adcq %r12, %r9
+ adcq %r13, %r10
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r10, %rdx
+ imulq $19, %rdx, %rax
+ andq %r15, %r10
+ addq %rax, %rcx
+ adcq $0x00, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ # Reduce if top bit set
+ movq %r10, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %r15, %r10
+ addq %rax, %rcx
+ adcq $0x00, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ # Store
+ movq %rcx, (%rdi)
+ movq %r8, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_sq_x64,.-fe_sq_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sq_n_x64
+.type fe_sq_n_x64,@function
+.align 4
+fe_sq_n_x64:
+#else
+.section __TEXT,__text
+.globl _fe_sq_n_x64
+.p2align 2
+_fe_sq_n_x64:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ movq %rdx, %rcx
+L_fe_sq_n_x64:
+ # Square
+ # A[0] * A[1]
+ movq (%rsi), %rax
+ mulq 8(%rsi)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq (%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq (%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 8(%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 8(%rsi), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 16(%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rax
+ mulq %rax
+ movq %rax, %r8
+ movq %rdx, %rbx
+ # A[1] * A[1]
+ movq 8(%rsi), %rax
+ mulq %rax
+ addq %rbx, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rbx
+ # A[2] * A[2]
+ movq 16(%rsi), %rax
+ mulq %rax
+ addq %rbx, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rbx
+ # A[3] * A[3]
+ movq 24(%rsi), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rbx, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ decb %cl
+ jnz L_fe_sq_n_x64
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_sq_n_x64,.-fe_sq_n_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_mul121666_x64
+.type fe_mul121666_x64,@function
+.align 4
+fe_mul121666_x64:
+#else
+.section __TEXT,__text
+.globl _fe_mul121666_x64
+.p2align 2
+_fe_mul121666_x64:
+#endif /* __APPLE__ */
+ pushq %r12
+ # Multiply by 121666
+ movq $0x1db42, %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ movq %rax, %r8
+ movq %rdx, %r9
+ movq $0x1db42, %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ movq $0x1db42, %rax
+ mulq 16(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ movq $0x1db42, %rax
+ mulq 24(%rsi)
+ movq $0x7fffffffffffffff, %rcx
+ addq %rax, %r11
+ adcq %rdx, %r12
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ movq $19, %rax
+ mulq %r12
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_mul121666_x64,.-fe_mul121666_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sq2_x64
+.type fe_sq2_x64,@function
+.align 4
+fe_sq2_x64:
+#else
+.section __TEXT,__text
+.globl _fe_sq2_x64
+.p2align 2
+_fe_sq2_x64:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ # Square * 2
+ # A[0] * A[1]
+ movq (%rsi), %rax
+ mulq 8(%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * A[2]
+ movq (%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[0] * A[3]
+ movq (%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * A[2]
+ movq 8(%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[1] * A[3]
+ movq 8(%rsi), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[2] * A[3]
+ movq 16(%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r13, %r13
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # Double
+ xorq %r14, %r14
+ addq %r8, %r8
+ adcq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq $0x00, %r14
+ # A[0] * A[0]
+ movq (%rsi), %rax
+ mulq %rax
+ movq %rax, %rcx
+ movq %rdx, %r15
+ # A[1] * A[1]
+ movq 8(%rsi), %rax
+ mulq %rax
+ addq %r15, %r8
+ adcq %rax, %r9
+ adcq $0x00, %rdx
+ movq %rdx, %r15
+ # A[2] * A[2]
+ movq 16(%rsi), %rax
+ mulq %rax
+ addq %r15, %r10
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ movq %rdx, %r15
+ # A[3] * A[3]
+ movq 24(%rsi), %rax
+ mulq %rax
+ addq %rax, %r13
+ adcq %rdx, %r14
+ addq %r15, %r12
+ adcq $0x00, %r13
+ adcq $0x00, %r14
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ xorq %rax, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $3, %r14, %rax
+ shldq $2, %r13, %r14
+ shldq $2, %r12, %r13
+ shldq $2, %r11, %r12
+ shldq $2, %r10, %r11
+ shldq $0x01, %r9, %r10
+ shldq $0x01, %r8, %r9
+ shldq $0x01, %rcx, %r8
+ shlq $0x01, %rcx
+ andq %rbx, %r10
+ # Two out left, one in right
+ andq %rbx, %r14
+ # Multiply top bits by 19*19
+ imulq $0x169, %rax, %r15
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r11
+ xorq %r11, %r11
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r11
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ # Add remaining produce results in
+ addq %r15, %rcx
+ adcq %r11, %r8
+ adcq %r12, %r9
+ adcq %r13, %r10
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r10, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r10
+ addq %rax, %rcx
+ adcq $0x00, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ # Reduce if top bit set
+ movq %r10, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r10
+ addq %rax, %rcx
+ adcq $0x00, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ # Store
+ movq %rcx, (%rdi)
+ movq %r8, 8(%rdi)
+ movq %r9, 16(%rdi)
+ movq %r10, 24(%rdi)
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_sq2_x64,.-fe_sq2_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_invert_x64
+.type fe_invert_x64,@function
+.align 4
+fe_invert_x64:
+#else
+.section __TEXT,__text
+.globl _fe_invert_x64
+.p2align 2
+_fe_invert_x64:
+#endif /* __APPLE__ */
+ subq $0x90, %rsp
+ # Invert
+ movq %rdi, 128(%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rsp, %rdi
+ movq 136(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq 136(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $19, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $0x63, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ movq 128(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ addq $0x90, %rsp
+ repz retq
+#ifndef __APPLE__
+.text
+.globl curve25519_x64
+.type curve25519_x64,@function
+.align 4
+curve25519_x64:
+#else
+.section __TEXT,__text
+.globl _curve25519_x64
+.p2align 2
+_curve25519_x64:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ pushq %rbp
+ movq %rdx, %r8
+ subq $0xb8, %rsp
+ xorq %rbx, %rbx
+ movq %rdi, 176(%rsp)
+ # Set one
+ movq $0x01, (%rdi)
+ movq $0x00, 8(%rdi)
+ movq $0x00, 16(%rdi)
+ movq $0x00, 24(%rdi)
+ # Set zero
+ movq $0x00, (%rsp)
+ movq $0x00, 8(%rsp)
+ movq $0x00, 16(%rsp)
+ movq $0x00, 24(%rsp)
+ # Set one
+ movq $0x01, 32(%rsp)
+ movq $0x00, 40(%rsp)
+ movq $0x00, 48(%rsp)
+ movq $0x00, 56(%rsp)
+ # Copy
+ movq (%r8), %rcx
+ movq 8(%r8), %r9
+ movq 16(%r8), %r10
+ movq 24(%r8), %r11
+ movq %rcx, 64(%rsp)
+ movq %r9, 72(%rsp)
+ movq %r10, 80(%rsp)
+ movq %r11, 88(%rsp)
+ movb $62, 168(%rsp)
+ movq $3, 160(%rsp)
+L_curve25519_x64_words:
+L_curve25519_x64_bits:
+ movq 160(%rsp), %r9
+ movb 168(%rsp), %cl
+ movq (%rsi,%r9,8), %rbp
+ shrq %cl, %rbp
+ andq $0x01, %rbp
+ xorq %rbp, %rbx
+ negq %rbx
+ # Conditional Swap
+ movq (%rdi), %rcx
+ movq 8(%rdi), %r9
+ movq 16(%rdi), %r10
+ movq 24(%rdi), %r11
+ xorq 64(%rsp), %rcx
+ xorq 72(%rsp), %r9
+ xorq 80(%rsp), %r10
+ xorq 88(%rsp), %r11
+ andq %rbx, %rcx
+ andq %rbx, %r9
+ andq %rbx, %r10
+ andq %rbx, %r11
+ xorq %rcx, (%rdi)
+ xorq %r9, 8(%rdi)
+ xorq %r10, 16(%rdi)
+ xorq %r11, 24(%rdi)
+ xorq %rcx, 64(%rsp)
+ xorq %r9, 72(%rsp)
+ xorq %r10, 80(%rsp)
+ xorq %r11, 88(%rsp)
+ # Conditional Swap
+ movq (%rsp), %rcx
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ xorq 32(%rsp), %rcx
+ xorq 40(%rsp), %r9
+ xorq 48(%rsp), %r10
+ xorq 56(%rsp), %r11
+ andq %rbx, %rcx
+ andq %rbx, %r9
+ andq %rbx, %r10
+ andq %rbx, %r11
+ xorq %rcx, (%rsp)
+ xorq %r9, 8(%rsp)
+ xorq %r10, 16(%rsp)
+ xorq %r11, 24(%rsp)
+ xorq %rcx, 32(%rsp)
+ xorq %r9, 40(%rsp)
+ xorq %r10, 48(%rsp)
+ xorq %r11, 56(%rsp)
+ movq %rbp, %rbx
+ # Add
+ movq (%rdi), %rcx
+ movq 8(%rdi), %r9
+ movq 16(%rdi), %r10
+ movq 24(%rdi), %rbp
+ movq %rcx, %r12
+ addq (%rsp), %rcx
+ movq %r9, %r13
+ adcq 8(%rsp), %r9
+ movq %r10, %r14
+ adcq 16(%rsp), %r10
+ movq %rbp, %r15
+ adcq 24(%rsp), %rbp
+ movq $-19, %rax
+ movq %rbp, %r11
+ movq $0x7fffffffffffffff, %rdx
+ sarq $63, %rbp
+ # Mask the modulus
+ andq %rbp, %rax
+ andq %rbp, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %rcx
+ sbbq %rbp, %r9
+ sbbq %rbp, %r10
+ sbbq %rdx, %r11
+ # Sub
+ subq (%rsp), %r12
+ movq $0x00, %rbp
+ sbbq 8(%rsp), %r13
+ movq $-19, %rax
+ sbbq 16(%rsp), %r14
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rsp), %r15
+ sbbq $0x00, %rbp
+ # Mask the modulus
+ andq %rbp, %rax
+ andq %rbp, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r12
+ adcq %rbp, %r13
+ adcq %rbp, %r14
+ adcq %rdx, %r15
+ movq %rcx, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, 128(%rsp)
+ movq %r13, 136(%rsp)
+ movq %r14, 144(%rsp)
+ movq %r15, 152(%rsp)
+ # Add
+ movq 64(%rsp), %rcx
+ movq 72(%rsp), %r9
+ movq 80(%rsp), %r10
+ movq 88(%rsp), %rbp
+ movq %rcx, %r12
+ addq 32(%rsp), %rcx
+ movq %r9, %r13
+ adcq 40(%rsp), %r9
+ movq %r10, %r14
+ adcq 48(%rsp), %r10
+ movq %rbp, %r15
+ adcq 56(%rsp), %rbp
+ movq $-19, %rax
+ movq %rbp, %r11
+ movq $0x7fffffffffffffff, %rdx
+ sarq $63, %rbp
+ # Mask the modulus
+ andq %rbp, %rax
+ andq %rbp, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %rcx
+ sbbq %rbp, %r9
+ sbbq %rbp, %r10
+ sbbq %rdx, %r11
+ # Sub
+ subq 32(%rsp), %r12
+ movq $0x00, %rbp
+ sbbq 40(%rsp), %r13
+ movq $-19, %rax
+ sbbq 48(%rsp), %r14
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 56(%rsp), %r15
+ sbbq $0x00, %rbp
+ # Mask the modulus
+ andq %rbp, %rax
+ andq %rbp, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r12
+ adcq %rbp, %r13
+ adcq %rbp, %r14
+ adcq %rdx, %r15
+ movq %rcx, (%rsp)
+ movq %r9, 8(%rsp)
+ movq %r10, 16(%rsp)
+ movq %r11, 24(%rsp)
+ movq %r12, 96(%rsp)
+ movq %r13, 104(%rsp)
+ movq %r14, 112(%rsp)
+ movq %r15, 120(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq (%rdi), %rax
+ mulq 96(%rsp)
+ movq %rax, %rcx
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rdi), %rax
+ mulq 96(%rsp)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rdi), %rax
+ mulq 104(%rsp)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rdi), %rax
+ mulq 96(%rsp)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rdi), %rax
+ mulq 104(%rsp)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rdi), %rax
+ mulq 112(%rsp)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rdi), %rax
+ mulq 96(%rsp)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rdi), %rax
+ mulq 104(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rdi), %rax
+ mulq 112(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rdi), %rax
+ mulq 120(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rdi), %rax
+ mulq 104(%rsp)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rdi), %rax
+ mulq 112(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rdi), %rax
+ mulq 120(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rdi), %rax
+ mulq 112(%rsp)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rdi), %rax
+ mulq 120(%rsp)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rdi), %rax
+ mulq 120(%rsp)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq %r10, 48(%rsp)
+ movq %r11, 56(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq 128(%rsp), %rax
+ mulq (%rsp)
+ movq %rax, %rcx
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 136(%rsp), %rax
+ mulq (%rsp)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq 128(%rsp), %rax
+ mulq 8(%rsp)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 144(%rsp), %rax
+ mulq (%rsp)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 136(%rsp), %rax
+ mulq 8(%rsp)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq 128(%rsp), %rax
+ mulq 16(%rsp)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 152(%rsp), %rax
+ mulq (%rsp)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 144(%rsp), %rax
+ mulq 8(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 136(%rsp), %rax
+ mulq 16(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq 128(%rsp), %rax
+ mulq 24(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 152(%rsp), %rax
+ mulq 8(%rsp)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 144(%rsp), %rax
+ mulq 16(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 136(%rsp), %rax
+ mulq 24(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 152(%rsp), %rax
+ mulq 16(%rsp)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 144(%rsp), %rax
+ mulq 24(%rsp)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 152(%rsp), %rax
+ mulq 24(%rsp)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, (%rsp)
+ movq %r9, 8(%rsp)
+ movq %r10, 16(%rsp)
+ movq %r11, 24(%rsp)
+ # Square
+ # A[0] * A[1]
+ movq 128(%rsp), %rax
+ mulq 136(%rsp)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq 128(%rsp), %rax
+ mulq 144(%rsp)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq 128(%rsp), %rax
+ mulq 152(%rsp)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 136(%rsp), %rax
+ mulq 144(%rsp)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 136(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 144(%rsp), %rax
+ mulq 152(%rsp)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq 128(%rsp), %rax
+ mulq %rax
+ movq %rax, %rcx
+ movq %rdx, %rbp
+ # A[1] * A[1]
+ movq 136(%rsp), %rax
+ mulq %rax
+ addq %rbp, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rbp
+ # A[2] * A[2]
+ movq 144(%rsp), %rax
+ mulq %rax
+ addq %rbp, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rbp
+ # A[3] * A[3]
+ movq 152(%rsp), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rbp, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, 96(%rsp)
+ movq %r9, 104(%rsp)
+ movq %r10, 112(%rsp)
+ movq %r11, 120(%rsp)
+ # Square
+ # A[0] * A[1]
+ movq (%rdi), %rax
+ mulq 8(%rdi)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq (%rdi), %rax
+ mulq 16(%rdi)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq (%rdi), %rax
+ mulq 24(%rdi)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 8(%rdi), %rax
+ mulq 16(%rdi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 8(%rdi), %rax
+ mulq 24(%rdi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 16(%rdi), %rax
+ mulq 24(%rdi)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq (%rdi), %rax
+ mulq %rax
+ movq %rax, %rcx
+ movq %rdx, %rbp
+ # A[1] * A[1]
+ movq 8(%rdi), %rax
+ mulq %rax
+ addq %rbp, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rbp
+ # A[2] * A[2]
+ movq 16(%rdi), %rax
+ mulq %rax
+ addq %rbp, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rbp
+ # A[3] * A[3]
+ movq 24(%rdi), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rbp, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, 128(%rsp)
+ movq %r9, 136(%rsp)
+ movq %r10, 144(%rsp)
+ movq %r11, 152(%rsp)
+ # Add
+ movq 32(%rsp), %rcx
+ movq 40(%rsp), %r9
+ movq 48(%rsp), %r10
+ movq 56(%rsp), %rbp
+ movq %rcx, %r12
+ addq (%rsp), %rcx
+ movq %r9, %r13
+ adcq 8(%rsp), %r9
+ movq %r10, %r14
+ adcq 16(%rsp), %r10
+ movq %rbp, %r15
+ adcq 24(%rsp), %rbp
+ movq $-19, %rax
+ movq %rbp, %r11
+ movq $0x7fffffffffffffff, %rdx
+ sarq $63, %rbp
+ # Mask the modulus
+ andq %rbp, %rax
+ andq %rbp, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %rcx
+ sbbq %rbp, %r9
+ sbbq %rbp, %r10
+ sbbq %rdx, %r11
+ # Sub
+ subq (%rsp), %r12
+ movq $0x00, %rbp
+ sbbq 8(%rsp), %r13
+ movq $-19, %rax
+ sbbq 16(%rsp), %r14
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rsp), %r15
+ sbbq $0x00, %rbp
+ # Mask the modulus
+ andq %rbp, %rax
+ andq %rbp, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r12
+ adcq %rbp, %r13
+ adcq %rbp, %r14
+ adcq %rdx, %r15
+ movq %rcx, 64(%rsp)
+ movq %r9, 72(%rsp)
+ movq %r10, 80(%rsp)
+ movq %r11, 88(%rsp)
+ movq %r12, (%rsp)
+ movq %r13, 8(%rsp)
+ movq %r14, 16(%rsp)
+ movq %r15, 24(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq 96(%rsp), %rax
+ mulq 128(%rsp)
+ movq %rax, %rcx
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 104(%rsp), %rax
+ mulq 128(%rsp)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq 96(%rsp), %rax
+ mulq 136(%rsp)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 112(%rsp), %rax
+ mulq 128(%rsp)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 104(%rsp), %rax
+ mulq 136(%rsp)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq 96(%rsp), %rax
+ mulq 144(%rsp)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 120(%rsp), %rax
+ mulq 128(%rsp)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 112(%rsp), %rax
+ mulq 136(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 104(%rsp), %rax
+ mulq 144(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq 96(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 120(%rsp), %rax
+ mulq 136(%rsp)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 112(%rsp), %rax
+ mulq 144(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 104(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 120(%rsp), %rax
+ mulq 144(%rsp)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 112(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 120(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ # Sub
+ movq 128(%rsp), %rcx
+ movq 136(%rsp), %r9
+ movq 144(%rsp), %r10
+ movq 152(%rsp), %r11
+ subq 96(%rsp), %rcx
+ movq $0x00, %rbp
+ sbbq 104(%rsp), %r9
+ movq $-19, %rax
+ sbbq 112(%rsp), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 120(%rsp), %r11
+ sbbq $0x00, %rbp
+ # Mask the modulus
+ andq %rbp, %rax
+ andq %rbp, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %rcx
+ adcq %rbp, %r9
+ adcq %rbp, %r10
+ adcq %rdx, %r11
+ movq %rcx, 128(%rsp)
+ movq %r9, 136(%rsp)
+ movq %r10, 144(%rsp)
+ movq %r11, 152(%rsp)
+ # Square
+ # A[0] * A[1]
+ movq (%rsp), %rax
+ mulq 8(%rsp)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq (%rsp), %rax
+ mulq 16(%rsp)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq (%rsp), %rax
+ mulq 24(%rsp)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 8(%rsp), %rax
+ mulq 16(%rsp)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 8(%rsp), %rax
+ mulq 24(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 16(%rsp), %rax
+ mulq 24(%rsp)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq (%rsp), %rax
+ mulq %rax
+ movq %rax, %rcx
+ movq %rdx, %rbp
+ # A[1] * A[1]
+ movq 8(%rsp), %rax
+ mulq %rax
+ addq %rbp, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rbp
+ # A[2] * A[2]
+ movq 16(%rsp), %rax
+ mulq %rax
+ addq %rbp, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rbp
+ # A[3] * A[3]
+ movq 24(%rsp), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rbp, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, (%rsp)
+ movq %r9, 8(%rsp)
+ movq %r10, 16(%rsp)
+ movq %r11, 24(%rsp)
+ # Multiply by 121666
+ movq $0x1db42, %rax
+ mulq 128(%rsp)
+ xorq %r10, %r10
+ movq %rax, %rcx
+ movq %rdx, %r9
+ movq $0x1db42, %rax
+ mulq 136(%rsp)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ movq $0x1db42, %rax
+ mulq 144(%rsp)
+ xorq %r13, %r13
+ addq %rax, %r10
+ adcq %rdx, %r11
+ movq $0x1db42, %rax
+ mulq 152(%rsp)
+ movq $0x7fffffffffffffff, %r12
+ addq %rax, %r11
+ adcq %rdx, %r13
+ shldq $0x01, %r11, %r13
+ andq %r12, %r11
+ movq $19, %rax
+ mulq %r13
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ movq %rcx, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq %r10, 48(%rsp)
+ movq %r11, 56(%rsp)
+ # Square
+ # A[0] * A[1]
+ movq 64(%rsp), %rax
+ mulq 72(%rsp)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq 64(%rsp), %rax
+ mulq 80(%rsp)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq 64(%rsp), %rax
+ mulq 88(%rsp)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 72(%rsp), %rax
+ mulq 80(%rsp)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 72(%rsp), %rax
+ mulq 88(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 80(%rsp), %rax
+ mulq 88(%rsp)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq 64(%rsp), %rax
+ mulq %rax
+ movq %rax, %rcx
+ movq %rdx, %rbp
+ # A[1] * A[1]
+ movq 72(%rsp), %rax
+ mulq %rax
+ addq %rbp, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rbp
+ # A[2] * A[2]
+ movq 80(%rsp), %rax
+ mulq %rax
+ addq %rbp, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rbp
+ # A[3] * A[3]
+ movq 88(%rsp), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rbp, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, 64(%rsp)
+ movq %r9, 72(%rsp)
+ movq %r10, 80(%rsp)
+ movq %r11, 88(%rsp)
+ # Add
+ movq 96(%rsp), %rcx
+ movq 104(%rsp), %r9
+ addq 32(%rsp), %rcx
+ movq 112(%rsp), %r10
+ adcq 40(%rsp), %r9
+ movq 120(%rsp), %rbp
+ adcq 48(%rsp), %r10
+ movq $-19, %rax
+ adcq 56(%rsp), %rbp
+ movq $0x7fffffffffffffff, %rdx
+ movq %rbp, %r11
+ sarq $63, %rbp
+ # Mask the modulus
+ andq %rbp, %rax
+ andq %rbp, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %rcx
+ sbbq %rbp, %r9
+ sbbq %rbp, %r10
+ sbbq %rdx, %r11
+ movq %rcx, 96(%rsp)
+ movq %r9, 104(%rsp)
+ movq %r10, 112(%rsp)
+ movq %r11, 120(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq (%rsp), %rax
+ mulq (%r8)
+ movq %rax, %rcx
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rsp), %rax
+ mulq (%r8)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rsp), %rax
+ mulq 8(%r8)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rsp), %rax
+ mulq (%r8)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rsp), %rax
+ mulq 8(%r8)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rsp), %rax
+ mulq 16(%r8)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rsp), %rax
+ mulq (%r8)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rsp), %rax
+ mulq 8(%r8)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rsp), %rax
+ mulq 16(%r8)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rsp), %rax
+ mulq 24(%r8)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rsp), %rax
+ mulq 8(%r8)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rsp), %rax
+ mulq 16(%r8)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rsp), %rax
+ mulq 24(%r8)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rsp), %rax
+ mulq 16(%r8)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rsp), %rax
+ mulq 24(%r8)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rsp), %rax
+ mulq 24(%r8)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq %r10, 48(%rsp)
+ movq %r11, 56(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq 96(%rsp), %rax
+ mulq 128(%rsp)
+ movq %rax, %rcx
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 104(%rsp), %rax
+ mulq 128(%rsp)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq 96(%rsp), %rax
+ mulq 136(%rsp)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 112(%rsp), %rax
+ mulq 128(%rsp)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 104(%rsp), %rax
+ mulq 136(%rsp)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq 96(%rsp), %rax
+ mulq 144(%rsp)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 120(%rsp), %rax
+ mulq 128(%rsp)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 112(%rsp), %rax
+ mulq 136(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 104(%rsp), %rax
+ mulq 144(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq 96(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 120(%rsp), %rax
+ mulq 136(%rsp)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 112(%rsp), %rax
+ mulq 144(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 104(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 120(%rsp), %rax
+ mulq 144(%rsp)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 112(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 120(%rsp), %rax
+ mulq 152(%rsp)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, (%rsp)
+ movq %r9, 8(%rsp)
+ movq %r10, 16(%rsp)
+ movq %r11, 24(%rsp)
+ decb 168(%rsp)
+ jge L_curve25519_x64_bits
+ movq $63, 168(%rsp)
+ decb 160(%rsp)
+ jge L_curve25519_x64_words
+ # Invert
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ movq %rsp, %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 96(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 128(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 128(%rsp), %rdi
+ leaq 128(%rsp), %rsi
+ movq $19, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 128(%rsp), %rsi
+ leaq 96(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 128(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 128(%rsp), %rdi
+ leaq 128(%rsp), %rsi
+ movq $0x63, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 128(%rsp), %rsi
+ leaq 96(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ movq 176(%rsp), %rdi
+ # Multiply
+ # A[0] * B[0]
+ movq (%rsp), %rax
+ mulq (%rdi)
+ movq %rax, %rcx
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rsp), %rax
+ mulq (%rdi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rsp), %rax
+ mulq 8(%rdi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rsp), %rax
+ mulq (%rdi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rsp), %rax
+ mulq 8(%rdi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rsp), %rax
+ mulq 16(%rdi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rsp), %rax
+ mulq (%rdi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rsp), %rax
+ mulq 8(%rdi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rsp), %rax
+ mulq 16(%rdi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rsp), %rax
+ mulq 24(%rdi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rsp), %rax
+ mulq 8(%rdi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rsp), %rax
+ mulq 16(%rdi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rsp), %rax
+ mulq 24(%rdi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rsp), %rax
+ mulq 16(%rdi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rsp), %rax
+ mulq 24(%rdi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rsp), %rax
+ mulq 24(%rdi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbp
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rbp, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %rcx
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbp, %r11
+ addq %rax, %rcx
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %rcx, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ xorq %rax, %rax
+ addq $0xb8, %rsp
+ popq %rbp
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size curve25519_x64,.-curve25519_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_pow22523_x64
+.type fe_pow22523_x64,@function
+.align 4
+fe_pow22523_x64:
+#else
+.section __TEXT,__text
+.globl _fe_pow22523_x64
+.p2align 2
+_fe_pow22523_x64:
+#endif /* __APPLE__ */
+ subq $0x70, %rsp
+ # pow22523
+ movq %rdi, 96(%rsp)
+ movq %rsi, 104(%rsp)
+ movq %rsp, %rdi
+ movq 104(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq 104(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $19, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $0x63, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_x64@plt
+#else
+ callq _fe_sq_n_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_x64@plt
+#else
+ callq _fe_sq_x64
+#endif /* __APPLE__ */
+ movq 96(%rsp), %rdi
+ movq %rsp, %rsi
+ movq 104(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_x64@plt
+#else
+ callq _fe_mul_x64
+#endif /* __APPLE__ */
+ movq 104(%rsp), %rsi
+ movq 96(%rsp), %rdi
+ addq $0x70, %rsp
+ repz retq
+#ifndef __APPLE__
+.text
+.globl fe_ge_to_p2_x64
+.type fe_ge_to_p2_x64,@function
+.align 4
+fe_ge_to_p2_x64:
+#else
+.section __TEXT,__text
+.globl _fe_ge_to_p2_x64
+.p2align 2
+_fe_ge_to_p2_x64:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $40, %rsp
+ movq %rsi, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rcx, 16(%rsp)
+ movq %r8, 24(%rsp)
+ movq %r9, 32(%rsp)
+ movq 16(%rsp), %rsi
+ movq 88(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 32(%rsp), %rsi
+ movq 88(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $40, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_to_p3_x64
+.type fe_ge_to_p3_x64,@function
+.align 4
+fe_ge_to_p3_x64:
+#else
+.section __TEXT,__text
+.globl _fe_ge_to_p3_x64
+.p2align 2
+_fe_ge_to_p3_x64:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $40, %rsp
+ movq %rsi, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rcx, 16(%rsp)
+ movq %r8, 24(%rsp)
+ movq %r9, 32(%rsp)
+ movq 24(%rsp), %rsi
+ movq 96(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 32(%rsp), %rsi
+ movq 88(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 88(%rsp), %rsi
+ movq 96(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $40, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_dbl_x64
+.type fe_ge_dbl_x64,@function
+.align 4
+fe_ge_dbl_x64:
+#else
+.section __TEXT,__text
+.globl _fe_ge_dbl_x64
+.p2align 2
+_fe_ge_dbl_x64:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x50, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq (%rsp), %rdi
+ movq 32(%rsp), %rsi
+ # Square
+ # A[0] * A[1]
+ movq (%rsi), %rax
+ mulq 8(%rsi)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq (%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq (%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 8(%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 8(%rsi), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 16(%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rax
+ mulq %rax
+ movq %rax, %r8
+ movq %rdx, %rcx
+ # A[1] * A[1]
+ movq 8(%rsi), %rax
+ mulq %rax
+ addq %rcx, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rcx
+ # A[2] * A[2]
+ movq 16(%rsi), %rax
+ mulq %rax
+ addq %rcx, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rcx
+ # A[3] * A[3]
+ movq 24(%rsi), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rcx, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq 40(%rsp), %rsi
+ # Square
+ # A[0] * A[1]
+ movq (%rsi), %rax
+ mulq 8(%rsi)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq (%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq (%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 8(%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 8(%rsi), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 16(%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rax
+ mulq %rax
+ movq %rax, %r8
+ movq %rdx, %rcx
+ # A[1] * A[1]
+ movq 8(%rsi), %rax
+ mulq %rax
+ addq %rcx, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rcx
+ # A[2] * A[2]
+ movq 16(%rsi), %rax
+ mulq %rax
+ addq %rcx, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rcx
+ # A[3] * A[3]
+ movq 24(%rsi), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rcx, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ movq 128(%rsp), %rsi
+ # Square * 2
+ # A[0] * A[1]
+ movq (%rsi), %rax
+ mulq 8(%rsi)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq (%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq (%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 8(%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 8(%rsi), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 16(%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rax
+ mulq %rax
+ movq %rax, %r8
+ movq %rdx, %rcx
+ # A[1] * A[1]
+ movq 8(%rsi), %rax
+ mulq %rax
+ addq %rcx, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rcx
+ # A[2] * A[2]
+ movq 16(%rsi), %rax
+ mulq %rax
+ addq %rcx, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rcx
+ # A[3] * A[3]
+ movq 24(%rsi), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rcx, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ xorq %rax, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $3, %r15, %rax
+ shldq $2, %r14, %r15
+ shldq $2, %r13, %r14
+ shldq $2, %r12, %r13
+ shldq $2, %r11, %r12
+ shldq $0x01, %r10, %r11
+ shldq $0x01, %r9, %r10
+ shldq $0x01, %r8, %r9
+ shlq $0x01, %r8
+ andq %rbx, %r11
+ # Two out left, one in right
+ andq %rbx, %r15
+ # Multiply top bits by 19*19
+ imulq $0x169, %rax, %rcx
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining produce results in
+ addq %rcx, %r8
+ adcq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 32(%rsp), %rsi
+ movq 40(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ leaq 48(%rsp), %rdi
+ movq 8(%rsp), %rsi
+ # Square
+ # A[0] * A[1]
+ movq (%rsi), %rax
+ mulq 8(%rsi)
+ movq %rax, %r9
+ movq %rdx, %r10
+ # A[0] * A[2]
+ movq (%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[0] * A[3]
+ movq (%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r11
+ adcq %rdx, %r12
+ # A[1] * A[2]
+ movq 8(%rsi), %rax
+ mulq 16(%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * A[3]
+ movq 8(%rsi), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ # A[2] * A[3]
+ movq 16(%rsi), %rax
+ mulq 24(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r13
+ adcq %rdx, %r14
+ # Double
+ xorq %r15, %r15
+ addq %r9, %r9
+ adcq %r10, %r10
+ adcq %r11, %r11
+ adcq %r12, %r12
+ adcq %r13, %r13
+ adcq %r14, %r14
+ adcq $0x00, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rax
+ mulq %rax
+ movq %rax, %r8
+ movq %rdx, %rcx
+ # A[1] * A[1]
+ movq 8(%rsi), %rax
+ mulq %rax
+ addq %rcx, %r9
+ adcq %rax, %r10
+ adcq $0x00, %rdx
+ movq %rdx, %rcx
+ # A[2] * A[2]
+ movq 16(%rsi), %rax
+ mulq %rax
+ addq %rcx, %r11
+ adcq %rax, %r12
+ adcq $0x00, %rdx
+ movq %rdx, %rcx
+ # A[3] * A[3]
+ movq 24(%rsi), %rax
+ mulq %rax
+ addq %rax, %r14
+ adcq %rdx, %r15
+ addq %rcx, %r13
+ adcq $0x00, %r14
+ adcq $0x00, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq (%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq (%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $0x50, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_dbl_x64,.-fe_ge_dbl_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_madd_x64
+.type fe_ge_madd_x64,@function
+.align 4
+fe_ge_madd_x64:
+#else
+.section __TEXT,__text
+.globl _fe_ge_madd_x64
+.p2align 2
+_fe_ge_madd_x64:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x50, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq (%rsp), %rdi
+ movq 40(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 40(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq (%rsp), %rsi
+ movq 152(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 8(%rsp), %rsi
+ movq 160(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ movq 144(%rsp), %rsi
+ movq 136(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ leaq 48(%rsp), %rdi
+ movq 128(%rsp), %rsi
+ movq 128(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 24(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 24(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $0x50, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_madd_x64,.-fe_ge_madd_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_msub_x64
+.type fe_ge_msub_x64,@function
+.align 4
+fe_ge_msub_x64:
+#else
+.section __TEXT,__text
+.globl _fe_ge_msub_x64
+.p2align 2
+_fe_ge_msub_x64:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x50, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq (%rsp), %rdi
+ movq 40(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 40(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq (%rsp), %rsi
+ movq 160(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 8(%rsp), %rsi
+ movq 152(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ movq 144(%rsp), %rsi
+ movq 136(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ leaq 48(%rsp), %rdi
+ movq 128(%rsp), %rsi
+ movq 128(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 24(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 24(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $0x50, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_msub_x64,.-fe_ge_msub_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_add_x64
+.type fe_ge_add_x64,@function
+.align 4
+fe_ge_add_x64:
+#else
+.section __TEXT,__text
+.globl _fe_ge_add_x64
+.p2align 2
+_fe_ge_add_x64:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x50, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq (%rsp), %rdi
+ movq 40(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 40(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq (%rsp), %rsi
+ movq 160(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 8(%rsp), %rsi
+ movq 168(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ movq 152(%rsp), %rsi
+ movq 136(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 128(%rsp), %rsi
+ movq 144(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ leaq 48(%rsp), %rdi
+ movq (%rsp), %rsi
+ movq (%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 24(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 24(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $0x50, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_add_x64,.-fe_ge_add_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_sub_x64
+.type fe_ge_sub_x64,@function
+.align 4
+fe_ge_sub_x64:
+#else
+.section __TEXT,__text
+.globl _fe_ge_sub_x64
+.p2align 2
+_fe_ge_sub_x64:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x50, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq (%rsp), %rdi
+ movq 40(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 40(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq (%rsp), %rsi
+ movq 168(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 8(%rsp), %rsi
+ movq 160(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ movq 152(%rsp), %rsi
+ movq 136(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 128(%rsp), %rsi
+ movq 144(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rax
+ mulq (%rsi)
+ movq %rax, %r8
+ movq %rdx, %r9
+ # A[0] * B[1]
+ movq 8(%rbx), %rax
+ mulq (%rsi)
+ xorq %r10, %r10
+ addq %rax, %r9
+ adcq %rdx, %r10
+ # A[1] * B[0]
+ movq (%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r11, %r11
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq $0x00, %r11
+ # A[0] * B[2]
+ movq 16(%rbx), %rax
+ mulq (%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ # A[1] * B[1]
+ movq 8(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r12, %r12
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[2] * B[0]
+ movq (%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r10
+ adcq %rdx, %r11
+ adcq $0x00, %r12
+ # A[0] * B[3]
+ movq 24(%rbx), %rax
+ mulq (%rsi)
+ xorq %r13, %r13
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[2]
+ movq 16(%rbx), %rax
+ mulq 8(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[2] * B[1]
+ movq 8(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[3] * B[0]
+ movq (%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq $0x00, %r13
+ # A[1] * B[3]
+ movq 24(%rbx), %rax
+ mulq 8(%rsi)
+ xorq %r14, %r14
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[2]
+ movq 16(%rbx), %rax
+ mulq 16(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[3] * B[1]
+ movq 8(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r12
+ adcq %rdx, %r13
+ adcq $0x00, %r14
+ # A[2] * B[3]
+ movq 24(%rbx), %rax
+ mulq 16(%rsi)
+ xorq %r15, %r15
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[2]
+ movq 16(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq $0x00, %r15
+ # A[3] * B[3]
+ movq 24(%rbx), %rax
+ mulq 24(%rsi)
+ addq %rax, %r14
+ adcq %rdx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rax
+ mulq %r12
+ xorq %r12, %r12
+ addq %rax, %r8
+ movq $19, %rax
+ adcq %rdx, %r12
+ mulq %r13
+ xorq %r13, %r13
+ addq %rax, %r9
+ movq $19, %rax
+ adcq %rdx, %r13
+ mulq %r14
+ xorq %r14, %r14
+ addq %rax, %r10
+ movq $19, %rax
+ adcq %rdx, %r14
+ mulq %r15
+ # Add remaining product results in
+ addq %r12, %r9
+ adcq %r13, %r10
+ adcq %r14, %r11
+ adcq %rax, %r11
+ adcq $0x00, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ leaq 48(%rsp), %rdi
+ movq (%rsp), %rsi
+ movq (%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 16(%rsp), %rsi
+ movq 8(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 24(%rsp), %rbx
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rbx), %r8
+ movq $0x00, %rcx
+ sbbq 8(%rbx), %r9
+ movq $-19, %rax
+ sbbq 16(%rbx), %r10
+ movq $0x7fffffffffffffff, %rdx
+ sbbq 24(%rbx), %r11
+ sbbq $0x00, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Add modulus (if underflow)
+ addq %rax, %r8
+ adcq %rcx, %r9
+ adcq %rcx, %r10
+ adcq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rdi
+ leaq 48(%rsp), %rsi
+ movq 24(%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rcx
+ adcq 16(%rbx), %r10
+ movq $-19, %rax
+ adcq 24(%rbx), %rcx
+ movq $0x7fffffffffffffff, %rdx
+ movq %rcx, %r11
+ sarq $63, %rcx
+ # Mask the modulus
+ andq %rcx, %rax
+ andq %rcx, %rdx
+ # Sub modulus (if overflow)
+ subq %rax, %r8
+ sbbq %rcx, %r9
+ sbbq %rcx, %r10
+ sbbq %rdx, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $0x50, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_sub_x64,.-fe_ge_sub_x64
+#endif /* __APPLE__ */
+#ifdef HAVE_INTEL_AVX2
+#ifndef __APPLE__
+.text
+.globl fe_mul_avx2
+.type fe_mul_avx2,@function
+.align 4
+fe_mul_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_mul_avx2
+.p2align 2
+_fe_mul_avx2:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ movq %rdx, %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rax, %rcx
+ xorq %r15, %r15
+ adcxq %rax, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rcx, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rax, %rcx
+ adoxq %rax, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rax, %r14
+ adoxq %rcx, %r10
+ adcxq %rax, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rax, %rcx
+ adcxq %r14, %r12
+ adoxq %rax, %r11
+ adcxq %r15, %r13
+ adoxq %rcx, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rax, %rcx
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rax, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rax
+ adcxq %rcx, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rax, %r11
+ mulxq 24(%rsi), %rax, %rcx
+ adcxq %rax, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rax
+ adcxq %rcx, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rax, %r13
+ mulxq 24(%rsi), %rax, %rcx
+ adoxq %r15, %r14
+ adcxq %rax, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rax
+ adcxq %rcx, %r15
+ xorq %rcx, %rcx
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rax, %r12
+ mulxq 24(%rsi), %rdx, %rax
+ adoxq %rdx, %r11
+ adoxq %rax, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rax
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rax, %r14
+ mulxq 24(%rsi), %rax, %rdx
+ adcxq %rcx, %r15
+ adoxq %rax, %r13
+ adoxq %rdx, %r14
+ adoxq %rcx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r12, %rax, %r12
+ adcxq %rax, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rax, %r13
+ adcxq %rax, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rax, %r14
+ adcxq %rax, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_mul_avx2,.-fe_mul_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sq_avx2
+.type fe_sq_avx2,@function
+.align 4
+fe_sq_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_sq_avx2
+.p2align 2
+_fe_sq_avx2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ # Square
+ # A[0] * A[1]
+ movq (%rsi), %rdx
+ mulxq 8(%rsi), %r9, %r10
+ # A[0] * A[3]
+ mulxq 24(%rsi), %r11, %r12
+ # A[2] * A[1]
+ movq 16(%rsi), %rdx
+ mulxq 8(%rsi), %rcx, %rbx
+ xorq %r15, %r15
+ adoxq %rcx, %r11
+ # A[2] * A[3]
+ mulxq 24(%rsi), %r13, %r14
+ adoxq %rbx, %r12
+ # A[2] * A[0]
+ mulxq (%rsi), %rcx, %rbx
+ adoxq %r15, %r13
+ adcxq %rcx, %r10
+ adoxq %r15, %r14
+ # A[1] * A[3]
+ movq 8(%rsi), %rdx
+ mulxq 24(%rsi), %rax, %r8
+ adcxq %rbx, %r11
+ adcxq %rax, %r12
+ adcxq %r8, %r13
+ adcxq %r15, %r14
+ # Double with Carry Flag
+ xorq %r15, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rdx
+ mulxq %rdx, %r8, %rax
+ adcxq %r9, %r9
+ # A[1] * A[1]
+ movq 8(%rsi), %rdx
+ mulxq %rdx, %rcx, %rbx
+ adcxq %r10, %r10
+ adoxq %rax, %r9
+ adcxq %r11, %r11
+ adoxq %rcx, %r10
+ # A[2] * A[2]
+ movq 16(%rsi), %rdx
+ mulxq %rdx, %rax, %rcx
+ adcxq %r12, %r12
+ adoxq %rbx, %r11
+ adcxq %r13, %r13
+ adoxq %rax, %r12
+ # A[3] * A[3]
+ movq 24(%rsi), %rdx
+ mulxq %rdx, %rax, %rbx
+ adcxq %r14, %r14
+ adoxq %rcx, %r13
+ adcxq %r15, %r15
+ adoxq %rax, %r14
+ adoxq %rbx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r12, %rax, %r12
+ adcxq %rax, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rax, %r13
+ adcxq %rax, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rax, %r14
+ adcxq %rax, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_sq_avx2,.-fe_sq_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sq_n_avx2
+.type fe_sq_n_avx2,@function
+.align 4
+fe_sq_n_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_sq_n_avx2
+.p2align 2
+_fe_sq_n_avx2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rdx, %rbp
+L_fe_sq_n_avx2:
+ # Square
+ # A[0] * A[1]
+ movq (%rsi), %rdx
+ mulxq 8(%rsi), %r9, %r10
+ # A[0] * A[3]
+ mulxq 24(%rsi), %r11, %r12
+ # A[2] * A[1]
+ movq 16(%rsi), %rdx
+ mulxq 8(%rsi), %rcx, %rbx
+ xorq %r15, %r15
+ adoxq %rcx, %r11
+ # A[2] * A[3]
+ mulxq 24(%rsi), %r13, %r14
+ adoxq %rbx, %r12
+ # A[2] * A[0]
+ mulxq (%rsi), %rcx, %rbx
+ adoxq %r15, %r13
+ adcxq %rcx, %r10
+ adoxq %r15, %r14
+ # A[1] * A[3]
+ movq 8(%rsi), %rdx
+ mulxq 24(%rsi), %rax, %r8
+ adcxq %rbx, %r11
+ adcxq %rax, %r12
+ adcxq %r8, %r13
+ adcxq %r15, %r14
+ # Double with Carry Flag
+ xorq %r15, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rdx
+ mulxq %rdx, %r8, %rax
+ adcxq %r9, %r9
+ # A[1] * A[1]
+ movq 8(%rsi), %rdx
+ mulxq %rdx, %rcx, %rbx
+ adcxq %r10, %r10
+ adoxq %rax, %r9
+ adcxq %r11, %r11
+ adoxq %rcx, %r10
+ # A[2] * A[2]
+ movq 16(%rsi), %rdx
+ mulxq %rdx, %rax, %rcx
+ adcxq %r12, %r12
+ adoxq %rbx, %r11
+ adcxq %r13, %r13
+ adoxq %rax, %r12
+ # A[3] * A[3]
+ movq 24(%rsi), %rdx
+ mulxq %rdx, %rax, %rbx
+ adcxq %r14, %r14
+ adoxq %rcx, %r13
+ adcxq %r15, %r15
+ adoxq %rax, %r14
+ adoxq %rbx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r12, %rax, %r12
+ adcxq %rax, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rax, %r13
+ adcxq %rax, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rax, %r14
+ adcxq %rax, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ decb %bpl
+ jnz L_fe_sq_n_avx2
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_sq_n_avx2,.-fe_sq_n_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_mul121666_avx2
+.type fe_mul121666_avx2,@function
+.align 4
+fe_mul121666_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_mul121666_avx2
+.p2align 2
+_fe_mul121666_avx2:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ movq $0x1db42, %rdx
+ mulxq (%rsi), %rax, %r13
+ mulxq 8(%rsi), %rcx, %r12
+ mulxq 16(%rsi), %r8, %r11
+ mulxq 24(%rsi), %r9, %r10
+ addq %r13, %rcx
+ adcq %r12, %r8
+ adcq %r11, %r9
+ adcq $0x00, %r10
+ movq $0x7fffffffffffffff, %r13
+ shldq $0x01, %r9, %r10
+ andq %r13, %r9
+ imulq $19, %r10, %r10
+ addq %r10, %rax
+ adcq $0x00, %rcx
+ adcq $0x00, %r8
+ adcq $0x00, %r9
+ movq %rax, (%rdi)
+ movq %rcx, 8(%rdi)
+ movq %r8, 16(%rdi)
+ movq %r9, 24(%rdi)
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size fe_mul121666_avx2,.-fe_mul121666_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_sq2_avx2
+.type fe_sq2_avx2,@function
+.align 4
+fe_sq2_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_sq2_avx2
+.p2align 2
+_fe_sq2_avx2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ # Square * 2
+ # A[0] * A[1]
+ movq (%rsi), %rdx
+ mulxq 8(%rsi), %r9, %r10
+ # A[0] * A[3]
+ mulxq 24(%rsi), %r11, %r12
+ # A[2] * A[1]
+ movq 16(%rsi), %rdx
+ mulxq 8(%rsi), %rcx, %rbx
+ xorq %r15, %r15
+ adoxq %rcx, %r11
+ # A[2] * A[3]
+ mulxq 24(%rsi), %r13, %r14
+ adoxq %rbx, %r12
+ # A[2] * A[0]
+ mulxq (%rsi), %rcx, %rbx
+ adoxq %r15, %r13
+ adcxq %rcx, %r10
+ adoxq %r15, %r14
+ # A[1] * A[3]
+ movq 8(%rsi), %rdx
+ mulxq 24(%rsi), %rax, %r8
+ adcxq %rbx, %r11
+ adcxq %rax, %r12
+ adcxq %r8, %r13
+ adcxq %r15, %r14
+ # Double with Carry Flag
+ xorq %r15, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rdx
+ mulxq %rdx, %r8, %rax
+ adcxq %r9, %r9
+ # A[1] * A[1]
+ movq 8(%rsi), %rdx
+ mulxq %rdx, %rcx, %rbx
+ adcxq %r10, %r10
+ adoxq %rax, %r9
+ adcxq %r11, %r11
+ adoxq %rcx, %r10
+ # A[2] * A[2]
+ movq 16(%rsi), %rdx
+ mulxq %rdx, %rax, %rcx
+ adcxq %r12, %r12
+ adoxq %rbx, %r11
+ adcxq %r13, %r13
+ adoxq %rax, %r12
+ # A[3] * A[3]
+ movq 24(%rsi), %rdx
+ mulxq %rdx, %rax, %rbx
+ adcxq %r14, %r14
+ adoxq %rcx, %r13
+ adcxq %r15, %r15
+ adoxq %rax, %r14
+ adoxq %rbx, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ xorq %rax, %rax
+ # Move top half into t4-t7 and remove top bit from t3 and double
+ shldq $3, %r15, %rax
+ shldq $2, %r14, %r15
+ shldq $2, %r13, %r14
+ shldq $2, %r12, %r13
+ shldq $2, %r11, %r12
+ shldq $0x01, %r10, %r11
+ shldq $0x01, %r9, %r10
+ shldq $0x01, %r8, %r9
+ shlq $0x01, %r8
+ andq %rbx, %r11
+ # Two out left, one in right
+ andq %rbx, %r15
+ # Multiply top bits by 19*19
+ imulq $0x169, %rax, %rcx
+ xorq %rbx, %rbx
+ # Multiply top half by 19
+ movq $19, %rdx
+ adoxq %rcx, %r8
+ mulxq %r12, %rax, %r12
+ adcxq %rax, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rax, %r13
+ adcxq %rax, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rax, %r14
+ adcxq %rax, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rbx, %rdx
+ adcxq %rbx, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rbx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rbx, %r11
+ addq %rax, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_sq2_avx2,.-fe_sq2_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_invert_avx2
+.type fe_invert_avx2,@function
+.align 4
+fe_invert_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_invert_avx2
+.p2align 2
+_fe_invert_avx2:
+#endif /* __APPLE__ */
+ subq $0x90, %rsp
+ # Invert
+ movq %rdi, 128(%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rsp, %rdi
+ movq 136(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq 136(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $19, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $0x63, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ movq 128(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ addq $0x90, %rsp
+ repz retq
+#ifndef __APPLE__
+.text
+.globl curve25519_avx2
+.type curve25519_avx2,@function
+.align 4
+curve25519_avx2:
+#else
+.section __TEXT,__text
+.globl _curve25519_avx2
+.p2align 2
+_curve25519_avx2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rdx, %r8
+ subq $0xc0, %rsp
+ movq $0x00, 184(%rsp)
+ movq %rdi, 176(%rsp)
+ # Set one
+ movq $0x01, (%rdi)
+ movq $0x00, 8(%rdi)
+ movq $0x00, 16(%rdi)
+ movq $0x00, 24(%rdi)
+ # Set zero
+ movq $0x00, (%rsp)
+ movq $0x00, 8(%rsp)
+ movq $0x00, 16(%rsp)
+ movq $0x00, 24(%rsp)
+ # Set one
+ movq $0x01, 32(%rsp)
+ movq $0x00, 40(%rsp)
+ movq $0x00, 48(%rsp)
+ movq $0x00, 56(%rsp)
+ # Copy
+ movq (%r8), %r9
+ movq 8(%r8), %r10
+ movq 16(%r8), %r11
+ movq 24(%r8), %r12
+ movq %r9, 64(%rsp)
+ movq %r10, 72(%rsp)
+ movq %r11, 80(%rsp)
+ movq %r12, 88(%rsp)
+ movb $62, 168(%rsp)
+ movq $3, 160(%rsp)
+L_curve25519_avx2_words:
+L_curve25519_avx2_bits:
+ movq 184(%rsp), %rbx
+ movq 160(%rsp), %r9
+ movb 168(%rsp), %cl
+ movq (%rsi,%r9,8), %rax
+ shrq %cl, %rax
+ andq $0x01, %rax
+ xorq %rax, %rbx
+ negq %rbx
+ # Conditional Swap
+ movq (%rdi), %r9
+ movq 8(%rdi), %r10
+ movq 16(%rdi), %r11
+ movq 24(%rdi), %r12
+ xorq 64(%rsp), %r9
+ xorq 72(%rsp), %r10
+ xorq 80(%rsp), %r11
+ xorq 88(%rsp), %r12
+ andq %rbx, %r9
+ andq %rbx, %r10
+ andq %rbx, %r11
+ andq %rbx, %r12
+ xorq %r9, (%rdi)
+ xorq %r10, 8(%rdi)
+ xorq %r11, 16(%rdi)
+ xorq %r12, 24(%rdi)
+ xorq %r9, 64(%rsp)
+ xorq %r10, 72(%rsp)
+ xorq %r11, 80(%rsp)
+ xorq %r12, 88(%rsp)
+ # Conditional Swap
+ movq (%rsp), %r9
+ movq 8(%rsp), %r10
+ movq 16(%rsp), %r11
+ movq 24(%rsp), %r12
+ xorq 32(%rsp), %r9
+ xorq 40(%rsp), %r10
+ xorq 48(%rsp), %r11
+ xorq 56(%rsp), %r12
+ andq %rbx, %r9
+ andq %rbx, %r10
+ andq %rbx, %r11
+ andq %rbx, %r12
+ xorq %r9, (%rsp)
+ xorq %r10, 8(%rsp)
+ xorq %r11, 16(%rsp)
+ xorq %r12, 24(%rsp)
+ xorq %r9, 32(%rsp)
+ xorq %r10, 40(%rsp)
+ xorq %r11, 48(%rsp)
+ xorq %r12, 56(%rsp)
+ movq %rax, 184(%rsp)
+ # Add
+ movq (%rdi), %r9
+ movq 8(%rdi), %r10
+ movq 16(%rdi), %r11
+ movq 24(%rdi), %rax
+ movq %r9, %r13
+ addq (%rsp), %r9
+ movq %r10, %r14
+ adcq 8(%rsp), %r10
+ movq %r11, %r15
+ adcq 16(%rsp), %r11
+ movq %rax, %rbp
+ adcq 24(%rsp), %rax
+ movq $-19, %rcx
+ movq %rax, %r12
+ movq $0x7fffffffffffffff, %rbx
+ sarq $63, %rax
+ # Mask the modulus
+ andq %rax, %rcx
+ andq %rax, %rbx
+ # Sub modulus (if overflow)
+ subq %rcx, %r9
+ sbbq %rax, %r10
+ sbbq %rax, %r11
+ sbbq %rbx, %r12
+ # Sub
+ subq (%rsp), %r13
+ movq $0x00, %rax
+ sbbq 8(%rsp), %r14
+ movq $-19, %rcx
+ sbbq 16(%rsp), %r15
+ movq $0x7fffffffffffffff, %rbx
+ sbbq 24(%rsp), %rbp
+ sbbq $0x00, %rax
+ # Mask the modulus
+ andq %rax, %rcx
+ andq %rax, %rbx
+ # Add modulus (if underflow)
+ addq %rcx, %r13
+ adcq %rax, %r14
+ adcq %rax, %r15
+ adcq %rbx, %rbp
+ movq %r9, (%rdi)
+ movq %r10, 8(%rdi)
+ movq %r11, 16(%rdi)
+ movq %r12, 24(%rdi)
+ movq %r13, 128(%rsp)
+ movq %r14, 136(%rsp)
+ movq %r15, 144(%rsp)
+ movq %rbp, 152(%rsp)
+ # Add
+ movq 64(%rsp), %r9
+ movq 72(%rsp), %r10
+ movq 80(%rsp), %r11
+ movq 88(%rsp), %rax
+ movq %r9, %r13
+ addq 32(%rsp), %r9
+ movq %r10, %r14
+ adcq 40(%rsp), %r10
+ movq %r11, %r15
+ adcq 48(%rsp), %r11
+ movq %rax, %rbp
+ adcq 56(%rsp), %rax
+ movq $-19, %rcx
+ movq %rax, %r12
+ movq $0x7fffffffffffffff, %rbx
+ sarq $63, %rax
+ # Mask the modulus
+ andq %rax, %rcx
+ andq %rax, %rbx
+ # Sub modulus (if overflow)
+ subq %rcx, %r9
+ sbbq %rax, %r10
+ sbbq %rax, %r11
+ sbbq %rbx, %r12
+ # Sub
+ subq 32(%rsp), %r13
+ movq $0x00, %rax
+ sbbq 40(%rsp), %r14
+ movq $-19, %rcx
+ sbbq 48(%rsp), %r15
+ movq $0x7fffffffffffffff, %rbx
+ sbbq 56(%rsp), %rbp
+ sbbq $0x00, %rax
+ # Mask the modulus
+ andq %rax, %rcx
+ andq %rax, %rbx
+ # Add modulus (if underflow)
+ addq %rcx, %r13
+ adcq %rax, %r14
+ adcq %rax, %r15
+ adcq %rbx, %rbp
+ movq %r9, (%rsp)
+ movq %r10, 8(%rsp)
+ movq %r11, 16(%rsp)
+ movq %r12, 24(%rsp)
+ movq %r13, 96(%rsp)
+ movq %r14, 104(%rsp)
+ movq %r15, 112(%rsp)
+ movq %rbp, 120(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq (%rdi), %rdx
+ mulxq 96(%rsp), %r9, %r10
+ # A[2] * B[0]
+ mulxq 112(%rsp), %r11, %r12
+ # A[1] * B[0]
+ mulxq 104(%rsp), %rcx, %rbx
+ xorq %rbp, %rbp
+ adcxq %rcx, %r10
+ # A[1] * B[3]
+ movq 24(%rdi), %rdx
+ mulxq 104(%rsp), %r13, %r14
+ adcxq %rbx, %r11
+ # A[0] * B[1]
+ movq 8(%rdi), %rdx
+ mulxq 96(%rsp), %rcx, %rbx
+ adoxq %rcx, %r10
+ # A[2] * B[1]
+ mulxq 112(%rsp), %rcx, %r15
+ adoxq %rbx, %r11
+ adcxq %rcx, %r12
+ # A[1] * B[2]
+ movq 16(%rdi), %rdx
+ mulxq 104(%rsp), %rcx, %rbx
+ adcxq %r15, %r13
+ adoxq %rcx, %r12
+ adcxq %rbp, %r14
+ adoxq %rbx, %r13
+ # A[0] * B[2]
+ mulxq 96(%rsp), %rcx, %rbx
+ adoxq %rbp, %r14
+ xorq %r15, %r15
+ adcxq %rcx, %r11
+ # A[1] * B[1]
+ movq 8(%rdi), %rdx
+ mulxq 104(%rsp), %rdx, %rcx
+ adcxq %rbx, %r12
+ adoxq %rdx, %r11
+ # A[3] * B[1]
+ movq 8(%rdi), %rdx
+ adoxq %rcx, %r12
+ mulxq 120(%rsp), %rcx, %rbx
+ adcxq %rcx, %r13
+ # A[2] * B[2]
+ movq 16(%rdi), %rdx
+ mulxq 112(%rsp), %rdx, %rcx
+ adcxq %rbx, %r14
+ adoxq %rdx, %r13
+ # A[3] * B[3]
+ movq 24(%rdi), %rdx
+ adoxq %rcx, %r14
+ mulxq 120(%rsp), %rcx, %rbx
+ adoxq %rbp, %r15
+ adcxq %rcx, %r15
+ # A[0] * B[3]
+ mulxq 96(%rsp), %rdx, %rcx
+ adcxq %rbx, %rbp
+ xorq %rbx, %rbx
+ adcxq %rdx, %r12
+ # A[3] * B[0]
+ movq (%rdi), %rdx
+ adcxq %rcx, %r13
+ mulxq 120(%rsp), %rdx, %rcx
+ adoxq %rdx, %r12
+ adoxq %rcx, %r13
+ # A[2] * B[3]
+ movq 24(%rdi), %rdx
+ mulxq 112(%rsp), %rdx, %rcx
+ adcxq %rdx, %r14
+ # A[3] * B[2]
+ movq 16(%rdi), %rdx
+ adcxq %rcx, %r15
+ mulxq 120(%rsp), %rcx, %rdx
+ adcxq %rbx, %rbp
+ adoxq %rcx, %r14
+ adoxq %rdx, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rbx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rbx, %rbx
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rcx, %r15
+ adcxq %rcx, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rbx, %rdx
+ adcxq %rbx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rbx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, 32(%rsp)
+ movq %r10, 40(%rsp)
+ movq %r11, 48(%rsp)
+ movq %r12, 56(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq 128(%rsp), %rdx
+ mulxq (%rsp), %r9, %r10
+ # A[2] * B[0]
+ mulxq 16(%rsp), %r11, %r12
+ # A[1] * B[0]
+ mulxq 8(%rsp), %rcx, %rbx
+ xorq %rbp, %rbp
+ adcxq %rcx, %r10
+ # A[1] * B[3]
+ movq 152(%rsp), %rdx
+ mulxq 8(%rsp), %r13, %r14
+ adcxq %rbx, %r11
+ # A[0] * B[1]
+ movq 136(%rsp), %rdx
+ mulxq (%rsp), %rcx, %rbx
+ adoxq %rcx, %r10
+ # A[2] * B[1]
+ mulxq 16(%rsp), %rcx, %r15
+ adoxq %rbx, %r11
+ adcxq %rcx, %r12
+ # A[1] * B[2]
+ movq 144(%rsp), %rdx
+ mulxq 8(%rsp), %rcx, %rbx
+ adcxq %r15, %r13
+ adoxq %rcx, %r12
+ adcxq %rbp, %r14
+ adoxq %rbx, %r13
+ # A[0] * B[2]
+ mulxq (%rsp), %rcx, %rbx
+ adoxq %rbp, %r14
+ xorq %r15, %r15
+ adcxq %rcx, %r11
+ # A[1] * B[1]
+ movq 136(%rsp), %rdx
+ mulxq 8(%rsp), %rdx, %rcx
+ adcxq %rbx, %r12
+ adoxq %rdx, %r11
+ # A[3] * B[1]
+ movq 136(%rsp), %rdx
+ adoxq %rcx, %r12
+ mulxq 24(%rsp), %rcx, %rbx
+ adcxq %rcx, %r13
+ # A[2] * B[2]
+ movq 144(%rsp), %rdx
+ mulxq 16(%rsp), %rdx, %rcx
+ adcxq %rbx, %r14
+ adoxq %rdx, %r13
+ # A[3] * B[3]
+ movq 152(%rsp), %rdx
+ adoxq %rcx, %r14
+ mulxq 24(%rsp), %rcx, %rbx
+ adoxq %rbp, %r15
+ adcxq %rcx, %r15
+ # A[0] * B[3]
+ mulxq (%rsp), %rdx, %rcx
+ adcxq %rbx, %rbp
+ xorq %rbx, %rbx
+ adcxq %rdx, %r12
+ # A[3] * B[0]
+ movq 128(%rsp), %rdx
+ adcxq %rcx, %r13
+ mulxq 24(%rsp), %rdx, %rcx
+ adoxq %rdx, %r12
+ adoxq %rcx, %r13
+ # A[2] * B[3]
+ movq 152(%rsp), %rdx
+ mulxq 16(%rsp), %rdx, %rcx
+ adcxq %rdx, %r14
+ # A[3] * B[2]
+ movq 144(%rsp), %rdx
+ adcxq %rcx, %r15
+ mulxq 24(%rsp), %rcx, %rdx
+ adcxq %rbx, %rbp
+ adoxq %rcx, %r14
+ adoxq %rdx, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rbx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rbx, %rbx
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rcx, %r15
+ adcxq %rcx, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rbx, %rdx
+ adcxq %rbx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rbx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, (%rsp)
+ movq %r10, 8(%rsp)
+ movq %r11, 16(%rsp)
+ movq %r12, 24(%rsp)
+ # Square
+ # A[0] * A[1]
+ movq 128(%rsp), %rdx
+ mulxq 136(%rsp), %r10, %r11
+ # A[0] * A[3]
+ mulxq 152(%rsp), %r12, %r13
+ # A[2] * A[1]
+ movq 144(%rsp), %rdx
+ mulxq 136(%rsp), %rcx, %rbx
+ xorq %rbp, %rbp
+ adoxq %rcx, %r12
+ # A[2] * A[3]
+ mulxq 152(%rsp), %r14, %r15
+ adoxq %rbx, %r13
+ # A[2] * A[0]
+ mulxq 128(%rsp), %rcx, %rbx
+ adoxq %rbp, %r14
+ adcxq %rcx, %r11
+ adoxq %rbp, %r15
+ # A[1] * A[3]
+ movq 136(%rsp), %rdx
+ mulxq 152(%rsp), %rax, %r9
+ adcxq %rbx, %r12
+ adcxq %rax, %r13
+ adcxq %r9, %r14
+ adcxq %rbp, %r15
+ # Double with Carry Flag
+ xorq %rbp, %rbp
+ # A[0] * A[0]
+ movq 128(%rsp), %rdx
+ mulxq %rdx, %r9, %rax
+ adcxq %r10, %r10
+ # A[1] * A[1]
+ movq 136(%rsp), %rdx
+ mulxq %rdx, %rcx, %rbx
+ adcxq %r11, %r11
+ adoxq %rax, %r10
+ adcxq %r12, %r12
+ adoxq %rcx, %r11
+ # A[2] * A[2]
+ movq 144(%rsp), %rdx
+ mulxq %rdx, %rax, %rcx
+ adcxq %r13, %r13
+ adoxq %rbx, %r12
+ adcxq %r14, %r14
+ adoxq %rax, %r13
+ # A[3] * A[3]
+ movq 152(%rsp), %rdx
+ mulxq %rdx, %rax, %rbx
+ adcxq %r15, %r15
+ adoxq %rcx, %r14
+ adcxq %rbp, %rbp
+ adoxq %rax, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rcx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r13, %rax, %r13
+ adcxq %rax, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rax, %r14
+ adcxq %rax, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rax, %r15
+ adcxq %rax, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r12
+ addq %rax, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r12
+ addq %rax, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, 96(%rsp)
+ movq %r10, 104(%rsp)
+ movq %r11, 112(%rsp)
+ movq %r12, 120(%rsp)
+ # Square
+ # A[0] * A[1]
+ movq (%rdi), %rdx
+ mulxq 8(%rdi), %r10, %r11
+ # A[0] * A[3]
+ mulxq 24(%rdi), %r12, %r13
+ # A[2] * A[1]
+ movq 16(%rdi), %rdx
+ mulxq 8(%rdi), %rcx, %rbx
+ xorq %rbp, %rbp
+ adoxq %rcx, %r12
+ # A[2] * A[3]
+ mulxq 24(%rdi), %r14, %r15
+ adoxq %rbx, %r13
+ # A[2] * A[0]
+ mulxq (%rdi), %rcx, %rbx
+ adoxq %rbp, %r14
+ adcxq %rcx, %r11
+ adoxq %rbp, %r15
+ # A[1] * A[3]
+ movq 8(%rdi), %rdx
+ mulxq 24(%rdi), %rax, %r9
+ adcxq %rbx, %r12
+ adcxq %rax, %r13
+ adcxq %r9, %r14
+ adcxq %rbp, %r15
+ # Double with Carry Flag
+ xorq %rbp, %rbp
+ # A[0] * A[0]
+ movq (%rdi), %rdx
+ mulxq %rdx, %r9, %rax
+ adcxq %r10, %r10
+ # A[1] * A[1]
+ movq 8(%rdi), %rdx
+ mulxq %rdx, %rcx, %rbx
+ adcxq %r11, %r11
+ adoxq %rax, %r10
+ adcxq %r12, %r12
+ adoxq %rcx, %r11
+ # A[2] * A[2]
+ movq 16(%rdi), %rdx
+ mulxq %rdx, %rax, %rcx
+ adcxq %r13, %r13
+ adoxq %rbx, %r12
+ adcxq %r14, %r14
+ adoxq %rax, %r13
+ # A[3] * A[3]
+ movq 24(%rdi), %rdx
+ mulxq %rdx, %rax, %rbx
+ adcxq %r15, %r15
+ adoxq %rcx, %r14
+ adcxq %rbp, %rbp
+ adoxq %rax, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rcx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r13, %rax, %r13
+ adcxq %rax, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rax, %r14
+ adcxq %rax, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rax, %r15
+ adcxq %rax, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r12
+ addq %rax, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r12
+ addq %rax, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, 128(%rsp)
+ movq %r10, 136(%rsp)
+ movq %r11, 144(%rsp)
+ movq %r12, 152(%rsp)
+ # Add
+ movq 32(%rsp), %r9
+ movq 40(%rsp), %r10
+ movq 48(%rsp), %r11
+ movq 56(%rsp), %rax
+ movq %r9, %r13
+ addq (%rsp), %r9
+ movq %r10, %r14
+ adcq 8(%rsp), %r10
+ movq %r11, %r15
+ adcq 16(%rsp), %r11
+ movq %rax, %rbp
+ adcq 24(%rsp), %rax
+ movq $-19, %rcx
+ movq %rax, %r12
+ movq $0x7fffffffffffffff, %rbx
+ sarq $63, %rax
+ # Mask the modulus
+ andq %rax, %rcx
+ andq %rax, %rbx
+ # Sub modulus (if overflow)
+ subq %rcx, %r9
+ sbbq %rax, %r10
+ sbbq %rax, %r11
+ sbbq %rbx, %r12
+ # Sub
+ subq (%rsp), %r13
+ movq $0x00, %rax
+ sbbq 8(%rsp), %r14
+ movq $-19, %rcx
+ sbbq 16(%rsp), %r15
+ movq $0x7fffffffffffffff, %rbx
+ sbbq 24(%rsp), %rbp
+ sbbq $0x00, %rax
+ # Mask the modulus
+ andq %rax, %rcx
+ andq %rax, %rbx
+ # Add modulus (if underflow)
+ addq %rcx, %r13
+ adcq %rax, %r14
+ adcq %rax, %r15
+ adcq %rbx, %rbp
+ movq %r9, 64(%rsp)
+ movq %r10, 72(%rsp)
+ movq %r11, 80(%rsp)
+ movq %r12, 88(%rsp)
+ movq %r13, (%rsp)
+ movq %r14, 8(%rsp)
+ movq %r15, 16(%rsp)
+ movq %rbp, 24(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq 96(%rsp), %rdx
+ mulxq 128(%rsp), %r9, %r10
+ # A[2] * B[0]
+ mulxq 144(%rsp), %r11, %r12
+ # A[1] * B[0]
+ mulxq 136(%rsp), %rcx, %rbx
+ xorq %rbp, %rbp
+ adcxq %rcx, %r10
+ # A[1] * B[3]
+ movq 120(%rsp), %rdx
+ mulxq 136(%rsp), %r13, %r14
+ adcxq %rbx, %r11
+ # A[0] * B[1]
+ movq 104(%rsp), %rdx
+ mulxq 128(%rsp), %rcx, %rbx
+ adoxq %rcx, %r10
+ # A[2] * B[1]
+ mulxq 144(%rsp), %rcx, %r15
+ adoxq %rbx, %r11
+ adcxq %rcx, %r12
+ # A[1] * B[2]
+ movq 112(%rsp), %rdx
+ mulxq 136(%rsp), %rcx, %rbx
+ adcxq %r15, %r13
+ adoxq %rcx, %r12
+ adcxq %rbp, %r14
+ adoxq %rbx, %r13
+ # A[0] * B[2]
+ mulxq 128(%rsp), %rcx, %rbx
+ adoxq %rbp, %r14
+ xorq %r15, %r15
+ adcxq %rcx, %r11
+ # A[1] * B[1]
+ movq 104(%rsp), %rdx
+ mulxq 136(%rsp), %rdx, %rcx
+ adcxq %rbx, %r12
+ adoxq %rdx, %r11
+ # A[3] * B[1]
+ movq 104(%rsp), %rdx
+ adoxq %rcx, %r12
+ mulxq 152(%rsp), %rcx, %rbx
+ adcxq %rcx, %r13
+ # A[2] * B[2]
+ movq 112(%rsp), %rdx
+ mulxq 144(%rsp), %rdx, %rcx
+ adcxq %rbx, %r14
+ adoxq %rdx, %r13
+ # A[3] * B[3]
+ movq 120(%rsp), %rdx
+ adoxq %rcx, %r14
+ mulxq 152(%rsp), %rcx, %rbx
+ adoxq %rbp, %r15
+ adcxq %rcx, %r15
+ # A[0] * B[3]
+ mulxq 128(%rsp), %rdx, %rcx
+ adcxq %rbx, %rbp
+ xorq %rbx, %rbx
+ adcxq %rdx, %r12
+ # A[3] * B[0]
+ movq 96(%rsp), %rdx
+ adcxq %rcx, %r13
+ mulxq 152(%rsp), %rdx, %rcx
+ adoxq %rdx, %r12
+ adoxq %rcx, %r13
+ # A[2] * B[3]
+ movq 120(%rsp), %rdx
+ mulxq 144(%rsp), %rdx, %rcx
+ adcxq %rdx, %r14
+ # A[3] * B[2]
+ movq 112(%rsp), %rdx
+ adcxq %rcx, %r15
+ mulxq 152(%rsp), %rcx, %rdx
+ adcxq %rbx, %rbp
+ adoxq %rcx, %r14
+ adoxq %rdx, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rbx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rbx, %rbx
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rcx, %r15
+ adcxq %rcx, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rbx, %rdx
+ adcxq %rbx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rbx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, (%rdi)
+ movq %r10, 8(%rdi)
+ movq %r11, 16(%rdi)
+ movq %r12, 24(%rdi)
+ # Sub
+ movq 128(%rsp), %r9
+ movq 136(%rsp), %r10
+ movq 144(%rsp), %r11
+ movq 152(%rsp), %r12
+ subq 96(%rsp), %r9
+ movq $0x00, %rax
+ sbbq 104(%rsp), %r10
+ movq $-19, %rcx
+ sbbq 112(%rsp), %r11
+ movq $0x7fffffffffffffff, %rbx
+ sbbq 120(%rsp), %r12
+ sbbq $0x00, %rax
+ # Mask the modulus
+ andq %rax, %rcx
+ andq %rax, %rbx
+ # Add modulus (if underflow)
+ addq %rcx, %r9
+ adcq %rax, %r10
+ adcq %rax, %r11
+ adcq %rbx, %r12
+ movq %r9, 128(%rsp)
+ movq %r10, 136(%rsp)
+ movq %r11, 144(%rsp)
+ movq %r12, 152(%rsp)
+ # Square
+ # A[0] * A[1]
+ movq (%rsp), %rdx
+ mulxq 8(%rsp), %r10, %r11
+ # A[0] * A[3]
+ mulxq 24(%rsp), %r12, %r13
+ # A[2] * A[1]
+ movq 16(%rsp), %rdx
+ mulxq 8(%rsp), %rcx, %rbx
+ xorq %rbp, %rbp
+ adoxq %rcx, %r12
+ # A[2] * A[3]
+ mulxq 24(%rsp), %r14, %r15
+ adoxq %rbx, %r13
+ # A[2] * A[0]
+ mulxq (%rsp), %rcx, %rbx
+ adoxq %rbp, %r14
+ adcxq %rcx, %r11
+ adoxq %rbp, %r15
+ # A[1] * A[3]
+ movq 8(%rsp), %rdx
+ mulxq 24(%rsp), %rax, %r9
+ adcxq %rbx, %r12
+ adcxq %rax, %r13
+ adcxq %r9, %r14
+ adcxq %rbp, %r15
+ # Double with Carry Flag
+ xorq %rbp, %rbp
+ # A[0] * A[0]
+ movq (%rsp), %rdx
+ mulxq %rdx, %r9, %rax
+ adcxq %r10, %r10
+ # A[1] * A[1]
+ movq 8(%rsp), %rdx
+ mulxq %rdx, %rcx, %rbx
+ adcxq %r11, %r11
+ adoxq %rax, %r10
+ adcxq %r12, %r12
+ adoxq %rcx, %r11
+ # A[2] * A[2]
+ movq 16(%rsp), %rdx
+ mulxq %rdx, %rax, %rcx
+ adcxq %r13, %r13
+ adoxq %rbx, %r12
+ adcxq %r14, %r14
+ adoxq %rax, %r13
+ # A[3] * A[3]
+ movq 24(%rsp), %rdx
+ mulxq %rdx, %rax, %rbx
+ adcxq %r15, %r15
+ adoxq %rcx, %r14
+ adcxq %rbp, %rbp
+ adoxq %rax, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rcx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r13, %rax, %r13
+ adcxq %rax, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rax, %r14
+ adcxq %rax, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rax, %r15
+ adcxq %rax, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r12
+ addq %rax, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r12
+ addq %rax, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, (%rsp)
+ movq %r10, 8(%rsp)
+ movq %r11, 16(%rsp)
+ movq %r12, 24(%rsp)
+ movq $0x1db42, %rdx
+ mulxq 128(%rsp), %r9, %rbp
+ mulxq 136(%rsp), %r10, %r15
+ mulxq 144(%rsp), %r11, %r14
+ mulxq 152(%rsp), %r12, %r13
+ addq %rbp, %r10
+ adcq %r15, %r11
+ adcq %r14, %r12
+ adcq $0x00, %r13
+ movq $0x7fffffffffffffff, %rbp
+ shldq $0x01, %r12, %r13
+ andq %rbp, %r12
+ imulq $19, %r13, %r13
+ addq %r13, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ movq %r9, 32(%rsp)
+ movq %r10, 40(%rsp)
+ movq %r11, 48(%rsp)
+ movq %r12, 56(%rsp)
+ # Square
+ # A[0] * A[1]
+ movq 64(%rsp), %rdx
+ mulxq 72(%rsp), %r10, %r11
+ # A[0] * A[3]
+ mulxq 88(%rsp), %r12, %r13
+ # A[2] * A[1]
+ movq 80(%rsp), %rdx
+ mulxq 72(%rsp), %rcx, %rbx
+ xorq %rbp, %rbp
+ adoxq %rcx, %r12
+ # A[2] * A[3]
+ mulxq 88(%rsp), %r14, %r15
+ adoxq %rbx, %r13
+ # A[2] * A[0]
+ mulxq 64(%rsp), %rcx, %rbx
+ adoxq %rbp, %r14
+ adcxq %rcx, %r11
+ adoxq %rbp, %r15
+ # A[1] * A[3]
+ movq 72(%rsp), %rdx
+ mulxq 88(%rsp), %rax, %r9
+ adcxq %rbx, %r12
+ adcxq %rax, %r13
+ adcxq %r9, %r14
+ adcxq %rbp, %r15
+ # Double with Carry Flag
+ xorq %rbp, %rbp
+ # A[0] * A[0]
+ movq 64(%rsp), %rdx
+ mulxq %rdx, %r9, %rax
+ adcxq %r10, %r10
+ # A[1] * A[1]
+ movq 72(%rsp), %rdx
+ mulxq %rdx, %rcx, %rbx
+ adcxq %r11, %r11
+ adoxq %rax, %r10
+ adcxq %r12, %r12
+ adoxq %rcx, %r11
+ # A[2] * A[2]
+ movq 80(%rsp), %rdx
+ mulxq %rdx, %rax, %rcx
+ adcxq %r13, %r13
+ adoxq %rbx, %r12
+ adcxq %r14, %r14
+ adoxq %rax, %r13
+ # A[3] * A[3]
+ movq 88(%rsp), %rdx
+ mulxq %rdx, %rax, %rbx
+ adcxq %r15, %r15
+ adoxq %rcx, %r14
+ adcxq %rbp, %rbp
+ adoxq %rax, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rcx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r13, %rax, %r13
+ adcxq %rax, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rax, %r14
+ adcxq %rax, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rax, %r15
+ adcxq %rax, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r12
+ addq %rax, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rax
+ andq %rcx, %r12
+ addq %rax, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, 64(%rsp)
+ movq %r10, 72(%rsp)
+ movq %r11, 80(%rsp)
+ movq %r12, 88(%rsp)
+ # Add
+ movq 96(%rsp), %r9
+ movq 104(%rsp), %r10
+ addq 32(%rsp), %r9
+ movq 112(%rsp), %r11
+ adcq 40(%rsp), %r10
+ movq 120(%rsp), %rax
+ adcq 48(%rsp), %r11
+ movq $-19, %rcx
+ adcq 56(%rsp), %rax
+ movq $0x7fffffffffffffff, %rbx
+ movq %rax, %r12
+ sarq $63, %rax
+ # Mask the modulus
+ andq %rax, %rcx
+ andq %rax, %rbx
+ # Sub modulus (if overflow)
+ subq %rcx, %r9
+ sbbq %rax, %r10
+ sbbq %rax, %r11
+ sbbq %rbx, %r12
+ movq %r9, 96(%rsp)
+ movq %r10, 104(%rsp)
+ movq %r11, 112(%rsp)
+ movq %r12, 120(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq (%rsp), %rdx
+ mulxq (%r8), %r9, %r10
+ # A[2] * B[0]
+ mulxq 16(%r8), %r11, %r12
+ # A[1] * B[0]
+ mulxq 8(%r8), %rcx, %rbx
+ xorq %rbp, %rbp
+ adcxq %rcx, %r10
+ # A[1] * B[3]
+ movq 24(%rsp), %rdx
+ mulxq 8(%r8), %r13, %r14
+ adcxq %rbx, %r11
+ # A[0] * B[1]
+ movq 8(%rsp), %rdx
+ mulxq (%r8), %rcx, %rbx
+ adoxq %rcx, %r10
+ # A[2] * B[1]
+ mulxq 16(%r8), %rcx, %r15
+ adoxq %rbx, %r11
+ adcxq %rcx, %r12
+ # A[1] * B[2]
+ movq 16(%rsp), %rdx
+ mulxq 8(%r8), %rcx, %rbx
+ adcxq %r15, %r13
+ adoxq %rcx, %r12
+ adcxq %rbp, %r14
+ adoxq %rbx, %r13
+ # A[0] * B[2]
+ mulxq (%r8), %rcx, %rbx
+ adoxq %rbp, %r14
+ xorq %r15, %r15
+ adcxq %rcx, %r11
+ # A[1] * B[1]
+ movq 8(%rsp), %rdx
+ mulxq 8(%r8), %rdx, %rcx
+ adcxq %rbx, %r12
+ adoxq %rdx, %r11
+ # A[3] * B[1]
+ movq 8(%rsp), %rdx
+ adoxq %rcx, %r12
+ mulxq 24(%r8), %rcx, %rbx
+ adcxq %rcx, %r13
+ # A[2] * B[2]
+ movq 16(%rsp), %rdx
+ mulxq 16(%r8), %rdx, %rcx
+ adcxq %rbx, %r14
+ adoxq %rdx, %r13
+ # A[3] * B[3]
+ movq 24(%rsp), %rdx
+ adoxq %rcx, %r14
+ mulxq 24(%r8), %rcx, %rbx
+ adoxq %rbp, %r15
+ adcxq %rcx, %r15
+ # A[0] * B[3]
+ mulxq (%r8), %rdx, %rcx
+ adcxq %rbx, %rbp
+ xorq %rbx, %rbx
+ adcxq %rdx, %r12
+ # A[3] * B[0]
+ movq (%rsp), %rdx
+ adcxq %rcx, %r13
+ mulxq 24(%r8), %rdx, %rcx
+ adoxq %rdx, %r12
+ adoxq %rcx, %r13
+ # A[2] * B[3]
+ movq 24(%rsp), %rdx
+ mulxq 16(%r8), %rdx, %rcx
+ adcxq %rdx, %r14
+ # A[3] * B[2]
+ movq 16(%rsp), %rdx
+ adcxq %rcx, %r15
+ mulxq 24(%r8), %rcx, %rdx
+ adcxq %rbx, %rbp
+ adoxq %rcx, %r14
+ adoxq %rdx, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rbx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rbx, %rbx
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rcx, %r15
+ adcxq %rcx, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rbx, %rdx
+ adcxq %rbx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rbx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, 32(%rsp)
+ movq %r10, 40(%rsp)
+ movq %r11, 48(%rsp)
+ movq %r12, 56(%rsp)
+ # Multiply
+ # A[0] * B[0]
+ movq 96(%rsp), %rdx
+ mulxq 128(%rsp), %r9, %r10
+ # A[2] * B[0]
+ mulxq 144(%rsp), %r11, %r12
+ # A[1] * B[0]
+ mulxq 136(%rsp), %rcx, %rbx
+ xorq %rbp, %rbp
+ adcxq %rcx, %r10
+ # A[1] * B[3]
+ movq 120(%rsp), %rdx
+ mulxq 136(%rsp), %r13, %r14
+ adcxq %rbx, %r11
+ # A[0] * B[1]
+ movq 104(%rsp), %rdx
+ mulxq 128(%rsp), %rcx, %rbx
+ adoxq %rcx, %r10
+ # A[2] * B[1]
+ mulxq 144(%rsp), %rcx, %r15
+ adoxq %rbx, %r11
+ adcxq %rcx, %r12
+ # A[1] * B[2]
+ movq 112(%rsp), %rdx
+ mulxq 136(%rsp), %rcx, %rbx
+ adcxq %r15, %r13
+ adoxq %rcx, %r12
+ adcxq %rbp, %r14
+ adoxq %rbx, %r13
+ # A[0] * B[2]
+ mulxq 128(%rsp), %rcx, %rbx
+ adoxq %rbp, %r14
+ xorq %r15, %r15
+ adcxq %rcx, %r11
+ # A[1] * B[1]
+ movq 104(%rsp), %rdx
+ mulxq 136(%rsp), %rdx, %rcx
+ adcxq %rbx, %r12
+ adoxq %rdx, %r11
+ # A[3] * B[1]
+ movq 104(%rsp), %rdx
+ adoxq %rcx, %r12
+ mulxq 152(%rsp), %rcx, %rbx
+ adcxq %rcx, %r13
+ # A[2] * B[2]
+ movq 112(%rsp), %rdx
+ mulxq 144(%rsp), %rdx, %rcx
+ adcxq %rbx, %r14
+ adoxq %rdx, %r13
+ # A[3] * B[3]
+ movq 120(%rsp), %rdx
+ adoxq %rcx, %r14
+ mulxq 152(%rsp), %rcx, %rbx
+ adoxq %rbp, %r15
+ adcxq %rcx, %r15
+ # A[0] * B[3]
+ mulxq 128(%rsp), %rdx, %rcx
+ adcxq %rbx, %rbp
+ xorq %rbx, %rbx
+ adcxq %rdx, %r12
+ # A[3] * B[0]
+ movq 96(%rsp), %rdx
+ adcxq %rcx, %r13
+ mulxq 152(%rsp), %rdx, %rcx
+ adoxq %rdx, %r12
+ adoxq %rcx, %r13
+ # A[2] * B[3]
+ movq 120(%rsp), %rdx
+ mulxq 144(%rsp), %rdx, %rcx
+ adcxq %rdx, %r14
+ # A[3] * B[2]
+ movq 112(%rsp), %rdx
+ adcxq %rcx, %r15
+ mulxq 152(%rsp), %rcx, %rdx
+ adcxq %rbx, %rbp
+ adoxq %rcx, %r14
+ adoxq %rdx, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rbx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rbx, %rbx
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rcx, %r15
+ adcxq %rcx, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rbx, %rdx
+ adcxq %rbx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rbx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, (%rsp)
+ movq %r10, 8(%rsp)
+ movq %r11, 16(%rsp)
+ movq %r12, 24(%rsp)
+ decb 168(%rsp)
+ jge L_curve25519_avx2_bits
+ movq $63, 168(%rsp)
+ decb 160(%rsp)
+ jge L_curve25519_avx2_words
+ # Invert
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ movq %rsp, %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 96(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 128(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 128(%rsp), %rdi
+ leaq 128(%rsp), %rsi
+ movq $19, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 128(%rsp), %rsi
+ leaq 96(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $9, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 128(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 128(%rsp), %rdi
+ leaq 128(%rsp), %rsi
+ movq $0x63, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 128(%rsp), %rsi
+ leaq 96(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 96(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ movq $49, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 96(%rsp), %rsi
+ leaq 64(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movq $4, %rdx
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ movq 176(%rsp), %rdi
+ # Multiply
+ # A[0] * B[0]
+ movq (%rsp), %rdx
+ mulxq (%rdi), %r9, %r10
+ # A[2] * B[0]
+ mulxq 16(%rdi), %r11, %r12
+ # A[1] * B[0]
+ mulxq 8(%rdi), %rcx, %rbx
+ xorq %rbp, %rbp
+ adcxq %rcx, %r10
+ # A[1] * B[3]
+ movq 24(%rsp), %rdx
+ mulxq 8(%rdi), %r13, %r14
+ adcxq %rbx, %r11
+ # A[0] * B[1]
+ movq 8(%rsp), %rdx
+ mulxq (%rdi), %rcx, %rbx
+ adoxq %rcx, %r10
+ # A[2] * B[1]
+ mulxq 16(%rdi), %rcx, %r15
+ adoxq %rbx, %r11
+ adcxq %rcx, %r12
+ # A[1] * B[2]
+ movq 16(%rsp), %rdx
+ mulxq 8(%rdi), %rcx, %rbx
+ adcxq %r15, %r13
+ adoxq %rcx, %r12
+ adcxq %rbp, %r14
+ adoxq %rbx, %r13
+ # A[0] * B[2]
+ mulxq (%rdi), %rcx, %rbx
+ adoxq %rbp, %r14
+ xorq %r15, %r15
+ adcxq %rcx, %r11
+ # A[1] * B[1]
+ movq 8(%rsp), %rdx
+ mulxq 8(%rdi), %rdx, %rcx
+ adcxq %rbx, %r12
+ adoxq %rdx, %r11
+ # A[3] * B[1]
+ movq 8(%rsp), %rdx
+ adoxq %rcx, %r12
+ mulxq 24(%rdi), %rcx, %rbx
+ adcxq %rcx, %r13
+ # A[2] * B[2]
+ movq 16(%rsp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rbx, %r14
+ adoxq %rdx, %r13
+ # A[3] * B[3]
+ movq 24(%rsp), %rdx
+ adoxq %rcx, %r14
+ mulxq 24(%rdi), %rcx, %rbx
+ adoxq %rbp, %r15
+ adcxq %rcx, %r15
+ # A[0] * B[3]
+ mulxq (%rdi), %rdx, %rcx
+ adcxq %rbx, %rbp
+ xorq %rbx, %rbx
+ adcxq %rdx, %r12
+ # A[3] * B[0]
+ movq (%rsp), %rdx
+ adcxq %rcx, %r13
+ mulxq 24(%rdi), %rdx, %rcx
+ adoxq %rdx, %r12
+ adoxq %rcx, %r13
+ # A[2] * B[3]
+ movq 24(%rsp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rdx, %r14
+ # A[3] * B[2]
+ movq 16(%rsp), %rdx
+ adcxq %rcx, %r15
+ mulxq 24(%rdi), %rcx, %rdx
+ adcxq %rbx, %rbp
+ adoxq %rcx, %r14
+ adoxq %rdx, %r15
+ adoxq %rbx, %rbp
+ # Reduce
+ movq $0x7fffffffffffffff, %rbx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r15, %rbp
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ andq %rbx, %r12
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rbx, %rbx
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %rcx, %r15
+ adcxq %rcx, %r11
+ adoxq %r15, %r12
+ mulxq %rbp, %rbp, %rdx
+ adcxq %rbp, %r12
+ adoxq %rbx, %rdx
+ adcxq %rbx, %rdx
+ # Overflow
+ shldq $0x01, %r12, %rdx
+ movq $0x7fffffffffffffff, %rbx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Reduce if top bit set
+ movq %r12, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rbx, %r12
+ addq %rcx, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ adcq $0x00, %r12
+ # Store
+ movq %r9, (%rdi)
+ movq %r10, 8(%rdi)
+ movq %r11, 16(%rdi)
+ movq %r12, 24(%rdi)
+ xorq %rax, %rax
+ addq $0xc0, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size curve25519_avx2,.-curve25519_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_pow22523_avx2
+.type fe_pow22523_avx2,@function
+.align 4
+fe_pow22523_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_pow22523_avx2
+.p2align 2
+_fe_pow22523_avx2:
+#endif /* __APPLE__ */
+ subq $0x70, %rsp
+ # pow22523
+ movq %rdi, 96(%rsp)
+ movq %rsi, 104(%rsp)
+ movq %rsp, %rdi
+ movq 104(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq 104(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movb $4, %dl
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movb $9, %dl
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movb $19, %dl
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movb $9, %dl
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movb $49, %dl
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 64(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ movb $0x63, %dl
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 64(%rsp), %rsi
+ leaq 32(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ leaq 32(%rsp), %rdi
+ leaq 32(%rsp), %rsi
+ movb $49, %dl
+#ifndef __APPLE__
+ callq fe_sq_n_avx2@plt
+#else
+ callq _fe_sq_n_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ leaq 32(%rsp), %rsi
+ movq %rsp, %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ movq %rsp, %rdi
+ movq %rsp, %rsi
+#ifndef __APPLE__
+ callq fe_sq_avx2@plt
+#else
+ callq _fe_sq_avx2
+#endif /* __APPLE__ */
+ movq 96(%rsp), %rdi
+ movq %rsp, %rsi
+ movq 104(%rsp), %rdx
+#ifndef __APPLE__
+ callq fe_mul_avx2@plt
+#else
+ callq _fe_mul_avx2
+#endif /* __APPLE__ */
+ movq 104(%rsp), %rsi
+ movq 96(%rsp), %rdi
+ addq $0x70, %rsp
+ repz retq
+#ifndef __APPLE__
+.text
+.globl fe_ge_to_p2_avx2
+.type fe_ge_to_p2_avx2,@function
+.align 4
+fe_ge_to_p2_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_ge_to_p2_avx2
+.p2align 2
+_fe_ge_to_p2_avx2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $40, %rsp
+ movq %rsi, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rcx, 16(%rsp)
+ movq %r8, 24(%rsp)
+ movq %r9, 32(%rsp)
+ movq 16(%rsp), %rsi
+ movq 88(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 88(%rsp), %rsi
+ # Multiply
+ # A[0] * B[0]
+ movq (%rsi), %rdx
+ mulxq (%rbx), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rbx), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rbx), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rsi), %rdx
+ mulxq 8(%rbx), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rsi), %rdx
+ mulxq (%rbx), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rbx), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rsi), %rdx
+ mulxq 8(%rbx), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rbx), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rsi), %rdx
+ mulxq 8(%rbx), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rsi), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rbx), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rsi), %rdx
+ mulxq 16(%rbx), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rsi), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rbx), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rbx), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rsi), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rbx), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rsi), %rdx
+ mulxq 16(%rbx), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rsi), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rbx), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $40, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_to_p3_avx2
+.type fe_ge_to_p3_avx2,@function
+.align 4
+fe_ge_to_p3_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_ge_to_p3_avx2
+.p2align 2
+_fe_ge_to_p3_avx2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $40, %rsp
+ movq %rsi, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rcx, 16(%rsp)
+ movq %r8, 24(%rsp)
+ movq %r9, 32(%rsp)
+ movq 24(%rsp), %rsi
+ movq 96(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq (%rsp), %rdi
+ movq 32(%rsp), %rsi
+ movq 88(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq 96(%rsp), %rsi
+ # Multiply
+ # A[0] * B[0]
+ movq (%rsi), %rdx
+ mulxq (%rbx), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rbx), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rbx), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rsi), %rdx
+ mulxq 8(%rbx), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rsi), %rdx
+ mulxq (%rbx), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rbx), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rsi), %rdx
+ mulxq 8(%rbx), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rbx), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rsi), %rdx
+ mulxq 8(%rbx), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rsi), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rbx), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rsi), %rdx
+ mulxq 16(%rbx), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rsi), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rbx), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rbx), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rsi), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rbx), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rsi), %rdx
+ mulxq 16(%rbx), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rsi), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rbx), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 32(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ addq $40, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_dbl_avx2
+.type fe_ge_dbl_avx2,@function
+.align 4
+fe_ge_dbl_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_ge_dbl_avx2
+.p2align 2
+_fe_ge_dbl_avx2:
+#endif /* __APPLE__ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $48, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq 32(%rsp), %rsi
+ # Square
+ # A[0] * A[1]
+ movq (%rsi), %rdx
+ mulxq 8(%rsi), %r9, %r10
+ # A[0] * A[3]
+ mulxq 24(%rsi), %r11, %r12
+ # A[2] * A[1]
+ movq 16(%rsi), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adoxq %rcx, %r11
+ # A[2] * A[3]
+ mulxq 24(%rsi), %r13, %r14
+ adoxq %rax, %r12
+ # A[2] * A[0]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ adcxq %rcx, %r10
+ adoxq %r15, %r14
+ # A[1] * A[3]
+ movq 8(%rsi), %rdx
+ mulxq 24(%rsi), %rbp, %r8
+ adcxq %rax, %r11
+ adcxq %rbp, %r12
+ adcxq %r8, %r13
+ adcxq %r15, %r14
+ # Double with Carry Flag
+ xorq %r15, %r15
+ # A[0] * A[0]
+ movq (%rsi), %rdx
+ mulxq %rdx, %r8, %rbp
+ adcxq %r9, %r9
+ # A[1] * A[1]
+ movq 8(%rsi), %rdx
+ mulxq %rdx, %rcx, %rax
+ adcxq %r10, %r10
+ adoxq %rbp, %r9
+ adcxq %r11, %r11
+ adoxq %rcx, %r10
+ # A[2] * A[2]
+ movq 16(%rsi), %rdx
+ mulxq %rdx, %rbp, %rcx
+ adcxq %r12, %r12
+ adoxq %rax, %r11
+ adcxq %r13, %r13
+ adoxq %rbp, %r12
+ # A[3] * A[3]
+ movq 24(%rsi), %rdx
+ mulxq %rdx, %rbp, %rax
+ adcxq %r14, %r14
+ adoxq %rcx, %r13
+ adcxq %r15, %r15
+ adoxq %rbp, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r12, %rbp, %r12
+ adcxq %rbp, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rbp, %r13
+ adcxq %rbp, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rbp, %r14
+ adcxq %rbp, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rbp
+ andq %rcx, %r11
+ addq %rbp, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rbp
+ andq %rcx, %r11
+ addq %rbp, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 16(%rsp), %rdi
+ movq 40(%rsp), %rbx
+ # Square
+ # A[0] * A[1]
+ movq (%rbx), %rdx
+ mulxq 8(%rbx), %r9, %r10
+ # A[0] * A[3]
+ mulxq 24(%rbx), %r11, %r12
+ # A[2] * A[1]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rbx), %rcx, %rax
+ xorq %r15, %r15
+ adoxq %rcx, %r11
+ # A[2] * A[3]
+ mulxq 24(%rbx), %r13, %r14
+ adoxq %rax, %r12
+ # A[2] * A[0]
+ mulxq (%rbx), %rcx, %rax
+ adoxq %r15, %r13
+ adcxq %rcx, %r10
+ adoxq %r15, %r14
+ # A[1] * A[3]
+ movq 8(%rbx), %rdx
+ mulxq 24(%rbx), %rbp, %r8
+ adcxq %rax, %r11
+ adcxq %rbp, %r12
+ adcxq %r8, %r13
+ adcxq %r15, %r14
+ # Double with Carry Flag
+ xorq %r15, %r15
+ # A[0] * A[0]
+ movq (%rbx), %rdx
+ mulxq %rdx, %r8, %rbp
+ adcxq %r9, %r9
+ # A[1] * A[1]
+ movq 8(%rbx), %rdx
+ mulxq %rdx, %rcx, %rax
+ adcxq %r10, %r10
+ adoxq %rbp, %r9
+ adcxq %r11, %r11
+ adoxq %rcx, %r10
+ # A[2] * A[2]
+ movq 16(%rbx), %rdx
+ mulxq %rdx, %rbp, %rcx
+ adcxq %r12, %r12
+ adoxq %rax, %r11
+ adcxq %r13, %r13
+ adoxq %rbp, %r12
+ # A[3] * A[3]
+ movq 24(%rbx), %rdx
+ mulxq %rdx, %rbp, %rax
+ adcxq %r14, %r14
+ adoxq %rcx, %r13
+ adcxq %r15, %r15
+ adoxq %rbp, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r12, %rbp, %r12
+ adcxq %rbp, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rbp, %r13
+ adcxq %rbp, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rbp, %r14
+ adcxq %rbp, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rbp
+ andq %rcx, %r11
+ addq %rbp, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rbp
+ andq %rcx, %r11
+ addq %rbp, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq (%rbx), %r8
+ movq 16(%rsi), %r10
+ adcq 8(%rbx), %r9
+ movq 24(%rsi), %rdx
+ adcq 16(%rbx), %r10
+ movq $-19, %rcx
+ adcq 24(%rbx), %rdx
+ movq $0x7fffffffffffffff, %rax
+ movq %rdx, %r11
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 24(%rsp), %rsi
+ # Square
+ # A[0] * A[1]
+ movq (%rdi), %rdx
+ mulxq 8(%rdi), %r9, %r10
+ # A[0] * A[3]
+ mulxq 24(%rdi), %r11, %r12
+ # A[2] * A[1]
+ movq 16(%rdi), %rdx
+ mulxq 8(%rdi), %rcx, %rax
+ xorq %r15, %r15
+ adoxq %rcx, %r11
+ # A[2] * A[3]
+ mulxq 24(%rdi), %r13, %r14
+ adoxq %rax, %r12
+ # A[2] * A[0]
+ mulxq (%rdi), %rcx, %rax
+ adoxq %r15, %r13
+ adcxq %rcx, %r10
+ adoxq %r15, %r14
+ # A[1] * A[3]
+ movq 8(%rdi), %rdx
+ mulxq 24(%rdi), %rbp, %r8
+ adcxq %rax, %r11
+ adcxq %rbp, %r12
+ adcxq %r8, %r13
+ adcxq %r15, %r14
+ # Double with Carry Flag
+ xorq %r15, %r15
+ # A[0] * A[0]
+ movq (%rdi), %rdx
+ mulxq %rdx, %r8, %rbp
+ adcxq %r9, %r9
+ # A[1] * A[1]
+ movq 8(%rdi), %rdx
+ mulxq %rdx, %rcx, %rax
+ adcxq %r10, %r10
+ adoxq %rbp, %r9
+ adcxq %r11, %r11
+ adoxq %rcx, %r10
+ # A[2] * A[2]
+ movq 16(%rdi), %rdx
+ mulxq %rdx, %rbp, %rcx
+ adcxq %r12, %r12
+ adoxq %rax, %r11
+ adcxq %r13, %r13
+ adoxq %rbp, %r12
+ # A[3] * A[3]
+ movq 24(%rdi), %rdx
+ mulxq %rdx, %rbp, %rax
+ adcxq %r14, %r14
+ adoxq %rcx, %r13
+ adcxq %r15, %r15
+ adoxq %rbp, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rcx
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rcx, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rcx, %rcx
+ mulxq %r12, %rbp, %r12
+ adcxq %rbp, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rbp, %r13
+ adcxq %rbp, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rbp, %r14
+ adcxq %rbp, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rcx, %rdx
+ adcxq %rcx, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rcx
+ imulq $19, %rdx, %rbp
+ andq %rcx, %r11
+ addq %rbp, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rbp
+ andq %rcx, %r11
+ addq %rbp, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 16(%rsp), %rsi
+ movq (%rsp), %rbx
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %rdx
+ movq %r8, %r12
+ addq (%rbx), %r8
+ movq %r9, %r13
+ adcq 8(%rbx), %r9
+ movq %r10, %r14
+ adcq 16(%rbx), %r10
+ movq %rdx, %r15
+ adcq 24(%rbx), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rbx), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rbx), %r13
+ movq $-19, %rcx
+ sbbq 16(%rbx), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rbx), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, (%rsi)
+ movq %r13, 8(%rsi)
+ movq %r14, 16(%rsi)
+ movq %r15, 24(%rsi)
+ movq 24(%rsp), %rsi
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rdi), %r8
+ movq $0x00, %rdx
+ sbbq 8(%rdi), %r9
+ movq $-19, %rcx
+ sbbq 16(%rdi), %r10
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rdi), %r11
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r8
+ adcq %rdx, %r9
+ adcq %rdx, %r10
+ adcq %rax, %r11
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq 104(%rsp), %rdi
+ # Square * 2
+ # A[0] * A[1]
+ movq (%rdi), %rdx
+ mulxq 8(%rdi), %r9, %r10
+ # A[0] * A[3]
+ mulxq 24(%rdi), %r11, %r12
+ # A[2] * A[1]
+ movq 16(%rdi), %rdx
+ mulxq 8(%rdi), %rcx, %rax
+ xorq %r15, %r15
+ adoxq %rcx, %r11
+ # A[2] * A[3]
+ mulxq 24(%rdi), %r13, %r14
+ adoxq %rax, %r12
+ # A[2] * A[0]
+ mulxq (%rdi), %rcx, %rax
+ adoxq %r15, %r13
+ adcxq %rcx, %r10
+ adoxq %r15, %r14
+ # A[1] * A[3]
+ movq 8(%rdi), %rdx
+ mulxq 24(%rdi), %rbp, %r8
+ adcxq %rax, %r11
+ adcxq %rbp, %r12
+ adcxq %r8, %r13
+ adcxq %r15, %r14
+ # Double with Carry Flag
+ xorq %r15, %r15
+ # A[0] * A[0]
+ movq (%rdi), %rdx
+ mulxq %rdx, %r8, %rbp
+ adcxq %r9, %r9
+ # A[1] * A[1]
+ movq 8(%rdi), %rdx
+ mulxq %rdx, %rcx, %rax
+ adcxq %r10, %r10
+ adoxq %rbp, %r9
+ adcxq %r11, %r11
+ adoxq %rcx, %r10
+ # A[2] * A[2]
+ movq 16(%rdi), %rdx
+ mulxq %rdx, %rbp, %rcx
+ adcxq %r12, %r12
+ adoxq %rax, %r11
+ adcxq %r13, %r13
+ adoxq %rbp, %r12
+ # A[3] * A[3]
+ movq 24(%rdi), %rdx
+ mulxq %rdx, %rbp, %rax
+ adcxq %r14, %r14
+ adoxq %rcx, %r13
+ adcxq %r15, %r15
+ adoxq %rbp, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ xorq %rbp, %rbp
+ # Move top half into t4-t7 and remove top bit from t3 and double
+ shldq $3, %r15, %rbp
+ shldq $2, %r14, %r15
+ shldq $2, %r13, %r14
+ shldq $2, %r12, %r13
+ shldq $2, %r11, %r12
+ shldq $0x01, %r10, %r11
+ shldq $0x01, %r9, %r10
+ shldq $0x01, %r8, %r9
+ shlq $0x01, %r8
+ andq %rax, %r11
+ # Two out left, one in right
+ andq %rax, %r15
+ # Multiply top bits by 19*19
+ imulq $0x169, %rbp, %rcx
+ xorq %rax, %rax
+ # Multiply top half by 19
+ movq $19, %rdx
+ adoxq %rcx, %r8
+ mulxq %r12, %rbp, %r12
+ adcxq %rbp, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rbp, %r13
+ adcxq %rbp, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rbp, %r14
+ adcxq %rbp, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rbp
+ andq %rax, %r11
+ addq %rbp, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rbp
+ andq %rax, %r11
+ addq %rbp, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 16(%rsp), %rdi
+ # Sub
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq (%rdi), %r8
+ movq $0x00, %rdx
+ sbbq 8(%rdi), %r9
+ movq $-19, %rcx
+ sbbq 16(%rdi), %r10
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rdi), %r11
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r8
+ adcq %rdx, %r9
+ adcq %rdx, %r10
+ adcq %rax, %r11
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ addq $48, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_madd_avx2
+.type fe_ge_madd_avx2,@function
+.align 4
+fe_ge_madd_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_ge_madd_avx2
+.p2align 2
+_fe_ge_madd_avx2:
+#endif /* __APPLE__ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $48, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq 8(%rsp), %rsi
+ movq 40(%rsp), %rbx
+ movq 32(%rsp), %rbp
+ # Add
+ movq (%rbx), %r8
+ movq 8(%rbx), %r9
+ movq 16(%rbx), %r10
+ movq 24(%rbx), %rdx
+ movq %r8, %r12
+ addq (%rbp), %r8
+ movq %r9, %r13
+ adcq 8(%rbp), %r9
+ movq %r10, %r14
+ adcq 16(%rbp), %r10
+ movq %rdx, %r15
+ adcq 24(%rbp), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rbp), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rbp), %r13
+ movq $-19, %rcx
+ sbbq 16(%rbp), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rbp), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, (%rsi)
+ movq %r13, 8(%rsi)
+ movq %r14, 16(%rsi)
+ movq %r15, 24(%rsi)
+ movq 16(%rsp), %rbx
+ movq 128(%rsp), %rbp
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbp), %rdx
+ mulxq (%rdi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rdi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rdi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 8(%rdi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq (%rdi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rdi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 8(%rdi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rdi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq 8(%rdi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbp), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rdi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbp), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rdi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rdi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbp), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rdi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbp), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rdi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq 136(%rsp), %rdi
+ # Multiply
+ # A[0] * B[0]
+ movq (%rdi), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rdi), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rdi), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rdi), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rdi), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rdi), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rdi), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rdi), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rdi), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rdi), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rdi), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 24(%rsp), %rdi
+ movq 120(%rsp), %rsi
+ movq 112(%rsp), %rbp
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbp), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbp), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbp), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbp), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbp), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rdi
+ movq (%rsp), %rsi
+ # Add
+ movq (%rbx), %r8
+ movq 8(%rbx), %r9
+ movq 16(%rbx), %r10
+ movq 24(%rbx), %rdx
+ movq %r8, %r12
+ addq (%rdi), %r8
+ movq %r9, %r13
+ adcq 8(%rdi), %r9
+ movq %r10, %r14
+ adcq 16(%rdi), %r10
+ movq %rdx, %r15
+ adcq 24(%rdi), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rdi), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rdi), %r13
+ movq $-19, %rcx
+ sbbq 16(%rdi), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rdi), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, (%rsi)
+ movq %r13, 8(%rsi)
+ movq %r14, 16(%rsi)
+ movq %r15, 24(%rsi)
+ movq 104(%rsp), %rdi
+ # Double
+ movq (%rdi), %r8
+ movq 8(%rdi), %r9
+ addq %r8, %r8
+ movq 16(%rdi), %r10
+ adcq %r9, %r9
+ movq 24(%rdi), %rdx
+ adcq %r10, %r10
+ movq $-19, %rcx
+ adcq %rdx, %rdx
+ movq $0x7fffffffffffffff, %rax
+ movq %rdx, %r11
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq 24(%rsp), %rdi
+ # Add
+ movq (%rbx), %r8
+ movq 8(%rbx), %r9
+ movq 16(%rbx), %r10
+ movq 24(%rbx), %rdx
+ movq %r8, %r12
+ addq (%rdi), %r8
+ movq %r9, %r13
+ adcq 8(%rdi), %r9
+ movq %r10, %r14
+ adcq 16(%rdi), %r10
+ movq %rdx, %r15
+ adcq 24(%rdi), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rdi), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rdi), %r13
+ movq $-19, %rcx
+ sbbq 16(%rdi), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rdi), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq %r12, (%rdi)
+ movq %r13, 8(%rdi)
+ movq %r14, 16(%rdi)
+ movq %r15, 24(%rdi)
+ addq $48, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_madd_avx2,.-fe_ge_madd_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_msub_avx2
+.type fe_ge_msub_avx2,@function
+.align 4
+fe_ge_msub_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_ge_msub_avx2
+.p2align 2
+_fe_ge_msub_avx2:
+#endif /* __APPLE__ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $48, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq 8(%rsp), %rsi
+ movq 40(%rsp), %rbx
+ movq 32(%rsp), %rbp
+ # Add
+ movq (%rbx), %r8
+ movq 8(%rbx), %r9
+ movq 16(%rbx), %r10
+ movq 24(%rbx), %rdx
+ movq %r8, %r12
+ addq (%rbp), %r8
+ movq %r9, %r13
+ adcq 8(%rbp), %r9
+ movq %r10, %r14
+ adcq 16(%rbp), %r10
+ movq %rdx, %r15
+ adcq 24(%rbp), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rbp), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rbp), %r13
+ movq $-19, %rcx
+ sbbq 16(%rbp), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rbp), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, (%rsi)
+ movq %r13, 8(%rsi)
+ movq %r14, 16(%rsi)
+ movq %r15, 24(%rsi)
+ movq 16(%rsp), %rbx
+ movq 136(%rsp), %rbp
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbp), %rdx
+ mulxq (%rdi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rdi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rdi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 8(%rdi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq (%rdi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rdi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 8(%rdi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rdi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq 8(%rdi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbp), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rdi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbp), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rdi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rdi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbp), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rdi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbp), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rdi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq 128(%rsp), %rdi
+ # Multiply
+ # A[0] * B[0]
+ movq (%rdi), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rdi), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rdi), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rdi), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rdi), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rdi), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rdi), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rdi), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rdi), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rdi), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rdi), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 24(%rsp), %rdi
+ movq 120(%rsp), %rsi
+ movq 112(%rsp), %rbp
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbp), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbp), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbp), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbp), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbp), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq 8(%rsp), %rsi
+ movq (%rsp), %rbp
+ # Add
+ movq (%rbx), %r8
+ movq 8(%rbx), %r9
+ movq 16(%rbx), %r10
+ movq 24(%rbx), %rdx
+ movq %r8, %r12
+ addq (%rsi), %r8
+ movq %r9, %r13
+ adcq 8(%rsi), %r9
+ movq %r10, %r14
+ adcq 16(%rsi), %r10
+ movq %rdx, %r15
+ adcq 24(%rsi), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rsi), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rsi), %r13
+ movq $-19, %rcx
+ sbbq 16(%rsi), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rsi), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq %r12, (%rbp)
+ movq %r13, 8(%rbp)
+ movq %r14, 16(%rbp)
+ movq %r15, 24(%rbp)
+ movq 104(%rsp), %rsi
+ # Double
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ addq %r8, %r8
+ movq 16(%rsi), %r10
+ adcq %r9, %r9
+ movq 24(%rsi), %rdx
+ adcq %r10, %r10
+ movq $-19, %rcx
+ adcq %rdx, %rdx
+ movq $0x7fffffffffffffff, %rax
+ movq %rdx, %r11
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ # Add
+ movq (%rbx), %r8
+ movq 8(%rbx), %r9
+ movq 16(%rbx), %r10
+ movq 24(%rbx), %rdx
+ movq %r8, %r12
+ addq (%rdi), %r8
+ movq %r9, %r13
+ adcq 8(%rdi), %r9
+ movq %r10, %r14
+ adcq 16(%rdi), %r10
+ movq %rdx, %r15
+ adcq 24(%rdi), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rdi), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rdi), %r13
+ movq $-19, %rcx
+ sbbq 16(%rdi), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rdi), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, (%rbx)
+ movq %r13, 8(%rbx)
+ movq %r14, 16(%rbx)
+ movq %r15, 24(%rbx)
+ addq $48, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_msub_avx2,.-fe_ge_msub_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_add_avx2
+.type fe_ge_add_avx2,@function
+.align 4
+fe_ge_add_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_ge_add_avx2
+.p2align 2
+_fe_ge_add_avx2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x50, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq 8(%rsp), %rsi
+ movq 40(%rsp), %rbx
+ movq 32(%rsp), %rbp
+ # Add
+ movq (%rbx), %r8
+ movq 8(%rbx), %r9
+ movq 16(%rbx), %r10
+ movq 24(%rbx), %rdx
+ movq %r8, %r12
+ addq (%rbp), %r8
+ movq %r9, %r13
+ adcq 8(%rbp), %r9
+ movq %r10, %r14
+ adcq 16(%rbp), %r10
+ movq %rdx, %r15
+ adcq 24(%rbp), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rbp), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rbp), %r13
+ movq $-19, %rcx
+ sbbq 16(%rbp), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rbp), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, (%rsi)
+ movq %r13, 8(%rsi)
+ movq %r14, 16(%rsi)
+ movq %r15, 24(%rsi)
+ movq 16(%rsp), %rbx
+ movq 168(%rsp), %rbp
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbp), %rdx
+ mulxq (%rdi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rdi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rdi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 8(%rdi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq (%rdi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rdi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 8(%rdi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rdi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq 8(%rdi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbp), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rdi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbp), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rdi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rdi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbp), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rdi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbp), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rdi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq 176(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 24(%rsp), %rsi
+ movq 160(%rsp), %rbx
+ movq 144(%rsp), %rbp
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbp), %rdx
+ mulxq (%rbx), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rbx), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rbx), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 8(%rbx), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq (%rbx), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rbx), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 8(%rbx), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rbx), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq 8(%rbx), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbp), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rbx), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 16(%rbx), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbp), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rbx), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rbx), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbp), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rbx), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 16(%rbx), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbp), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rbx), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 136(%rsp), %rsi
+ movq 152(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ leaq 48(%rsp), %rsi
+ # Double
+ movq (%rdi), %r8
+ movq 8(%rdi), %r9
+ addq %r8, %r8
+ movq 16(%rdi), %r10
+ adcq %r9, %r9
+ movq 24(%rdi), %rdx
+ adcq %r10, %r10
+ movq $-19, %rcx
+ adcq %rdx, %rdx
+ movq $0x7fffffffffffffff, %rax
+ movq %rdx, %r11
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 8(%rsp), %rbx
+ movq 16(%rsp), %rbp
+ # Add
+ movq (%rbp), %r8
+ movq 8(%rbp), %r9
+ movq 16(%rbp), %r10
+ movq 24(%rbp), %rdx
+ movq %r8, %r12
+ addq (%rbx), %r8
+ movq %r9, %r13
+ adcq 8(%rbx), %r9
+ movq %r10, %r14
+ adcq 16(%rbx), %r10
+ movq %rdx, %r15
+ adcq 24(%rbx), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rbx), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rbx), %r13
+ movq $-19, %rcx
+ sbbq 16(%rbx), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rbx), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq %r12, (%rdi)
+ movq %r13, 8(%rdi)
+ movq %r14, 16(%rdi)
+ movq %r15, 24(%rdi)
+ movq 24(%rsp), %rdi
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %rdx
+ movq %r8, %r12
+ addq (%rdi), %r8
+ movq %r9, %r13
+ adcq 8(%rdi), %r9
+ movq %r10, %r14
+ adcq 16(%rdi), %r10
+ movq %rdx, %r15
+ adcq 24(%rdi), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rdi), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rdi), %r13
+ movq $-19, %rcx
+ sbbq 16(%rdi), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rdi), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rbp)
+ movq %r9, 8(%rbp)
+ movq %r10, 16(%rbp)
+ movq %r11, 24(%rbp)
+ movq %r12, (%rdi)
+ movq %r13, 8(%rdi)
+ movq %r14, 16(%rdi)
+ movq %r15, 24(%rdi)
+ addq $0x50, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_add_avx2,.-fe_ge_add_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl fe_ge_sub_avx2
+.type fe_ge_sub_avx2,@function
+.align 4
+fe_ge_sub_avx2:
+#else
+.section __TEXT,__text
+.globl _fe_ge_sub_avx2
+.p2align 2
+_fe_ge_sub_avx2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x50, %rsp
+ movq %rdi, (%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq 8(%rsp), %rsi
+ movq 40(%rsp), %rbx
+ movq 32(%rsp), %rbp
+ # Add
+ movq (%rbx), %r8
+ movq 8(%rbx), %r9
+ movq 16(%rbx), %r10
+ movq 24(%rbx), %rdx
+ movq %r8, %r12
+ addq (%rbp), %r8
+ movq %r9, %r13
+ adcq 8(%rbp), %r9
+ movq %r10, %r14
+ adcq 16(%rbp), %r10
+ movq %rdx, %r15
+ adcq 24(%rbp), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rbp), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rbp), %r13
+ movq $-19, %rcx
+ sbbq 16(%rbp), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rbp), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, (%rsi)
+ movq %r13, 8(%rsi)
+ movq %r14, 16(%rsi)
+ movq %r15, 24(%rsi)
+ movq 16(%rsp), %rbx
+ movq 176(%rsp), %rbp
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbp), %rdx
+ mulxq (%rdi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rdi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rdi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 8(%rdi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq (%rdi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rdi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 8(%rdi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rdi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq 8(%rdi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbp), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rdi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbp), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rdi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rdi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbp), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rdi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 16(%rdi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbp), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rdi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq 168(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 24(%rsp), %rsi
+ movq 160(%rsp), %rbx
+ movq 144(%rsp), %rbp
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbp), %rdx
+ mulxq (%rbx), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rbx), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rbx), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 8(%rbx), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq (%rbx), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rbx), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 8(%rbx), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rbx), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbp), %rdx
+ mulxq 8(%rbx), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbp), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rbx), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbp), %rdx
+ mulxq 16(%rbx), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbp), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rbx), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rbx), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbp), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rbx), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbp), %rdx
+ mulxq 16(%rbx), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbp), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rbx), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 136(%rsp), %rsi
+ movq 152(%rsp), %rbx
+ # Multiply
+ # A[0] * B[0]
+ movq (%rbx), %rdx
+ mulxq (%rsi), %r8, %r9
+ # A[2] * B[0]
+ mulxq 16(%rsi), %r10, %r11
+ # A[1] * B[0]
+ mulxq 8(%rsi), %rcx, %rax
+ xorq %r15, %r15
+ adcxq %rcx, %r9
+ # A[1] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 8(%rsi), %r12, %r13
+ adcxq %rax, %r10
+ # A[0] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq (%rsi), %rcx, %rax
+ adoxq %rcx, %r9
+ # A[2] * B[1]
+ mulxq 16(%rsi), %rcx, %r14
+ adoxq %rax, %r10
+ adcxq %rcx, %r11
+ # A[1] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 8(%rsi), %rcx, %rax
+ adcxq %r14, %r12
+ adoxq %rcx, %r11
+ adcxq %r15, %r13
+ adoxq %rax, %r12
+ # A[0] * B[2]
+ mulxq (%rsi), %rcx, %rax
+ adoxq %r15, %r13
+ xorq %r14, %r14
+ adcxq %rcx, %r10
+ # A[1] * B[1]
+ movq 8(%rbx), %rdx
+ mulxq 8(%rsi), %rdx, %rcx
+ adcxq %rax, %r11
+ adoxq %rdx, %r10
+ # A[3] * B[1]
+ movq 8(%rbx), %rdx
+ adoxq %rcx, %r11
+ mulxq 24(%rsi), %rcx, %rax
+ adcxq %rcx, %r12
+ # A[2] * B[2]
+ movq 16(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rax, %r13
+ adoxq %rdx, %r12
+ # A[3] * B[3]
+ movq 24(%rbx), %rdx
+ adoxq %rcx, %r13
+ mulxq 24(%rsi), %rcx, %rax
+ adoxq %r15, %r14
+ adcxq %rcx, %r14
+ # A[0] * B[3]
+ mulxq (%rsi), %rdx, %rcx
+ adcxq %rax, %r15
+ xorq %rax, %rax
+ adcxq %rdx, %r11
+ # A[3] * B[0]
+ movq (%rbx), %rdx
+ adcxq %rcx, %r12
+ mulxq 24(%rsi), %rdx, %rcx
+ adoxq %rdx, %r11
+ adoxq %rcx, %r12
+ # A[2] * B[3]
+ movq 24(%rbx), %rdx
+ mulxq 16(%rsi), %rdx, %rcx
+ adcxq %rdx, %r13
+ # A[3] * B[2]
+ movq 16(%rbx), %rdx
+ adcxq %rcx, %r14
+ mulxq 24(%rsi), %rcx, %rdx
+ adcxq %rax, %r15
+ adoxq %rcx, %r13
+ adoxq %rdx, %r14
+ adoxq %rax, %r15
+ # Reduce
+ movq $0x7fffffffffffffff, %rax
+ # Move top half into t4-t7 and remove top bit from t3
+ shldq $0x01, %r14, %r15
+ shldq $0x01, %r13, %r14
+ shldq $0x01, %r12, %r13
+ shldq $0x01, %r11, %r12
+ andq %rax, %r11
+ # Multiply top half by 19
+ movq $19, %rdx
+ xorq %rax, %rax
+ mulxq %r12, %rcx, %r12
+ adcxq %rcx, %r8
+ adoxq %r12, %r9
+ mulxq %r13, %rcx, %r13
+ adcxq %rcx, %r9
+ adoxq %r13, %r10
+ mulxq %r14, %rcx, %r14
+ adcxq %rcx, %r10
+ adoxq %r14, %r11
+ mulxq %r15, %r15, %rdx
+ adcxq %r15, %r11
+ adoxq %rax, %rdx
+ adcxq %rax, %rdx
+ # Overflow
+ shldq $0x01, %r11, %rdx
+ movq $0x7fffffffffffffff, %rax
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Reduce if top bit set
+ movq %r11, %rdx
+ shrq $63, %rdx
+ imulq $19, %rdx, %rcx
+ andq %rax, %r11
+ addq %rcx, %r8
+ adcq $0x00, %r9
+ adcq $0x00, %r10
+ adcq $0x00, %r11
+ # Store
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ leaq 48(%rsp), %rsi
+ # Double
+ movq (%rdi), %r8
+ movq 8(%rdi), %r9
+ addq %r8, %r8
+ movq 16(%rdi), %r10
+ adcq %r9, %r9
+ movq 24(%rdi), %rdx
+ adcq %r10, %r10
+ movq $-19, %rcx
+ adcq %rdx, %rdx
+ movq $0x7fffffffffffffff, %rax
+ movq %rdx, %r11
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ movq %r8, (%rsi)
+ movq %r9, 8(%rsi)
+ movq %r10, 16(%rsi)
+ movq %r11, 24(%rsi)
+ movq 8(%rsp), %rbx
+ movq 16(%rsp), %rbp
+ # Add
+ movq (%rbp), %r8
+ movq 8(%rbp), %r9
+ movq 16(%rbp), %r10
+ movq 24(%rbp), %rdx
+ movq %r8, %r12
+ addq (%rbx), %r8
+ movq %r9, %r13
+ adcq 8(%rbx), %r9
+ movq %r10, %r14
+ adcq 16(%rbx), %r10
+ movq %rdx, %r15
+ adcq 24(%rbx), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rbx), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rbx), %r13
+ movq $-19, %rcx
+ sbbq 16(%rbx), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rbx), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rbx)
+ movq %r9, 8(%rbx)
+ movq %r10, 16(%rbx)
+ movq %r11, 24(%rbx)
+ movq %r12, (%rdi)
+ movq %r13, 8(%rdi)
+ movq %r14, 16(%rdi)
+ movq %r15, 24(%rdi)
+ movq 24(%rsp), %rdi
+ # Add
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %rdx
+ movq %r8, %r12
+ addq (%rdi), %r8
+ movq %r9, %r13
+ adcq 8(%rdi), %r9
+ movq %r10, %r14
+ adcq 16(%rdi), %r10
+ movq %rdx, %r15
+ adcq 24(%rdi), %rdx
+ movq $-19, %rcx
+ movq %rdx, %r11
+ movq $0x7fffffffffffffff, %rax
+ sarq $63, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Sub modulus (if overflow)
+ subq %rcx, %r8
+ sbbq %rdx, %r9
+ sbbq %rdx, %r10
+ sbbq %rax, %r11
+ # Sub
+ subq (%rdi), %r12
+ movq $0x00, %rdx
+ sbbq 8(%rdi), %r13
+ movq $-19, %rcx
+ sbbq 16(%rdi), %r14
+ movq $0x7fffffffffffffff, %rax
+ sbbq 24(%rdi), %r15
+ sbbq $0x00, %rdx
+ # Mask the modulus
+ andq %rdx, %rcx
+ andq %rdx, %rax
+ # Add modulus (if underflow)
+ addq %rcx, %r12
+ adcq %rdx, %r13
+ adcq %rdx, %r14
+ adcq %rax, %r15
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, (%rbp)
+ movq %r13, 8(%rbp)
+ movq %r14, 16(%rbp)
+ movq %r15, 24(%rbp)
+ addq $0x50, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size fe_ge_sub_avx2,.-fe_ge_sub_avx2
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX2 */