summaryrefslogtreecommitdiff
path: root/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/sha256_asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/sha256_asm.S')
-rw-r--r--FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/sha256_asm.S22653
1 files changed, 22653 insertions, 0 deletions
diff --git a/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/sha256_asm.S b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/sha256_asm.S
new file mode 100644
index 000000000..c433d341c
--- /dev/null
+++ b/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/sha256_asm.S
@@ -0,0 +1,22653 @@
+/* sha256_asm
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifndef HAVE_INTEL_AVX1
+#define HAVE_INTEL_AVX1
+#endif /* HAVE_INTEL_AVX1 */
+#ifndef NO_AVX2_SUPPORT
+#define HAVE_INTEL_AVX2
+#endif /* NO_AVX2_SUPPORT */
+
+#ifdef HAVE_INTEL_AVX1
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+L_avx1_sha256_k:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0xfc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x6ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx1_sha256_shuf_00BA:
+.quad 0xb0a090803020100, 0xffffffffffffffff
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx1_sha256_shuf_DC00:
+.quad 0xffffffffffffffff, 0xb0a090803020100
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx1_sha256_flip_mask:
+.quad 0x405060700010203, 0xc0d0e0f08090a0b
+#ifndef __APPLE__
+.text
+.globl Transform_Sha256_AVX1
+.type Transform_Sha256_AVX1,@function
+.align 4
+Transform_Sha256_AVX1:
+#else
+.section __TEXT,__text
+.globl _Transform_Sha256_AVX1
+.p2align 2
+_Transform_Sha256_AVX1:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x40, %rsp
+ leaq 32(%rdi), %rax
+ vmovdqa L_avx1_sha256_flip_mask(%rip), %xmm13
+ vmovdqa L_avx1_sha256_shuf_00BA(%rip), %xmm11
+ vmovdqa L_avx1_sha256_shuf_DC00(%rip), %xmm12
+ movl (%rdi), %r8d
+ movl 4(%rdi), %r9d
+ movl 8(%rdi), %r10d
+ movl 12(%rdi), %r11d
+ movl 16(%rdi), %r12d
+ movl 20(%rdi), %r13d
+ movl 24(%rdi), %r14d
+ movl 28(%rdi), %r15d
+ # X0, X1, X2, X3 = W[0..15]
+ vmovdqu (%rax), %xmm0
+ vmovdqu 16(%rax), %xmm1
+ vpshufb %xmm13, %xmm0, %xmm0
+ vpshufb %xmm13, %xmm1, %xmm1
+ vmovdqu 32(%rax), %xmm2
+ vmovdqu 48(%rax), %xmm3
+ vpshufb %xmm13, %xmm2, %xmm2
+ vpshufb %xmm13, %xmm3, %xmm3
+ movl %r9d, %ebx
+ movl %r12d, %edx
+ xorl %r10d, %ebx
+ # set_w_k_xfer_4: 0
+ vpaddd 0+L_avx1_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 16+L_avx1_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 32+L_avx1_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 48+L_avx1_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl (%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 4(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 8(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 12(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 16(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 20(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 24(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 28(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 32(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 36(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 40(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 44(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 48(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 52(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 56(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 60(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 4
+ vpaddd 64+L_avx1_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 80+L_avx1_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 96+L_avx1_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 112+L_avx1_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl (%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 4(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 8(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 12(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 16(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 20(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 24(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 28(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 32(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 36(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 40(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 44(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 48(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 52(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 56(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 60(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 8
+ vpaddd 128+L_avx1_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 144+L_avx1_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 160+L_avx1_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 176+L_avx1_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl (%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 4(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 8(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 12(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 16(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 20(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 24(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 28(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 32(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 36(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 40(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 44(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 48(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 52(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 56(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 60(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 12
+ vpaddd 192+L_avx1_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 208+L_avx1_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 224+L_avx1_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 240+L_avx1_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # rnd_all_4: 0-3
+ addl (%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 4(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 8(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 12(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 1-4
+ addl 16(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 20(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 24(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 28(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 2-5
+ addl 32(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 36(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 40(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 44(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 3-6
+ addl 48(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 52(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 56(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 60(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ addl %r8d, (%rdi)
+ addl %r9d, 4(%rdi)
+ addl %r10d, 8(%rdi)
+ addl %r11d, 12(%rdi)
+ addl %r12d, 16(%rdi)
+ addl %r13d, 20(%rdi)
+ addl %r14d, 24(%rdi)
+ addl %r15d, 28(%rdi)
+ xorq %rax, %rax
+ vzeroupper
+ addq $0x40, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size Transform_Sha256_AVX1,.-Transform_Sha256_AVX1
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl Transform_Sha256_AVX1_Len
+.type Transform_Sha256_AVX1_Len,@function
+.align 4
+Transform_Sha256_AVX1_Len:
+#else
+.section __TEXT,__text
+.globl _Transform_Sha256_AVX1_Len
+.p2align 2
+_Transform_Sha256_AVX1_Len:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsi, %rbp
+ movq %rdx, %rsi
+ subq $0x40, %rsp
+ vmovdqa L_avx1_sha256_flip_mask(%rip), %xmm13
+ vmovdqa L_avx1_sha256_shuf_00BA(%rip), %xmm11
+ vmovdqa L_avx1_sha256_shuf_DC00(%rip), %xmm12
+ movl (%rdi), %r8d
+ movl 4(%rdi), %r9d
+ movl 8(%rdi), %r10d
+ movl 12(%rdi), %r11d
+ movl 16(%rdi), %r12d
+ movl 20(%rdi), %r13d
+ movl 24(%rdi), %r14d
+ movl 28(%rdi), %r15d
+ # Start of loop processing a block
+L_sha256_len_avx1_start:
+ # X0, X1, X2, X3 = W[0..15]
+ vmovdqu (%rbp), %xmm0
+ vmovdqu 16(%rbp), %xmm1
+ vpshufb %xmm13, %xmm0, %xmm0
+ vpshufb %xmm13, %xmm1, %xmm1
+ vmovdqu 32(%rbp), %xmm2
+ vmovdqu 48(%rbp), %xmm3
+ vpshufb %xmm13, %xmm2, %xmm2
+ vpshufb %xmm13, %xmm3, %xmm3
+ movl %r9d, %ebx
+ movl %r12d, %edx
+ xorl %r10d, %ebx
+ # set_w_k_xfer_4: 0
+ vpaddd 0+L_avx1_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 16+L_avx1_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 32+L_avx1_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 48+L_avx1_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl (%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 4(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 8(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 12(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 16(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 20(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 24(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 28(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 32(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 36(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 40(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 44(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 48(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 52(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 56(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 60(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 4
+ vpaddd 64+L_avx1_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 80+L_avx1_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 96+L_avx1_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 112+L_avx1_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl (%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 4(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 8(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 12(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 16(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 20(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 24(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 28(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 32(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 36(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 40(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 44(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 48(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 52(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 56(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 60(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 8
+ vpaddd 128+L_avx1_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 144+L_avx1_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 160+L_avx1_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 176+L_avx1_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl (%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 4(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 8(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 12(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 16(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 20(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 24(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 28(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 32(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 36(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 40(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 44(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 48(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %xmm5, %xmm8
+ vpslld $14, %xmm5, %xmm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %xmm6, %xmm7, %xmm6
+ vpor %xmm8, %xmm9, %xmm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 52(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %xmm5, %xmm9
+ vpxor %xmm6, %xmm8, %xmm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %xmm6, %xmm9, %xmm5
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 56(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %xmm6, %xmm7, %xmm6
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 60(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %xmm6, %xmm8
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %xmm6, %xmm9
+ vpxor %xmm8, %xmm7, %xmm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %xmm9, %xmm8, %xmm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 12
+ vpaddd 192+L_avx1_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 208+L_avx1_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 224+L_avx1_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 240+L_avx1_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # rnd_all_4: 0-3
+ addl (%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 4(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 8(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 12(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 1-4
+ addl 16(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 20(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 24(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 28(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 2-5
+ addl 32(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 36(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 40(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 44(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 3-6
+ addl 48(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 52(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 56(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 60(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ addl (%rdi), %r8d
+ addl 4(%rdi), %r9d
+ addl 8(%rdi), %r10d
+ addl 12(%rdi), %r11d
+ addl 16(%rdi), %r12d
+ addl 20(%rdi), %r13d
+ addl 24(%rdi), %r14d
+ addl 28(%rdi), %r15d
+ addq $0x40, %rbp
+ subl $0x40, %esi
+ movl %r8d, (%rdi)
+ movl %r9d, 4(%rdi)
+ movl %r10d, 8(%rdi)
+ movl %r11d, 12(%rdi)
+ movl %r12d, 16(%rdi)
+ movl %r13d, 20(%rdi)
+ movl %r14d, 24(%rdi)
+ movl %r15d, 28(%rdi)
+ jnz L_sha256_len_avx1_start
+ xorq %rax, %rax
+ vzeroupper
+ addq $0x40, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size Transform_Sha256_AVX1_Len,.-Transform_Sha256_AVX1_Len
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+L_avx1_rorx_sha256_k:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0xfc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x6ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx1_rorx_sha256_shuf_00BA:
+.quad 0xb0a090803020100, 0xffffffffffffffff
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx1_rorx_sha256_shuf_DC00:
+.quad 0xffffffffffffffff, 0xb0a090803020100
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx1_rorx_sha256_flip_mask:
+.quad 0x405060700010203, 0xc0d0e0f08090a0b
+#ifndef __APPLE__
+.text
+.globl Transform_Sha256_AVX1_RORX
+.type Transform_Sha256_AVX1_RORX,@function
+.align 4
+Transform_Sha256_AVX1_RORX:
+#else
+.section __TEXT,__text
+.globl _Transform_Sha256_AVX1_RORX
+.p2align 2
+_Transform_Sha256_AVX1_RORX:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x40, %rsp
+ vmovdqa L_avx1_rorx_sha256_flip_mask(%rip), %xmm13
+ vmovdqa L_avx1_rorx_sha256_shuf_00BA(%rip), %xmm11
+ vmovdqa L_avx1_rorx_sha256_shuf_DC00(%rip), %xmm12
+ leaq 32(%rdi), %rax
+ # X0, X1, X2, X3 = W[0..15]
+ vmovdqu (%rax), %xmm0
+ vmovdqu 16(%rax), %xmm1
+ vpshufb %xmm13, %xmm0, %xmm0
+ vpshufb %xmm13, %xmm1, %xmm1
+ vmovdqu 32(%rax), %xmm2
+ vmovdqu 48(%rax), %xmm3
+ vpshufb %xmm13, %xmm2, %xmm2
+ vpshufb %xmm13, %xmm3, %xmm3
+ movl (%rdi), %r8d
+ movl 4(%rdi), %r9d
+ movl 8(%rdi), %r10d
+ movl 12(%rdi), %r11d
+ movl 16(%rdi), %r12d
+ movl 20(%rdi), %r13d
+ movl 24(%rdi), %r14d
+ movl 28(%rdi), %r15d
+ # set_w_k_xfer_4: 0
+ vpaddd 0+L_avx1_rorx_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 16+L_avx1_rorx_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 32+L_avx1_rorx_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 48+L_avx1_rorx_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ movl %r9d, %ebx
+ rorxl $6, %r12d, %edx
+ xorl %r10d, %ebx
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl (%rsp), %r15d
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 4(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 8(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 12(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 16(%rsp), %r11d
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 20(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 24(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 28(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 32(%rsp), %r15d
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 36(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 40(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 44(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 48(%rsp), %r11d
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 52(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 56(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 60(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 4
+ vpaddd 64+L_avx1_rorx_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 80+L_avx1_rorx_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 96+L_avx1_rorx_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 112+L_avx1_rorx_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl (%rsp), %r15d
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 4(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 8(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 12(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 16(%rsp), %r11d
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 20(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 24(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 28(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 32(%rsp), %r15d
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 36(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 40(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 44(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 48(%rsp), %r11d
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 52(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 56(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 60(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 8
+ vpaddd 128+L_avx1_rorx_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 144+L_avx1_rorx_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 160+L_avx1_rorx_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 176+L_avx1_rorx_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl (%rsp), %r15d
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 4(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 8(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 12(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 16(%rsp), %r11d
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 20(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 24(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 28(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 32(%rsp), %r15d
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 36(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 40(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 44(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 48(%rsp), %r11d
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 52(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 56(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 60(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 12
+ vpaddd 192+L_avx1_rorx_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 208+L_avx1_rorx_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 224+L_avx1_rorx_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 240+L_avx1_rorx_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ xorl %eax, %eax
+ # rnd_all_4: 0-3
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ addl %eax, %r8d
+ addl (%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 4(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ addl %eax, %r14d
+ addl 8(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 12(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ # rnd_all_4: 1-4
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ addl %eax, %r12d
+ addl 16(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 20(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ addl %eax, %r10d
+ addl 24(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 28(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ # rnd_all_4: 2-5
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ addl %eax, %r8d
+ addl 32(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 36(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ addl %eax, %r14d
+ addl 40(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 44(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ # rnd_all_4: 3-6
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ addl %eax, %r12d
+ addl 48(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 52(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ addl %eax, %r10d
+ addl 56(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 60(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ addl %eax, %r8d
+ addl %r8d, (%rdi)
+ addl %r9d, 4(%rdi)
+ addl %r10d, 8(%rdi)
+ addl %r11d, 12(%rdi)
+ addl %r12d, 16(%rdi)
+ addl %r13d, 20(%rdi)
+ addl %r14d, 24(%rdi)
+ addl %r15d, 28(%rdi)
+ xorq %rax, %rax
+ vzeroupper
+ addq $0x40, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size Transform_Sha256_AVX1_RORX,.-Transform_Sha256_AVX1_RORX
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl Transform_Sha256_AVX1_RORX_Len
+.type Transform_Sha256_AVX1_RORX_Len,@function
+.align 4
+Transform_Sha256_AVX1_RORX_Len:
+#else
+.section __TEXT,__text
+.globl _Transform_Sha256_AVX1_RORX_Len
+.p2align 2
+_Transform_Sha256_AVX1_RORX_Len:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsi, %rbp
+ movq %rdx, %rsi
+ subq $0x40, %rsp
+ vmovdqa L_avx1_rorx_sha256_flip_mask(%rip), %xmm13
+ vmovdqa L_avx1_rorx_sha256_shuf_00BA(%rip), %xmm11
+ vmovdqa L_avx1_rorx_sha256_shuf_DC00(%rip), %xmm12
+ movl (%rdi), %r8d
+ movl 4(%rdi), %r9d
+ movl 8(%rdi), %r10d
+ movl 12(%rdi), %r11d
+ movl 16(%rdi), %r12d
+ movl 20(%rdi), %r13d
+ movl 24(%rdi), %r14d
+ movl 28(%rdi), %r15d
+ # Start of loop processing a block
+L_sha256_len_avx1_len_rorx_start:
+ # X0, X1, X2, X3 = W[0..15]
+ vmovdqu (%rbp), %xmm0
+ vmovdqu 16(%rbp), %xmm1
+ vpshufb %xmm13, %xmm0, %xmm0
+ vpshufb %xmm13, %xmm1, %xmm1
+ vmovdqu 32(%rbp), %xmm2
+ vmovdqu 48(%rbp), %xmm3
+ vpshufb %xmm13, %xmm2, %xmm2
+ vpshufb %xmm13, %xmm3, %xmm3
+ # set_w_k_xfer_4: 0
+ vpaddd 0+L_avx1_rorx_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 16+L_avx1_rorx_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 32+L_avx1_rorx_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 48+L_avx1_rorx_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ movl %r9d, %ebx
+ rorxl $6, %r12d, %edx
+ xorl %r10d, %ebx
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl (%rsp), %r15d
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 4(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 8(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 12(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 16(%rsp), %r11d
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 20(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 24(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 28(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 32(%rsp), %r15d
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 36(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 40(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 44(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 48(%rsp), %r11d
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 52(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 56(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 60(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 4
+ vpaddd 64+L_avx1_rorx_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 80+L_avx1_rorx_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 96+L_avx1_rorx_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 112+L_avx1_rorx_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl (%rsp), %r15d
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 4(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 8(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 12(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 16(%rsp), %r11d
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 20(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 24(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 28(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 32(%rsp), %r15d
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 36(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 40(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 44(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 48(%rsp), %r11d
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 52(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 56(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 60(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 8
+ vpaddd 128+L_avx1_rorx_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 144+L_avx1_rorx_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 160+L_avx1_rorx_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 176+L_avx1_rorx_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl (%rsp), %r15d
+ vpalignr $4, %xmm2, %xmm3, %xmm4
+ vpalignr $4, %xmm0, %xmm1, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 4(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm3, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 8(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm0, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 12(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm0
+ # msg_sched done: 0-3
+ # msg_sched: 4-7
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 16(%rsp), %r11d
+ vpalignr $4, %xmm3, %xmm0, %xmm4
+ vpalignr $4, %xmm1, %xmm2, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 20(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm0, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 24(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm1, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 28(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm1
+ # msg_sched done: 4-7
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 32(%rsp), %r15d
+ vpalignr $4, %xmm0, %xmm1, %xmm4
+ vpalignr $4, %xmm2, %xmm3, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 36(%rsp), %r14d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpshufd $0xfa, %xmm1, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 40(%rsp), %r13d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm2, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 44(%rsp), %r12d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vpaddd %xmm4, %xmm9, %xmm2
+ # msg_sched done: 8-11
+ # msg_sched: 12-15
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 48(%rsp), %r11d
+ vpalignr $4, %xmm1, %xmm2, %xmm4
+ vpalignr $4, %xmm3, %xmm0, %xmm5
+ # rnd_0: 1 - 2
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %xmm5, %xmm6
+ vpslld $25, %xmm5, %xmm7
+ # rnd_0: 3 - 4
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $3, %xmm5, %xmm8
+ vpor %xmm6, %xmm7, %xmm7
+ # rnd_0: 5 - 7
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 52(%rsp), %r10d
+ vpsrld $18, %xmm5, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpslld $14, %xmm5, %xmm5
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpxor %xmm5, %xmm7, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %xmm6, %xmm7, %xmm7
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpshufd $0xfa, %xmm2, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ vpxor %xmm8, %xmm7, %xmm5
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrld $10, %xmm6, %xmm8
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 56(%rsp), %r9d
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpsrlq $0x11, %xmm6, %xmm6
+ vpaddd %xmm3, %xmm4, %xmm4
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %xmm5, %xmm4, %xmm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpxor %xmm6, %xmm8, %xmm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufb %xmm11, %xmm8, %xmm8
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpaddd %xmm8, %xmm4, %xmm4
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 60(%rsp), %r8d
+ vpshufd $0x50, %xmm4, %xmm6
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpsrld $10, %xmm6, %xmm9
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpsrlq $19, %xmm6, %xmm7
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpsrlq $0x11, %xmm6, %xmm6
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpxor %xmm7, %xmm6, %xmm6
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ vpxor %xmm6, %xmm9, %xmm9
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ vpshufb %xmm12, %xmm9, %xmm9
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vpaddd %xmm4, %xmm9, %xmm3
+ # msg_sched done: 12-15
+ # set_w_k_xfer_4: 12
+ vpaddd 192+L_avx1_rorx_sha256_k(%rip), %xmm0, %xmm4
+ vpaddd 208+L_avx1_rorx_sha256_k(%rip), %xmm1, %xmm5
+ vmovdqu %xmm4, (%rsp)
+ vmovdqu %xmm5, 16(%rsp)
+ vpaddd 224+L_avx1_rorx_sha256_k(%rip), %xmm2, %xmm6
+ vpaddd 240+L_avx1_rorx_sha256_k(%rip), %xmm3, %xmm7
+ vmovdqu %xmm6, 32(%rsp)
+ vmovdqu %xmm7, 48(%rsp)
+ xorl %eax, %eax
+ xorl %ecx, %ecx
+ # rnd_all_4: 0-3
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ addl %eax, %r8d
+ addl (%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 4(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ addl %eax, %r14d
+ addl 8(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 12(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ # rnd_all_4: 1-4
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ addl %eax, %r12d
+ addl 16(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 20(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ addl %eax, %r10d
+ addl 24(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 28(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ # rnd_all_4: 2-5
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ addl %eax, %r8d
+ addl 32(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 36(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ addl %r14d, %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ addl %eax, %r14d
+ addl 40(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 44(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ addl %r12d, %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ # rnd_all_4: 3-6
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ addl %eax, %r12d
+ addl 48(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 52(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ addl %r10d, %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ addl %eax, %r10d
+ addl 56(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 60(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ addl %r8d, %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ addl %eax, %r8d
+ addl (%rdi), %r8d
+ addl 4(%rdi), %r9d
+ addl 8(%rdi), %r10d
+ addl 12(%rdi), %r11d
+ addl 16(%rdi), %r12d
+ addl 20(%rdi), %r13d
+ addl 24(%rdi), %r14d
+ addl 28(%rdi), %r15d
+ addq $0x40, %rbp
+ subl $0x40, %esi
+ movl %r8d, (%rdi)
+ movl %r9d, 4(%rdi)
+ movl %r10d, 8(%rdi)
+ movl %r11d, 12(%rdi)
+ movl %r12d, 16(%rdi)
+ movl %r13d, 20(%rdi)
+ movl %r14d, 24(%rdi)
+ movl %r15d, 28(%rdi)
+ jnz L_sha256_len_avx1_len_rorx_start
+ xorq %rax, %rax
+ vzeroupper
+ addq $0x40, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size Transform_Sha256_AVX1_RORX_Len,.-Transform_Sha256_AVX1_RORX_Len
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX1 */
+#ifdef HAVE_INTEL_AVX2
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+L_avx2_sha256_k:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0xfc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0xfc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x6ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x6ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx2_sha256_shuf_00BA:
+.quad 0xb0a090803020100, 0xffffffffffffffff
+.quad 0xb0a090803020100, 0xffffffffffffffff
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx2_sha256_shuf_DC00:
+.quad 0xffffffffffffffff, 0xb0a090803020100
+.quad 0xffffffffffffffff, 0xb0a090803020100
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx2_sha256_flip_mask:
+.quad 0x405060700010203, 0xc0d0e0f08090a0b
+.quad 0x405060700010203, 0xc0d0e0f08090a0b
+#ifndef __APPLE__
+.text
+.globl Transform_Sha256_AVX2
+.type Transform_Sha256_AVX2,@function
+.align 4
+Transform_Sha256_AVX2:
+#else
+.section __TEXT,__text
+.globl _Transform_Sha256_AVX2
+.p2align 2
+_Transform_Sha256_AVX2:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x200, %rsp
+ leaq 32(%rdi), %rax
+ vmovdqa L_avx2_sha256_flip_mask(%rip), %xmm13
+ vmovdqa L_avx2_sha256_shuf_00BA(%rip), %ymm11
+ vmovdqa L_avx2_sha256_shuf_DC00(%rip), %ymm12
+ movl (%rdi), %r8d
+ movl 4(%rdi), %r9d
+ movl 8(%rdi), %r10d
+ movl 12(%rdi), %r11d
+ movl 16(%rdi), %r12d
+ movl 20(%rdi), %r13d
+ movl 24(%rdi), %r14d
+ movl 28(%rdi), %r15d
+ # X0, X1, X2, X3 = W[0..15]
+ vmovdqu (%rax), %xmm0
+ vmovdqu 16(%rax), %xmm1
+ vpshufb %xmm13, %xmm0, %xmm0
+ vpshufb %xmm13, %xmm1, %xmm1
+ vmovdqu 32(%rax), %xmm2
+ vmovdqu 48(%rax), %xmm3
+ vpshufb %xmm13, %xmm2, %xmm2
+ vpshufb %xmm13, %xmm3, %xmm3
+ movl %r9d, %ebx
+ movl %r12d, %edx
+ xorl %r10d, %ebx
+ # set_w_k_xfer_4: 0
+ vpaddd 0+L_avx2_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 32+L_avx2_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, (%rsp)
+ vmovdqu %ymm5, 32(%rsp)
+ vpaddd 64+L_avx2_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 96+L_avx2_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 64(%rsp)
+ vmovdqu %ymm5, 96(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl (%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 4(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm3, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 8(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 12(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # msg_sched done: 0-3
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 32(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 36(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm0, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 40(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 44(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # msg_sched done: 8-11
+ # msg_sched: 16-19
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 64(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 68(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm1, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 72(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 76(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # msg_sched done: 16-19
+ # msg_sched: 24-27
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 96(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 100(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm2, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 104(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 108(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # msg_sched done: 24-27
+ # set_w_k_xfer_4: 4
+ vpaddd 128+L_avx2_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 160+L_avx2_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, 128(%rsp)
+ vmovdqu %ymm5, 160(%rsp)
+ vpaddd 192+L_avx2_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 224+L_avx2_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 192(%rsp)
+ vmovdqu %ymm5, 224(%rsp)
+ # msg_sched: 32-35
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 128(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 132(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm3, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 136(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 140(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # msg_sched done: 32-35
+ # msg_sched: 40-43
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 160(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 164(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm0, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 168(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 172(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # msg_sched done: 40-43
+ # msg_sched: 48-51
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 192(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 196(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm1, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 200(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 204(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # msg_sched done: 48-51
+ # msg_sched: 56-59
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 224(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 228(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm2, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 232(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 236(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # msg_sched done: 56-59
+ # set_w_k_xfer_4: 8
+ vpaddd 256+L_avx2_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 288+L_avx2_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, 256(%rsp)
+ vmovdqu %ymm5, 288(%rsp)
+ vpaddd 320+L_avx2_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 352+L_avx2_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 320(%rsp)
+ vmovdqu %ymm5, 352(%rsp)
+ # msg_sched: 64-67
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 256(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 260(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm3, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 264(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 268(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # msg_sched done: 64-67
+ # msg_sched: 72-75
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 288(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 292(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm0, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 296(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 300(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # msg_sched done: 72-75
+ # msg_sched: 80-83
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 320(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 324(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm1, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 328(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 332(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # msg_sched done: 80-83
+ # msg_sched: 88-91
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 352(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 356(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm2, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 360(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 364(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # msg_sched done: 88-91
+ # set_w_k_xfer_4: 12
+ vpaddd 384+L_avx2_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 416+L_avx2_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, 384(%rsp)
+ vmovdqu %ymm5, 416(%rsp)
+ vpaddd 448+L_avx2_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 480+L_avx2_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 448(%rsp)
+ vmovdqu %ymm5, 480(%rsp)
+ # rnd_all_4: 24-27
+ addl 384(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 388(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 392(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 396(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 26-29
+ addl 416(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 420(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 424(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 428(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 28-31
+ addl 448(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 452(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 456(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 460(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 30-33
+ addl 480(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 484(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 488(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 492(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ addl %r8d, (%rdi)
+ addl %r9d, 4(%rdi)
+ addl %r10d, 8(%rdi)
+ addl %r11d, 12(%rdi)
+ addl %r12d, 16(%rdi)
+ addl %r13d, 20(%rdi)
+ addl %r14d, 24(%rdi)
+ addl %r15d, 28(%rdi)
+ xorq %rax, %rax
+ vzeroupper
+ addq $0x200, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size Transform_Sha256_AVX2,.-Transform_Sha256_AVX2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl Transform_Sha256_AVX2_Len
+.type Transform_Sha256_AVX2_Len,@function
+.align 4
+Transform_Sha256_AVX2_Len:
+#else
+.section __TEXT,__text
+.globl _Transform_Sha256_AVX2_Len
+.p2align 2
+_Transform_Sha256_AVX2_Len:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsi, %rbp
+ movq %rdx, %rsi
+ subq $0x200, %rsp
+ testb $0x40, %sil
+ je L_sha256_len_avx2_block
+ vmovdqu (%rbp), %ymm0
+ vmovdqu 32(%rbp), %ymm1
+ vmovups %ymm0, 32(%rdi)
+ vmovups %ymm1, 64(%rdi)
+#ifndef __APPLE__
+ call Transform_Sha256_AVX2@plt
+#else
+ call _Transform_Sha256_AVX2
+#endif /* __APPLE__ */
+ addq $0x40, %rbp
+ subl $0x40, %esi
+ jz L_sha256_len_avx2_done
+L_sha256_len_avx2_block:
+ vmovdqa L_avx2_sha256_flip_mask(%rip), %ymm13
+ vmovdqa L_avx2_sha256_shuf_00BA(%rip), %ymm11
+ vmovdqa L_avx2_sha256_shuf_DC00(%rip), %ymm12
+ movl (%rdi), %r8d
+ movl 4(%rdi), %r9d
+ movl 8(%rdi), %r10d
+ movl 12(%rdi), %r11d
+ movl 16(%rdi), %r12d
+ movl 20(%rdi), %r13d
+ movl 24(%rdi), %r14d
+ movl 28(%rdi), %r15d
+ # Start of loop processing two blocks
+L_sha256_len_avx2_start:
+ # X0, X1, X2, X3 = W[0..15]
+ vmovdqu (%rbp), %xmm0
+ vmovdqu 16(%rbp), %xmm1
+ vmovdqu 64(%rbp), %xmm4
+ vmovdqu 80(%rbp), %xmm5
+ vinserti128 $0x01, %xmm4, %ymm0, %ymm0
+ vinserti128 $0x01, %xmm5, %ymm1, %ymm1
+ vpshufb %ymm13, %ymm0, %ymm0
+ vpshufb %ymm13, %ymm1, %ymm1
+ vmovdqu 32(%rbp), %xmm2
+ vmovdqu 48(%rbp), %xmm3
+ vmovdqu 96(%rbp), %xmm6
+ vmovdqu 112(%rbp), %xmm7
+ vinserti128 $0x01, %xmm6, %ymm2, %ymm2
+ vinserti128 $0x01, %xmm7, %ymm3, %ymm3
+ vpshufb %ymm13, %ymm2, %ymm2
+ vpshufb %ymm13, %ymm3, %ymm3
+ movl %r9d, %ebx
+ movl %r12d, %edx
+ xorl %r10d, %ebx
+ # set_w_k_xfer_4: 0
+ vpaddd 0+L_avx2_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 32+L_avx2_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, (%rsp)
+ vmovdqu %ymm5, 32(%rsp)
+ vpaddd 64+L_avx2_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 96+L_avx2_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 64(%rsp)
+ vmovdqu %ymm5, 96(%rsp)
+ # msg_sched: 0-3
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl (%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 4(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm3, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 8(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 12(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # msg_sched done: 0-3
+ # msg_sched: 8-11
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 32(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 36(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm0, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 40(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 44(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # msg_sched done: 8-11
+ # msg_sched: 16-19
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 64(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 68(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm1, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 72(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 76(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # msg_sched done: 16-19
+ # msg_sched: 24-27
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 96(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 100(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm2, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 104(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 108(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # msg_sched done: 24-27
+ # set_w_k_xfer_4: 4
+ vpaddd 128+L_avx2_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 160+L_avx2_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, 128(%rsp)
+ vmovdqu %ymm5, 160(%rsp)
+ vpaddd 192+L_avx2_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 224+L_avx2_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 192(%rsp)
+ vmovdqu %ymm5, 224(%rsp)
+ # msg_sched: 32-35
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 128(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 132(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm3, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 136(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 140(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # msg_sched done: 32-35
+ # msg_sched: 40-43
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 160(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 164(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm0, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 168(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 172(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # msg_sched done: 40-43
+ # msg_sched: 48-51
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 192(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 196(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm1, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 200(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 204(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # msg_sched done: 48-51
+ # msg_sched: 56-59
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 224(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 228(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm2, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 232(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 236(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # msg_sched done: 56-59
+ # set_w_k_xfer_4: 8
+ vpaddd 256+L_avx2_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 288+L_avx2_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, 256(%rsp)
+ vmovdqu %ymm5, 288(%rsp)
+ vpaddd 320+L_avx2_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 352+L_avx2_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 320(%rsp)
+ vmovdqu %ymm5, 352(%rsp)
+ # msg_sched: 64-67
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 256(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 260(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm3, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 264(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 268(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # msg_sched done: 64-67
+ # msg_sched: 72-75
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 288(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 292(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm0, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 296(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 300(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # msg_sched done: 72-75
+ # msg_sched: 80-83
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 1 - 2
+ movl %r9d, %eax
+ movl %r13d, %ecx
+ addl 320(%rsp), %r15d
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ andl %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r14d, %ecx
+ xorl %r12d, %edx
+ addl %ecx, %r15d
+ rorl $6, %edx
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ movl %r8d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r8d, %ebx
+ movl %r12d, %ecx
+ addl 324(%rsp), %r14d
+ xorl %r13d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r11d, %edx
+ andl %r11d, %ecx
+ rorl $5, %edx
+ xorl %r13d, %ecx
+ xorl %r11d, %edx
+ addl %ecx, %r14d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm1, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ movl %r15d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r15d, %eax
+ movl %r11d, %ecx
+ addl 328(%rsp), %r13d
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ andl %r10d, %ecx
+ rorl $5, %edx
+ xorl %r12d, %ecx
+ xorl %r10d, %edx
+ addl %ecx, %r13d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ movl %r14d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r14d, %ebx
+ movl %r10d, %ecx
+ addl 332(%rsp), %r12d
+ xorl %r11d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r9d, %edx
+ andl %r9d, %ecx
+ rorl $5, %edx
+ xorl %r11d, %ecx
+ xorl %r9d, %edx
+ addl %ecx, %r12d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ movl %r13d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # msg_sched done: 80-83
+ # msg_sched: 88-91
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 1 - 2
+ movl %r13d, %eax
+ movl %r9d, %ecx
+ addl 352(%rsp), %r11d
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ andl %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 3 - 4
+ rorl $5, %edx
+ xorl %r10d, %ecx
+ xorl %r8d, %edx
+ addl %ecx, %r11d
+ rorl $6, %edx
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ movl %r12d, %ecx
+ vpsrld $18, %ymm5, %ymm8
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 5 - 6
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ vpor %ymm6, %ymm7, %ymm6
+ vpor %ymm8, %ymm9, %ymm8
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ # rnd_1: 0 - 1
+ rorl $14, %edx
+ movl %r12d, %ebx
+ movl %r8d, %ecx
+ addl 356(%rsp), %r10d
+ xorl %r9d, %ecx
+ vpsrld $3, %ymm5, %ymm9
+ vpxor %ymm6, %ymm8, %ymm6
+ # rnd_1: 2 - 3
+ xorl %r15d, %edx
+ andl %r15d, %ecx
+ rorl $5, %edx
+ xorl %r9d, %ecx
+ xorl %r15d, %edx
+ addl %ecx, %r10d
+ vpxor %ymm6, %ymm9, %ymm5
+ vpshufd $0xfa, %ymm2, %ymm6
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ movl %r11d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ vpsrld $10, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 6 - 7
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ # rnd_0: 0 - 0
+ rorl $14, %edx
+ vpsrlq $0x11, %ymm6, %ymm6
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 1 - 3
+ movl %r11d, %eax
+ movl %r15d, %ecx
+ addl 360(%rsp), %r9d
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ andl %r14d, %ecx
+ rorl $5, %edx
+ xorl %r8d, %ecx
+ xorl %r14d, %edx
+ addl %ecx, %r9d
+ vpxor %ymm6, %ymm7, %ymm6
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 4 - 4
+ rorl $6, %edx
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ movl %r10d, %ecx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 5 - 5
+ andl %eax, %ebx
+ rorl $9, %ecx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 6 - 6
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 7 - 7
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ # rnd_1: 0 - 0
+ rorl $14, %edx
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_1: 1 - 1
+ movl %r10d, %ebx
+ movl %r14d, %ecx
+ addl 364(%rsp), %r8d
+ xorl %r15d, %ecx
+ vpsrlq $0x11, %ymm6, %ymm8
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 2 - 3
+ xorl %r13d, %edx
+ andl %r13d, %ecx
+ rorl $5, %edx
+ xorl %r15d, %ecx
+ xorl %r13d, %edx
+ addl %ecx, %r8d
+ vpsrld $10, %ymm6, %ymm9
+ vpxor %ymm8, %ymm7, %ymm8
+ # rnd_1: 4 - 5
+ rorl $6, %edx
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ movl %r9d, %ecx
+ andl %ebx, %eax
+ rorl $9, %ecx
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ vpxor %ymm9, %ymm8, %ymm9
+ # rnd_1: 6 - 6
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 7 - 7
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # msg_sched done: 88-91
+ # set_w_k_xfer_4: 12
+ vpaddd 384+L_avx2_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 416+L_avx2_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, 384(%rsp)
+ vmovdqu %ymm5, 416(%rsp)
+ vpaddd 448+L_avx2_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 480+L_avx2_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 448(%rsp)
+ vmovdqu %ymm5, 480(%rsp)
+ # rnd_all_4: 24-27
+ addl 384(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 388(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 392(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 396(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 26-29
+ addl 416(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 420(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 424(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 428(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 28-31
+ addl 448(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 452(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 456(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 460(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 30-33
+ addl 480(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 484(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 488(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 492(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ addl (%rdi), %r8d
+ addl 4(%rdi), %r9d
+ addl 8(%rdi), %r10d
+ addl 12(%rdi), %r11d
+ addl 16(%rdi), %r12d
+ addl 20(%rdi), %r13d
+ addl 24(%rdi), %r14d
+ addl 28(%rdi), %r15d
+ movl %r8d, (%rdi)
+ movl %r9d, 4(%rdi)
+ movl %r10d, 8(%rdi)
+ movl %r11d, 12(%rdi)
+ movl %r12d, 16(%rdi)
+ movl %r13d, 20(%rdi)
+ movl %r14d, 24(%rdi)
+ movl %r15d, 28(%rdi)
+ movl %r9d, %ebx
+ movl %r12d, %edx
+ xorl %r10d, %ebx
+ # rnd_all_4: 1-4
+ addl 16(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 20(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 24(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 28(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 3-6
+ addl 48(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 52(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 56(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 60(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 5-8
+ addl 80(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 84(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 88(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 92(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 7-10
+ addl 112(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 116(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 120(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 124(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 9-12
+ addl 144(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 148(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 152(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 156(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 11-14
+ addl 176(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 180(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 184(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 188(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 13-16
+ addl 208(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 212(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 216(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 220(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 15-18
+ addl 240(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 244(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 248(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 252(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 17-20
+ addl 272(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 276(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 280(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 284(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 19-22
+ addl 304(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 308(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 312(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 316(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 21-24
+ addl 336(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 340(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 344(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 348(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 23-26
+ addl 368(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 372(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 376(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 380(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 25-28
+ addl 400(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 404(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 408(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 412(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 27-30
+ addl 432(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 436(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 440(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 444(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ # rnd_all_4: 29-32
+ addl 464(%rsp), %r15d
+ movl %r13d, %ecx
+ movl %r9d, %eax
+ xorl %r14d, %ecx
+ rorl $14, %edx
+ andl %r12d, %ecx
+ xorl %r12d, %edx
+ xorl %r14d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r15d
+ xorl %r12d, %edx
+ xorl %r8d, %eax
+ rorl $6, %edx
+ movl %r8d, %ecx
+ addl %edx, %r15d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r8d, %ecx
+ xorl %r9d, %ebx
+ rorl $11, %ecx
+ addl %r15d, %r11d
+ xorl %r8d, %ecx
+ addl %ebx, %r15d
+ rorl $2, %ecx
+ movl %r11d, %edx
+ addl %ecx, %r15d
+ addl 468(%rsp), %r14d
+ movl %r12d, %ecx
+ movl %r8d, %ebx
+ xorl %r13d, %ecx
+ rorl $14, %edx
+ andl %r11d, %ecx
+ xorl %r11d, %edx
+ xorl %r13d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r14d
+ xorl %r11d, %edx
+ xorl %r15d, %ebx
+ rorl $6, %edx
+ movl %r15d, %ecx
+ addl %edx, %r14d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r15d, %ecx
+ xorl %r8d, %eax
+ rorl $11, %ecx
+ addl %r14d, %r10d
+ xorl %r15d, %ecx
+ addl %eax, %r14d
+ rorl $2, %ecx
+ movl %r10d, %edx
+ addl %ecx, %r14d
+ addl 472(%rsp), %r13d
+ movl %r11d, %ecx
+ movl %r15d, %eax
+ xorl %r12d, %ecx
+ rorl $14, %edx
+ andl %r10d, %ecx
+ xorl %r10d, %edx
+ xorl %r12d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r13d
+ xorl %r10d, %edx
+ xorl %r14d, %eax
+ rorl $6, %edx
+ movl %r14d, %ecx
+ addl %edx, %r13d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r14d, %ecx
+ xorl %r15d, %ebx
+ rorl $11, %ecx
+ addl %r13d, %r9d
+ xorl %r14d, %ecx
+ addl %ebx, %r13d
+ rorl $2, %ecx
+ movl %r9d, %edx
+ addl %ecx, %r13d
+ addl 476(%rsp), %r12d
+ movl %r10d, %ecx
+ movl %r14d, %ebx
+ xorl %r11d, %ecx
+ rorl $14, %edx
+ andl %r9d, %ecx
+ xorl %r9d, %edx
+ xorl %r11d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r12d
+ xorl %r9d, %edx
+ xorl %r13d, %ebx
+ rorl $6, %edx
+ movl %r13d, %ecx
+ addl %edx, %r12d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r13d, %ecx
+ xorl %r14d, %eax
+ rorl $11, %ecx
+ addl %r12d, %r8d
+ xorl %r13d, %ecx
+ addl %eax, %r12d
+ rorl $2, %ecx
+ movl %r8d, %edx
+ addl %ecx, %r12d
+ # rnd_all_4: 31-34
+ addl 496(%rsp), %r11d
+ movl %r9d, %ecx
+ movl %r13d, %eax
+ xorl %r10d, %ecx
+ rorl $14, %edx
+ andl %r8d, %ecx
+ xorl %r8d, %edx
+ xorl %r10d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r11d
+ xorl %r8d, %edx
+ xorl %r12d, %eax
+ rorl $6, %edx
+ movl %r12d, %ecx
+ addl %edx, %r11d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r12d, %ecx
+ xorl %r13d, %ebx
+ rorl $11, %ecx
+ addl %r11d, %r15d
+ xorl %r12d, %ecx
+ addl %ebx, %r11d
+ rorl $2, %ecx
+ movl %r15d, %edx
+ addl %ecx, %r11d
+ addl 500(%rsp), %r10d
+ movl %r8d, %ecx
+ movl %r12d, %ebx
+ xorl %r9d, %ecx
+ rorl $14, %edx
+ andl %r15d, %ecx
+ xorl %r15d, %edx
+ xorl %r9d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r10d
+ xorl %r15d, %edx
+ xorl %r11d, %ebx
+ rorl $6, %edx
+ movl %r11d, %ecx
+ addl %edx, %r10d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r11d, %ecx
+ xorl %r12d, %eax
+ rorl $11, %ecx
+ addl %r10d, %r14d
+ xorl %r11d, %ecx
+ addl %eax, %r10d
+ rorl $2, %ecx
+ movl %r14d, %edx
+ addl %ecx, %r10d
+ addl 504(%rsp), %r9d
+ movl %r15d, %ecx
+ movl %r11d, %eax
+ xorl %r8d, %ecx
+ rorl $14, %edx
+ andl %r14d, %ecx
+ xorl %r14d, %edx
+ xorl %r8d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r9d
+ xorl %r14d, %edx
+ xorl %r10d, %eax
+ rorl $6, %edx
+ movl %r10d, %ecx
+ addl %edx, %r9d
+ rorl $9, %ecx
+ andl %eax, %ebx
+ xorl %r10d, %ecx
+ xorl %r11d, %ebx
+ rorl $11, %ecx
+ addl %r9d, %r13d
+ xorl %r10d, %ecx
+ addl %ebx, %r9d
+ rorl $2, %ecx
+ movl %r13d, %edx
+ addl %ecx, %r9d
+ addl 508(%rsp), %r8d
+ movl %r14d, %ecx
+ movl %r10d, %ebx
+ xorl %r15d, %ecx
+ rorl $14, %edx
+ andl %r13d, %ecx
+ xorl %r13d, %edx
+ xorl %r15d, %ecx
+ rorl $5, %edx
+ addl %ecx, %r8d
+ xorl %r13d, %edx
+ xorl %r9d, %ebx
+ rorl $6, %edx
+ movl %r9d, %ecx
+ addl %edx, %r8d
+ rorl $9, %ecx
+ andl %ebx, %eax
+ xorl %r9d, %ecx
+ xorl %r10d, %eax
+ rorl $11, %ecx
+ addl %r8d, %r12d
+ xorl %r9d, %ecx
+ addl %eax, %r8d
+ rorl $2, %ecx
+ movl %r12d, %edx
+ addl %ecx, %r8d
+ addl (%rdi), %r8d
+ addl 4(%rdi), %r9d
+ addl 8(%rdi), %r10d
+ addl 12(%rdi), %r11d
+ addl 16(%rdi), %r12d
+ addl 20(%rdi), %r13d
+ addl 24(%rdi), %r14d
+ addl 28(%rdi), %r15d
+ addq $0x80, %rbp
+ subl $0x80, %esi
+ movl %r8d, (%rdi)
+ movl %r9d, 4(%rdi)
+ movl %r10d, 8(%rdi)
+ movl %r11d, 12(%rdi)
+ movl %r12d, 16(%rdi)
+ movl %r13d, 20(%rdi)
+ movl %r14d, 24(%rdi)
+ movl %r15d, 28(%rdi)
+ jnz L_sha256_len_avx2_start
+L_sha256_len_avx2_done:
+ xorq %rax, %rax
+ vzeroupper
+ addq $0x200, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size Transform_Sha256_AVX2_Len,.-Transform_Sha256_AVX2_Len
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+L_avx2_rorx_sha256_k:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0xfc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0xfc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x6ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x6ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx2_rorx_sha256_flip_mask:
+.quad 0x405060700010203, 0xc0d0e0f08090a0b
+.quad 0x405060700010203, 0xc0d0e0f08090a0b
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx2_rorx_sha256_shuf_00BA:
+.quad 0xb0a090803020100, 0xffffffffffffffff
+.quad 0xb0a090803020100, 0xffffffffffffffff
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx2_rorx_sha256_shuf_DC00:
+.quad 0xffffffffffffffff, 0xb0a090803020100
+.quad 0xffffffffffffffff, 0xb0a090803020100
+#ifndef __APPLE__
+.text
+.globl Transform_Sha256_AVX2_RORX
+.type Transform_Sha256_AVX2_RORX,@function
+.align 4
+Transform_Sha256_AVX2_RORX:
+#else
+.section __TEXT,__text
+.globl _Transform_Sha256_AVX2_RORX
+.p2align 2
+_Transform_Sha256_AVX2_RORX:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $0x200, %rsp
+ leaq 32(%rdi), %rax
+ vmovdqa L_avx2_rorx_sha256_flip_mask(%rip), %xmm13
+ vmovdqa L_avx2_rorx_sha256_shuf_00BA(%rip), %ymm11
+ vmovdqa L_avx2_rorx_sha256_shuf_DC00(%rip), %ymm12
+ # X0, X1, X2, X3 = W[0..15]
+ vmovdqu (%rax), %xmm0
+ vmovdqu 16(%rax), %xmm1
+ vpshufb %xmm13, %xmm0, %xmm0
+ vpshufb %xmm13, %xmm1, %xmm1
+ vpaddd 0+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 32+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, (%rsp)
+ vmovdqu %ymm5, 32(%rsp)
+ vmovdqu 32(%rax), %xmm2
+ vmovdqu 48(%rax), %xmm3
+ vpshufb %xmm13, %xmm2, %xmm2
+ vpshufb %xmm13, %xmm3, %xmm3
+ vpaddd 64+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 96+L_avx2_rorx_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 64(%rsp)
+ vmovdqu %ymm5, 96(%rsp)
+ movl (%rdi), %r8d
+ movl 4(%rdi), %r9d
+ movl 8(%rdi), %r10d
+ movl 12(%rdi), %r11d
+ movl 16(%rdi), %r12d
+ movl 20(%rdi), %r13d
+ movl 24(%rdi), %r14d
+ movl 28(%rdi), %r15d
+ movl %r9d, %ebx
+ rorxl $6, %r12d, %edx
+ xorl %r10d, %ebx
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl (%rsp), %r15d
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 4(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm3, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 8(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 12(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 128+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 128(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 32(%rsp), %r11d
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 36(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm0, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 40(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 44(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 160+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 160(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 64(%rsp), %r15d
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 68(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm1, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 72(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 76(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 192+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 192(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 96(%rsp), %r11d
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 100(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm2, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 104(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 108(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 224+L_avx2_rorx_sha256_k(%rip), %ymm3, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 224(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 128(%rsp), %r15d
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 132(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm3, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 136(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 140(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 256+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 256(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 160(%rsp), %r11d
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 164(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm0, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 168(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 172(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 288+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 288(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 192(%rsp), %r15d
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 196(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm1, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 200(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 204(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 320+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 320(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 224(%rsp), %r11d
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 228(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm2, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 232(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 236(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 352+L_avx2_rorx_sha256_k(%rip), %ymm3, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 352(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 256(%rsp), %r15d
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 260(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm3, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 264(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 268(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 384+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 384(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 288(%rsp), %r11d
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 292(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm0, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 296(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 300(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 416+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 416(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 320(%rsp), %r15d
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 324(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm1, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 328(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 332(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 448+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 448(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 352(%rsp), %r11d
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 356(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm2, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 360(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 364(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 480+L_avx2_rorx_sha256_k(%rip), %ymm3, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 480(%rsp)
+ xorl %eax, %eax
+ xorl %ecx, %ecx
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 384(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 388(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 392(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 396(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 416(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 420(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 424(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 428(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 448(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 452(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 456(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 460(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 480(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 484(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 488(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 492(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ addl %eax, %r8d
+ addl %r8d, (%rdi)
+ addl %r9d, 4(%rdi)
+ addl %r10d, 8(%rdi)
+ addl %r11d, 12(%rdi)
+ addl %r12d, 16(%rdi)
+ addl %r13d, 20(%rdi)
+ addl %r14d, 24(%rdi)
+ addl %r15d, 28(%rdi)
+ xorq %rax, %rax
+ vzeroupper
+ addq $0x200, %rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size Transform_Sha256_AVX2_RORX,.-Transform_Sha256_AVX2_RORX
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl Transform_Sha256_AVX2_RORX_Len
+.type Transform_Sha256_AVX2_RORX_Len,@function
+.align 4
+Transform_Sha256_AVX2_RORX_Len:
+#else
+.section __TEXT,__text
+.globl _Transform_Sha256_AVX2_RORX_Len
+.p2align 2
+_Transform_Sha256_AVX2_RORX_Len:
+#endif /* __APPLE__ */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsi, %rbp
+ movq %rdx, %rsi
+ subq $0x200, %rsp
+ testb $0x40, %sil
+ je L_sha256_len_avx2_rorx_block
+ vmovdqu (%rbp), %ymm0
+ vmovdqu 32(%rbp), %ymm1
+ vmovups %ymm0, 32(%rdi)
+ vmovups %ymm1, 64(%rdi)
+#ifndef __APPLE__
+ call Transform_Sha256_AVX2_RORX@plt
+#else
+ call _Transform_Sha256_AVX2_RORX
+#endif /* __APPLE__ */
+ addq $0x40, %rbp
+ subl $0x40, %esi
+ jz L_sha256_len_avx2_rorx_done
+L_sha256_len_avx2_rorx_block:
+ vmovdqa L_avx2_rorx_sha256_flip_mask(%rip), %ymm13
+ vmovdqa L_avx2_rorx_sha256_shuf_00BA(%rip), %ymm11
+ vmovdqa L_avx2_rorx_sha256_shuf_DC00(%rip), %ymm12
+ movl (%rdi), %r8d
+ movl 4(%rdi), %r9d
+ movl 8(%rdi), %r10d
+ movl 12(%rdi), %r11d
+ movl 16(%rdi), %r12d
+ movl 20(%rdi), %r13d
+ movl 24(%rdi), %r14d
+ movl 28(%rdi), %r15d
+ # Start of loop processing two blocks
+L_sha256_len_avx2_rorx_start:
+ # X0, X1, X2, X3 = W[0..15]
+ vmovdqu (%rbp), %xmm0
+ vmovdqu 16(%rbp), %xmm1
+ vinserti128 $0x01, 64(%rbp), %ymm0, %ymm0
+ vinserti128 $0x01, 80(%rbp), %ymm1, %ymm1
+ vpshufb %ymm13, %ymm0, %ymm0
+ vpshufb %ymm13, %ymm1, %ymm1
+ vpaddd 0+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
+ vpaddd 32+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm5
+ vmovdqu %ymm4, (%rsp)
+ vmovdqu %ymm5, 32(%rsp)
+ vmovdqu 32(%rbp), %xmm2
+ vmovdqu 48(%rbp), %xmm3
+ vinserti128 $0x01, 96(%rbp), %ymm2, %ymm2
+ vinserti128 $0x01, 112(%rbp), %ymm3, %ymm3
+ vpshufb %ymm13, %ymm2, %ymm2
+ vpshufb %ymm13, %ymm3, %ymm3
+ vpaddd 64+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
+ vpaddd 96+L_avx2_rorx_sha256_k(%rip), %ymm3, %ymm5
+ vmovdqu %ymm4, 64(%rsp)
+ vmovdqu %ymm5, 96(%rsp)
+ movl %r9d, %ebx
+ rorxl $6, %r12d, %edx
+ xorl %r10d, %ebx
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl (%rsp), %r15d
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 4(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm3, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 8(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 12(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 128+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 128(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 32(%rsp), %r11d
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 36(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm0, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 40(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 44(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 160+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 160(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 64(%rsp), %r15d
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 68(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm1, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 72(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 76(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 192+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 192(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 96(%rsp), %r11d
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 100(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm2, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 104(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 108(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 224+L_avx2_rorx_sha256_k(%rip), %ymm3, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 224(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 128(%rsp), %r15d
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 132(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm3, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 136(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 140(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 256+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 256(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 160(%rsp), %r11d
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 164(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm0, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 168(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 172(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 288+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 288(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 192(%rsp), %r15d
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 196(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm1, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 200(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 204(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 320+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 320(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 224(%rsp), %r11d
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 228(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm2, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 232(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 236(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 352+L_avx2_rorx_sha256_k(%rip), %ymm3, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 352(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 256(%rsp), %r15d
+ vpalignr $4, %ymm0, %ymm1, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm2, %ymm3, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 260(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm3, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm0, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 264(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 268(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm0
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 384+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 384(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 288(%rsp), %r11d
+ vpalignr $4, %ymm1, %ymm2, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm3, %ymm0, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 292(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm0, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm1, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 296(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 300(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm1
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 416+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 416(%rsp)
+ # rnd_0: 0 - 0
+ movl %r13d, %eax
+ rorxl $11, %r12d, %ecx
+ addl 320(%rsp), %r15d
+ vpalignr $4, %ymm2, %ymm3, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ vpalignr $4, %ymm0, %ymm1, %ymm4
+ # rnd_0: 2 - 2
+ andl %r12d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r8d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ xorl %r14d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r8d, %eax
+ addl %edx, %r15d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ addl %ebx, %r15d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r12d, %ebx
+ rorxl $11, %r11d, %ecx
+ addl 324(%rsp), %r14d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r11d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r15d, %ecx
+ vpshufd $0xfa, %ymm1, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ xorl %r13d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r14d, %r10d
+ movl %r8d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r15d, %ebx
+ addl %edx, %r14d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ addl %eax, %r14d
+ vpaddd %ymm2, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r11d, %eax
+ rorxl $11, %r10d, %ecx
+ addl 328(%rsp), %r13d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r10d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r14d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ xorl %r12d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r14d, %eax
+ addl %edx, %r13d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ addl %ebx, %r13d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r10d, %ebx
+ rorxl $11, %r9d, %ecx
+ addl 332(%rsp), %r12d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r9d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r13d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ xorl %r11d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ vpaddd %ymm4, %ymm9, %ymm2
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r12d, %r8d
+ movl %r14d, %ebx
+ vpaddd 448+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r13d, %ebx
+ addl %edx, %r12d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ addl %eax, %r12d
+ vmovdqu %ymm4, 448(%rsp)
+ # rnd_0: 0 - 0
+ movl %r9d, %eax
+ rorxl $11, %r8d, %ecx
+ addl 352(%rsp), %r11d
+ vpalignr $4, %ymm3, %ymm0, %ymm5
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ vpalignr $4, %ymm1, %ymm2, %ymm4
+ # rnd_0: 2 - 2
+ andl %r8d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r12d, %ecx
+ vpsrld $7, %ymm5, %ymm6
+ # rnd_0: 3 - 3
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ xorl %r10d, %eax
+ vpslld $25, %ymm5, %ymm7
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ vpsrld $18, %ymm5, %ymm8
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ vpslld $14, %ymm5, %ymm9
+ # rnd_0: 6 - 6
+ xorl %r12d, %eax
+ addl %edx, %r11d
+ andl %eax, %ebx
+ vpor %ymm7, %ymm6, %ymm6
+ # rnd_0: 7 - 7
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ addl %ebx, %r11d
+ vpor %ymm9, %ymm8, %ymm8
+ # rnd_1: 0 - 0
+ movl %r8d, %ebx
+ rorxl $11, %r15d, %ecx
+ addl 356(%rsp), %r10d
+ vpsrld $3, %ymm5, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ vpxor %ymm8, %ymm6, %ymm6
+ # rnd_1: 2 - 2
+ andl %r15d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r11d, %ecx
+ vpshufd $0xfa, %ymm2, %ymm7
+ # rnd_1: 3 - 3
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ xorl %r9d, %ebx
+ vpxor %ymm6, %ymm9, %ymm5
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ vpsrld $10, %ymm7, %ymm8
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r10d, %r14d
+ movl %r12d, %ebx
+ vpsrlq $19, %ymm7, %ymm6
+ # rnd_1: 6 - 6
+ xorl %r11d, %ebx
+ addl %edx, %r10d
+ andl %ebx, %eax
+ vpsrlq $0x11, %ymm7, %ymm7
+ # rnd_1: 7 - 7
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ addl %eax, %r10d
+ vpaddd %ymm3, %ymm4, %ymm4
+ # rnd_0: 0 - 0
+ movl %r15d, %eax
+ rorxl $11, %r14d, %ecx
+ addl 360(%rsp), %r9d
+ vpxor %ymm7, %ymm6, %ymm6
+ # rnd_0: 1 - 1
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ vpxor %ymm6, %ymm8, %ymm8
+ # rnd_0: 2 - 2
+ andl %r14d, %eax
+ xorl %ecx, %edx
+ rorxl $13, %r10d, %ecx
+ vpaddd %ymm5, %ymm4, %ymm4
+ # rnd_0: 3 - 3
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ xorl %r8d, %eax
+ vpshufb %ymm11, %ymm8, %ymm8
+ # rnd_0: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ vpaddd %ymm8, %ymm4, %ymm4
+ # rnd_0: 5 - 5
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ vpshufd $0x50, %ymm4, %ymm6
+ # rnd_0: 6 - 6
+ xorl %r10d, %eax
+ addl %edx, %r9d
+ andl %eax, %ebx
+ vpsrlq $0x11, %ymm6, %ymm8
+ # rnd_0: 7 - 7
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ addl %ebx, %r9d
+ vpsrlq $19, %ymm6, %ymm7
+ # rnd_1: 0 - 0
+ movl %r14d, %ebx
+ rorxl $11, %r13d, %ecx
+ addl 364(%rsp), %r8d
+ vpsrld $10, %ymm6, %ymm9
+ # rnd_1: 1 - 1
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ vpxor %ymm7, %ymm8, %ymm8
+ # rnd_1: 2 - 2
+ andl %r13d, %ebx
+ xorl %ecx, %edx
+ rorxl $13, %r9d, %ecx
+ vpxor %ymm8, %ymm9, %ymm9
+ # rnd_1: 3 - 3
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ xorl %r15d, %ebx
+ vpshufb %ymm12, %ymm9, %ymm9
+ # rnd_1: 4 - 4
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ vpaddd %ymm4, %ymm9, %ymm3
+ # rnd_1: 5 - 5
+ xorl %ecx, %edx
+ addl %r8d, %r12d
+ movl %r10d, %ebx
+ vpaddd 480+L_avx2_rorx_sha256_k(%rip), %ymm3, %ymm4
+ # rnd_1: 6 - 6
+ xorl %r9d, %ebx
+ addl %edx, %r8d
+ andl %ebx, %eax
+ # rnd_1: 7 - 7
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ addl %eax, %r8d
+ vmovdqu %ymm4, 480(%rsp)
+ xorl %eax, %eax
+ xorl %ecx, %ecx
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 384(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 388(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 392(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 396(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 416(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 420(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 424(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 428(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 448(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 452(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 456(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 460(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 480(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 484(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 488(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 492(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ addl %eax, %r8d
+ xorl %ecx, %ecx
+ addl (%rdi), %r8d
+ addl 4(%rdi), %r9d
+ addl 8(%rdi), %r10d
+ addl 12(%rdi), %r11d
+ addl 16(%rdi), %r12d
+ addl 20(%rdi), %r13d
+ addl 24(%rdi), %r14d
+ addl 28(%rdi), %r15d
+ movl %r8d, (%rdi)
+ movl %r9d, 4(%rdi)
+ movl %r10d, 8(%rdi)
+ movl %r11d, 12(%rdi)
+ movl %r12d, 16(%rdi)
+ movl %r13d, 20(%rdi)
+ movl %r14d, 24(%rdi)
+ movl %r15d, 28(%rdi)
+ movl %r9d, %ebx
+ xorl %eax, %eax
+ xorl %r10d, %ebx
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 16(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 20(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 24(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 28(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 48(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 52(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 56(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 60(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 80(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 84(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 88(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 92(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 112(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 116(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 120(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 124(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 144(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 148(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 152(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 156(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 176(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 180(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 184(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 188(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 208(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 212(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 216(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 220(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 240(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 244(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 248(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 252(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 272(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 276(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 280(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 284(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 304(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 308(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 312(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 316(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 336(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 340(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 344(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 348(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 368(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 372(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 376(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 380(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 400(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 404(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 408(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 412(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 432(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 436(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 440(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 444(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ rorxl $6, %r12d, %edx
+ rorxl $11, %r12d, %ecx
+ leal (%r8,%rax,1), %r8d
+ addl 464(%rsp), %r15d
+ movl %r13d, %eax
+ xorl %edx, %ecx
+ xorl %r14d, %eax
+ rorxl $25, %r12d, %edx
+ xorl %ecx, %edx
+ andl %r12d, %eax
+ addl %edx, %r15d
+ rorxl $2, %r8d, %edx
+ rorxl $13, %r8d, %ecx
+ xorl %r14d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r8d, %edx
+ addl %eax, %r15d
+ xorl %ecx, %edx
+ movl %r9d, %eax
+ addl %r15d, %r11d
+ xorl %r8d, %eax
+ andl %eax, %ebx
+ addl %edx, %r15d
+ xorl %r9d, %ebx
+ rorxl $6, %r11d, %edx
+ rorxl $11, %r11d, %ecx
+ addl %ebx, %r15d
+ addl 468(%rsp), %r14d
+ movl %r12d, %ebx
+ xorl %edx, %ecx
+ xorl %r13d, %ebx
+ rorxl $25, %r11d, %edx
+ xorl %ecx, %edx
+ andl %r11d, %ebx
+ addl %edx, %r14d
+ rorxl $2, %r15d, %edx
+ rorxl $13, %r15d, %ecx
+ xorl %r13d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r15d, %edx
+ addl %ebx, %r14d
+ xorl %ecx, %edx
+ movl %r8d, %ebx
+ leal (%r10,%r14,1), %r10d
+ xorl %r15d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r14d
+ xorl %r8d, %eax
+ rorxl $6, %r10d, %edx
+ rorxl $11, %r10d, %ecx
+ leal (%r14,%rax,1), %r14d
+ addl 472(%rsp), %r13d
+ movl %r11d, %eax
+ xorl %edx, %ecx
+ xorl %r12d, %eax
+ rorxl $25, %r10d, %edx
+ xorl %ecx, %edx
+ andl %r10d, %eax
+ addl %edx, %r13d
+ rorxl $2, %r14d, %edx
+ rorxl $13, %r14d, %ecx
+ xorl %r12d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r14d, %edx
+ addl %eax, %r13d
+ xorl %ecx, %edx
+ movl %r15d, %eax
+ addl %r13d, %r9d
+ xorl %r14d, %eax
+ andl %eax, %ebx
+ addl %edx, %r13d
+ xorl %r15d, %ebx
+ rorxl $6, %r9d, %edx
+ rorxl $11, %r9d, %ecx
+ addl %ebx, %r13d
+ addl 476(%rsp), %r12d
+ movl %r10d, %ebx
+ xorl %edx, %ecx
+ xorl %r11d, %ebx
+ rorxl $25, %r9d, %edx
+ xorl %ecx, %edx
+ andl %r9d, %ebx
+ addl %edx, %r12d
+ rorxl $2, %r13d, %edx
+ rorxl $13, %r13d, %ecx
+ xorl %r11d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r13d, %edx
+ addl %ebx, %r12d
+ xorl %ecx, %edx
+ movl %r14d, %ebx
+ leal (%r8,%r12,1), %r8d
+ xorl %r13d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r12d
+ xorl %r14d, %eax
+ rorxl $6, %r8d, %edx
+ rorxl $11, %r8d, %ecx
+ leal (%r12,%rax,1), %r12d
+ addl 496(%rsp), %r11d
+ movl %r9d, %eax
+ xorl %edx, %ecx
+ xorl %r10d, %eax
+ rorxl $25, %r8d, %edx
+ xorl %ecx, %edx
+ andl %r8d, %eax
+ addl %edx, %r11d
+ rorxl $2, %r12d, %edx
+ rorxl $13, %r12d, %ecx
+ xorl %r10d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r12d, %edx
+ addl %eax, %r11d
+ xorl %ecx, %edx
+ movl %r13d, %eax
+ addl %r11d, %r15d
+ xorl %r12d, %eax
+ andl %eax, %ebx
+ addl %edx, %r11d
+ xorl %r13d, %ebx
+ rorxl $6, %r15d, %edx
+ rorxl $11, %r15d, %ecx
+ addl %ebx, %r11d
+ addl 500(%rsp), %r10d
+ movl %r8d, %ebx
+ xorl %edx, %ecx
+ xorl %r9d, %ebx
+ rorxl $25, %r15d, %edx
+ xorl %ecx, %edx
+ andl %r15d, %ebx
+ addl %edx, %r10d
+ rorxl $2, %r11d, %edx
+ rorxl $13, %r11d, %ecx
+ xorl %r9d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r11d, %edx
+ addl %ebx, %r10d
+ xorl %ecx, %edx
+ movl %r12d, %ebx
+ leal (%r14,%r10,1), %r14d
+ xorl %r11d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r10d
+ xorl %r12d, %eax
+ rorxl $6, %r14d, %edx
+ rorxl $11, %r14d, %ecx
+ leal (%r10,%rax,1), %r10d
+ addl 504(%rsp), %r9d
+ movl %r15d, %eax
+ xorl %edx, %ecx
+ xorl %r8d, %eax
+ rorxl $25, %r14d, %edx
+ xorl %ecx, %edx
+ andl %r14d, %eax
+ addl %edx, %r9d
+ rorxl $2, %r10d, %edx
+ rorxl $13, %r10d, %ecx
+ xorl %r8d, %eax
+ xorl %edx, %ecx
+ rorxl $22, %r10d, %edx
+ addl %eax, %r9d
+ xorl %ecx, %edx
+ movl %r11d, %eax
+ addl %r9d, %r13d
+ xorl %r10d, %eax
+ andl %eax, %ebx
+ addl %edx, %r9d
+ xorl %r11d, %ebx
+ rorxl $6, %r13d, %edx
+ rorxl $11, %r13d, %ecx
+ addl %ebx, %r9d
+ addl 508(%rsp), %r8d
+ movl %r14d, %ebx
+ xorl %edx, %ecx
+ xorl %r15d, %ebx
+ rorxl $25, %r13d, %edx
+ xorl %ecx, %edx
+ andl %r13d, %ebx
+ addl %edx, %r8d
+ rorxl $2, %r9d, %edx
+ rorxl $13, %r9d, %ecx
+ xorl %r15d, %ebx
+ xorl %edx, %ecx
+ rorxl $22, %r9d, %edx
+ addl %ebx, %r8d
+ xorl %ecx, %edx
+ movl %r10d, %ebx
+ leal (%r12,%r8,1), %r12d
+ xorl %r9d, %ebx
+ andl %ebx, %eax
+ addl %edx, %r8d
+ xorl %r10d, %eax
+ addl %eax, %r8d
+ addq $0x80, %rbp
+ addl (%rdi), %r8d
+ addl 4(%rdi), %r9d
+ addl 8(%rdi), %r10d
+ addl 12(%rdi), %r11d
+ addl 16(%rdi), %r12d
+ addl 20(%rdi), %r13d
+ addl 24(%rdi), %r14d
+ addl 28(%rdi), %r15d
+ subl $0x80, %esi
+ movl %r8d, (%rdi)
+ movl %r9d, 4(%rdi)
+ movl %r10d, 8(%rdi)
+ movl %r11d, 12(%rdi)
+ movl %r12d, 16(%rdi)
+ movl %r13d, 20(%rdi)
+ movl %r14d, 24(%rdi)
+ movl %r15d, 28(%rdi)
+ jnz L_sha256_len_avx2_rorx_start
+L_sha256_len_avx2_rorx_done:
+ xorq %rax, %rax
+ vzeroupper
+ addq $0x200, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size Transform_Sha256_AVX2_RORX_Len,.-Transform_Sha256_AVX2_RORX_Len
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX2 */