From c0ff0d4528d718c20b9ca2290bd10d59e9f794a3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 9 Feb 2022 00:33:39 +0100 Subject: libbb/sha256: code shrink in 32-bit x86 function old new delta sha256_process_block64_shaNI 713 697 -16 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 130 ++++++++++++++++------------------- libbb/hash_md5_sha256_x86-64_shaNI.S | 107 ++++++++++++++-------------- 2 files changed, 114 insertions(+), 123 deletions(-) (limited to 'libbb') diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 39e2baf41..a849dfcc2 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -31,35 +31,27 @@ #define MSGTMP1 %xmm4 #define MSGTMP2 %xmm5 #define MSGTMP3 %xmm6 -#define XMMTMP4 %xmm7 - .balign 8 # allow decoders to fetch at least 3 first insns -sha256_process_block64_shaNI: - pushl %ebp - movl %esp, %ebp - subl $32, %esp - andl $~0xF, %esp # paddd needs aligned memory operand +#define XMMTMP %xmm7 + .balign 8 # allow decoders to fetch at least 2 first insns +sha256_process_block64_shaNI: movu128 76+0*16(%eax), STATE0 movu128 76+1*16(%eax), STATE1 - shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ - shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ - mova128 STATE0, XMMTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, XMMTMP4, STATE1 /* CDGH */ + shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ + shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ + mova128 STATE0, XMMTMP + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, XMMTMP, STATE1 /* CDGH */ -/* XMMTMP4 holds flip mask from here... */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP4 +/* XMMTMP holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP movl $K256+8*16, SHA256CONSTANTS - /* Save hash values for addition after rounds */ - mova128 STATE0, 0*16(%esp) - mova128 STATE1, 1*16(%esp) - /* Rounds 0-3 */ movu128 0*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP0 paddd 0*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -68,7 +60,7 @@ sha256_process_block64_shaNI: /* Rounds 4-7 */ movu128 1*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP1 paddd 1*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -78,7 +70,7 @@ sha256_process_block64_shaNI: /* Rounds 8-11 */ movu128 2*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP2 paddd 2*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -88,14 +80,14 @@ sha256_process_block64_shaNI: /* Rounds 12-15 */ movu128 3*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG /* ...to here */ mova128 MSG, MSGTMP3 paddd 3*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -105,9 +97,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -117,9 +109,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -129,9 +121,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -141,9 +133,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -153,9 +145,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -165,9 +157,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -177,9 +169,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -189,9 +181,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -201,9 +193,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -213,9 +205,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -224,9 +216,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -238,22 +230,20 @@ sha256_process_block64_shaNI: shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 - /* Add current hash values with previously saved */ - paddd 0*16(%esp), STATE0 - paddd 1*16(%esp), STATE1 - /* Write hash values back in the correct order */ - shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ - shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ - mova128 STATE0, XMMTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, XMMTMP4, STATE1 /* HGFE */ - + shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ + shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ + mova128 STATE0, XMMTMP + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, XMMTMP, STATE1 /* HGFE */ + /* add current hash values to previous ones */ + movu128 76+0*16(%eax), XMMTMP + paddd XMMTMP, STATE0 + movu128 76+1*16(%eax), XMMTMP movu128 STATE0, 76+0*16(%eax) + paddd XMMTMP, STATE1 movu128 STATE1, 76+1*16(%eax) - movl %ebp, %esp - popl %ebp ret .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index c6c931341..b5c950a9a 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -31,7 +31,8 @@ #define MSGTMP1 %xmm4 #define MSGTMP2 %xmm5 #define MSGTMP3 %xmm6 -#define XMMTMP4 %xmm7 + +#define XMMTMP %xmm7 #define ABEF_SAVE %xmm9 #define CDGH_SAVE %xmm10 @@ -41,14 +42,14 @@ sha256_process_block64_shaNI: movu128 80+0*16(%rdi), STATE0 movu128 80+1*16(%rdi), STATE1 - shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ - shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ - mova128 STATE0, XMMTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, XMMTMP4, STATE1 /* CDGH */ + shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ + shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ + mova128 STATE0, XMMTMP + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, XMMTMP, STATE1 /* CDGH */ -/* XMMTMP4 holds flip mask from here... */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP4 +/* XMMTMP holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP leaq K256+8*16(%rip), SHA256CONSTANTS /* Save hash values for addition after rounds */ @@ -57,7 +58,7 @@ sha256_process_block64_shaNI: /* Rounds 0-3 */ movu128 0*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP0 paddd 0*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -66,7 +67,7 @@ sha256_process_block64_shaNI: /* Rounds 4-7 */ movu128 1*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP1 paddd 1*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -76,7 +77,7 @@ sha256_process_block64_shaNI: /* Rounds 8-11 */ movu128 2*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG mova128 MSG, MSGTMP2 paddd 2*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 @@ -86,14 +87,14 @@ sha256_process_block64_shaNI: /* Rounds 12-15 */ movu128 3*16(DATA_PTR), MSG - pshufb XMMTMP4, MSG + pshufb XMMTMP, MSG /* ...to here */ mova128 MSG, MSGTMP3 paddd 3*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -103,9 +104,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -115,9 +116,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -127,9 +128,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -139,9 +140,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -151,9 +152,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -163,9 +164,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -175,9 +176,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -187,9 +188,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP3, MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP3, XMMTMP4 - palignr $4, MSGTMP2, XMMTMP4 - paddd XMMTMP4, MSGTMP0 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -199,9 +200,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP0, MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP0, XMMTMP4 - palignr $4, MSGTMP3, XMMTMP4 - paddd XMMTMP4, MSGTMP1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -211,9 +212,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP1, MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP1, XMMTMP4 - palignr $4, MSGTMP0, XMMTMP4 - paddd XMMTMP4, MSGTMP2 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -222,9 +223,9 @@ sha256_process_block64_shaNI: mova128 MSGTMP2, MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 - mova128 MSGTMP2, XMMTMP4 - palignr $4, MSGTMP1, XMMTMP4 - paddd XMMTMP4, MSGTMP3 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG sha256rnds2 STATE1, STATE0 @@ -241,11 +242,11 @@ sha256_process_block64_shaNI: paddd CDGH_SAVE, STATE1 /* Write hash values back in the correct order */ - shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ - shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ - mova128 STATE0, XMMTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, XMMTMP4, STATE1 /* HGFE */ + shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ + shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ + mova128 STATE0, XMMTMP + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, XMMTMP, STATE1 /* HGFE */ movu128 STATE0, 80+0*16(%rdi) movu128 STATE1, 80+1*16(%rdi) -- cgit v1.2.1