summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-22 17:07:53 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-22 19:53:29 +0300
commitc7efaa5fe0ee92e321a7b49d56752cc12eb75fe0 (patch)
treec090454520de01a26a0357dc257cede6639674c3
parent335d9bf7b035815750b63a3a8334d6ce44dc4449 (diff)
downloadlibgcrypt-c7efaa5fe0ee92e321a7b49d56752cc12eb75fe0.tar.gz
serpent-amd64: do not use GAS macros
* cipher/serpent-avx2-amd64.S: Remove use of GAS macros. * cipher/serpent-sse2-amd64.S: Ditto. * configure.ac [HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS]: Do not check for GAS macros. -- This way we have better portability; for example, when compiling with clang on x86-64, the assembly implementations are now enabled and working. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--cipher/serpent-avx2-amd64.S519
-rw-r--r--cipher/serpent-sse2-amd64.S507
-rw-r--r--configure.ac7
3 files changed, 440 insertions, 593 deletions
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index c726e7ba..8a76ab1f 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -36,51 +36,36 @@
#define CTX %rdi
/* vector registers */
-.set RA0, %ymm0
-.set RA1, %ymm1
-.set RA2, %ymm2
-.set RA3, %ymm3
-.set RA4, %ymm4
-
-.set RB0, %ymm5
-.set RB1, %ymm6
-.set RB2, %ymm7
-.set RB3, %ymm8
-.set RB4, %ymm9
-
-.set RNOT, %ymm10
-.set RTMP0, %ymm11
-.set RTMP1, %ymm12
-.set RTMP2, %ymm13
-.set RTMP3, %ymm14
-.set RTMP4, %ymm15
-
-.set RNOTx, %xmm10
-.set RTMP0x, %xmm11
-.set RTMP1x, %xmm12
-.set RTMP2x, %xmm13
-.set RTMP3x, %xmm14
-.set RTMP4x, %xmm15
+#define RA0 %ymm0
+#define RA1 %ymm1
+#define RA2 %ymm2
+#define RA3 %ymm3
+#define RA4 %ymm4
+
+#define RB0 %ymm5
+#define RB1 %ymm6
+#define RB2 %ymm7
+#define RB3 %ymm8
+#define RB4 %ymm9
+
+#define RNOT %ymm10
+#define RTMP0 %ymm11
+#define RTMP1 %ymm12
+#define RTMP2 %ymm13
+#define RTMP3 %ymm14
+#define RTMP4 %ymm15
+
+#define RNOTx %xmm10
+#define RTMP0x %xmm11
+#define RTMP1x %xmm12
+#define RTMP2x %xmm13
+#define RTMP3x %xmm14
+#define RTMP4x %xmm15
/**********************************************************************
helper macros
**********************************************************************/
-/* preprocessor macro for renaming vector registers using GAS macros */
-#define sbox_reg_rename(r0, r1, r2, r3, r4, \
- new_r0, new_r1, new_r2, new_r3, new_r4) \
- .set rename_reg0, new_r0; \
- .set rename_reg1, new_r1; \
- .set rename_reg2, new_r2; \
- .set rename_reg3, new_r3; \
- .set rename_reg4, new_r4; \
- \
- .set r0, rename_reg0; \
- .set r1, rename_reg1; \
- .set r2, rename_reg2; \
- .set r3, rename_reg3; \
- .set r4, rename_reg4;
-
/* vector 32-bit rotation to left */
#define vec_rol(reg, nleft, tmp) \
vpslld $(nleft), reg, tmp; \
@@ -128,9 +113,7 @@
vpxor r4, r2, r2; vpxor RNOT, r4, r4; \
vpor r1, r4, r4; vpxor r3, r1, r1; \
vpxor r4, r1, r1; vpor r0, r3, r3; \
- vpxor r3, r1, r1; vpxor r3, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+ vpxor r3, r1, r1; vpxor r3, r4, r4;
#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
vpxor RNOT, r2, r2; vmovdqa r1, r4; \
@@ -143,9 +126,7 @@
vpxor r1, r2, r2; vpxor r0, r3, r3; \
vpxor r1, r3, r3; \
vpand r3, r2, r2; \
- vpxor r2, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+ vpxor r2, r4, r4;
#define SBOX1(r0, r1, r2, r3, r4) \
vpxor RNOT, r0, r0; vpxor RNOT, r2, r2; \
@@ -157,9 +138,7 @@
vpand r4, r2, r2; vpxor r1, r0, r0; \
vpand r2, r1, r1; \
vpxor r0, r1, r1; vpand r2, r0, r0; \
- vpxor r4, r0, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+ vpxor r4, r0, r0;
#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
vmovdqa r1, r4; vpxor r3, r1, r1; \
@@ -172,9 +151,7 @@
vpxor r1, r4, r4; vpor r0, r1, r1; \
vpxor r0, r1, r1; \
vpor r4, r1, r1; \
- vpxor r1, r3, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+ vpxor r1, r3, r3;
#define SBOX2(r0, r1, r2, r3, r4) \
vmovdqa r0, r4; vpand r2, r0, r0; \
@@ -184,9 +161,7 @@
vmovdqa r3, r1; vpor r4, r3, r3; \
vpxor r0, r3, r3; vpand r1, r0, r0; \
vpxor r0, r4, r4; vpxor r3, r1, r1; \
- vpxor r4, r1, r1; vpxor RNOT, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+ vpxor r4, r1, r1; vpxor RNOT, r4, r4;
#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
vpxor r3, r2, r2; vpxor r0, r3, r3; \
@@ -198,9 +173,7 @@
vpor r0, r2, r2; vpxor RNOT, r3, r3; \
vpxor r3, r2, r2; vpxor r3, r0, r0; \
vpand r1, r0, r0; vpxor r4, r3, r3; \
- vpxor r0, r3, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+ vpxor r0, r3, r3;
#define SBOX3(r0, r1, r2, r3, r4) \
vmovdqa r0, r4; vpor r3, r0, r0; \
@@ -212,9 +185,7 @@
vpxor r2, r4, r4; vpor r0, r1, r1; \
vpxor r2, r1, r1; vpxor r3, r0, r0; \
vmovdqa r1, r2; vpor r3, r1, r1; \
- vpxor r0, r1, r1; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+ vpxor r0, r1, r1;
#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
vmovdqa r2, r4; vpxor r1, r2, r2; \
@@ -226,9 +197,7 @@
vpxor r1, r3, r3; vpxor r0, r1, r1; \
vpor r2, r1, r1; vpxor r3, r0, r0; \
vpxor r4, r1, r1; \
- vpxor r1, r0, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+ vpxor r1, r0, r0;
#define SBOX4(r0, r1, r2, r3, r4) \
vpxor r3, r1, r1; vpxor RNOT, r3, r3; \
@@ -240,9 +209,7 @@
vpxor r0, r3, r3; vpor r1, r4, r4; \
vpxor r0, r4, r4; vpor r3, r0, r0; \
vpxor r2, r0, r0; vpand r3, r2, r2; \
- vpxor RNOT, r0, r0; vpxor r2, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+ vpxor RNOT, r0, r0; vpxor r2, r4, r4;
#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
vmovdqa r2, r4; vpand r3, r2, r2; \
@@ -255,9 +222,7 @@
vpand r0, r2, r2; vpxor r0, r3, r3; \
vpxor r4, r2, r2; \
vpor r3, r2, r2; vpxor r0, r3, r3; \
- vpxor r1, r2, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+ vpxor r1, r2, r2;
#define SBOX5(r0, r1, r2, r3, r4) \
vpxor r1, r0, r0; vpxor r3, r1, r1; \
@@ -269,9 +234,7 @@
vpxor r2, r4, r4; vpxor r0, r2, r2; \
vpand r3, r0, r0; vpxor RNOT, r2, r2; \
vpxor r4, r0, r0; vpor r3, r4, r4; \
- vpxor r4, r2, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+ vpxor r4, r2, r2;
#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
vpxor RNOT, r1, r1; vmovdqa r3, r4; \
@@ -283,9 +246,7 @@
vpxor r3, r1, r1; vpxor r2, r4, r4; \
vpand r4, r3, r3; vpxor r1, r4, r4; \
vpxor r4, r3, r3; vpxor RNOT, r4, r4; \
- vpxor r0, r3, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+ vpxor r0, r3, r3;
#define SBOX6(r0, r1, r2, r3, r4) \
vpxor RNOT, r2, r2; vmovdqa r3, r4; \
@@ -297,9 +258,7 @@
vpxor r2, r0, r0; vpxor r3, r4, r4; \
vpxor r0, r4, r4; vpxor RNOT, r3, r3; \
vpand r4, r2, r2; \
- vpxor r3, r2, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+ vpxor r3, r2, r2;
#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
vpxor r2, r0, r0; vmovdqa r2, r4; \
@@ -310,9 +269,7 @@
vpxor r1, r4, r4; vpand r3, r1, r1; \
vpxor r0, r1, r1; vpxor r3, r0, r0; \
vpor r2, r0, r0; vpxor r1, r3, r3; \
- vpxor r0, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+ vpxor r0, r4, r4;
#define SBOX7(r0, r1, r2, r3, r4) \
vmovdqa r1, r4; vpor r2, r1, r1; \
@@ -325,9 +282,7 @@
vpxor r1, r2, r2; vpand r0, r1, r1; \
vpxor r4, r1, r1; vpxor RNOT, r2, r2; \
vpor r0, r2, r2; \
- vpxor r2, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+ vpxor r2, r4, r4;
#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
vmovdqa r2, r4; vpxor r0, r2, r2; \
@@ -339,9 +294,7 @@
vpor r2, r0, r0; vpxor r1, r4, r4; \
vpxor r3, r0, r0; vpxor r4, r3, r3; \
vpor r0, r4, r4; vpxor r2, r3, r3; \
- vpxor r2, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+ vpxor r2, r4, r4;
/* Apply SBOX number WHICH to to the block. */
#define SBOX(which, r0, r1, r2, r3, r4) \
@@ -402,49 +355,51 @@
/* Apply a Serpent round to sixteen parallel blocks. This macro increments
`round'. */
-#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4); \
- LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4); \
- .set round, (round + 1);
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \
+ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
/* Apply the last Serpent round to sixteen parallel blocks. This macro
increments `round'. */
-#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- .set round, (round + 1); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round + 1);
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
/* Apply an inverse Serpent round to sixteen parallel blocks. This macro
increments `round'. */
-#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \
LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
/* Apply the first inverse Serpent round to sixteen parallel blocks. This macro
increments `round'. */
-#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1); \
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
.text
@@ -456,72 +411,82 @@ __serpent_enc_blk16:
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
* plaintext blocks
* output:
- * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel
* ciphertext blocks
*/
- /* record input vector names for __serpent_enc_blk16 */
- .set enc_in_a0, RA0
- .set enc_in_a1, RA1
- .set enc_in_a2, RA2
- .set enc_in_a3, RA3
- .set enc_in_b0, RB0
- .set enc_in_b1, RB1
- .set enc_in_b2, RB2
- .set enc_in_b3, RB3
-
vpcmpeqd RNOT, RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 0
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
- transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
-
- /* record output vector names for __serpent_enc_blk16 */
- .set enc_out_a0, RA0
- .set enc_out_a1, RA1
- .set enc_out_a2, RA2
- .set enc_out_a3, RA3
- .set enc_out_b0, RB0
- .set enc_out_b1, RB1
- .set enc_out_b2, RB2
- .set enc_out_b3, RB3
+ ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+ ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+ RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+ ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+ RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+ ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+ RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+ ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+ RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+ ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+ RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+ ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+ RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+ ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+ RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+ ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+ RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+ ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+ RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+ ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+ RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+ ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+ RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+ ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+ ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+ RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+ ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+ RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+ ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+ RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+ ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+ transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
ret;
.size __serpent_enc_blk16,.-__serpent_enc_blk16;
@@ -538,69 +503,81 @@ __serpent_dec_blk16:
* plaintext blocks
*/
- /* record input vector names for __serpent_dec_blk16 */
- .set dec_in_a0, RA0
- .set dec_in_a1, RA1
- .set dec_in_a2, RA2
- .set dec_in_a3, RA3
- .set dec_in_b0, RB0
- .set dec_in_b1, RB1
- .set dec_in_b2, RB2
- .set dec_in_b3, RB3
-
vpcmpeqd RNOT, RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 32
- ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+ ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+ RA3, RA0, RA1, RA4, RA2,
+ RB0, RB1, RB2, RB3, RB4,
+ RB3, RB0, RB1, RB4, RB2);
+ ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+ ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+ RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+ ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+ RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+ ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+ RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+ ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+ RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+ ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+ RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+ ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+ RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+ ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+ RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+ ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+ RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+ ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+ RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+ ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+ RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+ ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+ RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+ ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+ RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+ ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+ RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+ ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+ RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+ ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+ RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+ ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+ RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+ ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+ RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+ ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+ ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+ RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+ ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+ RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+ ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+ RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+ ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+ RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+ ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+ RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+ ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+ RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+ ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+ RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+ ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+ RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+ ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+ RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+ ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+ RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+ ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+ RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- /* record output vector names for __serpent_dec_blk16 */
- .set dec_out_a0, RA0
- .set dec_out_a1, RA1
- .set dec_out_a2, RA2
- .set dec_out_a3, RA3
- .set dec_out_b0, RB0
- .set dec_out_b1, RB1
- .set dec_out_b2, RB2
- .set dec_out_b3, RB3
-
ret;
.size __serpent_dec_blk16,.-__serpent_dec_blk16;
@@ -623,15 +600,6 @@ _gcry_serpent_avx2_ctr_enc:
vzeroupper;
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
vpcmpeqd RNOT, RNOT, RNOT;
vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
@@ -703,32 +671,23 @@ _gcry_serpent_avx2_ctr_enc:
call __serpent_enc_blk16;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (0 * 32)(%rdx), RA4, RA4;
vpxor (1 * 32)(%rdx), RA1, RA1;
vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (3 * 32)(%rdx), RA0, RA0;
+ vpxor (4 * 32)(%rdx), RB4, RB4;
vpxor (5 * 32)(%rdx), RB1, RB1;
vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
+ vpxor (7 * 32)(%rdx), RB0, RB0;
- vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA4, (0 * 32)(%rsi);
vmovdqu RA1, (1 * 32)(%rsi);
vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RA0, (3 * 32)(%rsi);
+ vmovdqu RB4, (4 * 32)(%rsi);
vmovdqu RB1, (5 * 32)(%rsi);
vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
+ vmovdqu RB0, (7 * 32)(%rsi);
vzeroall;
@@ -748,15 +707,6 @@ _gcry_serpent_avx2_cbc_dec:
vzeroupper;
- .set RA0, dec_in_a0
- .set RA1, dec_in_a1
- .set RA2, dec_in_a2
- .set RA3, dec_in_a3
- .set RB0, dec_in_b0
- .set RB1, dec_in_b1
- .set RB2, dec_in_b2
- .set RB3, dec_in_b3
-
vmovdqu (0 * 32)(%rdx), RA0;
vmovdqu (1 * 32)(%rdx), RA1;
vmovdqu (2 * 32)(%rdx), RA2;
@@ -768,15 +718,6 @@ _gcry_serpent_avx2_cbc_dec:
call __serpent_dec_blk16;
- .set RA0, dec_out_a0
- .set RA1, dec_out_a1
- .set RA2, dec_out_a2
- .set RA3, dec_out_a3
- .set RB0, dec_out_b0
- .set RB1, dec_out_b1
- .set RB2, dec_out_b2
- .set RB3, dec_out_b3
-
vmovdqu (%rcx), RNOTx;
vinserti128 $1, (%rdx), RNOT, RNOT;
vpxor RNOT, RA0, RA0;
@@ -817,15 +758,6 @@ _gcry_serpent_avx2_cfb_dec:
vzeroupper;
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
/* Load input */
vmovdqu (%rcx), RNOTx;
vinserti128 $1, (%rdx), RNOT, RA0;
@@ -843,32 +775,23 @@ _gcry_serpent_avx2_cfb_dec:
call __serpent_enc_blk16;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (0 * 32)(%rdx), RA4, RA4;
vpxor (1 * 32)(%rdx), RA1, RA1;
vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (3 * 32)(%rdx), RA0, RA0;
+ vpxor (4 * 32)(%rdx), RB4, RB4;
vpxor (5 * 32)(%rdx), RB1, RB1;
vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
+ vpxor (7 * 32)(%rdx), RB0, RB0;
- vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA4, (0 * 32)(%rsi);
vmovdqu RA1, (1 * 32)(%rsi);
vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RA0, (3 * 32)(%rsi);
+ vmovdqu RB4, (4 * 32)(%rsi);
vmovdqu RB1, (5 * 32)(%rsi);
vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
+ vmovdqu RB0, (7 * 32)(%rsi);
vzeroall;
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index a5cf3539..516126b3 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -35,42 +35,27 @@
#define CTX %rdi
/* vector registers */
-.set RA0, %xmm0
-.set RA1, %xmm1
-.set RA2, %xmm2
-.set RA3, %xmm3
-.set RA4, %xmm4
-
-.set RB0, %xmm5
-.set RB1, %xmm6
-.set RB2, %xmm7
-.set RB3, %xmm8
-.set RB4, %xmm9
-
-.set RNOT, %xmm10
-.set RTMP0, %xmm11
-.set RTMP1, %xmm12
-.set RTMP2, %xmm13
+#define RA0 %xmm0
+#define RA1 %xmm1
+#define RA2 %xmm2
+#define RA3 %xmm3
+#define RA4 %xmm4
+
+#define RB0 %xmm5
+#define RB1 %xmm6
+#define RB2 %xmm7
+#define RB3 %xmm8
+#define RB4 %xmm9
+
+#define RNOT %xmm10
+#define RTMP0 %xmm11
+#define RTMP1 %xmm12
+#define RTMP2 %xmm13
/**********************************************************************
helper macros
**********************************************************************/
-/* preprocessor macro for renaming vector registers using GAS macros */
-#define sbox_reg_rename(r0, r1, r2, r3, r4, \
- new_r0, new_r1, new_r2, new_r3, new_r4) \
- .set rename_reg0, new_r0; \
- .set rename_reg1, new_r1; \
- .set rename_reg2, new_r2; \
- .set rename_reg3, new_r3; \
- .set rename_reg4, new_r4; \
- \
- .set r0, rename_reg0; \
- .set r1, rename_reg1; \
- .set r2, rename_reg2; \
- .set r3, rename_reg3; \
- .set r4, rename_reg4;
-
/* vector 32-bit rotation to left */
#define vec_rol(reg, nleft, tmp) \
movdqa reg, tmp; \
@@ -147,9 +132,7 @@
pxor r4, r2; pxor RNOT, r4; \
por r1, r4; pxor r3, r1; \
pxor r4, r1; por r0, r3; \
- pxor r3, r1; pxor r3, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+ pxor r3, r1; pxor r3, r4;
#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
pxor RNOT, r2; movdqa r1, r4; \
@@ -162,9 +145,7 @@
pxor r1, r2; pxor r0, r3; \
pxor r1, r3; \
pand r3, r2; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+ pxor r2, r4;
#define SBOX1(r0, r1, r2, r3, r4) \
pxor RNOT, r0; pxor RNOT, r2; \
@@ -176,9 +157,7 @@
pand r4, r2; pxor r1, r0; \
pand r2, r1; \
pxor r0, r1; pand r2, r0; \
- pxor r4, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+ pxor r4, r0;
#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
movdqa r1, r4; pxor r3, r1; \
@@ -191,9 +170,7 @@
pxor r1, r4; por r0, r1; \
pxor r0, r1; \
por r4, r1; \
- pxor r1, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+ pxor r1, r3;
#define SBOX2(r0, r1, r2, r3, r4) \
movdqa r0, r4; pand r2, r0; \
@@ -203,9 +180,7 @@
movdqa r3, r1; por r4, r3; \
pxor r0, r3; pand r1, r0; \
pxor r0, r4; pxor r3, r1; \
- pxor r4, r1; pxor RNOT, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+ pxor r4, r1; pxor RNOT, r4;
#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
pxor r3, r2; pxor r0, r3; \
@@ -217,9 +192,7 @@
por r0, r2; pxor RNOT, r3; \
pxor r3, r2; pxor r3, r0; \
pand r1, r0; pxor r4, r3; \
- pxor r0, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+ pxor r0, r3;
#define SBOX3(r0, r1, r2, r3, r4) \
movdqa r0, r4; por r3, r0; \
@@ -231,9 +204,7 @@
pxor r2, r4; por r0, r1; \
pxor r2, r1; pxor r3, r0; \
movdqa r1, r2; por r3, r1; \
- pxor r0, r1; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+ pxor r0, r1;
#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pxor r1, r2; \
@@ -245,9 +216,7 @@
pxor r1, r3; pxor r0, r1; \
por r2, r1; pxor r3, r0; \
pxor r4, r1; \
- pxor r1, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+ pxor r1, r0;
#define SBOX4(r0, r1, r2, r3, r4) \
pxor r3, r1; pxor RNOT, r3; \
@@ -259,9 +228,7 @@
pxor r0, r3; por r1, r4; \
pxor r0, r4; por r3, r0; \
pxor r2, r0; pand r3, r2; \
- pxor RNOT, r0; pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+ pxor RNOT, r0; pxor r2, r4;
#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pand r3, r2; \
@@ -274,9 +241,7 @@
pand r0, r2; pxor r0, r3; \
pxor r4, r2; \
por r3, r2; pxor r0, r3; \
- pxor r1, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+ pxor r1, r2;
#define SBOX5(r0, r1, r2, r3, r4) \
pxor r1, r0; pxor r3, r1; \
@@ -288,9 +253,7 @@
pxor r2, r4; pxor r0, r2; \
pand r3, r0; pxor RNOT, r2; \
pxor r4, r0; por r3, r4; \
- pxor r4, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+ pxor r4, r2;
#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
pxor RNOT, r1; movdqa r3, r4; \
@@ -302,9 +265,7 @@
pxor r3, r1; pxor r2, r4; \
pand r4, r3; pxor r1, r4; \
pxor r4, r3; pxor RNOT, r4; \
- pxor r0, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+ pxor r0, r3;
#define SBOX6(r0, r1, r2, r3, r4) \
pxor RNOT, r2; movdqa r3, r4; \
@@ -316,9 +277,7 @@
pxor r2, r0; pxor r3, r4; \
pxor r0, r4; pxor RNOT, r3; \
pand r4, r2; \
- pxor r3, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+ pxor r3, r2;
#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
pxor r2, r0; movdqa r2, r4; \
@@ -329,9 +288,7 @@
pxor r1, r4; pand r3, r1; \
pxor r0, r1; pxor r3, r0; \
por r2, r0; pxor r1, r3; \
- pxor r0, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+ pxor r0, r4;
#define SBOX7(r0, r1, r2, r3, r4) \
movdqa r1, r4; por r2, r1; \
@@ -344,9 +301,7 @@
pxor r1, r2; pand r0, r1; \
pxor r4, r1; pxor RNOT, r2; \
por r0, r2; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+ pxor r2, r4;
#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pxor r0, r2; \
@@ -358,9 +313,7 @@
por r2, r0; pxor r1, r4; \
pxor r3, r0; pxor r4, r3; \
por r0, r4; pxor r2, r3; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+ pxor r2, r4;
/* Apply SBOX number WHICH to to the block. */
#define SBOX(which, r0, r1, r2, r3, r4) \
@@ -425,49 +378,51 @@
/* Apply a Serpent round to eight parallel blocks. This macro increments
`round'. */
-#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4); \
- LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4); \
- .set round, (round + 1);
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \
+ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
/* Apply the last Serpent round to eight parallel blocks. This macro increments
`round'. */
-#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- .set round, (round + 1); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round + 1);
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
/* Apply an inverse Serpent round to eight parallel blocks. This macro
increments `round'. */
-#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \
LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
/* Apply the first inverse Serpent round to eight parallel blocks. This macro
increments `round'. */
-#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1); \
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
.text
@@ -479,72 +434,82 @@ __serpent_enc_blk8:
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
* blocks
* output:
- * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
* ciphertext blocks
*/
- /* record input vector names for __serpent_enc_blk8 */
- .set enc_in_a0, RA0
- .set enc_in_a1, RA1
- .set enc_in_a2, RA2
- .set enc_in_a3, RA3
- .set enc_in_b0, RB0
- .set enc_in_b1, RB1
- .set enc_in_b2, RB2
- .set enc_in_b3, RB3
-
pcmpeqd RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 0
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
- transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
-
- /* record output vector names for __serpent_enc_blk8 */
- .set enc_out_a0, RA0
- .set enc_out_a1, RA1
- .set enc_out_a2, RA2
- .set enc_out_a3, RA3
- .set enc_out_b0, RB0
- .set enc_out_b1, RB1
- .set enc_out_b2, RB2
- .set enc_out_b3, RB3
+ ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+ ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+ RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+ ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+ RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+ ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+ RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+ ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+ RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+ ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+ RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+ ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+ RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+ ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+ RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+ ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+ RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+ ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+ RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+ ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+ RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+ ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+ RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+ ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+ ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+ RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+ ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+ RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+ ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+ RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+ ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+ transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
ret;
.size __serpent_enc_blk8,.-__serpent_enc_blk8;
@@ -561,69 +526,81 @@ __serpent_dec_blk8:
* blocks
*/
- /* record input vector names for __serpent_dec_blk8 */
- .set dec_in_a0, RA0
- .set dec_in_a1, RA1
- .set dec_in_a2, RA2
- .set dec_in_a3, RA3
- .set dec_in_b0, RB0
- .set dec_in_b1, RB1
- .set dec_in_b2, RB2
- .set dec_in_b3, RB3
-
pcmpeqd RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 32
- ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+ ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+ RA3, RA0, RA1, RA4, RA2,
+ RB0, RB1, RB2, RB3, RB4,
+ RB3, RB0, RB1, RB4, RB2);
+ ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+ ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+ RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+ ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+ RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+ ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+ RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+ ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+ RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+ ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+ RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+ ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+ RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+ ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+ RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+ ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+ RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+ ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+ RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+ ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+ RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+ ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+ RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+ ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+ RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+ ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+ RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+ ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+ RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+ ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+ RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+ ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+ RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+ ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+ RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+ ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+ ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+ RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+ ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+ RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+ ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+ RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+ ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+ RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+ ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+ RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+ ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+ RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+ ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+ RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+ ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+ RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+ ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+ RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+ ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+ RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+ ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+ RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- /* record output vector names for __serpent_dec_blk8 */
- .set dec_out_a0, RA0
- .set dec_out_a1, RA1
- .set dec_out_a2, RA2
- .set dec_out_a3, RA3
- .set dec_out_b0, RB0
- .set dec_out_b1, RB1
- .set dec_out_b2, RB2
- .set dec_out_b3, RB3
-
ret;
.size __serpent_dec_blk8,.-__serpent_dec_blk8;
@@ -638,15 +615,6 @@ _gcry_serpent_sse2_ctr_enc:
* %rcx: iv (big endian, 128bit)
*/
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
/* load IV and byteswap */
movdqu (%rcx), RA0;
movdqa RA0, RTMP0;
@@ -729,42 +697,35 @@ _gcry_serpent_sse2_ctr_enc:
call __serpent_enc_blk8;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((0 * 16)(%rdx), RA4, RTMP0);
pxor_u((1 * 16)(%rdx), RA1, RTMP0);
pxor_u((2 * 16)(%rdx), RA2, RTMP0);
- pxor_u((3 * 16)(%rdx), RA3, RTMP0);
- pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB4, RTMP0);
pxor_u((5 * 16)(%rdx), RB1, RTMP0);
pxor_u((6 * 16)(%rdx), RB2, RTMP0);
- pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB0, RTMP0);
- movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA4, (0 * 16)(%rsi);
movdqu RA1, (1 * 16)(%rsi);
movdqu RA2, (2 * 16)(%rsi);
- movdqu RA3, (3 * 16)(%rsi);
- movdqu RB0, (4 * 16)(%rsi);
+ movdqu RA0, (3 * 16)(%rsi);
+ movdqu RB4, (4 * 16)(%rsi);
movdqu RB1, (5 * 16)(%rsi);
movdqu RB2, (6 * 16)(%rsi);
- movdqu RB3, (7 * 16)(%rsi);
+ movdqu RB0, (7 * 16)(%rsi);
/* clear the used registers */
pxor RA0, RA0;
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;
@@ -784,15 +745,6 @@ _gcry_serpent_sse2_cbc_dec:
* %rcx: iv
*/
- .set RA0, dec_in_a0
- .set RA1, dec_in_a1
- .set RA2, dec_in_a2
- .set RA3, dec_in_a3
- .set RB0, dec_in_b0
- .set RB1, dec_in_b1
- .set RB2, dec_in_b2
- .set RB3, dec_in_b3
-
movdqu (0 * 16)(%rdx), RA0;
movdqu (1 * 16)(%rdx), RA1;
movdqu (2 * 16)(%rdx), RA2;
@@ -804,15 +756,6 @@ _gcry_serpent_sse2_cbc_dec:
call __serpent_dec_blk8;
- .set RA0, dec_out_a0
- .set RA1, dec_out_a1
- .set RA2, dec_out_a2
- .set RA3, dec_out_a3
- .set RB0, dec_out_b0
- .set RB1, dec_out_b1
- .set RB2, dec_out_b2
- .set RB3, dec_out_b3
-
movdqu (7 * 16)(%rdx), RNOT;
pxor_u((%rcx), RA0, RTMP0);
pxor_u((0 * 16)(%rdx), RA1, RTMP0);
@@ -838,10 +781,12 @@ _gcry_serpent_sse2_cbc_dec:
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;
@@ -861,15 +806,6 @@ _gcry_serpent_sse2_cfb_dec:
* %rcx: iv
*/
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
/* Load input */
movdqu (%rcx), RA0;
movdqu 0 * 16(%rdx), RA1;
@@ -886,42 +822,35 @@ _gcry_serpent_sse2_cfb_dec:
call __serpent_enc_blk8;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((0 * 16)(%rdx), RA4, RTMP0);
pxor_u((1 * 16)(%rdx), RA1, RTMP0);
pxor_u((2 * 16)(%rdx), RA2, RTMP0);
- pxor_u((3 * 16)(%rdx), RA3, RTMP0);
- pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB4, RTMP0);
pxor_u((5 * 16)(%rdx), RB1, RTMP0);
pxor_u((6 * 16)(%rdx), RB2, RTMP0);
- pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB0, RTMP0);
- movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA4, (0 * 16)(%rsi);
movdqu RA1, (1 * 16)(%rsi);
movdqu RA2, (2 * 16)(%rsi);
- movdqu RA3, (3 * 16)(%rsi);
- movdqu RB0, (4 * 16)(%rsi);
+ movdqu RA0, (3 * 16)(%rsi);
+ movdqu RB4, (4 * 16)(%rsi);
movdqu RB1, (5 * 16)(%rsi);
movdqu RB2, (6 * 16)(%rsi);
- movdqu RB3, (7 * 16)(%rsi);
+ movdqu RB0, (7 * 16)(%rsi);
/* clear the used registers */
pxor RA0, RA0;
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;
diff --git a/configure.ac b/configure.ac
index 1460dfdf..8fb14e27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1034,17 +1034,12 @@ if test $amd64_as_feature_detection = yes; then
[gcry_cv_gcc_amd64_platform_as_ok=no
AC_COMPILE_IFELSE([AC_LANG_SOURCE(
[[__asm__(
- /* Test if '.set' is supported by underlying assembler. */
- ".set a0, %rax\n\t"
- ".set b0, %rdx\n\t"
- "asmfunc:\n\t"
- "movq a0, b0;\n\t" /* Fails here if .set ignored by as. */
-
/* Test if '.type' and '.size' are supported. */
/* These work only on ELF targets. */
/* TODO: add COFF (mingw64, cygwin64) support to assembly
* implementations. Mingw64/cygwin64 also require additional
* work because they use different calling convention. */
+ "asmfunc:\n\t"
".size asmfunc,.-asmfunc;\n\t"
".type asmfunc,@function;\n\t"
);]])],