summaryrefslogtreecommitdiff
path: root/cipher/rijndael-aesni.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2018-01-06 18:53:20 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2018-01-09 18:44:34 +0200
commitc9e9cb2eb6a1c659d3825ca627228b732f2f2152 (patch)
tree25b0ed20bfc6106781a3b18bc8b9236218010332 /cipher/rijndael-aesni.c
parentb3ec0f752c925cde36f560f0f9309ab6450bbfd9 (diff)
downloadlibgcrypt-c9e9cb2eb6a1c659d3825ca627228b732f2f2152.tar.gz
AES-NI improvements for AMD64
* cipher/rijndael-aesni.c [__x86_64__] (aesni_prepare_7_15_variable) (aesni_prepare_7_15, aesni_cleanup_7_15, do_aesni_enc_vec8) (do_aesni_dec_vec8, do_aesni_ctr_8): New. (_gcry_aes_aesni_ctr_enc, _gcry_aes_aesni_cfb_dec) (_gcry_aes_aesni_cbc_dec, aesni_ocb_enc, aesni_ocb_dec) (_gcry_aes_aesni_ocb_auth) [__x86_64__]: Add 8 parallel blocks processing. -- Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo, no HT): Before: AES | nanosecs/byte mebibytes/sec cycles/byte CBC dec | 0.175 ns/B 5448.7 MiB/s 0.700 c/B CFB dec | 0.174 ns/B 5466.2 MiB/s 0.698 c/B CTR enc | 0.182 ns/B 5226.0 MiB/s 0.730 c/B OCB enc | 0.194 ns/B 4913.9 MiB/s 0.776 c/B OCB dec | 0.200 ns/B 4769.2 MiB/s 0.800 c/B OCB auth | 0.172 ns/B 5545.0 MiB/s 0.688 c/B After (1.08x to 1.14x faster): AES | nanosecs/byte mebibytes/sec cycles/byte CBC dec | 0.157 ns/B 6075.6 MiB/s 0.628 c/B CFB dec | 0.158 ns/B 6034.1 MiB/s 0.632 c/B CTR enc | 0.159 ns/B 5979.4 MiB/s 0.638 c/B OCB enc | 0.175 ns/B 5447.1 MiB/s 0.700 c/B OCB dec | 0.183 ns/B 5203.9 MiB/s 0.733 c/B OCB auth | 0.156 ns/B 6101.3 MiB/s 0.625 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-aesni.c')
-rw-r--r--cipher/rijndael-aesni.c1248
1 files changed, 1221 insertions, 27 deletions
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 735e5cdd..3d323cf0 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -55,6 +55,7 @@ typedef struct u128_s
#ifdef __WIN64__
/* XMM6-XMM15 are callee-saved registers on WIN64. */
# define aesni_prepare_2_6_variable char win64tmp[16]
+# define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9]
# define aesni_prepare() do { } while (0)
# define aesni_prepare_2_6() \
do { asm volatile ("movdqu %%xmm6, %0\n\t" \
@@ -62,6 +63,20 @@ typedef struct u128_s
: \
: "memory"); \
} while (0)
+# define aesni_prepare_7_15() \
+ do { asm volatile ("movdqu %%xmm7, 0*16(%0)\n\t" \
+ "movdqu %%xmm8, 1*16(%0)\n\t" \
+ "movdqu %%xmm9, 2*16(%0)\n\t" \
+ "movdqu %%xmm10, 3*16(%0)\n\t" \
+ "movdqu %%xmm11, 4*16(%0)\n\t" \
+ "movdqu %%xmm12, 5*16(%0)\n\t" \
+ "movdqu %%xmm13, 6*16(%0)\n\t" \
+ "movdqu %%xmm14, 7*16(%0)\n\t" \
+ "movdqu %%xmm15, 8*16(%0)\n\t" \
+ : \
+ : "r" (win64tmp7_15) \
+ : "memory"); \
+ } while (0)
# define aesni_cleanup() \
do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \
"pxor %%xmm1, %%xmm1\n" :: ); \
@@ -76,6 +91,20 @@ typedef struct u128_s
: "m" (*win64tmp) \
: "memory"); \
} while (0)
+# define aesni_cleanup_7_15() \
+ do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t" \
+ "movdqu 1*16(%0), %%xmm8\n\t" \
+ "movdqu 2*16(%0), %%xmm9\n\t" \
+ "movdqu 3*16(%0), %%xmm10\n\t" \
+ "movdqu 4*16(%0), %%xmm11\n\t" \
+ "movdqu 5*16(%0), %%xmm12\n\t" \
+ "movdqu 6*16(%0), %%xmm13\n\t" \
+ "movdqu 7*16(%0), %%xmm14\n\t" \
+ "movdqu 8*16(%0), %%xmm15\n\t" \
+ : \
+ : "r" (win64tmp7_15) \
+ : "memory"); \
+ } while (0)
#else
# define aesni_prepare_2_6_variable
# define aesni_prepare() do { } while (0)
@@ -91,6 +120,21 @@ typedef struct u128_s
"pxor %%xmm5, %%xmm5\n" \
"pxor %%xmm6, %%xmm6\n":: ); \
} while (0)
+# ifdef __x86_64__
+# define aesni_prepare_7_15_variable
+# define aesni_prepare_7_15() do { } while (0)
+# define aesni_cleanup_7_15() \
+ do { asm volatile ("pxor %%xmm7, %%xmm7\n\t" \
+ "pxor %%xmm8, %%xmm8\n" \
+ "pxor %%xmm9, %%xmm9\n" \
+ "pxor %%xmm10, %%xmm10\n" \
+ "pxor %%xmm11, %%xmm11\n" \
+ "pxor %%xmm12, %%xmm12\n" \
+ "pxor %%xmm13, %%xmm13\n" \
+ "pxor %%xmm14, %%xmm14\n" \
+ "pxor %%xmm15, %%xmm15\n":: ); \
+ } while (0)
+# endif
#endif
void
@@ -704,6 +748,314 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
}
+#ifdef __x86_64__
+
+/* Encrypt eight blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */
+static inline void
+do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
+{
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */
+ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */
+ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */
+ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "jb .Ldeclast%=\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "je .Ldeclast%=\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ "aesenc %%xmm0, %%xmm1\n\t"
+ "aesenc %%xmm0, %%xmm2\n\t"
+ "aesenc %%xmm0, %%xmm3\n\t"
+ "aesenc %%xmm0, %%xmm4\n\t"
+ "aesenc %%xmm0, %%xmm8\n\t"
+ "aesenc %%xmm0, %%xmm9\n\t"
+ "aesenc %%xmm0, %%xmm10\n\t"
+ "aesenc %%xmm0, %%xmm11\n\t"
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ "aesenclast %%xmm0, %%xmm1\n\t"
+ "aesenclast %%xmm0, %%xmm2\n\t"
+ "aesenclast %%xmm0, %%xmm3\n\t"
+ "aesenclast %%xmm0, %%xmm4\n\t"
+ "aesenclast %%xmm0, %%xmm8\n\t"
+ "aesenclast %%xmm0, %%xmm9\n\t"
+ "aesenclast %%xmm0, %%xmm10\n\t"
+ "aesenclast %%xmm0, %%xmm11\n\t"
+ : /* no output */
+ : [key] "r" (ctx->keyschenc),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+}
+
+
+/* Decrypt eight blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */
+static inline void
+do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
+{
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */
+ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */
+ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */
+ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "jb .Ldeclast%=\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "je .Ldeclast%=\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ "aesdec %%xmm0, %%xmm1\n\t"
+ "aesdec %%xmm0, %%xmm2\n\t"
+ "aesdec %%xmm0, %%xmm3\n\t"
+ "aesdec %%xmm0, %%xmm4\n\t"
+ "aesdec %%xmm0, %%xmm8\n\t"
+ "aesdec %%xmm0, %%xmm9\n\t"
+ "aesdec %%xmm0, %%xmm10\n\t"
+ "aesdec %%xmm0, %%xmm11\n\t"
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ "aesdeclast %%xmm0, %%xmm1\n\t"
+ "aesdeclast %%xmm0, %%xmm2\n\t"
+ "aesdeclast %%xmm0, %%xmm3\n\t"
+ "aesdeclast %%xmm0, %%xmm4\n\t"
+ "aesdeclast %%xmm0, %%xmm8\n\t"
+ "aesdeclast %%xmm0, %%xmm9\n\t"
+ "aesdeclast %%xmm0, %%xmm10\n\t"
+ "aesdeclast %%xmm0, %%xmm11\n\t"
+ : /* no output */
+ : [key] "r" (ctx->keyschdec),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+}
+
+#endif /* __x86_64__ */
+
+
/* Perform a CTR encryption round using the counter CTR and the input
block A. Write the result to the output block B and update CTR.
CTR needs to be a 16 byte aligned little-endian value. */
@@ -808,7 +1160,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
#define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
/* Register usage:
- esi keyschedule
+ [key] keyschedule
xmm0 CTR-0
xmm1 temp / round key
xmm2 CTR-1
@@ -1003,6 +1355,327 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
}
+#ifdef __x86_64__
+
+/* Eight blocks at a time variant of do_aesni_ctr. */
+static void
+do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
+ unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+ static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) =
+ {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
+ };
+ const void *bige_addb = bige_addb_const;
+
+ /* Register usage:
+ [key] keyschedule
+ xmm0 CTR-0
+ xmm1 temp / round key
+ xmm2 CTR-1
+ xmm3 CTR-2
+ xmm4 CTR-3
+ xmm5 copy of *ctr
+ xmm6 endian swapping mask
+ xmm8 CTR-4
+ xmm9 CTR-5
+ xmm10 CTR-6
+ xmm11 CTR-7
+ xmm12 temp
+ xmm13 temp
+ xmm14 temp
+ xmm15 temp
+ */
+
+ asm volatile (/* detect if 8-bit carry handling is needed */
+ "cmpb $0xf7, 15(%[ctr])\n\t"
+ "ja .Ladd32bit%=\n\t"
+
+ "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */
+ "movdqa 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) */
+ "movdqa 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) */
+ "movdqa 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) */
+ "movdqa 3*16(%[addb]), %%xmm8\n\t" /* xmm8 := be(4) */
+ "movdqa 4*16(%[addb]), %%xmm9\n\t" /* xmm9 := be(5) */
+ "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */
+ "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */
+ "movdqa 7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) */
+ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+ "paddb %%xmm0, %%xmm2\n\t" /* xmm2 := be(1) + CTR (xmm0) */
+ "paddb %%xmm0, %%xmm3\n\t" /* xmm3 := be(2) + CTR (xmm0) */
+ "paddb %%xmm0, %%xmm4\n\t" /* xmm4 := be(3) + CTR (xmm0) */
+ "paddb %%xmm0, %%xmm8\n\t" /* xmm8 := be(4) + CTR (xmm0) */
+ "paddb %%xmm0, %%xmm9\n\t" /* xmm9 := be(5) + CTR (xmm0) */
+ "paddb %%xmm0, %%xmm10\n\t" /* xmm10 := be(6) + CTR (xmm0) */
+ "paddb %%xmm0, %%xmm11\n\t" /* xmm11 := be(7) + CTR (xmm0) */
+ "paddb %%xmm0, %%xmm5\n\t" /* xmm5 := be(8) + CTR (xmm0) */
+ "jmp .Lstore_ctr%=\n\t"
+
+ ".Ladd32bit%=:\n\t"
+ "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */
+ "movdqa %%xmm0, %%xmm2\n\t"
+ "pcmpeqd %%xmm1, %%xmm1\n\t"
+ "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
+
+ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */
+ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */
+ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
+ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */
+ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
+ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */
+ "movdqa %%xmm4, %%xmm8\n\t" /* xmm8 := xmm4 */
+ "psubq %%xmm1, %%xmm8\n\t" /* xmm8++ */
+ "movdqa %%xmm8, %%xmm9\n\t" /* xmm9 := xmm8 */
+ "psubq %%xmm1, %%xmm9\n\t" /* xmm9++ */
+ "movdqa %%xmm9, %%xmm10\n\t" /* xmm10 := xmm9 */
+ "psubq %%xmm1, %%xmm10\n\t" /* xmm10++ */
+ "movdqa %%xmm10, %%xmm11\n\t" /* xmm11 := xmm10 */
+ "psubq %%xmm1, %%xmm11\n\t" /* xmm11++ */
+ "movdqa %%xmm11, %%xmm5\n\t" /* xmm5 := xmm11 */
+ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */
+
+ /* detect if 64-bit carry handling is needed */
+ "cmpl $0xffffffff, 8(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+ "movl 12(%[ctr]), %%esi\n\t"
+ "bswapl %%esi\n\t"
+ "cmpl $0xfffffff8, %%esi\n\t"
+ "jb .Lno_carry%=\n\t" /* no carry */
+
+ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
+ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffff8 */
+ "cmpl $0xfffffffa, %%esi\n\t"
+ "jb .Lcarry_xmm11%=\n\t" /* esi == 0xfffffff9 */
+ "je .Lcarry_xmm10%=\n\t" /* esi == 0xfffffffa */
+ "cmpl $0xfffffffc, %%esi\n\t"
+ "jb .Lcarry_xmm9%=\n\t" /* esi == 0xfffffffb */
+ "je .Lcarry_xmm8%=\n\t" /* esi == 0xfffffffc */
+ "cmpl $0xfffffffe, %%esi\n\t"
+ "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */
+ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */
+ /* esi == 0xffffffff */
+
+ "psubq %%xmm1, %%xmm2\n\t"
+ ".Lcarry_xmm3%=:\n\t"
+ "psubq %%xmm1, %%xmm3\n\t"
+ ".Lcarry_xmm4%=:\n\t"
+ "psubq %%xmm1, %%xmm4\n\t"
+ ".Lcarry_xmm8%=:\n\t"
+ "psubq %%xmm1, %%xmm8\n\t"
+ ".Lcarry_xmm9%=:\n\t"
+ "psubq %%xmm1, %%xmm9\n\t"
+ ".Lcarry_xmm10%=:\n\t"
+ "psubq %%xmm1, %%xmm10\n\t"
+ ".Lcarry_xmm11%=:\n\t"
+ "psubq %%xmm1, %%xmm11\n\t"
+ ".Lcarry_xmm5%=:\n\t"
+ "psubq %%xmm1, %%xmm5\n\t"
+
+ ".Lno_carry%=:\n\t"
+ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+
+ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */
+ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */
+ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */
+ "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */
+ "pshufb %%xmm6, %%xmm8\n\t" /* xmm8 := be(xmm8) */
+ "pshufb %%xmm6, %%xmm9\n\t" /* xmm9 := be(xmm9) */
+ "pshufb %%xmm6, %%xmm10\n\t" /* xmm10 := be(xmm10) */
+ "pshufb %%xmm6, %%xmm11\n\t" /* xmm11 := be(xmm11) */
+
+ ".Lstore_ctr%=:\n\t"
+ "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */
+ :
+ : [ctr] "r" (ctr),
+ [key] "r" (ctx->keyschenc),
+ [addb] "r" (bige_addb)
+ : "%esi", "cc", "memory");
+
+ asm volatile ("pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "pxor %%xmm1, %%xmm8\n\t" /* xmm8 ^= key[0] */
+ "pxor %%xmm1, %%xmm9\n\t" /* xmm9 ^= key[0] */
+ "pxor %%xmm1, %%xmm10\n\t" /* xmm10 ^= key[0] */
+ "pxor %%xmm1, %%xmm11\n\t" /* xmm11 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm1\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0x20(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0x30(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0x40(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0x50(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0x60(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0x70(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0x80(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0x90(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0xa0(%[key]), %%xmm1\n\t"
+ "jb .Lenclast%=\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0xb0(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0xc0(%[key]), %%xmm1\n\t"
+ "je .Lenclast%=\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0xd0(%[key]), %%xmm1\n\t"
+ "aesenc %%xmm1, %%xmm0\n\t"
+ "aesenc %%xmm1, %%xmm2\n\t"
+ "aesenc %%xmm1, %%xmm3\n\t"
+ "aesenc %%xmm1, %%xmm4\n\t"
+ "aesenc %%xmm1, %%xmm8\n\t"
+ "aesenc %%xmm1, %%xmm9\n\t"
+ "aesenc %%xmm1, %%xmm10\n\t"
+ "aesenc %%xmm1, %%xmm11\n\t"
+ "movdqa 0xe0(%[key]), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ "aesenclast %%xmm1, %%xmm0\n\t"
+ "aesenclast %%xmm1, %%xmm2\n\t"
+ "aesenclast %%xmm1, %%xmm3\n\t"
+ "aesenclast %%xmm1, %%xmm4\n\t"
+ "aesenclast %%xmm1, %%xmm8\n\t"
+ "aesenclast %%xmm1, %%xmm9\n\t"
+ "aesenclast %%xmm1, %%xmm10\n\t"
+ "aesenclast %%xmm1, %%xmm11\n\t"
+ :
+ : [key] "r" (ctx->keyschenc),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+
+ asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1. */
+ "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2. */
+ "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3. */
+ "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4. */
+ "movdqu 4*16(%[src]), %%xmm1\n\t" /* Get block 5. */
+ "pxor %%xmm12, %%xmm0\n\t" /* EncCTR-1 ^= input */
+ "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6. */
+ "pxor %%xmm13, %%xmm2\n\t" /* EncCTR-2 ^= input */
+ "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7. */
+ "pxor %%xmm14, %%xmm3\n\t" /* EncCTR-3 ^= input */
+ "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8. */
+ "pxor %%xmm15, %%xmm4\n\t" /* EncCTR-4 ^= input */
+ "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1 */
+ "pxor %%xmm1, %%xmm8\n\t" /* EncCTR-5 ^= input */
+ "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1 */
+ "pxor %%xmm12, %%xmm9\n\t" /* EncCTR-6 ^= input */
+ "movdqu %%xmm2, 1*16(%[dst])\n\t" /* Store block 2. */
+ "pxor %%xmm13, %%xmm10\n\t" /* EncCTR-7 ^= input */
+ "movdqu %%xmm3, 2*16(%[dst])\n\t" /* Store block 3. */
+ "pxor %%xmm14, %%xmm11\n\t" /* EncCTR-8 ^= input */
+ "movdqu %%xmm4, 3*16(%[dst])\n\t" /* Store block 4. */
+ "movdqu %%xmm8, 4*16(%[dst])\n\t" /* Store block 8. */
+ "movdqu %%xmm9, 5*16(%[dst])\n\t" /* Store block 9. */
+ "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10. */
+ "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11. */
+ :
+ : [src] "r" (a),
+ [dst] "r" (b)
+ : "memory");
+}
+
+#endif /* __x86_64__ */
+
+
unsigned int
_gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
const unsigned char *src)
@@ -1123,7 +1796,25 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
[ctr] "m" (*ctr)
: "memory");
- for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+ if (nblocks >= 8)
+ {
+ aesni_prepare_7_15_variable;
+
+ aesni_prepare_7_15();
+
+ for ( ;nblocks >= 8 ; nblocks -= 8 )
+ {
+ do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf);
+ outbuf += 8*BLOCKSIZE;
+ inbuf += 8*BLOCKSIZE;
+ }
+
+ aesni_cleanup_7_15();
+ }
+#endif
+
+ for ( ;nblocks >= 4 ; nblocks -= 4 )
{
do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
outbuf += 4*BLOCKSIZE;
@@ -1175,6 +1866,76 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
: "memory" );
/* CFB decryption can be parallelized */
+
+#ifdef __x86_64__
+ if (nblocks >= 8)
+ {
+ aesni_prepare_7_15_variable;
+
+ aesni_prepare_7_15();
+
+ for ( ;nblocks >= 8; nblocks -= 8)
+ {
+ asm volatile
+ ("movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */
+ "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+ "movdqu 3*16(%[inbuf]), %%xmm8\n\t"
+ "movdqu 4*16(%[inbuf]), %%xmm9\n\t"
+ "movdqu 5*16(%[inbuf]), %%xmm10\n\t"
+ "movdqu 6*16(%[inbuf]), %%xmm11\n\t"
+
+ "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */
+
+ "movdqa %%xmm2, %%xmm12\n\t"
+ "movdqa %%xmm3, %%xmm13\n\t"
+ "movdqa %%xmm4, %%xmm14\n\t"
+ "movdqa %%xmm8, %%xmm15\n\t"
+ : /* No output */
+ : [inbuf] "r" (inbuf)
+ : "memory");
+
+ do_aesni_enc_vec8 (ctx);
+
+ asm volatile
+ (
+ "pxor %%xmm12, %%xmm1\n\t"
+ "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+ "pxor %%xmm13, %%xmm2\n\t"
+ "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+ "pxor %%xmm14, %%xmm3\n\t"
+ "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+ "pxor %%xmm15, %%xmm4\n\t"
+ "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
+
+ "pxor %%xmm12, %%xmm8\n\t"
+ "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+ "pxor %%xmm13, %%xmm9\n\t"
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+ "pxor %%xmm14, %%xmm10\n\t"
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+ "pxor %%xmm15, %%xmm11\n\t"
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+ "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+ "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+ "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+ "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf),
+ [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 8*BLOCKSIZE;
+ inbuf += 8*BLOCKSIZE;
+ }
+
+ aesni_cleanup_7_15();
+ }
+#endif
+
for ( ;nblocks >= 4; nblocks -= 4)
{
asm volatile
@@ -1260,7 +2021,76 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
: [iv] "m" (*iv)
: "memory");
- for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+ if (nblocks >= 8)
+ {
+ aesni_prepare_7_15_variable;
+
+ aesni_prepare_7_15();
+
+ for ( ;nblocks >= 8 ; nblocks -= 8 )
+ {
+ asm volatile
+ ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
+ "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+ "movdqu 4*16(%[inbuf]), %%xmm8\n\t"
+ "movdqu 5*16(%[inbuf]), %%xmm9\n\t"
+ "movdqu 6*16(%[inbuf]), %%xmm10\n\t"
+ "movdqu 7*16(%[inbuf]), %%xmm11\n\t"
+
+ "movdqa %%xmm1, %%xmm12\n\t"
+ "movdqa %%xmm2, %%xmm13\n\t"
+ "movdqa %%xmm3, %%xmm14\n\t"
+ "movdqa %%xmm4, %%xmm15\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf)
+ : "memory");
+
+ do_aesni_dec_vec8 (ctx);
+
+ asm volatile
+ ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */
+
+ "pxor %%xmm12, %%xmm2\n\t" /* xor IV with output */
+ "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+
+ "pxor %%xmm13, %%xmm3\n\t" /* xor IV with output */
+ "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+
+ "pxor %%xmm14, %%xmm4\n\t" /* xor IV with output */
+ "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+
+ "pxor %%xmm15, %%xmm8\n\t" /* xor IV with output */
+ "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm12, %%xmm9\n\t" /* xor IV with output */
+ "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+ "pxor %%xmm13, %%xmm10\n\t" /* xor IV with output */
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+ "pxor %%xmm14, %%xmm11\n\t" /* xor IV with output */
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+ "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+ "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+ "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+ "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf),
+ [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 8*BLOCKSIZE;
+ inbuf += 8*BLOCKSIZE;
+ }
+
+ aesni_cleanup_7_15();
+ }
+#endif
+
+ for ( ;nblocks >= 4 ; nblocks -= 4 )
{
asm volatile
("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
@@ -1386,7 +2216,142 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
outbuf += BLOCKSIZE;
}
- for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+ if (nblocks >= 8)
+ {
+ aesni_prepare_7_15_variable;
+
+ aesni_prepare_7_15();
+
+ asm volatile ("movdqu %[l0], %%xmm7\n\t"
+ :
+ : [l0] "m" (*c->u_mode.ocb.L[0])
+ : "memory" );
+
+ for ( ;nblocks >= 8 ; nblocks -= 8 )
+ {
+ n += 4;
+ l = ocb_get_l(c, n);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+ asm volatile ("movdqu %[l1], %%xmm10\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqa %%xmm5, %%xmm12\n\t"
+ :
+ : [l1] "m" (*c->u_mode.ocb.L[1]),
+ [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm10, %%xmm5\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqa %%xmm5, %%xmm13\n\t"
+ :
+ : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqa %%xmm5, %%xmm14\n\t"
+ :
+ : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm15\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm15, %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqa %%xmm5, %%xmm15\n\t"
+ :
+ : [l3] "m" (*l),
+ [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ n += 4;
+ l = ocb_get_l(c, n);
+
+ asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm8, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm8\n\t"
+ "movdqu %%xmm5, %[outbuf4]\n\t"
+ : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+ : [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+ "pxor %%xmm10, %%xmm5\n\t"
+ "pxor %%xmm9, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm9\n\t"
+ "movdqu %%xmm5, %[outbuf5]\n\t"
+ : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+ : [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm10, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm10\n\t"
+ "movdqu %%xmm5, %[outbuf6]\n\t"
+ : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+ : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l7], %%xmm11\n\t"
+ "pxor %%xmm11, %%xmm5\n\t"
+ "movdqu %[inbuf7], %%xmm11\n\t"
+ "pxor %%xmm11, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm11\n\t"
+ :
+ : [l7] "m" (*l),
+ [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_enc_vec8 (ctx);
+
+ asm volatile ("pxor %%xmm12, %%xmm1\n\t"
+ "pxor %%xmm13, %%xmm2\n\t"
+ "movdqu %[outbuf4],%%xmm0\n\t"
+ "movdqu %[outbuf5],%%xmm12\n\t"
+ "movdqu %[outbuf6],%%xmm13\n\t"
+ "pxor %%xmm14, %%xmm3\n\t"
+ "pxor %%xmm15, %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm8\n\t"
+ "pxor %%xmm12, %%xmm9\n\t"
+ "pxor %%xmm13, %%xmm10\n\t"
+ "pxor %%xmm5, %%xmm11\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ "movdqu %%xmm8, %[outbuf4]\n\t"
+ "movdqu %%xmm9, %[outbuf5]\n\t"
+ "movdqu %%xmm10, %[outbuf6]\n\t"
+ "movdqu %%xmm11, %[outbuf7]\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+ [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+ [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+ [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+ [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+ [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+ [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+ [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+ :
+ : "memory" );
+
+ outbuf += 8*BLOCKSIZE;
+ inbuf += 8*BLOCKSIZE;
+ }
+
+ aesni_cleanup_7_15();
+ }
+#endif
+
+ for ( ;nblocks >= 4 ; nblocks -= 4 )
{
n += 4;
l = ocb_get_l(c, n);
@@ -1394,9 +2359,9 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Checksum_i = Checksum_{i-1} xor P_i */
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ asm volatile ("movdqu %[l0], %%xmm4\n\t"
"movdqu %[inbuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm5\n\t"
"pxor %%xmm1, %%xmm6\n\t"
"pxor %%xmm5, %%xmm1\n\t"
"movdqu %%xmm5, %[outbuf0]\n\t"
@@ -1414,19 +2379,17 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
: [l1] "m" (*c->u_mode.ocb.L[1]),
[inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
: "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[inbuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
+ asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm5\n\t"
"pxor %%xmm3, %%xmm6\n\t"
"pxor %%xmm5, %%xmm3\n\t"
"movdqu %%xmm5, %[outbuf2]\n\t"
: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
- : [l2] "m" (*c->u_mode.ocb.L[0]),
- [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
: "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ asm volatile ("movdqu %[l3], %%xmm4\n\t"
+ "pxor %%xmm4, %%xmm5\n\t"
"movdqu %[inbuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
"pxor %%xmm4, %%xmm6\n\t"
"pxor %%xmm5, %%xmm4\n\t"
:
@@ -1551,7 +2514,142 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
outbuf += BLOCKSIZE;
}
- for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+ if (nblocks >= 8)
+ {
+ aesni_prepare_7_15_variable;
+
+ aesni_prepare_7_15();
+
+ asm volatile ("movdqu %[l0], %%xmm7\n\t"
+ :
+ : [l0] "m" (*c->u_mode.ocb.L[0])
+ : "memory" );
+
+ for ( ;nblocks >= 8 ; nblocks -= 8 )
+ {
+ n += 4;
+ l = ocb_get_l(c, n);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+
+ asm volatile ("movdqu %[l1], %%xmm10\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqa %%xmm5, %%xmm12\n\t"
+ :
+ : [l1] "m" (*c->u_mode.ocb.L[1]),
+ [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm10, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqa %%xmm5, %%xmm13\n\t"
+ :
+ : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqa %%xmm5, %%xmm14\n\t"
+ :
+ : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqa %%xmm5, %%xmm15\n\t"
+ :
+ : [l3] "m" (*l),
+ [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ n += 4;
+ l = ocb_get_l(c, n);
+
+ asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm8\n\t"
+ "movdqu %%xmm5, %[outbuf4]\n\t"
+ : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+ : [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+ "pxor %%xmm10, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm9\n\t"
+ "movdqu %%xmm5, %[outbuf5]\n\t"
+ : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+ : [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm10\n\t"
+ "movdqu %%xmm5, %[outbuf6]\n\t"
+ : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+ : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l7], %%xmm0\n\t"
+ "movdqu %[inbuf7], %%xmm11\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm11\n\t"
+ :
+ : [l7] "m" (*l),
+ [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_dec_vec8 (ctx);
+
+ asm volatile ("pxor %%xmm12, %%xmm1\n\t"
+ "pxor %%xmm13, %%xmm2\n\t"
+ "movdqu %[outbuf4],%%xmm0\n\t"
+ "movdqu %[outbuf5],%%xmm12\n\t"
+ "movdqu %[outbuf6],%%xmm13\n\t"
+ "pxor %%xmm14, %%xmm3\n\t"
+ "pxor %%xmm15, %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm8\n\t"
+ "pxor %%xmm12, %%xmm9\n\t"
+ "pxor %%xmm13, %%xmm10\n\t"
+ "pxor %%xmm5, %%xmm11\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ "movdqu %%xmm8, %[outbuf4]\n\t"
+ "movdqu %%xmm9, %[outbuf5]\n\t"
+ "movdqu %%xmm10, %[outbuf6]\n\t"
+ "movdqu %%xmm11, %[outbuf7]\n\t"
+ "pxor %%xmm2, %%xmm1\n\t"
+ "pxor %%xmm4, %%xmm1\n\t"
+ "pxor %%xmm9, %%xmm1\n\t"
+ "pxor %%xmm11, %%xmm1\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm8, %%xmm6\n\t"
+ "pxor %%xmm10, %%xmm6\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+ [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+ [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+ [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+ [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+ [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+ [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+ [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+ :
+ : "memory" );
+
+ outbuf += 8*BLOCKSIZE;
+ inbuf += 8*BLOCKSIZE;
+ }
+
+ aesni_cleanup_7_15();
+ }
+#endif
+
+ for ( ;nblocks >= 4 ; nblocks -= 4 )
{
n += 4;
l = ocb_get_l(c, n);
@@ -1559,9 +2657,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
/* Checksum_i = Checksum_{i-1} xor P_i */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ asm volatile ("movdqu %[l0], %%xmm4\n\t"
"movdqu %[inbuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm5\n\t"
"pxor %%xmm5, %%xmm1\n\t"
"movdqu %%xmm5, %[outbuf0]\n\t"
: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
@@ -1577,14 +2675,12 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
: [l1] "m" (*c->u_mode.ocb.L[1]),
[inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
: "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[inbuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
+ asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm5\n\t"
"pxor %%xmm5, %%xmm3\n\t"
"movdqu %%xmm5, %[outbuf2]\n\t"
: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
- : [l2] "m" (*c->u_mode.ocb.L[0]),
- [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
: "memory" );
asm volatile ("movdqu %[l3], %%xmm0\n\t"
"movdqu %[inbuf3], %%xmm4\n\t"
@@ -1722,16 +2818,115 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
abuf += BLOCKSIZE;
}
- for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+ if (nblocks >= 8)
+ {
+ aesni_prepare_7_15_variable;
+
+ aesni_prepare_7_15();
+
+ asm volatile ("movdqu %[l0], %%xmm7\n\t"
+ "movdqu %[l1], %%xmm12\n\t"
+ :
+ : [l0] "m" (*c->u_mode.ocb.L[0]),
+ [l1] "m" (*c->u_mode.ocb.L[1])
+ : "memory" );
+
+ for ( ;nblocks >= 8 ; nblocks -= 8 )
+ {
+ n += 4;
+ l = ocb_get_l(c, n);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ asm volatile ("movdqu %[abuf0], %%xmm1\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ :
+ : [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[abuf1], %%xmm2\n\t"
+ "pxor %%xmm12, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ :
+ : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[abuf2], %%xmm3\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ :
+ : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[abuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l),
+ [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ n += 4;
+ l = ocb_get_l(c, n);
+
+ asm volatile ("movdqu %[abuf4], %%xmm8\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm8\n\t"
+ :
+ : [abuf4] "m" (*(abuf + 4 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[abuf5], %%xmm9\n\t"
+ "pxor %%xmm12, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm9\n\t"
+ :
+ : [abuf5] "m" (*(abuf + 5 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[abuf6], %%xmm10\n\t"
+ "pxor %%xmm7, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm10\n\t"
+ :
+ : [abuf6] "m" (*(abuf + 6 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l7], %%xmm0\n\t"
+ "movdqu %[abuf7], %%xmm11\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm11\n\t"
+ :
+ : [l7] "m" (*l),
+ [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_enc_vec8 (ctx);
+
+ asm volatile ("pxor %%xmm2, %%xmm1\n\t"
+ "pxor %%xmm3, %%xmm1\n\t"
+ "pxor %%xmm4, %%xmm1\n\t"
+ "pxor %%xmm8, %%xmm1\n\t"
+ "pxor %%xmm9, %%xmm6\n\t"
+ "pxor %%xmm10, %%xmm6\n\t"
+ "pxor %%xmm11, %%xmm6\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ :
+ :
+ : "memory" );
+
+ abuf += 8*BLOCKSIZE;
+ }
+
+ aesni_cleanup_7_15();
+ }
+#endif
+
+ for ( ;nblocks >= 4 ; nblocks -= 4 )
{
n += 4;
l = ocb_get_l(c, n);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ asm volatile ("movdqu %[l0], %%xmm4\n\t"
"movdqu %[abuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm5\n\t"
"pxor %%xmm5, %%xmm1\n\t"
:
: [l0] "m" (*c->u_mode.ocb.L[0]),
@@ -1745,9 +2940,8 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
: [l1] "m" (*c->u_mode.ocb.L[1]),
[abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
: "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[abuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
+ asm volatile ("movdqu %[abuf2], %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm5\n\t"
"pxor %%xmm5, %%xmm3\n\t"
:
: [l2] "m" (*c->u_mode.ocb.L[0]),