diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2018-01-06 18:53:20 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2018-01-09 18:44:34 +0200 |
commit | c9e9cb2eb6a1c659d3825ca627228b732f2f2152 (patch) | |
tree | 25b0ed20bfc6106781a3b18bc8b9236218010332 /cipher/rijndael-aesni.c | |
parent | b3ec0f752c925cde36f560f0f9309ab6450bbfd9 (diff) | |
download | libgcrypt-c9e9cb2eb6a1c659d3825ca627228b732f2f2152.tar.gz |
AES-NI improvements for AMD64
* cipher/rijndael-aesni.c [__x86_64__] (aesni_prepare_7_15_variable)
(aesni_prepare_7_15, aesni_cleanup_7_15, do_aesni_enc_vec8)
(do_aesni_dec_vec8, do_aesni_ctr_8): New.
(_gcry_aes_aesni_ctr_enc, _gcry_aes_aesni_cfb_dec)
(_gcry_aes_aesni_cbc_dec, aesni_ocb_enc, aesni_ocb_dec)
(_gcry_aes_aesni_ocb_auth) [__x86_64__]: Add 8 parallel blocks
processing.
--
Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo, no HT):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC dec | 0.175 ns/B 5448.7 MiB/s 0.700 c/B
CFB dec | 0.174 ns/B 5466.2 MiB/s 0.698 c/B
CTR enc | 0.182 ns/B 5226.0 MiB/s 0.730 c/B
OCB enc | 0.194 ns/B 4913.9 MiB/s 0.776 c/B
OCB dec | 0.200 ns/B 4769.2 MiB/s 0.800 c/B
OCB auth | 0.172 ns/B 5545.0 MiB/s 0.688 c/B
After (1.08x to 1.14x faster):
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC dec | 0.157 ns/B 6075.6 MiB/s 0.628 c/B
CFB dec | 0.158 ns/B 6034.1 MiB/s 0.632 c/B
CTR enc | 0.159 ns/B 5979.4 MiB/s 0.638 c/B
OCB enc | 0.175 ns/B 5447.1 MiB/s 0.700 c/B
OCB dec | 0.183 ns/B 5203.9 MiB/s 0.733 c/B
OCB auth | 0.156 ns/B 6101.3 MiB/s 0.625 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-aesni.c')
-rw-r--r-- | cipher/rijndael-aesni.c | 1248 |
1 files changed, 1221 insertions, 27 deletions
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 735e5cdd..3d323cf0 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -55,6 +55,7 @@ typedef struct u128_s #ifdef __WIN64__ /* XMM6-XMM15 are callee-saved registers on WIN64. */ # define aesni_prepare_2_6_variable char win64tmp[16] +# define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9] # define aesni_prepare() do { } while (0) # define aesni_prepare_2_6() \ do { asm volatile ("movdqu %%xmm6, %0\n\t" \ @@ -62,6 +63,20 @@ typedef struct u128_s : \ : "memory"); \ } while (0) +# define aesni_prepare_7_15() \ + do { asm volatile ("movdqu %%xmm7, 0*16(%0)\n\t" \ + "movdqu %%xmm8, 1*16(%0)\n\t" \ + "movdqu %%xmm9, 2*16(%0)\n\t" \ + "movdqu %%xmm10, 3*16(%0)\n\t" \ + "movdqu %%xmm11, 4*16(%0)\n\t" \ + "movdqu %%xmm12, 5*16(%0)\n\t" \ + "movdqu %%xmm13, 6*16(%0)\n\t" \ + "movdqu %%xmm14, 7*16(%0)\n\t" \ + "movdqu %%xmm15, 8*16(%0)\n\t" \ + : \ + : "r" (win64tmp7_15) \ + : "memory"); \ + } while (0) # define aesni_cleanup() \ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \ "pxor %%xmm1, %%xmm1\n" :: ); \ @@ -76,6 +91,20 @@ typedef struct u128_s : "m" (*win64tmp) \ : "memory"); \ } while (0) +# define aesni_cleanup_7_15() \ + do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t" \ + "movdqu 1*16(%0), %%xmm8\n\t" \ + "movdqu 2*16(%0), %%xmm9\n\t" \ + "movdqu 3*16(%0), %%xmm10\n\t" \ + "movdqu 4*16(%0), %%xmm11\n\t" \ + "movdqu 5*16(%0), %%xmm12\n\t" \ + "movdqu 6*16(%0), %%xmm13\n\t" \ + "movdqu 7*16(%0), %%xmm14\n\t" \ + "movdqu 8*16(%0), %%xmm15\n\t" \ + : \ + : "r" (win64tmp7_15) \ + : "memory"); \ + } while (0) #else # define aesni_prepare_2_6_variable # define aesni_prepare() do { } while (0) @@ -91,6 +120,21 @@ typedef struct u128_s "pxor %%xmm5, %%xmm5\n" \ "pxor %%xmm6, %%xmm6\n":: ); \ } while (0) +# ifdef __x86_64__ +# define aesni_prepare_7_15_variable +# define aesni_prepare_7_15() do { } while (0) +# define aesni_cleanup_7_15() \ + do { asm volatile ("pxor %%xmm7, %%xmm7\n\t" \ + "pxor %%xmm8, %%xmm8\n" \ + "pxor %%xmm9, %%xmm9\n" \ + "pxor %%xmm10, %%xmm10\n" \ + "pxor %%xmm11, %%xmm11\n" \ + "pxor %%xmm12, %%xmm12\n" \ + "pxor %%xmm13, %%xmm13\n" \ + "pxor %%xmm14, %%xmm14\n" \ + "pxor %%xmm15, %%xmm15\n":: ); \ + } while (0) +# endif #endif void @@ -704,6 +748,314 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx) } +#ifdef __x86_64__ + +/* Encrypt eight blocks using the Intel AES-NI instructions. Blocks are input + * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */ +static inline void +do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) +{ + asm volatile ("movdqa (%[key]), %%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ + "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ + "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ + "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ + "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ + "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ + "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ + "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm0\n\t" + "cmpl $12, %[rounds]\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x20(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x30(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x40(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x50(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x60(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x70(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x80(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x90(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xa0(%[key]), %%xmm0\n\t" + "jb .Ldeclast%=\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xb0(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xc0(%[key]), %%xmm0\n\t" + "je .Ldeclast%=\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xd0(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xe0(%[key]), %%xmm0\n" + + ".Ldeclast%=:\n\t" + "aesenclast %%xmm0, %%xmm1\n\t" + "aesenclast %%xmm0, %%xmm2\n\t" + "aesenclast %%xmm0, %%xmm3\n\t" + "aesenclast %%xmm0, %%xmm4\n\t" + "aesenclast %%xmm0, %%xmm8\n\t" + "aesenclast %%xmm0, %%xmm9\n\t" + "aesenclast %%xmm0, %%xmm10\n\t" + "aesenclast %%xmm0, %%xmm11\n\t" + : /* no output */ + : [key] "r" (ctx->keyschenc), + [rounds] "r" (ctx->rounds) + : "cc", "memory"); +} + + +/* Decrypt eight blocks using the Intel AES-NI instructions. Blocks are input + * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */ +static inline void +do_aesni_dec_vec8 (const RIJNDAEL_context *ctx) +{ + asm volatile ("movdqa (%[key]), %%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ + "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ + "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ + "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ + "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ + "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ + "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ + "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm0\n\t" + "cmpl $12, %[rounds]\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x20(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x30(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x40(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x50(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x60(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x70(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x80(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x90(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xa0(%[key]), %%xmm0\n\t" + "jb .Ldeclast%=\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xb0(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xc0(%[key]), %%xmm0\n\t" + "je .Ldeclast%=\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xd0(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xe0(%[key]), %%xmm0\n" + + ".Ldeclast%=:\n\t" + "aesdeclast %%xmm0, %%xmm1\n\t" + "aesdeclast %%xmm0, %%xmm2\n\t" + "aesdeclast %%xmm0, %%xmm3\n\t" + "aesdeclast %%xmm0, %%xmm4\n\t" + "aesdeclast %%xmm0, %%xmm8\n\t" + "aesdeclast %%xmm0, %%xmm9\n\t" + "aesdeclast %%xmm0, %%xmm10\n\t" + "aesdeclast %%xmm0, %%xmm11\n\t" + : /* no output */ + : [key] "r" (ctx->keyschdec), + [rounds] "r" (ctx->rounds) + : "cc", "memory"); +} + +#endif /* __x86_64__ */ + + /* Perform a CTR encryption round using the counter CTR and the input block A. Write the result to the output block B and update CTR. CTR needs to be a 16 byte aligned little-endian value. */ @@ -808,7 +1160,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx, #define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t" /* Register usage: - esi keyschedule + [key] keyschedule xmm0 CTR-0 xmm1 temp / round key xmm2 CTR-1 @@ -1003,6 +1355,327 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx, } +#ifdef __x86_64__ + +/* Eight blocks at a time variant of do_aesni_ctr. */ +static void +do_aesni_ctr_8 (const RIJNDAEL_context *ctx, + unsigned char *ctr, unsigned char *b, const unsigned char *a) +{ + static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) = + { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 } + }; + const void *bige_addb = bige_addb_const; + + /* Register usage: + [key] keyschedule + xmm0 CTR-0 + xmm1 temp / round key + xmm2 CTR-1 + xmm3 CTR-2 + xmm4 CTR-3 + xmm5 copy of *ctr + xmm6 endian swapping mask + xmm8 CTR-4 + xmm9 CTR-5 + xmm10 CTR-6 + xmm11 CTR-7 + xmm12 temp + xmm13 temp + xmm14 temp + xmm15 temp + */ + + asm volatile (/* detect if 8-bit carry handling is needed */ + "cmpb $0xf7, 15(%[ctr])\n\t" + "ja .Ladd32bit%=\n\t" + + "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ + "movdqa 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) */ + "movdqa 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) */ + "movdqa 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) */ + "movdqa 3*16(%[addb]), %%xmm8\n\t" /* xmm8 := be(4) */ + "movdqa 4*16(%[addb]), %%xmm9\n\t" /* xmm9 := be(5) */ + "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */ + "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */ + "movdqa 7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) */ + "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ + "paddb %%xmm0, %%xmm2\n\t" /* xmm2 := be(1) + CTR (xmm0) */ + "paddb %%xmm0, %%xmm3\n\t" /* xmm3 := be(2) + CTR (xmm0) */ + "paddb %%xmm0, %%xmm4\n\t" /* xmm4 := be(3) + CTR (xmm0) */ + "paddb %%xmm0, %%xmm8\n\t" /* xmm8 := be(4) + CTR (xmm0) */ + "paddb %%xmm0, %%xmm9\n\t" /* xmm9 := be(5) + CTR (xmm0) */ + "paddb %%xmm0, %%xmm10\n\t" /* xmm10 := be(6) + CTR (xmm0) */ + "paddb %%xmm0, %%xmm11\n\t" /* xmm11 := be(7) + CTR (xmm0) */ + "paddb %%xmm0, %%xmm5\n\t" /* xmm5 := be(8) + CTR (xmm0) */ + "jmp .Lstore_ctr%=\n\t" + + ".Ladd32bit%=:\n\t" + "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */ + "movdqa %%xmm0, %%xmm2\n\t" + "pcmpeqd %%xmm1, %%xmm1\n\t" + "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ + + "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */ + "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */ + "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */ + "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */ + "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */ + "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */ + "movdqa %%xmm4, %%xmm8\n\t" /* xmm8 := xmm4 */ + "psubq %%xmm1, %%xmm8\n\t" /* xmm8++ */ + "movdqa %%xmm8, %%xmm9\n\t" /* xmm9 := xmm8 */ + "psubq %%xmm1, %%xmm9\n\t" /* xmm9++ */ + "movdqa %%xmm9, %%xmm10\n\t" /* xmm10 := xmm9 */ + "psubq %%xmm1, %%xmm10\n\t" /* xmm10++ */ + "movdqa %%xmm10, %%xmm11\n\t" /* xmm11 := xmm10 */ + "psubq %%xmm1, %%xmm11\n\t" /* xmm11++ */ + "movdqa %%xmm11, %%xmm5\n\t" /* xmm5 := xmm11 */ + "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */ + + /* detect if 64-bit carry handling is needed */ + "cmpl $0xffffffff, 8(%[ctr])\n\t" + "jne .Lno_carry%=\n\t" + "movl 12(%[ctr]), %%esi\n\t" + "bswapl %%esi\n\t" + "cmpl $0xfffffff8, %%esi\n\t" + "jb .Lno_carry%=\n\t" /* no carry */ + + "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ + "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffff8 */ + "cmpl $0xfffffffa, %%esi\n\t" + "jb .Lcarry_xmm11%=\n\t" /* esi == 0xfffffff9 */ + "je .Lcarry_xmm10%=\n\t" /* esi == 0xfffffffa */ + "cmpl $0xfffffffc, %%esi\n\t" + "jb .Lcarry_xmm9%=\n\t" /* esi == 0xfffffffb */ + "je .Lcarry_xmm8%=\n\t" /* esi == 0xfffffffc */ + "cmpl $0xfffffffe, %%esi\n\t" + "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */ + "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */ + /* esi == 0xffffffff */ + + "psubq %%xmm1, %%xmm2\n\t" + ".Lcarry_xmm3%=:\n\t" + "psubq %%xmm1, %%xmm3\n\t" + ".Lcarry_xmm4%=:\n\t" + "psubq %%xmm1, %%xmm4\n\t" + ".Lcarry_xmm8%=:\n\t" + "psubq %%xmm1, %%xmm8\n\t" + ".Lcarry_xmm9%=:\n\t" + "psubq %%xmm1, %%xmm9\n\t" + ".Lcarry_xmm10%=:\n\t" + "psubq %%xmm1, %%xmm10\n\t" + ".Lcarry_xmm11%=:\n\t" + "psubq %%xmm1, %%xmm11\n\t" + ".Lcarry_xmm5%=:\n\t" + "psubq %%xmm1, %%xmm5\n\t" + + ".Lno_carry%=:\n\t" + "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ + + "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */ + "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */ + "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */ + "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */ + "pshufb %%xmm6, %%xmm8\n\t" /* xmm8 := be(xmm8) */ + "pshufb %%xmm6, %%xmm9\n\t" /* xmm9 := be(xmm9) */ + "pshufb %%xmm6, %%xmm10\n\t" /* xmm10 := be(xmm10) */ + "pshufb %%xmm6, %%xmm11\n\t" /* xmm11 := be(xmm11) */ + + ".Lstore_ctr%=:\n\t" + "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ + : + : [ctr] "r" (ctr), + [key] "r" (ctx->keyschenc), + [addb] "r" (bige_addb) + : "%esi", "cc", "memory"); + + asm volatile ("pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ + "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ + "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ + "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ + "pxor %%xmm1, %%xmm8\n\t" /* xmm8 ^= key[0] */ + "pxor %%xmm1, %%xmm9\n\t" /* xmm9 ^= key[0] */ + "pxor %%xmm1, %%xmm10\n\t" /* xmm10 ^= key[0] */ + "pxor %%xmm1, %%xmm11\n\t" /* xmm11 ^= key[0] */ + "movdqa 0x10(%[key]), %%xmm1\n\t" + "cmpl $12, %[rounds]\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0x20(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0x30(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0x40(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0x50(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0x60(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0x70(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0x80(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0x90(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0xa0(%[key]), %%xmm1\n\t" + "jb .Lenclast%=\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0xb0(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0xc0(%[key]), %%xmm1\n\t" + "je .Lenclast%=\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0xd0(%[key]), %%xmm1\n\t" + "aesenc %%xmm1, %%xmm0\n\t" + "aesenc %%xmm1, %%xmm2\n\t" + "aesenc %%xmm1, %%xmm3\n\t" + "aesenc %%xmm1, %%xmm4\n\t" + "aesenc %%xmm1, %%xmm8\n\t" + "aesenc %%xmm1, %%xmm9\n\t" + "aesenc %%xmm1, %%xmm10\n\t" + "aesenc %%xmm1, %%xmm11\n\t" + "movdqa 0xe0(%[key]), %%xmm1\n" + + ".Lenclast%=:\n\t" + "aesenclast %%xmm1, %%xmm0\n\t" + "aesenclast %%xmm1, %%xmm2\n\t" + "aesenclast %%xmm1, %%xmm3\n\t" + "aesenclast %%xmm1, %%xmm4\n\t" + "aesenclast %%xmm1, %%xmm8\n\t" + "aesenclast %%xmm1, %%xmm9\n\t" + "aesenclast %%xmm1, %%xmm10\n\t" + "aesenclast %%xmm1, %%xmm11\n\t" + : + : [key] "r" (ctx->keyschenc), + [rounds] "r" (ctx->rounds) + : "cc", "memory"); + + asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1. */ + "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2. */ + "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3. */ + "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4. */ + "movdqu 4*16(%[src]), %%xmm1\n\t" /* Get block 5. */ + "pxor %%xmm12, %%xmm0\n\t" /* EncCTR-1 ^= input */ + "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6. */ + "pxor %%xmm13, %%xmm2\n\t" /* EncCTR-2 ^= input */ + "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7. */ + "pxor %%xmm14, %%xmm3\n\t" /* EncCTR-3 ^= input */ + "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8. */ + "pxor %%xmm15, %%xmm4\n\t" /* EncCTR-4 ^= input */ + "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1 */ + "pxor %%xmm1, %%xmm8\n\t" /* EncCTR-5 ^= input */ + "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1 */ + "pxor %%xmm12, %%xmm9\n\t" /* EncCTR-6 ^= input */ + "movdqu %%xmm2, 1*16(%[dst])\n\t" /* Store block 2. */ + "pxor %%xmm13, %%xmm10\n\t" /* EncCTR-7 ^= input */ + "movdqu %%xmm3, 2*16(%[dst])\n\t" /* Store block 3. */ + "pxor %%xmm14, %%xmm11\n\t" /* EncCTR-8 ^= input */ + "movdqu %%xmm4, 3*16(%[dst])\n\t" /* Store block 4. */ + "movdqu %%xmm8, 4*16(%[dst])\n\t" /* Store block 8. */ + "movdqu %%xmm9, 5*16(%[dst])\n\t" /* Store block 9. */ + "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10. */ + "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11. */ + : + : [src] "r" (a), + [dst] "r" (b) + : "memory"); +} + +#endif /* __x86_64__ */ + + unsigned int _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) @@ -1123,7 +1796,25 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, [ctr] "m" (*ctr) : "memory"); - for ( ;nblocks > 3 ; nblocks -= 4 ) +#ifdef __x86_64__ + if (nblocks >= 8) + { + aesni_prepare_7_15_variable; + + aesni_prepare_7_15(); + + for ( ;nblocks >= 8 ; nblocks -= 8 ) + { + do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf); + outbuf += 8*BLOCKSIZE; + inbuf += 8*BLOCKSIZE; + } + + aesni_cleanup_7_15(); + } +#endif + + for ( ;nblocks >= 4 ; nblocks -= 4 ) { do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf); outbuf += 4*BLOCKSIZE; @@ -1175,6 +1866,76 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, : "memory" ); /* CFB decryption can be parallelized */ + +#ifdef __x86_64__ + if (nblocks >= 8) + { + aesni_prepare_7_15_variable; + + aesni_prepare_7_15(); + + for ( ;nblocks >= 8; nblocks -= 8) + { + asm volatile + ("movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */ + "movdqu 0*16(%[inbuf]), %%xmm2\n\t" + "movdqu 1*16(%[inbuf]), %%xmm3\n\t" + "movdqu 2*16(%[inbuf]), %%xmm4\n\t" + "movdqu 3*16(%[inbuf]), %%xmm8\n\t" + "movdqu 4*16(%[inbuf]), %%xmm9\n\t" + "movdqu 5*16(%[inbuf]), %%xmm10\n\t" + "movdqu 6*16(%[inbuf]), %%xmm11\n\t" + + "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */ + + "movdqa %%xmm2, %%xmm12\n\t" + "movdqa %%xmm3, %%xmm13\n\t" + "movdqa %%xmm4, %%xmm14\n\t" + "movdqa %%xmm8, %%xmm15\n\t" + : /* No output */ + : [inbuf] "r" (inbuf) + : "memory"); + + do_aesni_enc_vec8 (ctx); + + asm volatile + ( + "pxor %%xmm12, %%xmm1\n\t" + "movdqu 4*16(%[inbuf]), %%xmm12\n\t" + "pxor %%xmm13, %%xmm2\n\t" + "movdqu 5*16(%[inbuf]), %%xmm13\n\t" + "pxor %%xmm14, %%xmm3\n\t" + "movdqu 6*16(%[inbuf]), %%xmm14\n\t" + "pxor %%xmm15, %%xmm4\n\t" + "movdqu 7*16(%[inbuf]), %%xmm15\n\t" + + "pxor %%xmm12, %%xmm8\n\t" + "movdqu %%xmm1, 0*16(%[outbuf])\n\t" + "pxor %%xmm13, %%xmm9\n\t" + "movdqu %%xmm2, 1*16(%[outbuf])\n\t" + "pxor %%xmm14, %%xmm10\n\t" + "movdqu %%xmm3, 2*16(%[outbuf])\n\t" + "pxor %%xmm15, %%xmm11\n\t" + "movdqu %%xmm4, 3*16(%[outbuf])\n\t" + + "movdqu %%xmm8, 4*16(%[outbuf])\n\t" + "movdqu %%xmm9, 5*16(%[outbuf])\n\t" + "movdqu %%xmm10, 6*16(%[outbuf])\n\t" + "movdqu %%xmm11, 7*16(%[outbuf])\n\t" + + : /* No output */ + : [inbuf] "r" (inbuf), + [outbuf] "r" (outbuf) + : "memory"); + + outbuf += 8*BLOCKSIZE; + inbuf += 8*BLOCKSIZE; + } + + aesni_cleanup_7_15(); + } +#endif + for ( ;nblocks >= 4; nblocks -= 4) { asm volatile @@ -1260,7 +2021,76 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, : [iv] "m" (*iv) : "memory"); - for ( ;nblocks > 3 ; nblocks -= 4 ) +#ifdef __x86_64__ + if (nblocks >= 8) + { + aesni_prepare_7_15_variable; + + aesni_prepare_7_15(); + + for ( ;nblocks >= 8 ; nblocks -= 8 ) + { + asm volatile + ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ + "movdqu 1*16(%[inbuf]), %%xmm2\n\t" + "movdqu 2*16(%[inbuf]), %%xmm3\n\t" + "movdqu 3*16(%[inbuf]), %%xmm4\n\t" + "movdqu 4*16(%[inbuf]), %%xmm8\n\t" + "movdqu 5*16(%[inbuf]), %%xmm9\n\t" + "movdqu 6*16(%[inbuf]), %%xmm10\n\t" + "movdqu 7*16(%[inbuf]), %%xmm11\n\t" + + "movdqa %%xmm1, %%xmm12\n\t" + "movdqa %%xmm2, %%xmm13\n\t" + "movdqa %%xmm3, %%xmm14\n\t" + "movdqa %%xmm4, %%xmm15\n\t" + + : /* No output */ + : [inbuf] "r" (inbuf) + : "memory"); + + do_aesni_dec_vec8 (ctx); + + asm volatile + ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */ + + "pxor %%xmm12, %%xmm2\n\t" /* xor IV with output */ + "movdqu 4*16(%[inbuf]), %%xmm12\n\t" + + "pxor %%xmm13, %%xmm3\n\t" /* xor IV with output */ + "movdqu 5*16(%[inbuf]), %%xmm13\n\t" + + "pxor %%xmm14, %%xmm4\n\t" /* xor IV with output */ + "movdqu 6*16(%[inbuf]), %%xmm14\n\t" + + "pxor %%xmm15, %%xmm8\n\t" /* xor IV with output */ + "movdqu 7*16(%[inbuf]), %%xmm5\n\t" + "pxor %%xmm12, %%xmm9\n\t" /* xor IV with output */ + "movdqu %%xmm1, 0*16(%[outbuf])\n\t" + "pxor %%xmm13, %%xmm10\n\t" /* xor IV with output */ + "movdqu %%xmm2, 1*16(%[outbuf])\n\t" + "pxor %%xmm14, %%xmm11\n\t" /* xor IV with output */ + "movdqu %%xmm3, 2*16(%[outbuf])\n\t" + "movdqu %%xmm4, 3*16(%[outbuf])\n\t" + "movdqu %%xmm8, 4*16(%[outbuf])\n\t" + "movdqu %%xmm9, 5*16(%[outbuf])\n\t" + "movdqu %%xmm10, 6*16(%[outbuf])\n\t" + "movdqu %%xmm11, 7*16(%[outbuf])\n\t" + + : /* No output */ + : [inbuf] "r" (inbuf), + [outbuf] "r" (outbuf) + : "memory"); + + outbuf += 8*BLOCKSIZE; + inbuf += 8*BLOCKSIZE; + } + + aesni_cleanup_7_15(); + } +#endif + + for ( ;nblocks >= 4 ; nblocks -= 4 ) { asm volatile ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ @@ -1386,7 +2216,142 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, outbuf += BLOCKSIZE; } - for ( ;nblocks > 3 ; nblocks -= 4 ) +#ifdef __x86_64__ + if (nblocks >= 8) + { + aesni_prepare_7_15_variable; + + aesni_prepare_7_15(); + + asm volatile ("movdqu %[l0], %%xmm7\n\t" + : + : [l0] "m" (*c->u_mode.ocb.L[0]) + : "memory" ); + + for ( ;nblocks >= 8 ; nblocks -= 8 ) + { + n += 4; + l = ocb_get_l(c, n); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + + asm volatile ("movdqu %[l1], %%xmm10\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqa %%xmm5, %%xmm12\n\t" + : + : [l1] "m" (*c->u_mode.ocb.L[1]), + [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm10, %%xmm5\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqa %%xmm5, %%xmm13\n\t" + : + : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqa %%xmm5, %%xmm14\n\t" + : + : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm15\n\t" + "movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm15, %%xmm5\n\t" + "pxor %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqa %%xmm5, %%xmm15\n\t" + : + : [l3] "m" (*l), + [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) + : "memory" ); + + n += 4; + l = ocb_get_l(c, n); + + asm volatile ("movdqu %[inbuf4], %%xmm8\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm8, %%xmm6\n\t" + "pxor %%xmm5, %%xmm8\n\t" + "movdqu %%xmm5, %[outbuf4]\n\t" + : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)) + : [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[inbuf5], %%xmm9\n\t" + "pxor %%xmm10, %%xmm5\n\t" + "pxor %%xmm9, %%xmm6\n\t" + "pxor %%xmm5, %%xmm9\n\t" + "movdqu %%xmm5, %[outbuf5]\n\t" + : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)) + : [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm10, %%xmm6\n\t" + "pxor %%xmm5, %%xmm10\n\t" + "movdqu %%xmm5, %[outbuf6]\n\t" + : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)) + : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l7], %%xmm11\n\t" + "pxor %%xmm11, %%xmm5\n\t" + "movdqu %[inbuf7], %%xmm11\n\t" + "pxor %%xmm11, %%xmm6\n\t" + "pxor %%xmm5, %%xmm11\n\t" + : + : [l7] "m" (*l), + [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)) + : "memory" ); + + do_aesni_enc_vec8 (ctx); + + asm volatile ("pxor %%xmm12, %%xmm1\n\t" + "pxor %%xmm13, %%xmm2\n\t" + "movdqu %[outbuf4],%%xmm0\n\t" + "movdqu %[outbuf5],%%xmm12\n\t" + "movdqu %[outbuf6],%%xmm13\n\t" + "pxor %%xmm14, %%xmm3\n\t" + "pxor %%xmm15, %%xmm4\n\t" + "pxor %%xmm0, %%xmm8\n\t" + "pxor %%xmm12, %%xmm9\n\t" + "pxor %%xmm13, %%xmm10\n\t" + "pxor %%xmm5, %%xmm11\n\t" + "movdqu %%xmm1, %[outbuf0]\n\t" + "movdqu %%xmm2, %[outbuf1]\n\t" + "movdqu %%xmm3, %[outbuf2]\n\t" + "movdqu %%xmm4, %[outbuf3]\n\t" + "movdqu %%xmm8, %[outbuf4]\n\t" + "movdqu %%xmm9, %[outbuf5]\n\t" + "movdqu %%xmm10, %[outbuf6]\n\t" + "movdqu %%xmm11, %[outbuf7]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), + [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), + [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)), + [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)), + [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)), + [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE)) + : + : "memory" ); + + outbuf += 8*BLOCKSIZE; + inbuf += 8*BLOCKSIZE; + } + + aesni_cleanup_7_15(); + } +#endif + + for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = ocb_get_l(c, n); @@ -1394,9 +2359,9 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" + asm volatile ("movdqu %[l0], %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" @@ -1414,19 +2379,17 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, : [l1] "m" (*c->u_mode.ocb.L[1]), [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[inbuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" + asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) - : [l2] "m" (*c->u_mode.ocb.L[0]), - [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l3], %%xmm0\n\t" + asm volatile ("movdqu %[l3], %%xmm4\n\t" + "pxor %%xmm4, %%xmm5\n\t" "movdqu %[inbuf3], %%xmm4\n\t" - "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm4, %%xmm6\n\t" "pxor %%xmm5, %%xmm4\n\t" : @@ -1551,7 +2514,142 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, outbuf += BLOCKSIZE; } - for ( ;nblocks > 3 ; nblocks -= 4 ) +#ifdef __x86_64__ + if (nblocks >= 8) + { + aesni_prepare_7_15_variable; + + aesni_prepare_7_15(); + + asm volatile ("movdqu %[l0], %%xmm7\n\t" + : + : [l0] "m" (*c->u_mode.ocb.L[0]) + : "memory" ); + + for ( ;nblocks >= 8 ; nblocks -= 8 ) + { + n += 4; + l = ocb_get_l(c, n); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + + asm volatile ("movdqu %[l1], %%xmm10\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqa %%xmm5, %%xmm12\n\t" + : + : [l1] "m" (*c->u_mode.ocb.L[1]), + [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm10, %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqa %%xmm5, %%xmm13\n\t" + : + : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqa %%xmm5, %%xmm14\n\t" + : + : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqa %%xmm5, %%xmm15\n\t" + : + : [l3] "m" (*l), + [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) + : "memory" ); + + n += 4; + l = ocb_get_l(c, n); + + asm volatile ("movdqu %[inbuf4], %%xmm8\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm5, %%xmm8\n\t" + "movdqu %%xmm5, %[outbuf4]\n\t" + : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)) + : [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[inbuf5], %%xmm9\n\t" + "pxor %%xmm10, %%xmm5\n\t" + "pxor %%xmm5, %%xmm9\n\t" + "movdqu %%xmm5, %[outbuf5]\n\t" + : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)) + : [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm5, %%xmm10\n\t" + "movdqu %%xmm5, %[outbuf6]\n\t" + : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)) + : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l7], %%xmm0\n\t" + "movdqu %[inbuf7], %%xmm11\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm11\n\t" + : + : [l7] "m" (*l), + [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)) + : "memory" ); + + do_aesni_dec_vec8 (ctx); + + asm volatile ("pxor %%xmm12, %%xmm1\n\t" + "pxor %%xmm13, %%xmm2\n\t" + "movdqu %[outbuf4],%%xmm0\n\t" + "movdqu %[outbuf5],%%xmm12\n\t" + "movdqu %[outbuf6],%%xmm13\n\t" + "pxor %%xmm14, %%xmm3\n\t" + "pxor %%xmm15, %%xmm4\n\t" + "pxor %%xmm0, %%xmm8\n\t" + "pxor %%xmm12, %%xmm9\n\t" + "pxor %%xmm13, %%xmm10\n\t" + "pxor %%xmm5, %%xmm11\n\t" + "movdqu %%xmm1, %[outbuf0]\n\t" + "movdqu %%xmm2, %[outbuf1]\n\t" + "movdqu %%xmm3, %[outbuf2]\n\t" + "movdqu %%xmm4, %[outbuf3]\n\t" + "movdqu %%xmm8, %[outbuf4]\n\t" + "movdqu %%xmm9, %[outbuf5]\n\t" + "movdqu %%xmm10, %[outbuf6]\n\t" + "movdqu %%xmm11, %[outbuf7]\n\t" + "pxor %%xmm2, %%xmm1\n\t" + "pxor %%xmm4, %%xmm1\n\t" + "pxor %%xmm9, %%xmm1\n\t" + "pxor %%xmm11, %%xmm1\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm8, %%xmm6\n\t" + "pxor %%xmm10, %%xmm6\n\t" + "pxor %%xmm1, %%xmm6\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), + [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), + [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)), + [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)), + [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)), + [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE)) + : + : "memory" ); + + outbuf += 8*BLOCKSIZE; + inbuf += 8*BLOCKSIZE; + } + + aesni_cleanup_7_15(); + } +#endif + + for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = ocb_get_l(c, n); @@ -1559,9 +2657,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ /* Checksum_i = Checksum_{i-1} xor P_i */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" + asm volatile ("movdqu %[l0], %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) @@ -1577,14 +2675,12 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, : [l1] "m" (*c->u_mode.ocb.L[1]), [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[inbuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" + asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) - : [l2] "m" (*c->u_mode.ocb.L[0]), - [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l3], %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" @@ -1722,16 +2818,115 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, abuf += BLOCKSIZE; } - for ( ;nblocks > 3 ; nblocks -= 4 ) +#ifdef __x86_64__ + if (nblocks >= 8) + { + aesni_prepare_7_15_variable; + + aesni_prepare_7_15(); + + asm volatile ("movdqu %[l0], %%xmm7\n\t" + "movdqu %[l1], %%xmm12\n\t" + : + : [l0] "m" (*c->u_mode.ocb.L[0]), + [l1] "m" (*c->u_mode.ocb.L[1]) + : "memory" ); + + for ( ;nblocks >= 8 ; nblocks -= 8 ) + { + n += 4; + l = ocb_get_l(c, n); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + asm volatile ("movdqu %[abuf0], %%xmm1\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + : + : [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[abuf1], %%xmm2\n\t" + "pxor %%xmm12, %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + : + : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[abuf2], %%xmm3\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + : + : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[abuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l), + [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) + : "memory" ); + + n += 4; + l = ocb_get_l(c, n); + + asm volatile ("movdqu %[abuf4], %%xmm8\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm5, %%xmm8\n\t" + : + : [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[abuf5], %%xmm9\n\t" + "pxor %%xmm12, %%xmm5\n\t" + "pxor %%xmm5, %%xmm9\n\t" + : + : [abuf5] "m" (*(abuf + 5 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[abuf6], %%xmm10\n\t" + "pxor %%xmm7, %%xmm5\n\t" + "pxor %%xmm5, %%xmm10\n\t" + : + : [abuf6] "m" (*(abuf + 6 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l7], %%xmm0\n\t" + "movdqu %[abuf7], %%xmm11\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm11\n\t" + : + : [l7] "m" (*l), + [abuf7] "m" (*(abuf + 7 * BLOCKSIZE)) + : "memory" ); + + do_aesni_enc_vec8 (ctx); + + asm volatile ("pxor %%xmm2, %%xmm1\n\t" + "pxor %%xmm3, %%xmm1\n\t" + "pxor %%xmm4, %%xmm1\n\t" + "pxor %%xmm8, %%xmm1\n\t" + "pxor %%xmm9, %%xmm6\n\t" + "pxor %%xmm10, %%xmm6\n\t" + "pxor %%xmm11, %%xmm6\n\t" + "pxor %%xmm1, %%xmm6\n\t" + : + : + : "memory" ); + + abuf += 8*BLOCKSIZE; + } + + aesni_cleanup_7_15(); + } +#endif + + for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" + asm volatile ("movdqu %[l0], %%xmm4\n\t" "movdqu %[abuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), @@ -1745,9 +2940,8 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, : [l1] "m" (*c->u_mode.ocb.L[1]), [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[abuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" + asm volatile ("movdqu %[abuf2], %%xmm3\n\t" + "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" : : [l2] "m" (*c->u_mode.ocb.L[0]), |