summaryrefslogtreecommitdiff
path: root/cipher/rijndael-aesni.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2018-01-06 18:53:20 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2018-01-09 18:44:43 +0200
commita00c5b2988cea256c7823a76ce601febf02c790f (patch)
tree035091bfa43e955e41015cf1a42caa4079cb1063 /cipher/rijndael-aesni.c
parentc9e9cb2eb6a1c659d3825ca627228b732f2f2152 (diff)
downloadlibgcrypt-a00c5b2988cea256c7823a76ce601febf02c790f.tar.gz
Add AES-NI acceleration for AES-XTS
* cipher/cipher-internal.h (gcry_cipher_handle): Change bulk XTS function to take cipher context. * cipher/cipher-xts.c (_gcry_cipher_xts_crypt): Ditto. * cipher/cipher.c (_gcry_cipher_open_internal): Setup AES-NI XTS bulk function. * cipher/rijndael-aesni.c (xts_gfmul_const, _gcry_aes_aesni_xts_enc) (_gcry_aes_aesni_xts_enc, _gcry_aes_aesni_xts_crypt): New. * cipher/rijndael.c (_gcry_aes_aesni_xts_crypt) (_gcry_aes_xts_crypt): New. * src/cipher.h (_gcry_aes_xts_crypt): New. -- Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo): Before: XTS enc | 1.66 ns/B 575.7 MiB/s 6.63 c/B XTS dec | 1.66 ns/B 575.5 MiB/s 6.63 c/B After (~6x faster): XTS enc | 0.270 ns/B 3528.5 MiB/s 1.08 c/B XTS dec | 0.272 ns/B 3511.5 MiB/s 1.09 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-aesni.c')
-rw-r--r--cipher/rijndael-aesni.c291
1 files changed, 291 insertions, 0 deletions
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 3d323cf0..50a0745b 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -3007,4 +3007,295 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
}
+static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) =
+ { 0x87, 0x01 };
+
+
+static void
+_gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks)
+{
+ aesni_prepare_2_6_variable;
+
+ aesni_prepare ();
+ aesni_prepare_2_6 ();
+
+ /* Preload Tweak */
+ asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+ "movdqa %[gfmul], %%xmm6\n\t"
+ :
+ : [tweak] "m" (*tweak),
+ [gfmul] "m" (*xts_gfmul_const)
+ : "memory" );
+
+ for ( ;nblocks >= 4; nblocks -= 4 )
+ {
+ asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * 16))
+ : [inbuf0] "m" (*(inbuf + 0 * 16))
+ : "memory" );
+
+ asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * 16))
+ : [inbuf1] "m" (*(inbuf + 1 * 16))
+ : "memory" );
+
+ asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * 16))
+ : [inbuf2] "m" (*(inbuf + 2 * 16))
+ : "memory" );
+
+ asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm5, %[outbuf3]\n\t"
+
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf3] "=m" (*(outbuf + 3 * 16))
+ : [inbuf3] "m" (*(inbuf + 3 * 16))
+ : "memory" );
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %[outbuf1], %%xmm0\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf2], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %[outbuf3], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm4\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+ [outbuf1] "+m" (*(outbuf + 1 * 16)),
+ [outbuf2] "+m" (*(outbuf + 2 * 16)),
+ [outbuf3] "+m" (*(outbuf + 3 * 16))
+ :
+ : "memory" );
+
+ outbuf += BLOCKSIZE * 4;
+ inbuf += BLOCKSIZE * 4;
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ "movdqa %%xmm5, %%xmm4\n\t"
+
+ "pshufd $0x13, %%xmm5, %%xmm1\n\t"
+ "psrad $31, %%xmm1\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ :
+ : [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_aesni_enc (ctx);
+
+ asm volatile ("pxor %%xmm4, %%xmm0\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+ : [tweak] "=m" (*tweak)
+ :
+ : "memory" );
+
+ aesni_cleanup ();
+ aesni_cleanup_2_6 ();
+}
+
+
+static void
+_gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks)
+{
+ aesni_prepare_2_6_variable;
+
+ aesni_prepare ();
+ aesni_prepare_2_6 ();
+
+ /* Preload Tweak */
+ asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+ "movdqa %[gfmul], %%xmm6\n\t"
+ :
+ : [tweak] "m" (*tweak),
+ [gfmul] "m" (*xts_gfmul_const)
+ : "memory" );
+
+ for ( ;nblocks >= 4; nblocks -= 4 )
+ {
+ asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * 16))
+ : [inbuf0] "m" (*(inbuf + 0 * 16))
+ : "memory" );
+
+ asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * 16))
+ : [inbuf1] "m" (*(inbuf + 1 * 16))
+ : "memory" );
+
+ asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * 16))
+ : [inbuf2] "m" (*(inbuf + 2 * 16))
+ : "memory" );
+
+ asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm5, %[outbuf3]\n\t"
+
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf3] "=m" (*(outbuf + 3 * 16))
+ : [inbuf3] "m" (*(inbuf + 3 * 16))
+ : "memory" );
+
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %[outbuf1], %%xmm0\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf2], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %[outbuf3], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm4\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+ [outbuf1] "+m" (*(outbuf + 1 * 16)),
+ [outbuf2] "+m" (*(outbuf + 2 * 16)),
+ [outbuf3] "+m" (*(outbuf + 3 * 16))
+ :
+ : "memory" );
+
+ outbuf += BLOCKSIZE * 4;
+ inbuf += BLOCKSIZE * 4;
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ "movdqa %%xmm5, %%xmm4\n\t"
+
+ "pshufd $0x13, %%xmm5, %%xmm1\n\t"
+ "psrad $31, %%xmm1\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ :
+ : [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_aesni_dec (ctx);
+
+ asm volatile ("pxor %%xmm4, %%xmm0\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+ : [tweak] "=m" (*tweak)
+ :
+ : "memory" );
+
+ aesni_cleanup ();
+ aesni_cleanup_2_6 ();
+}
+
+
+void
+_gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks, int encrypt)
+{
+ if (encrypt)
+ _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
+ else
+ _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
+}
+
#endif /* USE_AESNI */