diff options
-rw-r--r-- | cipher/cipher-internal.h | 2 | ||||
-rw-r--r-- | cipher/cipher.c | 41 | ||||
-rw-r--r-- | cipher/rijndael-aesni.c | 160 | ||||
-rw-r--r-- | cipher/rijndael-armv8-aarch32-ce.S | 152 | ||||
-rw-r--r-- | cipher/rijndael-armv8-aarch64-ce.S | 125 | ||||
-rw-r--r-- | cipher/rijndael-armv8-ce.c | 124 | ||||
-rw-r--r-- | cipher/rijndael-vaes-avx2-amd64.S | 432 | ||||
-rw-r--r-- | cipher/rijndael-vaes.c | 26 | ||||
-rw-r--r-- | cipher/rijndael.c | 12 |
9 files changed, 997 insertions, 77 deletions
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 66b75955..4e022f38 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -161,6 +161,8 @@ typedef struct cipher_mode_ops not NULL. */ typedef struct cipher_bulk_ops { + void (*ecb_crypt)(void *context, void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg, diff --git a/cipher/cipher.c b/cipher/cipher.c index 6c335aec..026c1511 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -983,14 +983,11 @@ cipher_reset (gcry_cipher_hd_t c) static gcry_err_code_t -do_ecb_crypt (gcry_cipher_hd_t c, - unsigned char *outbuf, size_t outbuflen, - const unsigned char *inbuf, size_t inbuflen, - gcry_cipher_encrypt_t crypt_fn) +do_ecb_crypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, + const unsigned char *inbuf, size_t inbuflen, int encrypt) { unsigned int blocksize = c->spec->blocksize; size_t n, nblocks; - unsigned int burn, nburn; if (outbuflen < inbuflen) return GPG_ERR_BUFFER_TOO_SHORT; @@ -998,18 +995,32 @@ do_ecb_crypt (gcry_cipher_hd_t c, return GPG_ERR_INV_LENGTH; nblocks = inbuflen / blocksize; - burn = 0; - for (n=0; n < nblocks; n++ ) + if (nblocks == 0) + return 0; + + if (c->bulk.ecb_crypt) { - nburn = crypt_fn (&c->context.c, outbuf, inbuf); - burn = nburn > burn ? nburn : burn; - inbuf += blocksize; - outbuf += blocksize; + c->bulk.ecb_crypt (&c->context.c, outbuf, inbuf, nblocks, encrypt); } + else + { + gcry_cipher_encrypt_t crypt_fn = + encrypt ? c->spec->encrypt : c->spec->decrypt; + unsigned int burn = 0; + unsigned int nburn; - if (burn > 0) - _gcry_burn_stack (burn + 4 * sizeof(void *)); + for (n = 0; n < nblocks; n++) + { + nburn = crypt_fn (&c->context.c, outbuf, inbuf); + burn = nburn > burn ? nburn : burn; + inbuf += blocksize; + outbuf += blocksize; + } + + if (burn > 0) + _gcry_burn_stack (burn + 4 * sizeof(void *)); + } return 0; } @@ -1019,7 +1030,7 @@ do_ecb_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { - return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->encrypt); + return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 1); } static gcry_err_code_t @@ -1027,7 +1038,7 @@ do_ecb_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { - return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->decrypt); + return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 0); } diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 156af015..906737a6 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -870,7 +870,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" - "jb .Ldeclast%=\n\t" + "jb .Lenclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" @@ -889,7 +889,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" - "je .Ldeclast%=\n\t" + "je .Lenclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" @@ -909,7 +909,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" - ".Ldeclast%=:\n\t" + ".Lenclast%=:\n\t" : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) @@ -1718,6 +1718,160 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, void ASM_FUNC_ATTR +_gcry_aes_aesni_ecb_crypt (RIJNDAEL_context *ctx, unsigned char *dst, + const unsigned char *src, size_t nblocks, + int encrypt) +{ + aesni_prepare_2_7_variable; + + aesni_prepare (); + aesni_prepare_2_7(); + + if (!encrypt && !ctx->decryption_prepared) + { + do_aesni_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + +#ifdef __x86_64__ + if (nblocks >= 8) + { + const void *key = encrypt ? ctx->keyschenc : ctx->keyschdec; + aesni_prepare_8_15_variable; + + aesni_prepare_8_15(); + + for (; nblocks >= 8; nblocks -= 8) + { + asm volatile + ("movdqa (%[key]), %%xmm0\n\t" + "movdqu 0*16(%[src]), %%xmm1\n\t" + "movdqu 1*16(%[src]), %%xmm2\n\t" + "movdqu 2*16(%[src]), %%xmm3\n\t" + "movdqu 3*16(%[src]), %%xmm4\n\t" + "movdqu 4*16(%[src]), %%xmm8\n\t" + "movdqu 5*16(%[src]), %%xmm9\n\t" + "movdqu 6*16(%[src]), %%xmm10\n\t" + "movdqu 7*16(%[src]), %%xmm11\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "pxor %%xmm0, %%xmm4\n\t" + "pxor %%xmm0, %%xmm8\n\t" + "pxor %%xmm0, %%xmm9\n\t" + "pxor %%xmm0, %%xmm10\n\t" + "pxor %%xmm0, %%xmm11\n\t" + : /* No output */ + : [src] "r" (src), + [key] "r" (key) + : "memory"); + + if (encrypt) + { + do_aesni_enc_vec8 (ctx); + asm volatile + ("aesenclast %%xmm0, %%xmm1\n\t" + "aesenclast %%xmm0, %%xmm2\n\t" + "aesenclast %%xmm0, %%xmm3\n\t" + "aesenclast %%xmm0, %%xmm4\n\t" + "aesenclast %%xmm0, %%xmm8\n\t" + "aesenclast %%xmm0, %%xmm9\n\t" + "aesenclast %%xmm0, %%xmm10\n\t" + "aesenclast %%xmm0, %%xmm11\n\t" + ::: "memory" ); + } + else + { + do_aesni_dec_vec8 (ctx); + asm volatile + ("aesdeclast %%xmm0, %%xmm1\n\t" + "aesdeclast %%xmm0, %%xmm2\n\t" + "aesdeclast %%xmm0, %%xmm3\n\t" + "aesdeclast %%xmm0, %%xmm4\n\t" + "aesdeclast %%xmm0, %%xmm8\n\t" + "aesdeclast %%xmm0, %%xmm9\n\t" + "aesdeclast %%xmm0, %%xmm10\n\t" + "aesdeclast %%xmm0, %%xmm11\n\t" + ::: "memory" ); + } + + asm volatile + ("movdqu %%xmm1, 0*16(%[dst])\n\t" + "movdqu %%xmm2, 1*16(%[dst])\n\t" + "movdqu %%xmm3, 2*16(%[dst])\n\t" + "movdqu %%xmm4, 3*16(%[dst])\n\t" + "movdqu %%xmm8, 4*16(%[dst])\n\t" + "movdqu %%xmm9, 5*16(%[dst])\n\t" + "movdqu %%xmm10, 6*16(%[dst])\n\t" + "movdqu %%xmm11, 7*16(%[dst])\n\t" + : /* No output */ + : [dst] "r" (dst) + : "memory"); + + dst += 8*BLOCKSIZE; + src += 8*BLOCKSIZE; + } + + aesni_cleanup_8_15(); + } +#endif + + for (; nblocks >= 4; nblocks -= 4) + { + asm volatile + ("movdqu 0*16(%[src]), %%xmm1\n\t" + "movdqu 1*16(%[src]), %%xmm2\n\t" + "movdqu 2*16(%[src]), %%xmm3\n\t" + "movdqu 3*16(%[src]), %%xmm4\n\t" + : /* No output */ + : [src] "r" (src) + : "memory"); + + if (encrypt) + do_aesni_enc_vec4 (ctx); + else + do_aesni_dec_vec4 (ctx); + + asm volatile + ("movdqu %%xmm1, 0*16(%[dst])\n\t" + "movdqu %%xmm2, 1*16(%[dst])\n\t" + "movdqu %%xmm3, 2*16(%[dst])\n\t" + "movdqu %%xmm4, 3*16(%[dst])\n\t" + : /* No output */ + : [dst] "r" (dst) + : "memory"); + + dst += 4*BLOCKSIZE; + src += 4*BLOCKSIZE; + } + + for (; nblocks; nblocks--) + { + asm volatile ("movdqu %[src], %%xmm0\n\t" + : + : [src] "m" (*src) + : "memory" ); + + if (encrypt) + do_aesni_enc (ctx); + else + do_aesni_dec (ctx); + + asm volatile ("movdqu %%xmm0, %[dst]\n\t" + : [dst] "=m" (*dst) + : + : "memory" ); + + dst += BLOCKSIZE; + src += BLOCKSIZE; + } + + aesni_cleanup (); + aesni_cleanup_2_7 (); +} + + +void ASM_FUNC_ATTR _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S index 1eafa93e..6208652b 100644 --- a/cipher/rijndael-armv8-aarch32-ce.S +++ b/cipher/rijndael-armv8-aarch32-ce.S @@ -654,6 +654,149 @@ _gcry_aes_cbc_dec_armv8_ce: /* + * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, + * unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_enc_armv8_ce +.type _gcry_aes_ecb_enc_armv8_ce,%function; +_gcry_aes_ecb_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: nblocks + * %st+0: nrounds => r4 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + cmp r3, #0 + beq .Lecb_enc_skip + ldr r4, [sp, #(16+0)] + vpush {q4-q7} + + cmp r4, #12 + aes_preload_keys(r0, lr); + + beq .Lecb_entry_192e + bhi .Lecb_entry_256e + +#define ECB_CRYPT(bits, e_d, mc_imc, ...) \ + .Lecb_entry_##bits##e_d: \ + cmp r3, #4; \ + blo .Lecb_loop_##bits##e_d; \ + \ + .Lecb_loop4_##bits##e_d: \ + vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \ + sub r3, r3, #4; \ + vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \ + cmp r3, #4; \ + \ + do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \ + vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \ + \ + bhs .Lecb_loop4_##bits##e_d; \ + cmp r3, #0; \ + beq .Lecb_done_##e_d; \ + \ + .Lecb_loop_##bits##e_d: \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + subs r3, r3, #1; \ + \ + do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \ + \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + bne .Lecb_loop_##bits##e_d; \ + b .Lecb_done_##e_d; + + ECB_CRYPT(128, e, mc) + ECB_CRYPT(192, e, mc, r0, lr) + ECB_CRYPT(256, e, mc, r0, lr) + +.Lecb_done_e: + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lecb_enc_skip: + pop {r4-r6,pc} +.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce; + + +/* + * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, + * unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_dec_armv8_ce +.type _gcry_aes_ecb_dec_armv8_ce,%function; +_gcry_aes_ecb_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: nblocks + * %st+0: nrounds => r4 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + cmp r3, #0 + beq .Lecb_enc_skip + ldr r4, [sp, #(16+0)] + vpush {q4-q7} + + cmp r4, #12 + + aes_preload_keys(r0, lr); + + beq .Lecb_entry_192d + bhi .Lecb_entry_256d + + ECB_CRYPT(128, d, imc) + ECB_CRYPT(192, d, imc, r0, lr) + ECB_CRYPT(256, d, imc, r0, lr) + +#undef ECB_CRYPT + +.Lecb_done_d: + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lecb_dec_skip: + pop {r4-r6,pc} +.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce; + + +/* * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, @@ -1138,7 +1281,7 @@ _gcry_aes_ctr32le_enc_armv8_ce: /* - * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1305,6 +1448,7 @@ _gcry_aes_ocb_enc_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr @@ -1312,7 +1456,7 @@ _gcry_aes_ocb_enc_armv8_ce: /* - * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1479,6 +1623,7 @@ _gcry_aes_ocb_dec_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr @@ -1486,7 +1631,7 @@ _gcry_aes_ocb_dec_armv8_ce: /* - * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched, * const unsigned char *abuf, * unsigned char *offset, * unsigned char *checksum, @@ -1632,6 +1777,7 @@ _gcry_aes_ocb_auth_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S index 4fef0345..97d3d7eb 100644 --- a/cipher/rijndael-armv8-aarch64-ce.S +++ b/cipher/rijndael-armv8-aarch64-ce.S @@ -386,6 +386,119 @@ ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;) /* + * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_enc_armv8_ce +ELF(.type _gcry_aes_ecb_enc_armv8_ce,%function;) +_gcry_aes_ecb_enc_armv8_ce: + /* input: + * x0: keysched + * x1: outbuf + * x2: inbuf + * x3: nblocks + * w4: nrounds + */ + CFI_STARTPROC(); + + cbz x3, .Lecb_enc_skip + + aes_preload_keys(x0, w4); + + b.eq .Lecb_entry_192e + b.hi .Lecb_entry_256e + +#define ECB_CRYPT(bits, e_d, mc_imc) \ + .Lecb_entry_##bits##e_d: \ + cmp x3, #4; \ + b.lo .Lecb_loop_##bits##e_d; \ + \ + .Lecb_loop4_##bits##e_d: \ + sub x3, x3, #4; \ + ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \ + cmp x3, #4; \ + do_aes_4_##bits(e_d, mc_imc, v0, v1, v2, v3); \ + st1 {v0.16b-v3.16b}, [x1], #64; /* store plaintext */ \ + \ + b.hs .Lecb_loop4_##bits##e_d; \ + CLEAR_REG(v1); \ + CLEAR_REG(v2); \ + CLEAR_REG(v3); \ + cbz x3, .Lecb_done_##e_d; \ + \ + .Lecb_loop_##bits##e_d: \ + ld1 {v0.16b}, [x2], #16; /* load ciphertext */ \ + sub x3, x3, #1; \ + do_aes_one##bits(e_d, mc_imc, v0, v0, vk0); \ + st1 {v0.16b}, [x1], #16; /* store plaintext */ \ + \ + cbnz x3, .Lecb_loop_##bits##e_d; \ + b .Lecb_done_##e_d; + + ECB_CRYPT(128, e, mc) + ECB_CRYPT(192, e, mc) + ECB_CRYPT(256, e, mc) + +.Lecb_done_e: + aes_clear_keys(w4) + + CLEAR_REG(v0) + +.Lecb_enc_skip: + ret_spec_stop + CFI_ENDPROC(); +ELF(.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;) + + +/* + * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_dec_armv8_ce +ELF(.type _gcry_aes_ecb_dec_armv8_ce,%function;) +_gcry_aes_ecb_dec_armv8_ce: + /* input: + * x0: keysched + * x1: outbuf + * x2: inbuf + * x3: nblocks + * w4: nrounds + */ + CFI_STARTPROC(); + + cbz x3, .Lecb_enc_skip + + aes_preload_keys(x0, w4); + + b.eq .Lecb_entry_192d + b.hi .Lecb_entry_256d + + ECB_CRYPT(128, d, imc) + ECB_CRYPT(192, d, imc) + ECB_CRYPT(256, d, imc) + +#undef ECB_CRYPT + +.Lecb_done_d: + aes_clear_keys(w4) + + CLEAR_REG(v0) + +.Lecb_dec_skip: + ret_spec_stop + CFI_ENDPROC(); +ELF(.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;) + + +/* * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, @@ -471,7 +584,8 @@ ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;) * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, - * unsigned char *iv, unsigned int nrounds); + * unsigned char *iv, + * size_t nblocks, unsigned int nrounds); */ .align 3 @@ -1136,7 +1250,7 @@ ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;) /* - * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1379,13 +1493,14 @@ _gcry_aes_ocb_enc_armv8_ce: add sp, sp, #128; CFI_ADJUST_CFA_OFFSET(-128); + mov x0, #0 ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;) /* - * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1458,13 +1573,14 @@ _gcry_aes_ocb_dec_armv8_ce: add sp, sp, #128; CFI_ADJUST_CFA_OFFSET(-128); + mov x0, #0 ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;) /* - * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched, * const unsigned char *abuf, * unsigned char *offset, * unsigned char *checksum, @@ -1605,6 +1721,7 @@ _gcry_aes_ocb_auth_armv8_ce: CLEAR_REG(v2) CLEAR_REG(v16) + mov x0, #0 ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;) diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c index c9c37654..042b7d42 100644 --- a/cipher/rijndael-armv8-ce.c +++ b/cipher/rijndael-armv8-ce.c @@ -80,32 +80,32 @@ extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched, unsigned char *iv, size_t nblocks, unsigned int nrounds); -extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *offset, - unsigned char *checksum, - unsigned char *L_table, - size_t nblocks, - unsigned int nrounds, - unsigned int blkn); -extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *offset, - unsigned char *checksum, - unsigned char *L_table, - size_t nblocks, - unsigned int nrounds, - unsigned int blkn); -extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, - const unsigned char *abuf, - unsigned char *offset, - unsigned char *checksum, - unsigned char *L_table, - size_t nblocks, - unsigned int nrounds, - unsigned int blkn); +extern size_t _gcry_aes_ocb_enc_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *offset, + unsigned char *checksum, + unsigned char *L_table, + size_t nblocks, + unsigned int nrounds, + unsigned int blkn); +extern size_t _gcry_aes_ocb_dec_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *offset, + unsigned char *checksum, + unsigned char *L_table, + size_t nblocks, + unsigned int nrounds, + unsigned int blkn); +extern size_t _gcry_aes_ocb_auth_armv8_ce (const void *keysched, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + unsigned char *L_table, + size_t nblocks, + unsigned int nrounds, + unsigned int blkn); extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, @@ -116,17 +116,14 @@ extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched, const unsigned char *inbuf, unsigned char *tweak, size_t nblocks, unsigned int nrounds); - -typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *offset, unsigned char *checksum, - unsigned char *L_table, size_t nblocks, - unsigned int nrounds, unsigned int blkn); - -typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *tweak, size_t nblocks, - unsigned int nrounds); +extern void _gcry_aes_ecb_enc_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + size_t nblocks, unsigned int nrounds); +extern void _gcry_aes_ecb_dec_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + size_t nblocks, unsigned int nrounds); void @@ -312,8 +309,6 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, { RIJNDAEL_context *ctx = (void *)&c->context.c; const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; - ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce - : _gcry_aes_ocb_dec_armv8_ce; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int nrounds = ctx->rounds; @@ -327,10 +322,16 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, c->u_mode.ocb.data_nblocks = blkn + nblocks; - crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, - c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn); - - return 0; + if (encrypt) + return _gcry_aes_ocb_enc_armv8_ce (keysched, outbuf, inbuf, + c->u_iv.iv, c->u_ctr.ctr, + c->u_mode.ocb.L[0], nblocks, nrounds, + (unsigned int)blkn); + else + return _gcry_aes_ocb_dec_armv8_ce (keysched, outbuf, inbuf, + c->u_iv.iv, c->u_ctr.ctr, + c->u_mode.ocb.L[0], nblocks, nrounds, + (unsigned int)blkn); } size_t @@ -345,11 +346,9 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, c->u_mode.ocb.aad_nblocks = blkn + nblocks; - _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0], - nblocks, nrounds, (unsigned int)blkn); - - return 0; + return _gcry_aes_ocb_auth_armv8_ce (keysched, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0], + nblocks, nrounds, (unsigned int)blkn); } void @@ -358,8 +357,6 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, size_t nblocks, int encrypt) { const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; - xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce - : _gcry_aes_xts_dec_armv8_ce; unsigned int nrounds = ctx->rounds; if ( !encrypt && !ctx->decryption_prepared ) @@ -368,7 +365,32 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, ctx->decryption_prepared = 1; } - crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds); + if (encrypt) + _gcry_aes_xts_enc_armv8_ce (keysched, outbuf, inbuf, tweak, + nblocks, nrounds); + else + _gcry_aes_xts_dec_armv8_ce (keysched, outbuf, inbuf, tweak, + nblocks, nrounds); } +void +_gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf, + const void *inbuf, size_t nblocks, + int encrypt) +{ + RIJNDAEL_context *ctx = context; + const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; + unsigned int nrounds = ctx->rounds; + + if ( !encrypt && !ctx->decryption_prepared ) + { + _gcry_aes_armv8_ce_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + + if (encrypt) + _gcry_aes_ecb_enc_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds); + else + _gcry_aes_ecb_dec_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds); +} #endif /* USE_ARM_CE */ diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S index e36e82a0..655fdf55 100644 --- a/cipher/rijndael-vaes-avx2-amd64.S +++ b/cipher/rijndael-vaes-avx2-amd64.S @@ -2357,7 +2357,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64: ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64) /********************************************************************** - CTR-mode encryption + XTS-mode encryption **********************************************************************/ ELF(.type _gcry_vaes_avx2_xts_crypt_amd64,@function) .globl _gcry_vaes_avx2_xts_crypt_amd64 @@ -2874,6 +2874,436 @@ _gcry_vaes_avx2_xts_crypt_amd64: ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64) /********************************************************************** + ECB-mode encryption + **********************************************************************/ +ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64,@function) +.globl _gcry_vaes_avx2_ecb_crypt_amd64 +_gcry_vaes_avx2_ecb_crypt_amd64: + /* input: + * %rdi: round keys + * %esi: encrypt + * %rdx: dst + * %rcx: src + * %r8: nblocks + * %r9: nrounds + */ + CFI_STARTPROC(); + + /* Process 16 blocks per loop. */ +.align 8 +.Lecb_blk16: + cmpq $16, %r8; + jb .Lecb_blk8; + + leaq -16(%r8), %r8; + + /* Load input and xor first key. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm8; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + vmovdqu (8 * 16)(%rcx), %ymm4; + vmovdqu (10 * 16)(%rcx), %ymm5; + vmovdqu (12 * 16)(%rcx), %ymm6; + vmovdqu (14 * 16)(%rcx), %ymm7; + vpxor %ymm8, %ymm0, %ymm0; + vpxor %ymm8, %ymm1, %ymm1; + vpxor %ymm8, %ymm2, %ymm2; + vpxor %ymm8, %ymm3, %ymm3; + vpxor %ymm8, %ymm4, %ymm4; + vpxor %ymm8, %ymm5, %ymm5; + vpxor %ymm8, %ymm6, %ymm6; + vpxor %ymm8, %ymm7, %ymm7; + vbroadcasti128 (1 * 16)(%rdi), %ymm8; + leaq (16 * 16)(%rcx), %rcx; + + testl %esi, %esi; + jz .Lecb_dec_blk16; + /* AES rounds */ + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (10 * 16)(%rdi), %ymm8; + cmpl $12, %r9d; + jb .Lecb_enc_blk16_last; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (12 * 16)(%rdi), %ymm8; + jz .Lecb_enc_blk16_last; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (14 * 16)(%rdi), %ymm8; + .Lecb_enc_blk16_last: + vaesenclast %ymm8, %ymm0, %ymm0; + vaesenclast %ymm8, %ymm1, %ymm1; + vaesenclast %ymm8, %ymm2, %ymm2; + vaesenclast %ymm8, %ymm3, %ymm3; + vaesenclast %ymm8, %ymm4, %ymm4; + vaesenclast %ymm8, %ymm5, %ymm5; + vaesenclast %ymm8, %ymm6, %ymm6; + vaesenclast %ymm8, %ymm7, %ymm7; + jmp .Lecb_blk16_end; + + .align 8 + .Lecb_dec_blk16: + /* AES rounds */ + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (10 * 16)(%rdi), %ymm8; + cmpl $12, %r9d; + jb .Lecb_dec_blk16_last; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (12 * 16)(%rdi), %ymm8; + jz .Lecb_dec_blk16_last; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (14 * 16)(%rdi), %ymm8; + .Lecb_dec_blk16_last: + vaesdeclast %ymm8, %ymm0, %ymm0; + vaesdeclast %ymm8, %ymm1, %ymm1; + vaesdeclast %ymm8, %ymm2, %ymm2; + vaesdeclast %ymm8, %ymm3, %ymm3; + vaesdeclast %ymm8, %ymm4, %ymm4; + vaesdeclast %ymm8, %ymm5, %ymm5; + vaesdeclast %ymm8, %ymm6, %ymm6; + vaesdeclast %ymm8, %ymm7, %ymm7; + jmp .Lecb_blk16_end; + + .align 8 + .Lecb_blk16_end: + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + vmovdqu %ymm4, (8 * 16)(%rdx); + vmovdqu %ymm5, (10 * 16)(%rdx); + vmovdqu %ymm6, (12 * 16)(%rdx); + vmovdqu %ymm7, (14 * 16)(%rdx); + leaq (16 * 16)(%rdx), %rdx; + + jmp .Lecb_blk16; + + /* Handle trailing eight blocks. */ +.align 8 +.Lecb_blk8: + cmpq $8, %r8; + jmp .Lecb_blk4; + + leaq -8(%r8), %r8; + + /* Load input and xor first key. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm4, %ymm1, %ymm1; + vpxor %ymm4, %ymm2, %ymm2; + vpxor %ymm4, %ymm3, %ymm3; + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + leaq (8 * 16)(%rcx), %rcx; + + testl %esi, %esi; + jz .Lecb_dec_blk8; + /* AES rounds */ + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lecb_enc_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lecb_enc_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + .Lecb_enc_blk8_last: + vaesenclast %ymm4, %ymm0, %ymm0; + vaesenclast %ymm4, %ymm1, %ymm1; + vaesenclast %ymm4, %ymm2, %ymm2; + vaesenclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + jmp .Lecb_blk4; + + .align 8 + .Lecb_dec_blk8: + /* AES rounds */ + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lecb_dec_blk8_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lecb_dec_blk8_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + .Lecb_dec_blk8_last: + vaesdeclast %ymm4, %ymm0, %ymm0; + vaesdeclast %ymm4, %ymm1, %ymm1; + vaesdeclast %ymm4, %ymm2, %ymm2; + vaesdeclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + /* Handle trailing four blocks. */ +.align 8 +.Lecb_blk4: + cmpq $4, %r8; + jb .Lecb_blk1; + + leaq -4(%r8), %r8; + + /* Load input and xor first key. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm4, %ymm1, %ymm1; + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + leaq (4 * 16)(%rcx), %rcx; + + testl %esi, %esi; + jz .Lecb_dec_blk4; + /* AES rounds */ + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lecb_enc_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lecb_enc_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + .Lecb_enc_blk4_last: + vaesenclast %ymm4, %ymm0, %ymm0; + vaesenclast %ymm4, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + jmp .Lecb_blk1; + + .align 8 + .Lecb_dec_blk4: + /* AES rounds */ + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lecb_dec_blk4_last; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lecb_dec_blk4_last; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + .Lecb_dec_blk4_last: + vaesdeclast %ymm4, %ymm0, %ymm0; + vaesdeclast %ymm4, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + /* Process trailing one to three blocks, one per loop. */ +.align 8 +.Lecb_blk1: + cmpq $1, %r8; + jb .Ldone_ecb; + + leaq -1(%r8), %r8; + + /* Load input. */ + vmovdqu (%rcx), %xmm2; + leaq 16(%rcx), %rcx; + + /* Xor first key. */ + vpxor (0 * 16)(%rdi), %xmm2, %xmm0; + + testl %esi, %esi; + jz .Lecb_dec_blk1; + /* AES rounds. */ + vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lecb_enc_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lecb_enc_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + .Lecb_enc_blk1_last: + vaesenclast %xmm1, %xmm0, %xmm0; + jmp .Lecb_blk1_end; + + .align 8 + .Lecb_dec_blk1: + /* AES rounds. */ + vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (3 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (4 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (5 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (6 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (7 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (8 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lecb_dec_blk1_last; + vaesdec %xmm1, %xmm0, %xmm0; + vaesdec (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lecb_dec_blk1_last; + vaesdec %xmm1, %xmm0, %xmm0; + vaesdec (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + .Lecb_dec_blk1_last: + vaesdeclast %xmm1, %xmm0, %xmm0; + jmp .Lecb_blk1_end; + + .align 8 + .Lecb_blk1_end: + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Lecb_blk1; + +.align 8 +.Ldone_ecb: + vzeroall; + ret_spec_stop + CFI_ENDPROC(); +ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64) + +/********************************************************************** constants **********************************************************************/ ELF(.type _gcry_vaes_consts,@object) diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c index dbcf9afa..978c86da 100644 --- a/cipher/rijndael-vaes.c +++ b/cipher/rijndael-vaes.c @@ -91,6 +91,32 @@ extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched, unsigned int nrounds, int encrypt) ASM_FUNC_ABI; +extern void _gcry_vaes_avx2_ecb_crypt_amd64 (const void *keysched, + int encrypt, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks, + unsigned int nrounds) ASM_FUNC_ABI; + + +void +_gcry_aes_vaes_ecb_crypt (void *context, void *outbuf, + const void *inbuf, size_t nblocks, + int encrypt) +{ + RIJNDAEL_context *ctx = context; + const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; + unsigned int nrounds = ctx->rounds; + + if (!encrypt && !ctx->decryption_prepared) + { + _gcry_aes_aesni_prepare_decryption (ctx); + ctx->decryption_prepared = 1; + } + + _gcry_vaes_avx2_ecb_crypt_amd64 (keysched, encrypt, outbuf, inbuf, + nblocks, nrounds); +} void _gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv, diff --git a/cipher/rijndael.c b/cipher/rijndael.c index f3060ea5..84cb7109 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -102,6 +102,9 @@ extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +extern void _gcry_aes_aesni_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); #endif #ifdef USE_VAES @@ -125,6 +128,9 @@ extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +extern void _gcry_aes_vaes_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); #endif #ifdef USE_SSSE3 @@ -227,6 +233,9 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +extern void _gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); #endif /*USE_ARM_ASM*/ #ifdef USE_PPC_CRYPTO @@ -524,6 +533,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt; + bulk_ops->ecb_crypt = _gcry_aes_aesni_ecb_crypt; #ifdef USE_VAES if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) && @@ -536,6 +546,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc; bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt; bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt; + bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt; } #endif } @@ -591,6 +602,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt; + bulk_ops->ecb_crypt = _gcry_aes_armv8_ce_ecb_crypt; } #endif #ifdef USE_PPC_CRYPTO_WITH_PPC9LE |