diff options
-rw-r--r-- | cipher/camellia-glue.c | 254 | ||||
-rw-r--r-- | cipher/rijndael-aesni.c | 562 | ||||
-rw-r--r-- | cipher/serpent.c | 370 | ||||
-rw-r--r-- | tests/basic.c | 48 |
4 files changed, 586 insertions, 648 deletions
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 2d5dd209..dee01694 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -631,58 +631,47 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, { int did_use_aesni_avx2 = 0; const void *Ls[32]; + unsigned int n = 32 - (blkn % 32); + const void **l; int i; - if (blkn % 32 == 0) + if (nblocks >= 32) { for (i = 0; i < 32; i += 8) { - Ls[i + 0] = c->u_mode.ocb.L[0]; - Ls[i + 1] = c->u_mode.ocb.L[1]; - Ls[i + 2] = c->u_mode.ocb.L[0]; - Ls[i + 3] = c->u_mode.ocb.L[2]; - Ls[i + 4] = c->u_mode.ocb.L[0]; - Ls[i + 5] = c->u_mode.ocb.L[1]; - Ls[i + 6] = c->u_mode.ocb.L[0]; + Ls[(i + 0 + n) % 32] = c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 32] = c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 32] = c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 32] = c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 32] = c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 32] = c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 32] = c->u_mode.ocb.L[0]; } - Ls[7] = c->u_mode.ocb.L[3]; - Ls[15] = c->u_mode.ocb.L[4]; - Ls[23] = c->u_mode.ocb.L[3]; - } + Ls[(7 + n) % 32] = c->u_mode.ocb.L[3]; + Ls[(15 + n) % 32] = c->u_mode.ocb.L[4]; + Ls[(23 + n) % 32] = c->u_mode.ocb.L[3]; + l = &Ls[(31 + n) % 32]; - /* Process data in 32 block chunks. */ - while (nblocks >= 32) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 32 == 0) + /* Process data in 32 block chunks. */ + while (nblocks >= 32) { + /* l_tmp will be used only every 65536-th block. */ blkn += 32; - Ls[31] = ocb_get_l(c, l_tmp, blkn); + *l = ocb_get_l(c, l_tmp, blkn - blkn % 32); + + if (encrypt) + _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 32; + outbuf += 32 * CAMELLIA_BLOCK_SIZE; + inbuf += 32 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx2 = 1; } - else - { - for (i = 0; i < 32; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } - } - - if (encrypt) - _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - else - _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - - nblocks -= 32; - outbuf += 32 * CAMELLIA_BLOCK_SIZE; - inbuf += 32 * CAMELLIA_BLOCK_SIZE; - did_use_aesni_avx2 = 1; } if (did_use_aesni_avx2) @@ -703,56 +692,45 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, { int did_use_aesni_avx = 0; const void *Ls[16]; + unsigned int n = 16 - (blkn % 16); + const void **l; int i; - if (blkn % 16 == 0) + if (nblocks >= 16) { for (i = 0; i < 16; i += 8) { - Ls[i + 0] = c->u_mode.ocb.L[0]; - Ls[i + 1] = c->u_mode.ocb.L[1]; - Ls[i + 2] = c->u_mode.ocb.L[0]; - Ls[i + 3] = c->u_mode.ocb.L[2]; - Ls[i + 4] = c->u_mode.ocb.L[0]; - Ls[i + 5] = c->u_mode.ocb.L[1]; - Ls[i + 6] = c->u_mode.ocb.L[0]; + Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0]; } - Ls[7] = c->u_mode.ocb.L[3]; - } + Ls[(7 + n) % 16] = c->u_mode.ocb.L[3]; + l = &Ls[(15 + n) % 16]; - /* Process data in 16 block chunks. */ - while (nblocks >= 16) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 16 == 0) + /* Process data in 16 block chunks. */ + while (nblocks >= 16) { + /* l_tmp will be used only every 65536-th block. */ blkn += 16; - Ls[15] = ocb_get_l(c, l_tmp, blkn); + *l = ocb_get_l(c, l_tmp, blkn - blkn % 16); + + if (encrypt) + _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 16; + outbuf += 16 * CAMELLIA_BLOCK_SIZE; + inbuf += 16 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx = 1; } - else - { - for (i = 0; i < 16; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } - } - - if (encrypt) - _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - else - _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - - nblocks -= 16; - outbuf += 16 * CAMELLIA_BLOCK_SIZE; - inbuf += 16 * CAMELLIA_BLOCK_SIZE; - did_use_aesni_avx = 1; } if (did_use_aesni_avx) @@ -803,53 +781,43 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, { int did_use_aesni_avx2 = 0; const void *Ls[32]; + unsigned int n = 32 - (blkn % 32); + const void **l; int i; - if (blkn % 32 == 0) + if (nblocks >= 32) { for (i = 0; i < 32; i += 8) { - Ls[i + 0] = c->u_mode.ocb.L[0]; - Ls[i + 1] = c->u_mode.ocb.L[1]; - Ls[i + 2] = c->u_mode.ocb.L[0]; - Ls[i + 3] = c->u_mode.ocb.L[2]; - Ls[i + 4] = c->u_mode.ocb.L[0]; - Ls[i + 5] = c->u_mode.ocb.L[1]; - Ls[i + 6] = c->u_mode.ocb.L[0]; + Ls[(i + 0 + n) % 32] = c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 32] = c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 32] = c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 32] = c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 32] = c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 32] = c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 32] = c->u_mode.ocb.L[0]; } - Ls[7] = c->u_mode.ocb.L[3]; - Ls[15] = c->u_mode.ocb.L[4]; - Ls[23] = c->u_mode.ocb.L[3]; - } + Ls[(7 + n) % 32] = c->u_mode.ocb.L[3]; + Ls[(15 + n) % 32] = c->u_mode.ocb.L[4]; + Ls[(23 + n) % 32] = c->u_mode.ocb.L[3]; + l = &Ls[(31 + n) % 32]; - /* Process data in 32 block chunks. */ - while (nblocks >= 32) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 32 == 0) + /* Process data in 32 block chunks. */ + while (nblocks >= 32) { + /* l_tmp will be used only every 65536-th block. */ blkn += 32; - Ls[31] = ocb_get_l(c, l_tmp, blkn); - } - else - { - for (i = 0; i < 32; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } - } + *l = ocb_get_l(c, l_tmp, blkn - blkn % 32); - _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); + _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, + c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); - nblocks -= 32; - abuf += 32 * CAMELLIA_BLOCK_SIZE; - did_use_aesni_avx2 = 1; + nblocks -= 32; + abuf += 32 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx2 = 1; + } } if (did_use_aesni_avx2) @@ -870,51 +838,41 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, { int did_use_aesni_avx = 0; const void *Ls[16]; + unsigned int n = 16 - (blkn % 16); + const void **l; int i; - if (blkn % 16 == 0) + if (nblocks >= 16) { for (i = 0; i < 16; i += 8) { - Ls[i + 0] = c->u_mode.ocb.L[0]; - Ls[i + 1] = c->u_mode.ocb.L[1]; - Ls[i + 2] = c->u_mode.ocb.L[0]; - Ls[i + 3] = c->u_mode.ocb.L[2]; - Ls[i + 4] = c->u_mode.ocb.L[0]; - Ls[i + 5] = c->u_mode.ocb.L[1]; - Ls[i + 6] = c->u_mode.ocb.L[0]; + Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0]; } - Ls[7] = c->u_mode.ocb.L[3]; - } + Ls[(7 + n) % 16] = c->u_mode.ocb.L[3]; + l = &Ls[(15 + n) % 16]; - /* Process data in 16 block chunks. */ - while (nblocks >= 16) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 16 == 0) + /* Process data in 16 block chunks. */ + while (nblocks >= 16) { + /* l_tmp will be used only every 65536-th block. */ blkn += 16; - Ls[15] = ocb_get_l(c, l_tmp, blkn); - } - else - { - for (i = 0; i < 16; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } - } + *l = ocb_get_l(c, l_tmp, blkn - blkn % 16); - _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); + _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, + c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); - nblocks -= 16; - abuf += 16 * CAMELLIA_BLOCK_SIZE; - did_use_aesni_avx = 1; + nblocks -= 16; + abuf += 16 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx = 1; + } } if (did_use_aesni_avx) diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 66787858..5c859031 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -1338,11 +1338,7 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv, const unsigned char *l; unsigned int ntz; - if (i & 1) - return c->u_mode.ocb.L[0]; - else if (i & 2) - return c->u_mode.ocb.L[1]; - else if (i & 0xffffffffU) + if (i & 0xffffffffU) { asm ("rep;bsf %k[low], %k[ntz]\n\t" : [ntz] "=r" (ntz) @@ -1407,7 +1403,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; - const unsigned char *l[4] = {}; + const unsigned char *l; aesni_prepare_2_6_variable; aesni_prepare (); @@ -1421,103 +1417,112 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, [ctr] "m" (*c->u_ctr.ctr) : "memory" ); - if (nblocks > 3) + + for ( ;nblocks && n % 4; nblocks-- ) + { + l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + asm volatile ("movdqu %[l], %%xmm1\n\t" + "movdqu %[inbuf], %%xmm0\n\t" + "pxor %%xmm1, %%xmm5\n\t" + "pxor %%xmm0, %%xmm6\n\t" + "pxor %%xmm5, %%xmm0\n\t" + : + : [l] "m" (*l), + [inbuf] "m" (*inbuf) + : "memory" ); + + do_aesni_enc (ctx); + + asm volatile ("pxor %%xmm5, %%xmm0\n\t" + "movdqu %%xmm0, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : + : "memory" ); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + for ( ;nblocks > 3 ; nblocks -= 4 ) { - if (n % 4 == 0) - { - l[0] = c->u_mode.ocb.L[0]; - l[1] = c->u_mode.ocb.L[1]; - l[2] = c->u_mode.ocb.L[0]; - } - - for ( ;nblocks > 3 ; nblocks -= 4 ) - { - /* l_tmp will be used only every 65536-th block. */ - if (n % 4 == 0) - { - n += 4; - l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr); - } - else - { - l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr); - l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr); - l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr); - l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr); - n += 4; - } - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* Checksum_i = Checksum_{i-1} xor P_i */ - /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" - "movdqu %[inbuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm1, %%xmm6\n\t" - "pxor %%xmm5, %%xmm1\n\t" - "movdqu %%xmm5, %[outbuf0]\n\t" - : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0] "m" (*l[0]), - [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l1], %%xmm0\n\t" - "movdqu %[inbuf1], %%xmm2\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm2, %%xmm6\n\t" - "pxor %%xmm5, %%xmm2\n\t" - "movdqu %%xmm5, %[outbuf1]\n\t" - : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) - : [l1] "m" (*l[1]), - [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[inbuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm3, %%xmm6\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "movdqu %%xmm5, %[outbuf2]\n\t" - : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) - : [l2] "m" (*l[2]), - [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l3], %%xmm0\n\t" - "movdqu %[inbuf3], %%xmm4\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm4, %%xmm6\n\t" - "pxor %%xmm5, %%xmm4\n\t" - : - : [l3] "m" (*l[3]), - [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) - : "memory" ); - - do_aesni_enc_vec4 (ctx); - - asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" - "pxor %%xmm0, %%xmm1\n\t" - "movdqu %%xmm1, %[outbuf0]\n\t" - "movdqu %[outbuf1],%%xmm0\n\t" - "pxor %%xmm0, %%xmm2\n\t" - "movdqu %%xmm2, %[outbuf1]\n\t" - "movdqu %[outbuf2],%%xmm0\n\t" - "pxor %%xmm0, %%xmm3\n\t" - "movdqu %%xmm3, %[outbuf2]\n\t" - "pxor %%xmm5, %%xmm4\n\t" - "movdqu %%xmm4, %[outbuf3]\n\t" - : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), - [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), - [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), - [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) - : - : "memory" ); - - outbuf += 4*BLOCKSIZE; - inbuf += 4*BLOCKSIZE; - } + /* l_tmp will be used only every 65536-th block. */ + n += 4; + l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu %%xmm5, %[outbuf0]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) + : [l0] "m" (*c->u_mode.ocb.L[0]), + [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqu %%xmm5, %[outbuf1]\n\t" + : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [l1] "m" (*c->u_mode.ocb.L[1]), + [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqu %%xmm5, %[outbuf2]\n\t" + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) + : [l2] "m" (*c->u_mode.ocb.L[0]), + [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l), + [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_enc_vec4 (ctx); + + asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "movdqu %%xmm1, %[outbuf0]\n\t" + "movdqu %[outbuf1],%%xmm0\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "movdqu %%xmm2, %[outbuf1]\n\t" + "movdqu %[outbuf2],%%xmm0\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "movdqu %%xmm3, %[outbuf2]\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqu %%xmm4, %[outbuf3]\n\t" + : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), + [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) + : + : "memory" ); + + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { - l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ @@ -1528,7 +1533,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, "pxor %%xmm0, %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" : - : [l] "m" (*l[0]), + : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); @@ -1568,7 +1573,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; - const unsigned char *l[4] = {}; + const unsigned char *l; aesni_prepare_2_6_variable; aesni_prepare (); @@ -1582,103 +1587,111 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, [ctr] "m" (*c->u_ctr.ctr) : "memory" ); - if (nblocks > 3) + for ( ;nblocks && n % 4; nblocks-- ) + { + l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + asm volatile ("movdqu %[l], %%xmm1\n\t" + "movdqu %[inbuf], %%xmm0\n\t" + "pxor %%xmm1, %%xmm5\n\t" + "pxor %%xmm5, %%xmm0\n\t" + : + : [l] "m" (*l), + [inbuf] "m" (*inbuf) + : "memory" ); + + do_aesni_dec (ctx); + + asm volatile ("pxor %%xmm5, %%xmm0\n\t" + "pxor %%xmm0, %%xmm6\n\t" + "movdqu %%xmm0, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : + : "memory" ); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + for ( ;nblocks > 3 ; nblocks -= 4 ) { - if (n % 4 == 0) - { - l[0] = c->u_mode.ocb.L[0]; - l[1] = c->u_mode.ocb.L[1]; - l[2] = c->u_mode.ocb.L[0]; - } - - for ( ;nblocks > 3 ; nblocks -= 4 ) - { - /* l_tmp will be used only every 65536-th block. */ - if (n % 4 == 0) - { - n += 4; - l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr); - } - else - { - l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr); - l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr); - l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr); - l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr); - n += 4; - } - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ - /* Checksum_i = Checksum_{i-1} xor P_i */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" - "movdqu %[inbuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm1\n\t" - "movdqu %%xmm5, %[outbuf0]\n\t" - : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0] "m" (*l[0]), - [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l1], %%xmm0\n\t" - "movdqu %[inbuf1], %%xmm2\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm2\n\t" - "movdqu %%xmm5, %[outbuf1]\n\t" - : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) - : [l1] "m" (*l[1]), - [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[inbuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "movdqu %%xmm5, %[outbuf2]\n\t" - : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) - : [l2] "m" (*l[2]), - [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l3], %%xmm0\n\t" - "movdqu %[inbuf3], %%xmm4\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm4\n\t" - : - : [l3] "m" (*l[3]), - [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) - : "memory" ); - - do_aesni_dec_vec4 (ctx); - - asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" - "pxor %%xmm0, %%xmm1\n\t" - "movdqu %%xmm1, %[outbuf0]\n\t" - "movdqu %[outbuf1],%%xmm0\n\t" - "pxor %%xmm0, %%xmm2\n\t" - "movdqu %%xmm2, %[outbuf1]\n\t" - "movdqu %[outbuf2],%%xmm0\n\t" - "pxor %%xmm0, %%xmm3\n\t" - "movdqu %%xmm3, %[outbuf2]\n\t" - "pxor %%xmm5, %%xmm4\n\t" - "movdqu %%xmm4, %[outbuf3]\n\t" - "pxor %%xmm1, %%xmm6\n\t" - "pxor %%xmm2, %%xmm6\n\t" - "pxor %%xmm3, %%xmm6\n\t" - "pxor %%xmm4, %%xmm6\n\t" - : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), - [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), - [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), - [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) - : - : "memory" ); - - outbuf += 4*BLOCKSIZE; - inbuf += 4*BLOCKSIZE; - } + /* l_tmp will be used only every 65536-th block. */ + n += 4; + l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu %%xmm5, %[outbuf0]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) + : [l0] "m" (*c->u_mode.ocb.L[0]), + [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqu %%xmm5, %[outbuf1]\n\t" + : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [l1] "m" (*c->u_mode.ocb.L[1]), + [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqu %%xmm5, %[outbuf2]\n\t" + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) + : [l2] "m" (*c->u_mode.ocb.L[0]), + [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l), + [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_dec_vec4 (ctx); + + asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "movdqu %%xmm1, %[outbuf0]\n\t" + "movdqu %[outbuf1],%%xmm0\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "movdqu %%xmm2, %[outbuf1]\n\t" + "movdqu %[outbuf2],%%xmm0\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "movdqu %%xmm3, %[outbuf2]\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqu %%xmm4, %[outbuf3]\n\t" + "pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm4, %%xmm6\n\t" + : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), + [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) + : + : "memory" ); + + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { - l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ @@ -1688,7 +1701,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : - : [l] "m" (*l[0]), + : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); @@ -1739,7 +1752,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; u64 n = c->u_mode.ocb.aad_nblocks; - const unsigned char *l[4] = {}; + const unsigned char *l; aesni_prepare_2_6_variable; aesni_prepare (); @@ -1753,90 +1766,91 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, [ctr] "m" (*c->u_mode.ocb.aad_sum) : "memory" ); - if (nblocks > 3) + for ( ;nblocks && n % 4; nblocks-- ) + { + l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + asm volatile ("movdqu %[l], %%xmm1\n\t" + "movdqu %[abuf], %%xmm0\n\t" + "pxor %%xmm1, %%xmm5\n\t" + "pxor %%xmm5, %%xmm0\n\t" + : + : [l] "m" (*l), + [abuf] "m" (*abuf) + : "memory" ); + + do_aesni_enc (ctx); + + asm volatile ("pxor %%xmm0, %%xmm6\n\t" + : + : + : "memory" ); + + abuf += BLOCKSIZE; + } + + for ( ;nblocks > 3 ; nblocks -= 4 ) { - if (n % 4 == 0) - { - l[0] = c->u_mode.ocb.L[0]; - l[1] = c->u_mode.ocb.L[1]; - l[2] = c->u_mode.ocb.L[0]; - } - - for ( ;nblocks > 3 ; nblocks -= 4 ) - { - /* l_tmp will be used only every 65536-th block. */ - if (n % 4 == 0) - { - n += 4; - l[3] = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - } - else - { - l[0] = get_l(c, l_tmp.x1, n + 1, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - l[1] = get_l(c, l_tmp.x1, n + 2, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - l[2] = get_l(c, l_tmp.x1, n + 3, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - l[3] = get_l(c, l_tmp.x1, n + 4, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - n += 4; - } - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" - "movdqu %[abuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm1\n\t" - : - : [l0] "m" (*l[0]), - [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l1], %%xmm0\n\t" - "movdqu %[abuf1], %%xmm2\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm2\n\t" - : - : [l1] "m" (*l[1]), - [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[abuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm3\n\t" - : - : [l2] "m" (*l[2]), - [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l3], %%xmm0\n\t" - "movdqu %[abuf3], %%xmm4\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm4\n\t" - : - : [l3] "m" (*l[3]), - [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) - : "memory" ); - - do_aesni_enc_vec4 (ctx); - - asm volatile ("pxor %%xmm1, %%xmm6\n\t" - "pxor %%xmm2, %%xmm6\n\t" - "pxor %%xmm3, %%xmm6\n\t" - "pxor %%xmm4, %%xmm6\n\t" - : - : - : "memory" ); - - abuf += 4*BLOCKSIZE; - } + /* l_tmp will be used only every 65536-th block. */ + n += 4; + l = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[abuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + : + : [l0] "m" (*c->u_mode.ocb.L[0]), + [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[abuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + : + : [l1] "m" (*c->u_mode.ocb.L[1]), + [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[abuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + : + : [l2] "m" (*c->u_mode.ocb.L[0]), + [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[abuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l), + [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_enc_vec4 (ctx); + + asm volatile ("pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm4, %%xmm6\n\t" + : + : + : "memory" ); + + abuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { - l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); + l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ @@ -1845,7 +1859,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : - : [l] "m" (*l[0]), + : [l] "m" (*l), [abuf] "m" (*abuf) : "memory" ); diff --git a/cipher/serpent.c b/cipher/serpent.c index a47a1b77..fc3afa6b 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -1250,56 +1250,45 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, { int did_use_avx2 = 0; const void *Ls[16]; + unsigned int n = 16 - (blkn % 16); + const void **l; int i; - if (blkn % 16 == 0) + if (nblocks >= 16) { for (i = 0; i < 16; i += 8) { - Ls[i + 0] = c->u_mode.ocb.L[0]; - Ls[i + 1] = c->u_mode.ocb.L[1]; - Ls[i + 2] = c->u_mode.ocb.L[0]; - Ls[i + 3] = c->u_mode.ocb.L[2]; - Ls[i + 4] = c->u_mode.ocb.L[0]; - Ls[i + 5] = c->u_mode.ocb.L[1]; - Ls[i + 6] = c->u_mode.ocb.L[0]; + Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0]; } - Ls[7] = c->u_mode.ocb.L[3]; - } + Ls[(7 + n) % 16] = c->u_mode.ocb.L[3]; + l = &Ls[(15 + n) % 16]; - /* Process data in 16 block chunks. */ - while (nblocks >= 16) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 16 == 0) + /* Process data in 16 block chunks. */ + while (nblocks >= 16) { + /* l_tmp will be used only every 65536-th block. */ blkn += 16; - Ls[15] = ocb_get_l(c, l_tmp, blkn); + *l = ocb_get_l(c, l_tmp, blkn - blkn % 16); + + if (encrypt) + _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 16; + outbuf += 16 * sizeof(serpent_block_t); + inbuf += 16 * sizeof(serpent_block_t); + did_use_avx2 = 1; } - else - { - for (i = 0; i < 16; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } - } - - if (encrypt) - _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - else - _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - - nblocks -= 16; - outbuf += 16 * sizeof(serpent_block_t); - inbuf += 16 * sizeof(serpent_block_t); - did_use_avx2 = 1; } if (did_use_avx2) @@ -1317,51 +1306,39 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, { int did_use_sse2 = 0; const void *Ls[8]; - int i; + unsigned int n = 8 - (blkn % 8); + const void **l; - if (blkn % 8 == 0) + if (nblocks >= 8) { - Ls[0] = c->u_mode.ocb.L[0]; - Ls[1] = c->u_mode.ocb.L[1]; - Ls[2] = c->u_mode.ocb.L[0]; - Ls[3] = c->u_mode.ocb.L[2]; - Ls[4] = c->u_mode.ocb.L[0]; - Ls[5] = c->u_mode.ocb.L[1]; - Ls[6] = c->u_mode.ocb.L[0]; - } - - /* Process data in 8 block chunks. */ - while (nblocks >= 8) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 8 == 0) + Ls[(0 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(1 + n) % 8] = c->u_mode.ocb.L[1]; + Ls[(2 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(3 + n) % 8] = c->u_mode.ocb.L[2]; + Ls[(4 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(5 + n) % 8] = c->u_mode.ocb.L[1]; + Ls[(6 + n) % 8] = c->u_mode.ocb.L[0]; + l = &Ls[(7 + n) % 8]; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) { + /* l_tmp will be used only every 65536-th block. */ blkn += 8; - Ls[7] = ocb_get_l(c, l_tmp, blkn); - } - else - { - for (i = 0; i < 8; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } + *l = ocb_get_l(c, l_tmp, blkn - blkn % 8); + + if (encrypt) + _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_sse2 = 1; } - - if (encrypt) - _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - else - _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - - nblocks -= 8; - outbuf += 8 * sizeof(serpent_block_t); - inbuf += 8 * sizeof(serpent_block_t); - did_use_sse2 = 1; } if (did_use_sse2) @@ -1380,51 +1357,39 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, { int did_use_neon = 0; const void *Ls[8]; - int i; + unsigned int n = 8 - (blkn % 8); + const void **l; - if (blkn % 8 == 0) + if (nblocks >= 8) { - Ls[0] = c->u_mode.ocb.L[0]; - Ls[1] = c->u_mode.ocb.L[1]; - Ls[2] = c->u_mode.ocb.L[0]; - Ls[3] = c->u_mode.ocb.L[2]; - Ls[4] = c->u_mode.ocb.L[0]; - Ls[5] = c->u_mode.ocb.L[1]; - Ls[6] = c->u_mode.ocb.L[0]; - } - - /* Process data in 8 block chunks. */ - while (nblocks >= 8) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 8 == 0) + Ls[(0 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(1 + n) % 8] = c->u_mode.ocb.L[1]; + Ls[(2 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(3 + n) % 8] = c->u_mode.ocb.L[2]; + Ls[(4 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(5 + n) % 8] = c->u_mode.ocb.L[1]; + Ls[(6 + n) % 8] = c->u_mode.ocb.L[0]; + l = &Ls[(7 + n) % 8]; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) { + /* l_tmp will be used only every 65536-th block. */ blkn += 8; - Ls[7] = ocb_get_l(c, l_tmp, blkn); - } - else - { - for (i = 0; i < 8; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } + *l = ocb_get_l(c, l_tmp, blkn - blkn % 8); + + if (encrypt) + _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_neon = 1; } - - if (encrypt) - _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - else - _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - - nblocks -= 8; - outbuf += 8 * sizeof(serpent_block_t); - inbuf += 8 * sizeof(serpent_block_t); - did_use_neon = 1; } if (did_use_neon) @@ -1471,51 +1436,40 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, { int did_use_avx2 = 0; const void *Ls[16]; + unsigned int n = 16 - (blkn % 16); + const void **l; int i; - if (blkn % 16 == 0) + if (nblocks >= 16) { for (i = 0; i < 16; i += 8) { - Ls[i + 0] = c->u_mode.ocb.L[0]; - Ls[i + 1] = c->u_mode.ocb.L[1]; - Ls[i + 2] = c->u_mode.ocb.L[0]; - Ls[i + 3] = c->u_mode.ocb.L[2]; - Ls[i + 4] = c->u_mode.ocb.L[0]; - Ls[i + 5] = c->u_mode.ocb.L[1]; - Ls[i + 6] = c->u_mode.ocb.L[0]; + Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0]; } - Ls[7] = c->u_mode.ocb.L[3]; - } + Ls[(7 + n) % 16] = c->u_mode.ocb.L[3]; + l = &Ls[(15 + n) % 16]; - /* Process data in 16 block chunks. */ - while (nblocks >= 16) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 16 == 0) + /* Process data in 16 block chunks. */ + while (nblocks >= 16) { + /* l_tmp will be used only every 65536-th block. */ blkn += 16; - Ls[15] = ocb_get_l(c, l_tmp, blkn); - } - else - { - for (i = 0; i < 16; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } - } + *l = ocb_get_l(c, l_tmp, blkn - blkn % 16); - _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); + _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); - nblocks -= 16; - abuf += 16 * sizeof(serpent_block_t); - did_use_avx2 = 1; + nblocks -= 16; + abuf += 16 * sizeof(serpent_block_t); + did_use_avx2 = 1; + } } if (did_use_avx2) @@ -1533,46 +1487,34 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, { int did_use_sse2 = 0; const void *Ls[8]; - int i; + unsigned int n = 8 - (blkn % 8); + const void **l; - if (blkn % 8 == 0) + if (nblocks >= 8) { - Ls[0] = c->u_mode.ocb.L[0]; - Ls[1] = c->u_mode.ocb.L[1]; - Ls[2] = c->u_mode.ocb.L[0]; - Ls[3] = c->u_mode.ocb.L[2]; - Ls[4] = c->u_mode.ocb.L[0]; - Ls[5] = c->u_mode.ocb.L[1]; - Ls[6] = c->u_mode.ocb.L[0]; - } - - /* Process data in 8 block chunks. */ - while (nblocks >= 8) - { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 8 == 0) + Ls[(0 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(1 + n) % 8] = c->u_mode.ocb.L[1]; + Ls[(2 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(3 + n) % 8] = c->u_mode.ocb.L[2]; + Ls[(4 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(5 + n) % 8] = c->u_mode.ocb.L[1]; + Ls[(6 + n) % 8] = c->u_mode.ocb.L[0]; + l = &Ls[(7 + n) % 8]; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) { + /* l_tmp will be used only every 65536-th block. */ blkn += 8; - Ls[7] = ocb_get_l(c, l_tmp, blkn); - } - else - { - for (i = 0; i < 8; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } - } + *l = ocb_get_l(c, l_tmp, blkn - blkn % 8); - _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); + _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); - nblocks -= 8; - abuf += 8 * sizeof(serpent_block_t); - did_use_sse2 = 1; + nblocks -= 8; + abuf += 8 * sizeof(serpent_block_t); + did_use_sse2 = 1; + } } if (did_use_sse2) @@ -1591,46 +1533,34 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, { int did_use_neon = 0; const void *Ls[8]; - int i; - - if (blkn % 8 == 0) - { - Ls[0] = c->u_mode.ocb.L[0]; - Ls[1] = c->u_mode.ocb.L[1]; - Ls[2] = c->u_mode.ocb.L[0]; - Ls[3] = c->u_mode.ocb.L[2]; - Ls[4] = c->u_mode.ocb.L[0]; - Ls[5] = c->u_mode.ocb.L[1]; - Ls[6] = c->u_mode.ocb.L[0]; - } + unsigned int n = 8 - (blkn % 8); + const void **l; - /* Process data in 8 block chunks. */ - while (nblocks >= 8) + if (nblocks >= 8) { - /* l_tmp will be used only every 65536-th block. */ - if (blkn % 8 == 0) + Ls[(0 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(1 + n) % 8] = c->u_mode.ocb.L[1]; + Ls[(2 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(3 + n) % 8] = c->u_mode.ocb.L[2]; + Ls[(4 + n) % 8] = c->u_mode.ocb.L[0]; + Ls[(5 + n) % 8] = c->u_mode.ocb.L[1]; + Ls[(6 + n) % 8] = c->u_mode.ocb.L[0]; + l = &Ls[(7 + n) % 8]; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) { + /* l_tmp will be used only every 65536-th block. */ blkn += 8; - Ls[7] = ocb_get_l(c, l_tmp, blkn); - } - else - { - for (i = 0; i < 8; i += 4) - { - Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); - blkn += 4; - } - } + *l = ocb_get_l(c, l_tmp, blkn - blkn % 8); - _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); + _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); - nblocks -= 8; - abuf += 8 * sizeof(serpent_block_t); - did_use_neon = 1; + nblocks -= 8; + abuf += 8 * sizeof(serpent_block_t); + did_use_neon = 1; + } } if (did_use_neon) diff --git a/tests/basic.c b/tests/basic.c index c1aa76a7..4ea91a93 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -3153,7 +3153,8 @@ do_check_ocb_cipher (int inplace) static void -check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect) +check_ocb_cipher_largebuf_split (int algo, int keylen, const char *tagexpect, + unsigned int splitpos) { static const unsigned char key[32] = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" @@ -3219,7 +3220,14 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect) goto out_free; } - err = gcry_cipher_authenticate (hde, inbuf, buflen); + if (splitpos) + { + err = gcry_cipher_authenticate (hde, inbuf, splitpos); + } + if (!err) + { + err = gcry_cipher_authenticate (hde, inbuf + splitpos, buflen - splitpos); + } if (err) { fail ("cipher-ocb, gcry_cipher_authenticate failed (large, algo %d): %s\n", @@ -3229,10 +3237,18 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect) goto out_free; } - err = gcry_cipher_final (hde); + if (splitpos) + { + err = gcry_cipher_encrypt (hde, outbuf, splitpos, inbuf, splitpos); + } if (!err) { - err = gcry_cipher_encrypt (hde, outbuf, buflen, inbuf, buflen); + err = gcry_cipher_final (hde); + if (!err) + { + err = gcry_cipher_encrypt (hde, outbuf + splitpos, buflen - splitpos, + inbuf + splitpos, buflen - splitpos); + } } if (err) { @@ -3267,10 +3283,18 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect) } /* Now for the decryption. */ - err = gcry_cipher_final (hdd); + if (splitpos) + { + err = gcry_cipher_decrypt (hdd, outbuf, splitpos, NULL, 0); + } if (!err) { - err = gcry_cipher_decrypt (hdd, outbuf, buflen, NULL, 0); + err = gcry_cipher_final (hdd); + if (!err) + { + err = gcry_cipher_decrypt (hdd, outbuf + splitpos, buflen - splitpos, + NULL, 0); + } } if (err) { @@ -3319,6 +3343,18 @@ out_free: static void +check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect) +{ + unsigned int split; + + for (split = 0; split < 32 * 16; split = split * 2 + 16) + { + check_ocb_cipher_largebuf_split(algo, keylen, tagexpect, split); + } +} + + +static void check_ocb_cipher (void) { /* Check OCB cipher with separate destination and source buffers for |