diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2014-12-23 12:35:28 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2014-12-23 12:37:35 +0200 |
commit | 2374753938df64f6fd8015b44613806a326eff1a (patch) | |
tree | 3e8f7c245ca838681415c13f48a9de5d98931559 /cipher/rijndael.c | |
parent | ad50e360ef4851e66e51a03fc420175636336b58 (diff) | |
download | libgcrypt-2374753938df64f6fd8015b44613806a326eff1a.tar.gz |
rijndael: use more compact look-up tables and add table prefetching
* cipher/rijndael-internal.h (rijndael_prefetchfn_t): New.
(RIJNDAEL_context): Add 'prefetch_enc_fn' and 'prefetch_dec_fn'.
* cipher/rijndael-tables.h (S, T1, T2, T3, T4, T5, T6, T7, T8, S5, U1)
(U2, U3, U4): Remove.
(encT, dec_tables, decT, inv_sbox): Add.
* cipher/rijndael.c (_gcry_aes_amd64_encrypt_block)
(_gcry_aes_amd64_decrypt_block, _gcry_aes_arm_encrypt_block)
(_gcry_aes_arm_encrypt_block): Add parameter for passing table pointer
to assembly implementation.
(prefetch_table, prefetch_enc, prefetch_dec): New.
(do_setkey): Setup context prefetch functions depending on selected
rijndael implementation; Use new tables for key setup.
(prepare_decryption): Use new tables for decryption key setup.
(do_encrypt_aligned): Rename to...
(do_encrypt_fn): ... to this, change to use new compact tables,
make handle unaligned input and unroll rounds loop by two.
(do_encrypt): Remove handling of unaligned input/output; pass table
pointer to assembly implementations.
(rijndael_encrypt, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc)
(_gcry_aes_ctr_enc, _gcry_aes_cfb_dec): Prefetch encryption tables
before encryption.
(do_decrypt_aligned): Rename to...
(do_decrypt_fn): ... to this, change to use new compact tables,
make handle unaligned input and unroll rounds loop by two.
(do_decrypt): Remove handling of unaligned input/output; pass table
pointer to assembly implementations.
(rijndael_decrypt, _gcry_aes_cbc_dec): Prefetch decryption tables
before decryption.
* cipher/rijndael-amd64.S: Use 1+1.25 KiB tables for
encryption+decryption; remove tables from assembly file.
* cipher/rijndael-arm.S: Ditto.
--
Patch replaces 4+4.25 KiB look-up tables in generic implementation and
8+8 KiB look-up tables in AMD64 implementation and 2+2 KiB look-up tables in
ARM implementation with 1+1.25 KiB look-up tables, and adds prefetching of
look-up tables.
AMD64 assembly is slower than before because of additional rotation
instructions. The generic C implementation is now better optimized and
actually faster than before.
Benchmark results on Intel i5-4570 (turbo off) (64-bit, AMD64 assembly):
tests/bench-slope --disable-hwf intel-aesni --cpu-mhz 3200 cipher aes
Old:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 3.10 ns/B 307.5 MiB/s 9.92 c/B
ECB dec | 3.15 ns/B 302.5 MiB/s 10.09 c/B
CBC enc | 3.46 ns/B 275.5 MiB/s 11.08 c/B
CBC dec | 3.19 ns/B 299.2 MiB/s 10.20 c/B
CFB enc | 3.48 ns/B 274.4 MiB/s 11.12 c/B
CFB dec | 3.23 ns/B 294.8 MiB/s 10.35 c/B
OFB enc | 3.29 ns/B 290.2 MiB/s 10.52 c/B
OFB dec | 3.31 ns/B 288.3 MiB/s 10.58 c/B
CTR enc | 3.64 ns/B 261.7 MiB/s 11.66 c/B
CTR dec | 3.65 ns/B 261.6 MiB/s 11.67 c/B
New:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 4.21 ns/B 226.7 MiB/s 13.46 c/B
ECB dec | 4.27 ns/B 223.2 MiB/s 13.67 c/B
CBC enc | 4.15 ns/B 229.8 MiB/s 13.28 c/B
CBC dec | 3.85 ns/B 247.8 MiB/s 12.31 c/B
CFB enc | 4.16 ns/B 229.1 MiB/s 13.32 c/B
CFB dec | 3.88 ns/B 245.9 MiB/s 12.41 c/B
OFB enc | 4.38 ns/B 217.8 MiB/s 14.01 c/B
OFB dec | 4.36 ns/B 218.6 MiB/s 13.96 c/B
CTR enc | 4.30 ns/B 221.6 MiB/s 13.77 c/B
CTR dec | 4.30 ns/B 221.7 MiB/s 13.76 c/B
Benchmark on Intel i5-4570 (turbo off) (32-bit mingw, generic C):
tests/bench-slope.exe --disable-hwf intel-aesni --cpu-mhz 3200 cipher aes
Old:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 6.03 ns/B 158.2 MiB/s 19.29 c/B
ECB dec | 5.81 ns/B 164.1 MiB/s 18.60 c/B
CBC enc | 6.22 ns/B 153.4 MiB/s 19.90 c/B
CBC dec | 5.91 ns/B 161.3 MiB/s 18.92 c/B
CFB enc | 6.25 ns/B 152.7 MiB/s 19.99 c/B
CFB dec | 6.24 ns/B 152.8 MiB/s 19.97 c/B
OFB enc | 6.33 ns/B 150.6 MiB/s 20.27 c/B
OFB dec | 6.33 ns/B 150.7 MiB/s 20.25 c/B
CTR enc | 6.28 ns/B 152.0 MiB/s 20.08 c/B
CTR dec | 6.28 ns/B 151.7 MiB/s 20.11 c/B
New:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 5.02 ns/B 190.0 MiB/s 16.06 c/B
ECB dec | 5.33 ns/B 178.8 MiB/s 17.07 c/B
CBC enc | 4.64 ns/B 205.4 MiB/s 14.86 c/B
CBC dec | 4.95 ns/B 192.7 MiB/s 15.84 c/B
CFB enc | 4.75 ns/B 200.7 MiB/s 15.20 c/B
CFB dec | 4.74 ns/B 201.1 MiB/s 15.18 c/B
OFB enc | 5.29 ns/B 180.3 MiB/s 16.93 c/B
OFB dec | 5.29 ns/B 180.3 MiB/s 16.93 c/B
CTR enc | 4.77 ns/B 200.0 MiB/s 15.26 c/B
CTR dec | 4.77 ns/B 199.8 MiB/s 15.27 c/B
Benchmark on Cortex-A8 (ARM assembly):
tests/bench-slope --cpu-mhz 1008 cipher aes
Old:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 21.84 ns/B 43.66 MiB/s 22.02 c/B
ECB dec | 22.35 ns/B 42.67 MiB/s 22.53 c/B
CBC enc | 22.97 ns/B 41.53 MiB/s 23.15 c/B
CBC dec | 23.48 ns/B 40.61 MiB/s 23.67 c/B
CFB enc | 22.72 ns/B 41.97 MiB/s 22.90 c/B
CFB dec | 23.41 ns/B 40.74 MiB/s 23.59 c/B
OFB enc | 23.65 ns/B 40.32 MiB/s 23.84 c/B
OFB dec | 23.67 ns/B 40.29 MiB/s 23.86 c/B
CTR enc | 23.24 ns/B 41.03 MiB/s 23.43 c/B
CTR dec | 23.23 ns/B 41.05 MiB/s 23.42 c/B
New:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 26.03 ns/B 36.64 MiB/s 26.24 c/B
ECB dec | 26.97 ns/B 35.36 MiB/s 27.18 c/B
CBC enc | 23.21 ns/B 41.09 MiB/s 23.39 c/B
CBC dec | 23.36 ns/B 40.83 MiB/s 23.54 c/B
CFB enc | 23.02 ns/B 41.42 MiB/s 23.21 c/B
CFB dec | 23.67 ns/B 40.28 MiB/s 23.86 c/B
OFB enc | 27.86 ns/B 34.24 MiB/s 28.08 c/B
OFB dec | 27.87 ns/B 34.21 MiB/s 28.10 c/B
CTR enc | 23.47 ns/B 40.63 MiB/s 23.66 c/B
CTR dec | 23.49 ns/B 40.61 MiB/s 23.67 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael.c')
-rw-r--r-- | cipher/rijndael.c | 645 |
1 files changed, 385 insertions, 260 deletions
diff --git a/cipher/rijndael.c b/cipher/rijndael.c index aa1681db..5b0fe1c8 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -63,12 +63,14 @@ typedef u32 u32_a_t; extern unsigned int _gcry_aes_amd64_encrypt_block(const void *keysched_enc, unsigned char *out, const unsigned char *in, - int rounds); + int rounds, + const void *encT); extern unsigned int _gcry_aes_amd64_decrypt_block(const void *keysched_dec, unsigned char *out, const unsigned char *in, - int rounds); + int rounds, + const void *decT); #endif /*USE_AMD64_ASM*/ #ifdef USE_AESNI @@ -119,12 +121,14 @@ extern unsigned int _gcry_aes_padlock_decrypt (const RIJNDAEL_context *ctx, extern unsigned int _gcry_aes_arm_encrypt_block(const void *keysched_enc, unsigned char *out, const unsigned char *in, - int rounds); + int rounds, + const void *encT); extern unsigned int _gcry_aes_arm_decrypt_block(const void *keysched_dec, unsigned char *out, const unsigned char *in, - int rounds); + int rounds, + const void *decT); #endif /*USE_ARM_ASM*/ static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, @@ -145,6 +149,38 @@ static const char *selftest(void); +/* Prefetching for encryption/decryption tables. */ +static void prefetch_table(const volatile byte *tab, size_t len) +{ + size_t i; + + for (i = 0; i < len; i += 8 * 32) + { + (void)tab[i + 0 * 32]; + (void)tab[i + 1 * 32]; + (void)tab[i + 2 * 32]; + (void)tab[i + 3 * 32]; + (void)tab[i + 4 * 32]; + (void)tab[i + 5 * 32]; + (void)tab[i + 6 * 32]; + (void)tab[i + 7 * 32]; + } + + (void)tab[len - 1]; +} + +static void prefetch_enc(void) +{ + prefetch_table((const void *)encT, sizeof(encT)); +} + +static void prefetch_dec(void) +{ + prefetch_table((const void *)&dec_tables, sizeof(dec_tables)); +} + + + /* Perform the key setup. */ static gcry_err_code_t do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) @@ -216,6 +252,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { ctx->encrypt_fn = _gcry_aes_aesni_encrypt; ctx->decrypt_fn = _gcry_aes_aesni_decrypt; + ctx->prefetch_enc_fn = NULL; + ctx->prefetch_dec_fn = NULL; ctx->use_aesni = 1; } #endif @@ -224,6 +262,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { ctx->encrypt_fn = _gcry_aes_padlock_encrypt; ctx->decrypt_fn = _gcry_aes_padlock_decrypt; + ctx->prefetch_enc_fn = NULL; + ctx->prefetch_dec_fn = NULL; ctx->use_padlock = 1; memcpy (ctx->padlockkey, key, keylen); } @@ -232,6 +272,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { ctx->encrypt_fn = do_encrypt; ctx->decrypt_fn = do_decrypt; + ctx->prefetch_enc_fn = prefetch_enc; + ctx->prefetch_dec_fn = prefetch_dec; } /* NB: We don't yet support Padlock hardware key generation. */ @@ -246,14 +288,18 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) #endif else { + const byte *sbox = ((const byte *)encT) + 1; union { PROPERLY_ALIGNED_TYPE dummy; byte data[MAXKC][4]; - } k, tk; -#define k k.data -#define tk tk.data + } tkk[2]; +#define k tkk[0].data +#define tk tkk[1].data #define W (ctx->keyschenc) + + prefetch_enc(); + for (i = 0; i < keylen; i++) { k[i >> 2][i & 3] = key[i]; @@ -270,7 +316,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { for (; (j < KC) && (t < 4); j++, t++) { - *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]); + *((u32_a_t*)W[r][t]) = le_bswap32(*((u32_a_t*)tk[j])); } if (t == 4) { @@ -283,10 +329,10 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { /* While not enough round key material calculated calculate new values. */ - tk[0][0] ^= S[tk[KC-1][1]]; - tk[0][1] ^= S[tk[KC-1][2]]; - tk[0][2] ^= S[tk[KC-1][3]]; - tk[0][3] ^= S[tk[KC-1][0]]; + tk[0][0] ^= sbox[tk[KC-1][1] * 4]; + tk[0][1] ^= sbox[tk[KC-1][2] * 4]; + tk[0][2] ^= sbox[tk[KC-1][3] * 4]; + tk[0][3] ^= sbox[tk[KC-1][0] * 4]; tk[0][0] ^= rcon[rconpointer++]; if (KC != 8) @@ -302,10 +348,10 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]); } - tk[KC/2][0] ^= S[tk[KC/2 - 1][0]]; - tk[KC/2][1] ^= S[tk[KC/2 - 1][1]]; - tk[KC/2][2] ^= S[tk[KC/2 - 1][2]]; - tk[KC/2][3] ^= S[tk[KC/2 - 1][3]]; + tk[KC/2][0] ^= sbox[tk[KC/2 - 1][0] * 4]; + tk[KC/2][1] ^= sbox[tk[KC/2 - 1][1] * 4]; + tk[KC/2][2] ^= sbox[tk[KC/2 - 1][2] * 4]; + tk[KC/2][3] ^= sbox[tk[KC/2 - 1][3] * 4]; for (j = KC/2 + 1; j < KC; j++) { *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]); @@ -317,7 +363,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { for (; (j < KC) && (t < 4); j++, t++) { - *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]); + *((u32_a_t*)W[r][t]) = le_bswap32(*((u32_a_t*)tk[j])); } if (t == 4) { @@ -329,8 +375,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) #undef W #undef tk #undef k - wipememory(&tk, sizeof(tk)); - wipememory(&t, sizeof(t)); + wipememory(&tkk, sizeof(tkk)); } return 0; @@ -367,136 +412,190 @@ prepare_decryption( RIJNDAEL_context *ctx ) #endif /*USE_PADLOCK*/ else { - union - { - PROPERLY_ALIGNED_TYPE dummy; - byte *w; - } w; -#define w w.w + const byte *sbox = ((const byte *)encT) + 1; - for (r=0; r < MAXROUNDS+1; r++ ) - { - *((u32_a_t*)ctx->keyschdec[r][0]) = *((u32_a_t*)ctx->keyschenc[r][0]); - *((u32_a_t*)ctx->keyschdec[r][1]) = *((u32_a_t*)ctx->keyschenc[r][1]); - *((u32_a_t*)ctx->keyschdec[r][2]) = *((u32_a_t*)ctx->keyschenc[r][2]); - *((u32_a_t*)ctx->keyschdec[r][3]) = *((u32_a_t*)ctx->keyschenc[r][3]); - } -#define W (ctx->keyschdec) - for (r = 1; r < ctx->rounds; r++) - { - w = W[r][0]; - *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]]) - ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]); + prefetch_enc(); + prefetch_dec(); - w = W[r][1]; - *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]]) - ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]); + *((u32_a_t*)ctx->keyschdec[0][0]) = *((u32_a_t*)ctx->keyschenc[0][0]); + *((u32_a_t*)ctx->keyschdec[0][1]) = *((u32_a_t*)ctx->keyschenc[0][1]); + *((u32_a_t*)ctx->keyschdec[0][2]) = *((u32_a_t*)ctx->keyschenc[0][2]); + *((u32_a_t*)ctx->keyschdec[0][3]) = *((u32_a_t*)ctx->keyschenc[0][3]); - w = W[r][2]; - *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]]) - ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]); - - w = W[r][3]; - *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]]) - ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]); + for (r = 1; r < ctx->rounds; r++) + { + u32_a_t *wi = (u32_a_t*)((ctx->keyschenc)[r]); + u32_a_t *wo = (u32_a_t*)((ctx->keyschdec)[r]); + u32 wt; + + wt = wi[0]; + wo[0] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) + ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) + ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) + ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); + + wt = wi[1]; + wo[1] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) + ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) + ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) + ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); + + wt = wi[2]; + wo[2] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) + ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) + ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) + ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); + + wt = wi[3]; + wo[3] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) + ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) + ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) + ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); } -#undef W -#undef w - wipememory(&w, sizeof(w)); + + *((u32_a_t*)ctx->keyschdec[r][0]) = *((u32_a_t*)ctx->keyschenc[r][0]); + *((u32_a_t*)ctx->keyschdec[r][1]) = *((u32_a_t*)ctx->keyschenc[r][1]); + *((u32_a_t*)ctx->keyschdec[r][2]) = *((u32_a_t*)ctx->keyschenc[r][2]); + *((u32_a_t*)ctx->keyschdec[r][3]) = *((u32_a_t*)ctx->keyschenc[r][3]); } } -#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM) -/* Encrypt one block. A and B need to be aligned on a 4 byte - boundary. A and B may be the same. */ -static void -do_encrypt_aligned (const RIJNDAEL_context *ctx, - unsigned char *b, const unsigned char *a) +#if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM) +/* Encrypt one block. A and B may be the same. */ +static unsigned int +do_encrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b, + const unsigned char *a) { #define rk (ctx->keyschenc) + const byte *sbox = ((const byte *)encT) + 1; int rounds = ctx->rounds; int r; - union - { - u32 tempu32[4]; /* Force correct alignment. */ - byte temp[4][4]; - } u; - - *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[0][0]); - *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[0][1]); - *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[0][2]); - *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[0][3]); - *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]]) - ^ *((u32_a_t*)T2[u.temp[1][1]]) - ^ *((u32_a_t*)T3[u.temp[2][2]]) - ^ *((u32_a_t*)T4[u.temp[3][3]])); - *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]]) - ^ *((u32_a_t*)T2[u.temp[2][1]]) - ^ *((u32_a_t*)T3[u.temp[3][2]]) - ^ *((u32_a_t*)T4[u.temp[0][3]])); - *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]]) - ^ *((u32_a_t*)T2[u.temp[3][1]]) - ^ *((u32_a_t*)T3[u.temp[0][2]]) - ^ *((u32_a_t*)T4[u.temp[1][3]])); - *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]]) - ^ *((u32_a_t*)T2[u.temp[0][1]]) - ^ *((u32_a_t*)T3[u.temp[1][2]]) - ^ *((u32_a_t*)T4[u.temp[2][3]])); - - for (r = 1; r < rounds-1; r++) + u32 sa[4]; + u32 sb[4]; + + sb[0] = buf_get_le32(a + 0); + sb[1] = buf_get_le32(a + 4); + sb[2] = buf_get_le32(a + 8); + sb[3] = buf_get_le32(a + 12); + + sa[0] = sb[0] ^ *((u32_a_t*)rk[0][0]); + sa[1] = sb[1] ^ *((u32_a_t*)rk[0][1]); + sa[2] = sb[2] ^ *((u32_a_t*)rk[0][2]); + sa[3] = sb[3] ^ *((u32_a_t*)rk[0][3]); + + sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); + sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); + sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); + sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); + sa[0] = *((u32_a_t*)rk[1][0]) ^ sb[0]; + + sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); + sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); + sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); + sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); + sa[1] = *((u32_a_t*)rk[1][1]) ^ sb[1]; + + sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); + sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); + sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); + sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); + sa[2] = *((u32_a_t*)rk[1][2]) ^ sb[2]; + + sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); + sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); + sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); + sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); + sa[3] = *((u32_a_t*)rk[1][3]) ^ sb[3]; + + for (r = 2; r < rounds; r++) { - *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]); - *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]); - *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]); - *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]); - - *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]]) - ^ *((u32_a_t*)T2[u.temp[1][1]]) - ^ *((u32_a_t*)T3[u.temp[2][2]]) - ^ *((u32_a_t*)T4[u.temp[3][3]])); - *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]]) - ^ *((u32_a_t*)T2[u.temp[2][1]]) - ^ *((u32_a_t*)T3[u.temp[3][2]]) - ^ *((u32_a_t*)T4[u.temp[0][3]])); - *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]]) - ^ *((u32_a_t*)T2[u.temp[3][1]]) - ^ *((u32_a_t*)T3[u.temp[0][2]]) - ^ *((u32_a_t*)T4[u.temp[1][3]])); - *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]]) - ^ *((u32_a_t*)T2[u.temp[0][1]]) - ^ *((u32_a_t*)T3[u.temp[1][2]]) - ^ *((u32_a_t*)T4[u.temp[2][3]])); + sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); + sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); + sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); + sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); + sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0]; + + sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); + sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); + sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); + sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); + sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1]; + + sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); + sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); + sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); + sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); + sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2]; + + sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); + sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); + sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); + sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); + sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3]; + + r++; + + sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); + sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); + sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); + sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); + sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0]; + + sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); + sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); + sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); + sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); + sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1]; + + sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); + sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); + sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); + sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); + sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2]; + + sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); + sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); + sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); + sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); + sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3]; } /* Last round is special. */ - *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[rounds-1][0]); - *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[rounds-1][1]); - *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[rounds-1][2]); - *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[rounds-1][3]); - b[ 0] = T1[u.temp[0][0]][1]; - b[ 1] = T1[u.temp[1][1]][1]; - b[ 2] = T1[u.temp[2][2]][1]; - b[ 3] = T1[u.temp[3][3]][1]; - b[ 4] = T1[u.temp[1][0]][1]; - b[ 5] = T1[u.temp[2][1]][1]; - b[ 6] = T1[u.temp[3][2]][1]; - b[ 7] = T1[u.temp[0][3]][1]; - b[ 8] = T1[u.temp[2][0]][1]; - b[ 9] = T1[u.temp[3][1]][1]; - b[10] = T1[u.temp[0][2]][1]; - b[11] = T1[u.temp[1][3]][1]; - b[12] = T1[u.temp[3][0]][1]; - b[13] = T1[u.temp[0][1]][1]; - b[14] = T1[u.temp[1][2]][1]; - b[15] = T1[u.temp[2][3]][1]; - *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[rounds][0]); - *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[rounds][1]); - *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]); - *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]); + + sb[0] = (sbox[(byte)(sa[0] >> (0 * 8)) * 4]) << (0 * 8); + sb[3] = (sbox[(byte)(sa[0] >> (1 * 8)) * 4]) << (1 * 8); + sb[2] = (sbox[(byte)(sa[0] >> (2 * 8)) * 4]) << (2 * 8); + sb[1] = (sbox[(byte)(sa[0] >> (3 * 8)) * 4]) << (3 * 8); + sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0]; + + sb[1] ^= (sbox[(byte)(sa[1] >> (0 * 8)) * 4]) << (0 * 8); + sa[0] ^= (sbox[(byte)(sa[1] >> (1 * 8)) * 4]) << (1 * 8); + sb[3] ^= (sbox[(byte)(sa[1] >> (2 * 8)) * 4]) << (2 * 8); + sb[2] ^= (sbox[(byte)(sa[1] >> (3 * 8)) * 4]) << (3 * 8); + sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1]; + + sb[2] ^= (sbox[(byte)(sa[2] >> (0 * 8)) * 4]) << (0 * 8); + sa[1] ^= (sbox[(byte)(sa[2] >> (1 * 8)) * 4]) << (1 * 8); + sa[0] ^= (sbox[(byte)(sa[2] >> (2 * 8)) * 4]) << (2 * 8); + sb[3] ^= (sbox[(byte)(sa[2] >> (3 * 8)) * 4]) << (3 * 8); + sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2]; + + sb[3] ^= (sbox[(byte)(sa[3] >> (0 * 8)) * 4]) << (0 * 8); + sa[2] ^= (sbox[(byte)(sa[3] >> (1 * 8)) * 4]) << (1 * 8); + sa[1] ^= (sbox[(byte)(sa[3] >> (2 * 8)) * 4]) << (2 * 8); + sa[0] ^= (sbox[(byte)(sa[3] >> (3 * 8)) * 4]) << (3 * 8); + sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3]; + + buf_put_le32(b + 0, sa[0]); + buf_put_le32(b + 4, sa[1]); + buf_put_le32(b + 8, sa[2]); + buf_put_le32(b + 12, sa[3]); #undef rk + + return (56 + 2*sizeof(int)); } -#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/ +#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ static unsigned int @@ -504,31 +603,13 @@ do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { #ifdef USE_AMD64_ASM - return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds); + return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, + encT); #elif defined(USE_ARM_ASM) - return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds); + return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, encT); #else - /* BX and AX are not necessary correctly aligned. Thus we might - need to copy them here. We try to align to a 16 bytes. */ - if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f)) - { - union - { - u32 dummy[4]; - byte a[16] ATTR_ALIGNED_16; - } a; - - buf_cpy (a.a, ax, 16); - do_encrypt_aligned (ctx, a.a, a.a); - buf_cpy (bx, a.a, 16); - } - else - { - do_encrypt_aligned (ctx, bx, ax); - } - - return (56 + 2*sizeof(int)); -#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/ + return do_encrypt_fn (ctx, bx, ax); +#endif /* !USE_ARM_ASM && !USE_AMD64_ASM*/ } @@ -537,6 +618,9 @@ rijndael_encrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + return ctx->encrypt_fn (ctx, b, a); } @@ -555,6 +639,9 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv, const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + if (0) ; #ifdef USE_AESNI @@ -599,6 +686,9 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv, unsigned char *last_iv; unsigned int burn_depth = 0; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + if (0) ; #ifdef USE_AESNI @@ -651,6 +741,9 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr, unsigned int burn_depth = 0; int i; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + if (0) ; #ifdef USE_AESNI @@ -691,98 +784,139 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr, -#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM) -/* Decrypt one block. A and B need to be aligned on a 4 byte boundary - and the decryption must have been prepared. A and B may be the - same. */ -static void -do_decrypt_aligned (const RIJNDAEL_context *ctx, - unsigned char *b, const unsigned char *a) +#if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM) +/* Decrypt one block. A and B may be the same. */ +static unsigned int +do_decrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b, + const unsigned char *a) { #define rk (ctx->keyschdec) int rounds = ctx->rounds; int r; - union - { - u32 tempu32[4]; /* Force correct alignment. */ - byte temp[4][4]; - } u; - - - *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[rounds][0]); - *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[rounds][1]); - *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[rounds][2]); - *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[rounds][3]); - - *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]]) - ^ *((u32_a_t*)T6[u.temp[3][1]]) - ^ *((u32_a_t*)T7[u.temp[2][2]]) - ^ *((u32_a_t*)T8[u.temp[1][3]])); - *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]]) - ^ *((u32_a_t*)T6[u.temp[0][1]]) - ^ *((u32_a_t*)T7[u.temp[3][2]]) - ^ *((u32_a_t*)T8[u.temp[2][3]])); - *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]]) - ^ *((u32_a_t*)T6[u.temp[1][1]]) - ^ *((u32_a_t*)T7[u.temp[0][2]]) - ^ *((u32_a_t*)T8[u.temp[3][3]])); - *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]]) - ^ *((u32_a_t*)T6[u.temp[2][1]]) - ^ *((u32_a_t*)T7[u.temp[1][2]]) - ^ *((u32_a_t*)T8[u.temp[0][3]])); - - for (r = rounds-1; r > 1; r--) + u32 sa[4]; + u32 sb[4]; + + sb[0] = buf_get_le32(a + 0); + sb[1] = buf_get_le32(a + 4); + sb[2] = buf_get_le32(a + 8); + sb[3] = buf_get_le32(a + 12); + + sa[0] = sb[0] ^ *((u32_a_t*)rk[rounds][0]); + sa[1] = sb[1] ^ *((u32_a_t*)rk[rounds][1]); + sa[2] = sb[2] ^ *((u32_a_t*)rk[rounds][2]); + sa[3] = sb[3] ^ *((u32_a_t*)rk[rounds][3]); + + for (r = rounds - 1; r > 1; r--) { - *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]); - *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]); - *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]); - *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]); - *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]]) - ^ *((u32_a_t*)T6[u.temp[3][1]]) - ^ *((u32_a_t*)T7[u.temp[2][2]]) - ^ *((u32_a_t*)T8[u.temp[1][3]])); - *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]]) - ^ *((u32_a_t*)T6[u.temp[0][1]]) - ^ *((u32_a_t*)T7[u.temp[3][2]]) - ^ *((u32_a_t*)T8[u.temp[2][3]])); - *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]]) - ^ *((u32_a_t*)T6[u.temp[1][1]]) - ^ *((u32_a_t*)T7[u.temp[0][2]]) - ^ *((u32_a_t*)T8[u.temp[3][3]])); - *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]]) - ^ *((u32_a_t*)T6[u.temp[2][1]]) - ^ *((u32_a_t*)T7[u.temp[1][2]]) - ^ *((u32_a_t*)T8[u.temp[0][3]])); + sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); + sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); + sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); + sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); + sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0]; + + sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); + sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); + sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); + sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); + sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1]; + + sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); + sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); + sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); + sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); + sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2]; + + sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); + sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); + sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); + sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); + sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3]; + + r--; + + sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); + sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); + sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); + sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); + sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0]; + + sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); + sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); + sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); + sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); + sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1]; + + sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); + sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); + sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); + sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); + sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2]; + + sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); + sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); + sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); + sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); + sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3]; } + sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); + sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); + sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); + sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); + sa[0] = *((u32_a_t*)rk[1][0]) ^ sb[0]; + + sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); + sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); + sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); + sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); + sa[1] = *((u32_a_t*)rk[1][1]) ^ sb[1]; + + sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); + sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); + sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); + sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); + sa[2] = *((u32_a_t*)rk[1][2]) ^ sb[2]; + + sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); + sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); + sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); + sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); + sa[3] = *((u32_a_t*)rk[1][3]) ^ sb[3]; + /* Last round is special. */ - *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[1][0]); - *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[1][1]); - *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[1][2]); - *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[1][3]); - b[ 0] = S5[u.temp[0][0]]; - b[ 1] = S5[u.temp[3][1]]; - b[ 2] = S5[u.temp[2][2]]; - b[ 3] = S5[u.temp[1][3]]; - b[ 4] = S5[u.temp[1][0]]; - b[ 5] = S5[u.temp[0][1]]; - b[ 6] = S5[u.temp[3][2]]; - b[ 7] = S5[u.temp[2][3]]; - b[ 8] = S5[u.temp[2][0]]; - b[ 9] = S5[u.temp[1][1]]; - b[10] = S5[u.temp[0][2]]; - b[11] = S5[u.temp[3][3]]; - b[12] = S5[u.temp[3][0]]; - b[13] = S5[u.temp[2][1]]; - b[14] = S5[u.temp[1][2]]; - b[15] = S5[u.temp[0][3]]; - *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[0][0]); - *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[0][1]); - *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]); - *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]); + sb[0] = inv_sbox[(byte)(sa[0] >> (0 * 8))] << (0 * 8); + sb[1] = inv_sbox[(byte)(sa[0] >> (1 * 8))] << (1 * 8); + sb[2] = inv_sbox[(byte)(sa[0] >> (2 * 8))] << (2 * 8); + sb[3] = inv_sbox[(byte)(sa[0] >> (3 * 8))] << (3 * 8); + sa[0] = sb[0] ^ *((u32_a_t*)rk[0][0]); + + sb[1] ^= inv_sbox[(byte)(sa[1] >> (0 * 8))] << (0 * 8); + sb[2] ^= inv_sbox[(byte)(sa[1] >> (1 * 8))] << (1 * 8); + sb[3] ^= inv_sbox[(byte)(sa[1] >> (2 * 8))] << (2 * 8); + sa[0] ^= inv_sbox[(byte)(sa[1] >> (3 * 8))] << (3 * 8); + sa[1] = sb[1] ^ *((u32_a_t*)rk[0][1]); + + sb[2] ^= inv_sbox[(byte)(sa[2] >> (0 * 8))] << (0 * 8); + sb[3] ^= inv_sbox[(byte)(sa[2] >> (1 * 8))] << (1 * 8); + sa[0] ^= inv_sbox[(byte)(sa[2] >> (2 * 8))] << (2 * 8); + sa[1] ^= inv_sbox[(byte)(sa[2] >> (3 * 8))] << (3 * 8); + sa[2] = sb[2] ^ *((u32_a_t*)rk[0][2]); + + sb[3] ^= inv_sbox[(byte)(sa[3] >> (0 * 8))] << (0 * 8); + sa[0] ^= inv_sbox[(byte)(sa[3] >> (1 * 8))] << (1 * 8); + sa[1] ^= inv_sbox[(byte)(sa[3] >> (2 * 8))] << (2 * 8); + sa[2] ^= inv_sbox[(byte)(sa[3] >> (3 * 8))] << (3 * 8); + sa[3] = sb[3] ^ *((u32_a_t*)rk[0][3]); + + buf_put_le32(b + 0, sa[0]); + buf_put_le32(b + 4, sa[1]); + buf_put_le32(b + 8, sa[2]); + buf_put_le32(b + 12, sa[3]); #undef rk + + return (56+2*sizeof(int)); } -#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/ +#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ /* Decrypt one block. AX and BX may be the same. */ @@ -791,31 +925,14 @@ do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { #ifdef USE_AMD64_ASM - return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds); + return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds, + &dec_tables); #elif defined(USE_ARM_ASM) - return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds); + return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds, + &dec_tables); #else - /* BX and AX are not necessary correctly aligned. Thus we might - need to copy them here. We try to align to a 16 bytes. */ - if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f)) - { - union - { - u32 dummy[4]; - byte a[16] ATTR_ALIGNED_16; - } a; - - buf_cpy (a.a, ax, 16); - do_decrypt_aligned (ctx, a.a, a.a); - buf_cpy (bx, a.a, 16); - } - else - { - do_decrypt_aligned (ctx, bx, ax); - } - - return (56+2*sizeof(int)); -#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/ + return do_decrypt_fn (ctx, bx, ax); +#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ } @@ -837,6 +954,9 @@ rijndael_decrypt (void *context, byte *b, const byte *a) check_decryption_preparation (ctx); + if (ctx->prefetch_dec_fn) + ctx->prefetch_dec_fn(); + return ctx->decrypt_fn (ctx, b, a); } @@ -855,6 +975,9 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv, const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + if (0) ; #ifdef USE_AESNI @@ -898,6 +1021,9 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv, check_decryption_preparation (ctx); + if (ctx->prefetch_dec_fn) + ctx->prefetch_dec_fn(); + if (0) ; #ifdef USE_AESNI @@ -932,7 +1058,6 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv, } - /* Run the self-tests for AES 128. Returns NULL on success. */ static const char* |