From 947b554d85012cf35185ded38ef3484de010d2cf Mon Sep 17 00:00:00 2001 From: Legrandin Date: Sun, 23 Mar 2014 18:46:55 +0100 Subject: Make GHASH more robust against timing attacks. In order to speed up as much as possible the GHASH, the current implementation expands the 16 byte hash key (H) into a table of 64 KBytes. However, that is sensitive to cache-based timing attacks. If we assume that access to data inside the same cache line is constant-time (likely), fitting a table item into a cache line may help against the attacks. This patch reduce the pre-computed table from 64K to 4K and aligns every item to a 32 byte boundary (since most modern CPUs have cache line of that size or larger). This patch will reduce the overall performance. This patch also reverts commit 965871a727 ("GCM mode: Optimize key setup for GCM mode") since I actually got conflicting benchmark results. --- configure.ac | 1 + lib/Crypto/Cipher/blockalgo.py | 13 +- src/config.h.in | 7 ++ src/galois.c | 279 ++++++++++++++--------------------------- 4 files changed, 105 insertions(+), 195 deletions(-) diff --git a/configure.ac b/configure.ac index 5f22d00..3bba18d 100644 --- a/configure.ac +++ b/configure.ac @@ -99,6 +99,7 @@ AC_TYPE_UINT16_T AC_TYPE_UINT32_T AC_TYPE_UINT64_T AC_TYPE_UINT8_T +AC_TYPE_UINTPTR_T # Checks for library functions. AC_FUNC_MALLOC diff --git a/lib/Crypto/Cipher/blockalgo.py b/lib/Crypto/Cipher/blockalgo.py index 84b9bc3..db45404 100644 --- a/lib/Crypto/Cipher/blockalgo.py +++ b/lib/Crypto/Cipher/blockalgo.py @@ -329,17 +329,14 @@ class _GHASH(_SmoothMAC): (x^128 + x^7 + x^2 + x + 1). """ - def __init__(self, hash_subkey, block_size, table_size='64K'): + def __init__(self, hash_subkey, block_size): _SmoothMAC.__init__(self, block_size, None, 0) - if table_size == '64K': - self._hash_subkey = galois._ghash_expand(hash_subkey) - else: - self._hash_subkey = hash_subkey + self._hash_subkey = galois._ghash_expand(hash_subkey) self._last_y = bchr(0) * 16 self._mac = galois._ghash def copy(self): - clone = _GHASH(self._hash_subkey, self._bs, table_size='0K') + clone = _GHASH(self._hash_subkey, self._bs) _SmoothMAC._deep_copy(self, clone) clone._last_y = self._last_y return clone @@ -436,7 +433,7 @@ class BlockAlgo: bchr(0) * fill + long_to_bytes(8 * len(self.nonce), 8)) - mac = _GHASH(hash_subkey, factory.block_size, '0K') + mac = _GHASH(hash_subkey, factory.block_size) mac.update(ghash_in) self._j0 = bytes_to_long(mac.digest()) @@ -446,7 +443,7 @@ class BlockAlgo: self._cipher = self._factory.new(key, MODE_CTR, counter=ctr) # Step 5 - Bootstrat GHASH - self._cipherMAC = _GHASH(hash_subkey, factory.block_size, '64K') + self._cipherMAC = _GHASH(hash_subkey, factory.block_size) # Step 6 - Prepare GCTR cipher for GMAC ctr = Counter.new(128, initial_value=self._j0, allow_wraparound=True) diff --git a/src/config.h.in b/src/config.h.in index 36278c1..f18fcb0 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -69,6 +69,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_SYS_TYPES_H +/* Define to 1 if the system has the type `uintptr_t'. */ +#undef HAVE_UINTPTR_T + /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H @@ -160,3 +163,7 @@ /* Define to the type of an unsigned integer type of width exactly 8 bits if such a type exists and the standard includes do not define it. */ #undef uint8_t + +/* Define to the type of an unsigned integer type wide enough to hold a + pointer, if such a type exists, and if the system does not define it. */ +#undef uintptr_t diff --git a/src/galois.c b/src/galois.c index 2660044..0da79e6 100644 --- a/src/galois.c +++ b/src/galois.c @@ -25,10 +25,28 @@ #include #include -/** Type for tables containing the expanded hash key **/ -typedef uint64_t t_key_tables[16][256][2]; +#define ALIGNMENT 32 -typedef uint64_t t_v_tables[128][2]; +/** + * A V table is a 4096 bytes table that will contain the expanded + * GHASH key (H). It is used to speed up the GF(128) multiplication Z = X*H. + * + * The table contains 128 entries, one for each bit of X. + * Each entry takes 32 bytes and can fit into the cache line of a modern + * processor. If we assume that access to memory mapped to the same + * cache line is somewhat constant, we can make GHASH robust again + * cache timing attacks. + */ +typedef uint64_t t_v_tables[128][2][2]; + +/** + * To ensure that the V table is aligned to a 32-byte memory boundary, + * we allocate a larger piece of memory and carve the V table from there. + */ +typedef struct { + uint8_t buffer[sizeof(t_v_tables)+ALIGNMENT]; + t_v_tables *v_tables; +} t_exp_key; /** * Big Endian to word conversions @@ -56,230 +74,121 @@ static void word_to_be(uint8_t fb[8], uint64_t w) } /** - * Compute H*x^i for i=0..127. Store the results in a new - * vector indexed by i. + * Create a V table. V[i] is the value H*x^i (i=0..127). + * \param h The 16 byte GHASH key + * \param tables A pointer to an allocated V table */ -static const t_v_tables* make_v_tables(const uint8_t y[16]) +static void make_v_tables(const uint8_t h[16], t_v_tables *tables) { - t_v_tables *tables; - uint64_t *cur; + uint64_t (*cur)[2]; int i; - tables = (t_v_tables*) calloc(128*2, sizeof(uint64_t)); - if (!tables) { - return NULL; - } + memset(tables, 0, sizeof(t_v_tables)); - cur = &((*tables)[0][0]); + cur = &((*tables)[0][1]); - cur[0] = be_to_word(&y[0]); - cur[1] = be_to_word(&y[8]); + (*cur)[0] = be_to_word(&h[0]); + (*cur)[1] = be_to_word(&h[8]); for (i=1; i<128; i++) { uint64_t c; - uint64_t *next; + uint64_t (*next)[2]; - next = &((*tables)[i][0]); + next = &((*tables)[i][1]); /** v = (v&1)*0xE1000000000000000000000000000000L ^ (v>>1) **/ - c = cur[1]&1 ? 0xE100000000000000 : 0; - next[1] = cur[1]>>1 | cur[0]<<63; - next[0] = cur[0]>>1 ^ c; + c = (*cur)[1]&1 ? 0xE100000000000000 : 0; + (*next)[1] = (*cur)[1]>>1 | (*cur)[0]<<63; + (*next)[0] = (*cur)[0]>>1 ^ c; cur = next; } - - return (const t_v_tables*)tables; -} - -/** - * Multiply to elements of GF(2**128) using the reducing polynomial - * (x^128 + x^7 + x^2 + x + 1). - */ -static void gcm_mult(uint8_t out[16], const uint8_t x[16], const uint8_t y[16]) -{ - uint64_t z[2], v[2]; - int i; - - /** z, v = 0, y **/ - z[0] = z[1] = 0; - v[0] = be_to_word(&y[0]); - v[1] = be_to_word(&y[8]); - - for (i=0; i<16; i++) { - uint8_t j; - - for (j=0x80; j>0; j>>=1) { - uint64_t c; - - /** z ^= (x>>i&1)*v **/ - if (x[i] & j) { - - z[0] ^= v[0]; - z[1] ^= v[1]; - } - /** v = (v&1)*0xE1000000000000000000000000000000L ^ (v>>1) **/ - c = v[1]&1 ? 0xE100000000000000 : 0; - v[1] = v[1]>>1 | (v[0] << 63); - v[0] = v[0]>>1 ^ c; - } - } - word_to_be(out, z[0]); - word_to_be(out+8, z[1]); } /** * Multiply two elements of GF(2**128) using the reducing polynomial * (x^128 + x^7 + x^2 + x + 1). * - * The first element has been expanded into H tables. + * \param out The 16 byte buffer that will receive the result + * \param key_tables One factor, expanded into a V table + * \param x The other factor (16 bytes) */ -static void gcm_mult2(uint8_t out[16], const t_key_tables *key_tables, const uint8_t x[16]) +static void gcm_mult2(uint8_t out[16], const t_v_tables *key_tables, const uint8_t x[16]) { - int i; + int i, bit_scan_128; uint64_t z[2]; z[0] = z[1] = 0; + bit_scan_128 = 0; for (i=0; i<16; i++) { - z[0] ^= (*key_tables)[i][x[i]][0]; - z[1] ^= (*key_tables)[i][x[i]][1]; - } - word_to_be(out, z[0]); - word_to_be(out+8, z[1]); -} + uint8_t xi; + int j; -/** - * Multiply two elements of GF(2**128) using the reducing polynomial - * (x^128 + x^7 + x^2 + x + 1). - * - * In first element, only the byte at position 'pos' is non-zero at has - * value 'x'. - * - * The second element, is expanded in V tables (128 elements, one per - * each bit position). - */ -static void gcm_mult3(uint64_t out[2], uint8_t x, uint8_t pos, const t_v_tables *v_tables) -{ - uint64_t z[2]; - int j; - const uint64_t (*v)[2]; + xi = x[i]; + for (j=0; j<8; j++) { + int bit; - /** z, v = 0, y **/ - z[0] = z[1] = 0; + bit = xi>>7 & 1; /** Constant time */ + z[0] ^= (*key_tables)[bit_scan_128][bit][0]; + z[1] ^= (*key_tables)[bit_scan_128][bit][1]; - v = &((*v_tables)[pos*8]); - for (j=0x80; j!=0; j>>=1, v++) { - if (x & j) { - z[0] ^= (*v)[0]; - z[1] ^= (*v)[1]; + xi <<= 1; + bit_scan_128++; } } - out[0] = z[0]; - out[1] = z[1]; + + word_to_be(out, z[0]); + word_to_be(out+8, z[1]); } -/** - * Expand a hash key into a set of tables that will speed - * up GHASH. - * - * \param tables Pointer to allocated memory that will hold - * the tables. - * \param h The hash key. - */ -static int ghash_expand(t_key_tables *key_tables, const uint8_t h[16]) -{ - int i; - const t_v_tables *v_tables; - - v_tables = make_v_tables(h); - if (v_tables==NULL) { - return -1; - } - - for (i=0; i<16; i++) { - int j; - - for (j=0; j<256; j++) { - /** Z = H*j*P^{8i} **/ - gcm_mult3(&((*key_tables)[i][j][0]), j, i, v_tables); - } - } - - free(v_tables); - return 0; -} /** - * Compute the GHASH of a piece of an arbitrary data given an - * arbitrary Y_0, as specified in NIST SP 800 38D. + * Compute the GHASH of a piece of data given an arbitrary Y_0, + * as specified in NIST SP 800 38D. * - * \param y_out The resulting GHASH (16 bytes). - * \param block_data Pointer to the data to hash. - * \param len Length of the data to hash (multiple of 16). - * \param y_in The initial Y (Y_0, 16 bytes). - * \param key_tables The hash key, possibly expanded to 16*256*16 bytes. - * \param key_tables_len The length of the data pointed by key_table. + * \param y_out The resulting GHASH (16 bytes). + * \param block_data Pointer to the data to hash. + * \param len Length of the data to hash (multiple of 16). + * \param y_in The initial Y (Y_0, 16 bytes). + * \param key_tables The expanded hash key (16*256*16 bytes). */ static void ghash( uint8_t y_out[16], const uint8_t block_data[], int len, const uint8_t y_in[16], - const void *key_tables, - int key_tables_len + const t_v_tables *key_tables ) { - int i, j; - uint8_t x[16]; - const t_key_tables *key_tables_64 = NULL; - const uint8_t (*key)[16] = NULL; - - switch (key_tables_len) { - case sizeof(t_key_tables): - { - key_tables_64 = (const t_key_tables*) key_tables; - break; - } - case 16: - { - key = (const uint8_t (*)[16]) key_tables; - break; - } - default: - return; - } + int i; memcpy(y_out, y_in, 16); + for (i=0; iv_tables = (t_v_tables*) + (((uintptr_t)exp_key->buffer & ~(ALIGNMENT-1)) + ALIGNMENT); + Py_BEGIN_ALLOW_THREADS; + + make_v_tables((uint8_t*)PyBytes_AS_STRING(h), exp_key->v_tables); + Py_END_ALLOW_THREADS; - if (err!=0) { - Py_DECREF(retval); - retval = NULL; - } - out: return retval; } static char ghash__doc__[] = -"_ghash(data:str, y:str, exp_h:str) -> str\n" +"_ghash(data:str, y:str, exp_key:str) -> str\n" "\n" "Return a GHASH.\n"; static PyObject * ghash_function(PyObject *self, PyObject *args) { - PyObject *data, *y, *exp_h; + PyObject *data, *y, *exp_key; PyObject *retval = NULL; - Py_ssize_t len_data, len_y, len_exp_h; + Py_ssize_t len_data, len_y, len_exp_key; + const t_v_tables *v_tables; - if (!PyArg_ParseTuple(args, "SSS", &data, &y, &exp_h)) { + if (!PyArg_ParseTuple(args, "SSS", &data, &y, &exp_key)) { goto out; } len_data = PyBytes_GET_SIZE(data); len_y = PyBytes_GET_SIZE(y); - len_exp_h = PyBytes_GET_SIZE(exp_h); + len_exp_key = PyBytes_GET_SIZE(exp_key); if (len_data%16!=0) { PyErr_SetString(PyExc_ValueError, "Length of data must be a multiple of 16 bytes."); @@ -347,7 +252,7 @@ ghash_function(PyObject *self, PyObject *args) goto out; } - if (len_exp_h!=sizeof(t_key_tables) && len_exp_h!=16) { + if (len_exp_key!=sizeof(t_exp_key)) { PyErr_SetString(PyExc_ValueError, "Length of expanded key is incorrect."); goto out; } @@ -361,10 +266,10 @@ ghash_function(PyObject *self, PyObject *args) Py_BEGIN_ALLOW_THREADS; #define PyBytes_Buffer(a) (uint8_t*)PyBytes_AS_STRING(a) - + + v_tables = (const t_v_tables*)((t_exp_key *)PyBytes_AS_STRING(exp_key))->v_tables; ghash( PyBytes_Buffer(retval), PyBytes_Buffer(data), len_data, - PyBytes_Buffer(y), - PyBytes_Buffer(exp_h), len_exp_h ); + PyBytes_Buffer(y), v_tables); #undef PyBytes_Buffer -- cgit v1.2.1