From ed45eac3b721c1313902b977379fbd4886ccca7b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 2 Dec 2020 20:44:11 +0200 Subject: chacha20-ppc: fix 32-bit counter overflow handling * cipher/chacha20-ppc.c (vec_add_ctr_u64, ADD_U64): New. (_gcry_chacha20_ppc8_blocks1, _gcry_chacha20_ppc8_blocks4) (_gcry_chacha20_poly1305_ppc8_blocks4): Use ADD_U64 when incrementing counter. -- Patch fixes 32-bit overflow for PowerPC ChaCha20 implementation. In typical use case, overflow happens after 256 GiB bytes of output. Typical use case here means use of 96-bit or 64-bit IV which causes lower 32-bits of counter to start from zero. Signed-off-by: Jussi Kivilinna --- cipher/chacha20-ppc.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c index 985f2fcd..4a21b837 100644 --- a/cipher/chacha20-ppc.c +++ b/cipher/chacha20-ppc.c @@ -88,6 +88,24 @@ vec_store_le(vector4x_u32 vec, unsigned long offset, unsigned char *ptr) } +static ASM_FUNC_ATTR_INLINE vector4x_u32 +vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a) +{ +#ifdef WORDS_BIGENDIAN + static const vector16x_u8 swap32 = + { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 }; + vector2x_u64 vec, add, sum; + + vec = (vector2x_u64)vec_perm((vector16x_u8)v, (vector16x_u8)v, swap32); + add = (vector2x_u64)vec_perm((vector16x_u8)a, (vector16x_u8)a, swap32); + sum = vec + add; + return (vector4x_u32)vec_perm((vector16x_u8)sum, (vector16x_u8)sum, swap32); +#else + return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a)); +#endif +} + + /********************************************************************** 2-way && 1-way chacha20 **********************************************************************/ @@ -115,6 +133,9 @@ vec_store_le(vector4x_u32 vec, unsigned long offset, unsigned char *ptr) ROTATE(x1, rotate_7); \ WORD_ROL(x1, rol_x1); +#define ADD_U64(v,a) \ + (v = vec_add_ctr_u64(v, a)) + unsigned int ASM_FUNC_ATTR _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) @@ -152,7 +173,7 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, v5 = state1; v6 = state2; v7 = state3; - v7 += counter_1; + ADD_U64(v7, counter_1); for (i = 20; i > 0; i -= 2) { @@ -166,12 +187,12 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, v1 += state1; v2 += state2; v3 += state3; - state3 += counter_1; /* update counter */ + ADD_U64(state3, counter_1); /* update counter */ v4 += state0; v5 += state1; v6 += state2; v7 += state3; - state3 += counter_1; /* update counter */ + ADD_U64(state3, counter_1); /* update counter */ v0 ^= vec_load_le(0 * 16, src); v1 ^= vec_load_le(1 * 16, src); @@ -214,7 +235,7 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, v1 += state1; v2 += state2; v3 += state3; - state3 += counter_1; /* update counter */ + ADD_U64(state3, counter_1); /* update counter */ v0 ^= vec_load_le(0 * 16, src); v1 ^= vec_load_le(1 * 16, src); @@ -339,7 +360,7 @@ _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src, v13 += vec_splat(state3, 1) - vec_cmplt(tmp, counters_0123); v14 += vec_splat(state3, 2); v15 += vec_splat(state3, 3); - state3 += counter_4; /* update counter */ + ADD_U64(state3, counter_4); /* update counter */ transpose_4x4(v0, v1, v2, v3); transpose_4x4(v4, v5, v6, v7); @@ -554,7 +575,7 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, v13 += vec_splat(state3, 1) - vec_cmplt(tmp, counters_0123); v14 += vec_splat(state3, 2); v15 += vec_splat(state3, 3); - state3 += counter_4; /* update counter */ + ADD_U64(state3, counter_4); /* update counter */ transpose_4x4(v0, v1, v2, v3); transpose_4x4(v4, v5, v6, v7); -- cgit v1.2.1