diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2016-10-09 12:52:55 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2016-10-09 12:52:55 +0300 |
commit | 27747921cb1dfced83c5666cd1c474764724c52b (patch) | |
tree | 13b1e722681dc77c5459ef42e409a959804f5333 /cipher/cipher-gcm-armv8-aarch32-ce.S | |
parent | 5418d9ca4c0e087fd6872ad350a996fe74880d86 (diff) | |
download | libgcrypt-27747921cb1dfced83c5666cd1c474764724c52b.tar.gz |
GCM: Add bulk processing for ARMv8/AArch32 implementation
* cipher/cipher-gcm-armv8-aarch32-ce.S: Add 4 blocks bulk processing.
* tests/basic.c (check_digests): Print correct data length for "?"
tests.
(check_one_mac): Add large 1000000 bytes tests, when input is "!" or
"?".
(check_mac): Add "?" tests vectors for HMAC, CMAC, GMAC and POLY1305.
--
Benchmark on Cortex-A53 (1152 Mhz):
Before:
| nanosecs/byte mebibytes/sec cycles/byte
GMAC_AES | 0.924 ns/B 1032.2 MiB/s 1.06 c/B
After (1.21x faster):
| nanosecs/byte mebibytes/sec cycles/byte
GMAC_AES | 0.764 ns/B 1248.2 MiB/s 0.880 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/cipher-gcm-armv8-aarch32-ce.S')
-rw-r--r-- | cipher/cipher-gcm-armv8-aarch32-ce.S | 321 |
1 files changed, 259 insertions, 62 deletions
diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S index b879fb2e..b61a7871 100644 --- a/cipher/cipher-gcm-armv8-aarch32-ce.S +++ b/cipher/cipher-gcm-armv8-aarch32-ce.S @@ -57,69 +57,125 @@ gcry_gcm_reduction_constant: #define rhash_l d0 #define rhash_h d1 -#define rbuf q1 -#define rbuf_l d2 -#define rbuf_h d3 +#define rh1 q1 +#define rh1_l d2 +#define rh1_h d3 -#define rh0 q2 -#define rh0_l d4 -#define rh0_h d5 +#define rbuf q2 +#define rbuf_l d4 +#define rbuf_h d5 -#define rt0 q3 -#define rt0_l d6 -#define rt0_h d7 +#define rbuf1 q3 +#define rbuf1_l d6 +#define rbuf1_h d7 -#define rr0 q8 -#define rr0_l d16 -#define rr0_h d17 +#define rbuf2 q4 +#define rbuf2_l d8 +#define rbuf2_h d9 -#define rr1 q9 -#define rr1_l d18 -#define rr1_h d19 +#define rbuf3 q5 +#define rbuf3_l d10 +#define rbuf3_h d11 + +#define rh2 q6 +#define rh2_l d12 +#define rh2_h d13 + +#define rh3 q7 +#define rh3_l d14 +#define rh3_h d15 + +#define rh4 q8 +#define rh4_l d16 +#define rh4_h d17 + +#define rr2 q9 +#define rr2_l d18 +#define rr2_h d19 + +#define rr3 q10 +#define rr3_l d20 +#define rr3_h d21 + +#define rr0 q11 +#define rr0_l d22 +#define rr0_h d23 + +#define rr1 q12 +#define rr1_l d24 +#define rr1_h d25 + +#define rt0 q13 +#define rt0_l d26 +#define rt0_h d27 + +#define rt1 q14 +#define rt1_l d28 +#define rt1_h d29 #define rrconst q15 #define rrconst_l d30 #define rrconst_h d31 -#define ia rbuf_h -#define ib rbuf_l -#define oa rh0_l -#define ob rh0_h -#define co rrconst_l -#define ma rrconst_h - /* GHASH macros */ /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in * Cryptology — CT-RSA 2015" for details. */ -/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */ +/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) + * Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'. + */ #define PMUL_128x128(r0, r1, a, b, t, interleave_op) \ veor t##_h, b##_l, b##_h; \ veor t##_l, a##_l, a##_h; \ vmull.p64 r0, a##_l, b##_l; \ vmull.p64 r1, a##_h, b##_h; \ vmull.p64 t, t##_h, t##_l; \ - interleave_op(); \ + interleave_op; \ veor t, r0; \ veor t, r1; \ veor r0##_h, t##_l; \ veor r1##_l, t##_h; +/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A) + * Note: 'r1A' may be 'aA' or 'bA', 'r0A' must not be either 'aA' or 'bA'. + * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B) + * Note: 'r1B' may be 'aB' or 'bB', 'r0B' must not be either 'aB' or 'bB'. + */ +#define PMUL_128x128_2(r0A, r1A, aA, bA, r0B, r1B, aB, bB, tA, tB, interleave_op) \ + veor tA##_h, bA##_l, bA##_h; \ + veor tA##_l, aA##_l, aA##_h; \ + veor tB##_h, bB##_l, bB##_h; \ + veor tB##_l, aB##_l, aB##_h; \ + vmull.p64 r0A, aA##_l, bA##_l; \ + vmull.p64 r1A, aA##_h, bA##_h; \ + vmull.p64 tA, tA##_h, tA##_l; \ + vmull.p64 r0B, aB##_l, bB##_l; \ + vmull.p64 r1B, aB##_h, bB##_h; \ + vmull.p64 tB, tB##_h, tB##_l; \ + interleave_op; \ + veor tA, r0A; \ + veor tA, r1A; \ + veor tB, r0B; \ + veor tB, r1B; \ + veor r0A##_h, tA##_l; \ + veor r1A##_l, tA##_h; \ + veor r0B##_h, tB##_l; \ + veor r1B##_l, tB##_h; \ + /* Input: 'r0:r1', Output: 'a' */ #define REDUCTION(a, r0, r1, rconst, t, interleave_op) \ vmull.p64 t, r0##_l, rconst; \ veor r0##_h, t##_l; \ veor r1##_l, t##_h; \ - interleave_op(); \ + interleave_op; \ vmull.p64 t, r0##_h, rconst; \ veor r1, t; \ veor a, r0, r1; -#define _(...) /*_*/ -#define vrev_rbuf() vrev64.8 rbuf, rbuf; -#define vext_rbuf() vext.8 rbuf, rbuf, rbuf, #8; +#define _(...) __VA_ARGS__ +#define __ _() /* Other functional macros */ @@ -142,22 +198,128 @@ _gcry_ghash_armv8_ce_pmull: * r3: nblocks * %st+0: gcm_table */ - push {r4, lr} + push {r4-r6, lr} cmp r3, #0 beq .Ldo_nothing - GET_DATA_POINTER(lr, .Lrconst64, r4) + GET_DATA_POINTER(r4, .Lrconst64, lr) - subs r3, r3, #1 vld1.64 {rhash}, [r1] - vld1.64 {rh0}, [r0] + vld1.64 {rh1}, [r0] vrev64.8 rhash, rhash /* byte-swap */ - vld1.64 {rrconst_h}, [lr] + vld1.64 {rrconst_h}, [r4] vext.8 rhash, rhash, rhash, #8 + cmp r3, #4 + blo .Less_than_4 + + /* Bulk processing of 4 blocks per loop iteration. */ + + ldr r5, [sp, #(4*4)]; + add r6, r5, #32 + + vpush {q4-q7} + + vld1.64 {rh2-rh3}, [r5] + vld1.64 {rh4}, [r6] + + vld1.64 {rbuf-rbuf1}, [r2]! + sub r3, r3, #4 + vld1.64 {rbuf2-rbuf3}, [r2]! + + cmp r3, #4 + vrev64.8 rbuf, rbuf /* byte-swap */ + vrev64.8 rbuf1, rbuf1 /* byte-swap */ + vrev64.8 rbuf2, rbuf2 /* byte-swap */ + vrev64.8 rbuf3, rbuf3 /* byte-swap */ + + vext.8 rbuf, rbuf, rbuf, #8 + vext.8 rbuf1, rbuf1, rbuf1, #8 + vext.8 rbuf2, rbuf2, rbuf2, #8 + vext.8 rbuf3, rbuf3, rbuf3, #8 + veor rhash, rhash, rbuf /* in0 ^ hash */ + + blo .Lend_4 + +.Loop_4: + /* (in0 ^ hash) * H⁴ => rr2:rr3 */ + /* (in1) * H³ => rr0:rr1 */ + PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) + + vld1.64 {rbuf-rbuf1}, [r2]! + sub r3, r3, #4 + veor rr0, rr0, rr2 + veor rr1, rr1, rr3 + + /* (in2) * H² => rr2:rr3 */ + /* (in3) * H¹ => rhash:rbuf3 */ + PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1, + _(vrev64.8 rbuf, rbuf)) + + vld1.64 {rbuf2}, [r2]! + + vrev64.8 rbuf1, rbuf1 + veor rr0, rr0, rr2 + veor rr1, rr1, rr3 + + cmp r3, #4 + vext.8 rbuf, rbuf, rbuf, #8 + vext.8 rbuf1, rbuf1, rbuf1, #8 + + veor rr0, rr0, rhash + veor rr1, rr1, rbuf3 + + vld1.64 {rbuf3}, [r2]! + + REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, + _(vrev64.8 rbuf2, rbuf2; + vrev64.8 rbuf3, rbuf3)) + + vext.8 rbuf2, rbuf2, rbuf2, #8 + vext.8 rbuf3, rbuf3, rbuf3, #8 + veor rhash, rhash, rbuf /* in0 ^ hash */ + + bhs .Loop_4 + +.Lend_4: + /* (in0 ^ hash) * H⁴ => rr2:rr3 */ + /* (in1) * H³ => rr0:rr1 */ + PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) + + /* (in2) * H² => rhash:rbuf */ + /* (in3) * H¹ => rbuf1:rbuf2 */ + PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1, + _(veor rr0, rr0, rr2; + veor rr1, rr1, rr3)) + + veor rr0, rr0, rhash + veor rr1, rr1, rbuf + + veor rr0, rr0, rbuf1 + veor rr1, rr1, rbuf2 + + REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, + _(CLEAR_REG(rr2); + CLEAR_REG(rr3); + CLEAR_REG(rbuf1); + CLEAR_REG(rbuf2); + CLEAR_REG(rbuf3); + CLEAR_REG(rh2); + CLEAR_REG(rh3); + CLEAR_REG(rh4))) + + vpop {q4-q7} + + cmp r3, #0 + beq .Ldone + +.Less_than_4: + /* Handle remaining blocks. */ + vld1.64 {rbuf}, [r2]! + subs r3, r3, #1 vrev64.8 rbuf, rbuf /* byte-swap */ vext.8 rbuf, rbuf, rbuf, #8 @@ -169,30 +331,29 @@ _gcry_ghash_armv8_ce_pmull: .Loop: vld1.64 {rbuf}, [r2]! subs r3, r3, #1 - PMUL_128x128(rr0, rr1, rh0, rhash, rt0, vrev_rbuf) - REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, vext_rbuf) + PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(vrev64.8 rbuf, rbuf)) + REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(vext.8 rbuf, rbuf, rbuf, #8)) veor rhash, rhash, rbuf bne .Loop .Lend: - PMUL_128x128(rr0, rr1, rh0, rhash, rt0, _) - REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _) + PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf))) + REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1))) +.Ldone: CLEAR_REG(rr1) - CLEAR_REG(rr0) vrev64.8 rhash, rhash /* byte-swap */ - CLEAR_REG(rbuf) CLEAR_REG(rt0) + CLEAR_REG(rr0) vext.8 rhash, rhash, rhash, #8 - CLEAR_REG(rh0) - + CLEAR_REG(rt1) vst1.64 {rhash}, [r1] CLEAR_REG(rhash) .Ldo_nothing: mov r0, #0 - pop {r4, pc} + pop {r4-r6, pc} .size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull; @@ -208,28 +369,64 @@ _gcry_ghash_setup_armv8_ce_pmull: * r1: gcm_table */ - push {r4, lr} + vpush {q4-q7} - GET_DATA_POINTER(r4, .Lrconst64, lr) + GET_DATA_POINTER(r2, .Lrconst64, r3) + + vld1.64 {rrconst_h}, [r2] + +#define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \ + /* H <<< 1 */ \ + vshr.s64 ma, ib, #63; \ + vshr.u64 oa, ib, #63; \ + vshr.u64 ob, ia, #63; \ + vand ma, const_d; \ + vshl.u64 ib, ib, #1; \ + vshl.u64 ia, ia, #1; \ + vorr ob, ib; \ + vorr oa, ia; \ + veor ob, ma; \ + vst1.64 {oa, ob}, [r_out] + + vld1.64 {rhash}, [r0] + vrev64.8 rhash, rhash /* byte-swap */ + vext.8 rhash, rhash, rhash, #8 + + vmov rbuf1, rhash + GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */ - /* H <<< 1 */ - vld1.64 {ib,ia}, [r0] - vld1.64 {co}, [r4] - vrev64.8 ib, ib; - vrev64.8 ia, ia; - vshr.s64 ma, ib, #63 - vshr.u64 oa, ib, #63 - vshr.u64 ob, ia, #63 - vand ma, co - vshl.u64 ib, ib, #1 - vshl.u64 ia, ia, #1 - vorr ob, ib - vorr oa, ia - veor ob, ma - - vst1.64 {oa, ob}, [r0] - - pop {r4, pc} + /* H² */ + PMUL_128x128(rr0, rr1, rbuf1, rh1, rt0, __) + REDUCTION(rh2, rr0, rr1, rrconst_h, rt0, __) + vmov rhash, rh2 + GCM_LSH_1(r1, rh2_l, rh2_h, rrconst_h, rbuf1_l, rbuf1_h, rt1_l) /* H²<<<1 */ + add r1, r1, #16 + + /* H³ */ + PMUL_128x128(rr0, rr1, rhash, rh1, rt1, __) + REDUCTION(rh3, rr0, rr1, rrconst_h, rt1, __) + + /* H⁴ */ + PMUL_128x128(rr0, rr1, rhash, rbuf1, rt0, __) + REDUCTION(rh4, rr0, rr1, rrconst_h, rt0, __) + + GCM_LSH_1(r1, rh3_l, rh3_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H³<<<1 */ + add r1, r1, #16 + GCM_LSH_1(r1, rh4_l, rh4_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H⁴<<<1 */ + + CLEAR_REG(rt0) + CLEAR_REG(rt1) + CLEAR_REG(rr1) + CLEAR_REG(rr0) + CLEAR_REG(rh1) + CLEAR_REG(rh2) + CLEAR_REG(rh3) + CLEAR_REG(rh4) + CLEAR_REG(rhash) + CLEAR_REG(rbuf1) + CLEAR_REG(rrconst) + vpop {q4-q7} + bx lr .size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull; #endif |