diff options
author | Niels Möller <nisse@lysator.liu.se> | 2013-03-14 14:43:27 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2013-03-14 14:43:27 +0100 |
commit | 51ff7924453f000041fabd47d45e0e407eece869 (patch) | |
tree | 9a88678f8bfa3d18a8164407ed1d159d4e5091e4 /armv7/sha512-compress.asm | |
parent | 084733ae73f728d4171c259310d7eebde77e45dc (diff) | |
download | nettle-51ff7924453f000041fabd47d45e0e407eece869.tar.gz |
Improved ARM sha512 assembly.
Diffstat (limited to 'armv7/sha512-compress.asm')
-rw-r--r-- | armv7/sha512-compress.asm | 370 |
1 files changed, 217 insertions, 153 deletions
diff --git a/armv7/sha512-compress.asm b/armv7/sha512-compress.asm index 089128d6..ac2b4382 100644 --- a/armv7/sha512-compress.asm +++ b/armv7/sha512-compress.asm @@ -23,56 +23,93 @@ C MA 02111-1301, USA. define(<STATE>, <r0>) define(<INPUT>, <r1>) define(<K>, <r2>) -define(<SA>, <d16>) -define(<SB>, <d17>) -define(<SC>, <d18>) -define(<SD>, <d19>) -define(<SE>, <d20>) -define(<SF>, <d21>) -define(<SG>, <d22>) -define(<SH>, <d23>) -define(<W>, <d24>) -define(<T0>, <d25>) - define(<COUNT>, <r3>) - -C Used for data load -define(<I0>, <r4>) -define(<I1>, <r5>) -define(<I2>, <r6>) -define(<I3>, <r7>) -define(<I4>, <r8>) -define(<DST>, <r10>) define(<SHIFT>, <r12>) -define(<IT>, <r14>) -C FIXME: More opportunities for parallelism, at least do s0 and s1 xors, -C or expand two words at a time. +define(<SA>, <d0>) +define(<SB>, <d1>) +define(<SC>, <d2>) +define(<SD>, <d3>) +define(<SE>, <d4>) +define(<SF>, <d5>) +define(<SG>, <d6>) +define(<SH>, <d7>) +define(<QSAB>, <q0>) +define(<QSCD>, <q1>) +define(<QSEF>, <q2>) +define(<QSGH>, <q3>) + +C d8-d15 are callee-save +define(<DT0>, <d8>) +define(<DT1>, <d9>) +define(<QT01>, <q4>) +define(<DT2>, <d10>) +define(<DT3>, <d11>) +define(<QT23>, <q5>) +define(<DT4>, <d12>) +define(<DT5>, <d13>) +define(<QT45>, <q6>) + +C Used only when reading the input, can overlap with state +define(<DT6>, <d0>) +define(<DT7>, <d1>) +define(<QT67>, <q0>) + +define(<DW0>, <d16>) +define(<DW1>, <d17>) +define(<DW2>, <d18>) +define(<DW3>, <d19>) +define(<DW4>, <d20>) +define(<DW5>, <d21>) +define(<DW6>, <d22>) +define(<DW7>, <d23>) +define(<DW8>, <d24>) +define(<DW9>, <d25>) +define(<DW10>, <d26>) +define(<DW11>, <d27>) +define(<DW12>, <d28>) +define(<DW13>, <d29>) +define(<DW14>, <d30>) +define(<DW15>, <d31>) +define(<QW0001>, <q8>) +define(<QW0203>, <q9>) +define(<QW0405>, <q10>) +define(<QW0607>, <q11>) +define(<QW0809>, <q12>) +define(<QW1011>, <q13>) +define(<QW1213>, <q14>) +define(<QW1415>, <q15>) + +define(<EXPAND_ME>, <$1>) +define(<W>, <EXPAND_ME(<DW>eval(($1) % 16))>) + +C If x = W(i+14), y = w(i+1), we xor in parallel +C +C x << 45 y << 63 +C x >> 19 y >> 1 +C x << 3 y << 56 +C x >> 61 y >> 8 +C xor x >> 6 y >> 7 +C ----------------------------- +C DT0 DT1 define(<EXPN>, < - vldr W, [sp, #+eval(8*$1)] - vldr T0, [sp, #+eval(8*(($1 + 14) % 16))] - vshl.i64 d0, T0, #45 - vshr.u64 d2, T0, #19 - vshl.i64 d1, T0, #3 - vshr.u64 d3, T0, #61 - vadd.i64 q0, q0, q1 - vshr.u64 T0, T0, #6 - veor T0, T0, d0 - veor T0, T0, d1 - vadd.i64 W, W, T0 - vldr T0, [sp, #+eval(8*(($1 + 9) % 16))] - vadd.i64 W, W, T0 - vldr T0, [sp, #+eval(8*(($1 + 1) % 16))] - vshl.i64 d0, T0, #63 - vshr.u64 d2, T0, #1 - vshl.i64 d1, T0, #56 - vshr.u64 d3, T0, #8 - vadd.i64 q0, q0, q1 - vshr.u64 T0, T0, #7 - veor T0, T0, d0 - veor T0, T0, d1 - vadd.i64 W, W, T0 - vstr W, [sp, #+eval(8*$1)] + vshl.i64 DT0, W($1+14), #45 + vshl.i64 DT1, W($1 + 1), #63 + vshr.u64 DT2, W($1+14), #19 + vshr.u64 DT3, W($1 + 1), #1 + vshl.i64 DT4, W($1+14), #3 + vshl.i64 DT5, W($1 + 1), #56 + veor.i64 QT01, QT01, QT23 + vshr.u64 DT2, W($1+14), #61 + vshr.u64 DT3, W($1 + 1), #8 + veor.i64 QT01, QT01, QT45 + vshr.u64 DT4, W($1+14), #6 + vshr.u64 DT5, W($1 + 1), #7 + veor.i64 QT01, QT01, QT23 + vadd.i64 W($1), W($1), W($1 + 9) + veor.i64 QT01, QT01, QT45 + vadd.i64 W($1), W($1), DT0 + vadd.i64 W($1), W($1), DT1 >) C ROUND(A,B,C,D,E,F,G,H,i) @@ -88,48 +125,48 @@ C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25 C Choice (E, F, G) = G^(E&(F^G)) C Majority (A,B,C) = (A&B) + (C&(A^B)) -C FIXME: More opportunities for parallelism, at least do S0 and S1 xors. +C Do S1 and S0 in parallel +C +C e << 50 a << 36 +C e >> 14 a >> 28 +C e << 46 a << 30 +C e >> 18 a >> 34 +C e << 23 a << 25 +C xor e >> 41 a >> 39 +C ---------------------------- +C DT0 DT1 define(<ROUND>, < - vshl.i64 d0, $5, #50 - vshr.u64 d2, $5, #14 - vshl.i64 d1, $5, #46 - vshr.u64 d3, $5, #18 - vadd.i64 q0, q0, q1 - vshl.i64 d2, $5, #23 - vshr.u64 d3, $5, #41 - vadd.i64 d2, d2, d3 - veor d0, d0, d1 - veor d0, d0, d2 - vadd.i64 $8, $8, d0 - veor d0, $6, $7 - vand d0, d0, $5 - veor d0, d0, $7 - vadd.i64 $8,$8, d0 - vldr d0, [K,#eval(8*$9)] - vadd.i64 $8, $8, W - vadd.i64 $8, $8, d0 + vshl.i64 DT0, $5, #50 + vshl.i64 DT1, $1, #36 + vshr.u64 DT2, $5, #14 + vshr.u64 DT3, $1, #28 + vshl.i64 DT4, $5, #46 + vshl.i64 DT5, $1, #30 + veor QT01, QT01, QT23 + vshr.u64 DT2, $5, #18 + vshr.u64 DT3, $1, #34 + veor QT01, QT01, QT45 + vshl.i64 DT4, $5, #23 + vshl.i64 DT5, $1, #25 + veor QT01, QT01, QT23 + vshr.u64 DT2, $5, #41 + vshr.u64 DT3, $1, #39 + veor QT01, QT01, QT45 + veor DT4, $6, $7 + veor DT5, $1, $2 + vand DT4, DT4, $5 + vand DT5, DT5, $3 + veor DT4, DT4, $7 + veor QT01, QT01, QT23 + vand DT2, $1, $2 + vldr DT3, [K,#eval(8*$9)] + vadd.i64 $8, $8, W($9) + vadd.i64 QT01, QT01, QT45 + vadd.i64 $8, $8, DT3 + vadd.i64 $8, $8, DT0 + vadd.i64 DT1, DT1, DT2 vadd.i64 $4, $4, $8 - - vshl.i64 d0, $1, #36 - vshr.u64 d2, $1, #28 - vshl.i64 d1, $1, #30 - vshr.u64 d3, $1, #34 - vadd.i64 q0, q0, q1 - vshl.i64 d2, $1, #25 - vshr.u64 d3, $1, #39 - vadd.i64 d2, d2, d3 - veor d0, d0, d1 - veor d0, d0, d2 - vadd.i64 $8, $8, d0 - vand d0, $1, $2 - veor d1, $1, $2 - vadd.i64 $8, $8, d0 - vand d1, d1, $3 - vadd.i64 $8, $8, d1 ->) - -define(<NOEXPN>, < - vldr W, [INPUT, #eval(8*$1)] + vadd.i64 $8, $8, DT1 >) C void @@ -139,69 +176,100 @@ define(<NOEXPN>, < .align 2 PROLOGUE(_nettle_sha512_compress) - push {r4,r5,r6,r7,r8,r10,r14} - sub sp, sp, #128 + vpush {d8,d9,d10,d11,d12,d13} ands SHIFT, INPUT, #7 and INPUT, INPUT, #-8 - vld1.8 {d0}, [INPUT :64] + vld1.8 {DT5}, [INPUT :64] addne INPUT, INPUT, #8 addeq SHIFT, SHIFT, #8 lsl SHIFT, SHIFT, #3 - C Put right shift in d2 and d3, aka q1 + C Put right shift in DT0 and DT1, aka QT01 neg SHIFT, SHIFT - vmov.i32 d2, #0 - vmov.32 d2[0], SHIFT - vmov d3, d2 - C Put left shift in d4 and d5, aka q2 + vmov.i32 DT0, #0 + vmov.32 DT0[0], SHIFT + vmov DT1, DT0 + C Put left shift in DT2 and DT3, aka QT23 add SHIFT, SHIFT, #64 - vmov.i32 d4, #0 - vmov.32 d4[0], SHIFT - vmov d5, d4 - vshl.u64 d0, d0, d2 + vmov.i32 DT2, #0 + vmov.32 DT2[0], SHIFT + vmov DT3, DT2 + vshl.u64 DT5, DT5, DT0 - mov DST, sp - mov COUNT, #4 -.Lcopy: C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT - vld1.8 {d16,d17,d18,d19}, [INPUT :64]! - vshl.u64 q3, q8, q1 C Right shift - vshl.u64 q8, q8, q2 C Left shift - veor d16, d16, d0 - veor d17, d17, d6 - vrev64.8 q8, q8 - vshl.u64 q0, q9, q1 C Right shift - vshl.u64 q9, q9, q2 C Left shift - veor d18, d18, d7 - veor d19, d19, d0 - vrev64.8 q9, q9 - subs COUNT, COUNT, #1 - vst1.64 {d16,d17,d18,d19}, [DST]! - vmov d0, d1 - bne .Lcopy + vld1.8 {W(0),W(1),W(2),W(3)}, [INPUT :64]! + vshl.u64 QT67, QW0001, QT01 C Right shift + vshl.u64 QW0001, QW0001, QT23 C Left shift + veor W(0), W(0), DT5 + veor W(1), W(1), DT6 + vrev64.8 QW0001, QW0001 + vshl.u64 QT45, QW0203, QT01 C Right shift + vshl.u64 QW0203, QW0203, QT23 C Left shift + veor W(2), W(2), DT7 + veor W(3), W(3), DT4 + vrev64.8 QW0203, QW0203 + + vld1.8 {W(4),W(5),W(6),W(7)}, [INPUT :64]! + vshl.u64 QT67, QW0405, QT01 C Right shift + vshl.u64 QW0405, QW0405, QT23 C Left shift + veor W(4), W(4), DT5 + veor W(5), W(5), DT6 + vrev64.8 QW0405, QW0405 + vshl.u64 QT45, QW0607, QT01 C Right shift + vshl.u64 QW0607, QW0607, QT23 C Left shift + veor W(6), W(6), DT7 + veor W(7), W(7), DT4 + vrev64.8 QW0607, QW0607 - mov COUNT,#2 - mov INPUT, sp + vld1.8 {W(8),W(9),W(10),W(11)}, [INPUT :64]! + vshl.u64 QT67, QW0809, QT01 C Right shift + vshl.u64 QW0809, QW0809, QT23 C Left shift + veor W(8), W(8), DT5 + veor W(9), W(9), DT6 + vrev64.8 QW0809, QW0809 + vshl.u64 QT45, QW1011, QT01 C Right shift + vshl.u64 QW1011, QW1011, QT23 C Left shift + veor W(10), W(10), DT7 + veor W(11), W(11), DT4 + vrev64.8 QW1011, QW1011 + + vld1.8 {W(12),W(13),W(14),W(15)}, [INPUT :64]! + vshl.u64 QT67, QW1213, QT01 C Right shift + vshl.u64 QW1213, QW1213, QT23 C Left shift + veor W(12), W(12), DT5 + veor W(13), W(13), DT6 + vrev64.8 QW1213, QW1213 + vshl.u64 QT45, QW1415, QT01 C Right shift + vshl.u64 QW1415, QW1415, QT23 C Left shift + veor W(14), W(14), DT7 + veor W(15), W(15), DT4 + vrev64.8 QW1415, QW1415 vldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH} -.Loop1: - NOEXPN(0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0) - NOEXPN(1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1) - NOEXPN(2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2) - NOEXPN(3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3) - NOEXPN(4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4) - NOEXPN(5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5) - NOEXPN(6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6) - NOEXPN(7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7) - subs COUNT,#1 - add INPUT, INPUT, #64 - add K, K, #64 - bne .Loop1 + ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0) + ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1) + ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2) + ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3) + ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4) + ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5) + ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6) + ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7) + + ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8) + ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9) + ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10) + ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11) + ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12) + ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13) + ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14) + ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15) + + add K, K, #128 mov COUNT, #4 -.Loop2: +.Loop: EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0) EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1) @@ -221,33 +289,29 @@ PROLOGUE(_nettle_sha512_compress) subs COUNT, COUNT, #1 EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15) add K, K, #128 - bne .Loop2 + bne .Loop - vld1.64 {d24,d25,d26,d27}, [STATE] - vadd.i64 SA, SA, d24 - vadd.i64 SB, SB, d25 - vadd.i64 SC, SC, d26 - vadd.i64 SD, SD, d27 + vld1.64 {DW0, DW1, DW2, DW3}, [STATE] + vadd.i64 QSAB, QSAB, QW0001 + vadd.i64 QSCD, QSCD, QW0203 vst1.64 {SA,SB,SC,SD}, [STATE]! - vld1.64 {d24,d25,d26,d27}, [STATE] - vadd.i64 SE, SE, d24 - vadd.i64 SF, SF, d25 - vadd.i64 SG, SG, d26 - vadd.i64 SH, SH, d27 + vld1.64 {DW0, DW1, DW2, DW3}, [STATE] + vadd.i64 QSEF, QSEF, QW0001 + vadd.i64 QSGH, QSGH, QW0203 vst1.64 {SE,SF,SG,SH}, [STATE]! - add sp, sp, #128 - pop {r4,r5,r6,r7,r8,r10,pc} + vpop {d8,d9,d10,d11,d12,d13} + bx lr EPILOGUE(_nettle_sha512_compress) divert(-1) define shastate -p/x $d16.u64 -p/x $d17.u64 -p/x $d18.u64 -p/x $d19.u64 -p/x $d20.u64 -p/x $d21.u64 -p/x $d22.u64 -p/x $d23.u64 +p/x $d0.u64 +p/x $d1.u64 +p/x $d2.u64 +p/x $d3.u64 +p/x $d4.u64 +p/x $d5.u64 +p/x $d6.u64 +p/x $d7.u64 end |