diff options
author | Michael Weiser <michael.weiser@gmx.de> | 2018-02-13 22:13:13 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2018-03-25 11:27:37 +0200 |
commit | 2644d1ed132f7dad05e165d6c96a68ee66547d32 (patch) | |
tree | c4164ff59e65d5bfb49e4047c8ee7a82444c688a /arm | |
parent | d5738a574daee265ebfcf28dd51dfdca56c1798b (diff) | |
download | nettle-2644d1ed132f7dad05e165d6c96a68ee66547d32.tar.gz |
Support big-endian arm in assembly code
Adjust sha1-compress, sha256-compress, umac-nh, chacha-core-internal,
salsa20-core-internal and memxor for arm to work in big-endian mode.
Diffstat (limited to 'arm')
-rw-r--r-- | arm/memxor.asm | 21 | ||||
-rw-r--r-- | arm/memxor3.asm | 49 | ||||
-rw-r--r-- | arm/neon/chacha-core-internal.asm | 39 | ||||
-rw-r--r-- | arm/neon/salsa20-core-internal.asm | 45 | ||||
-rw-r--r-- | arm/neon/umac-nh.asm | 4 | ||||
-rw-r--r-- | arm/v6/sha1-compress.asm | 8 | ||||
-rw-r--r-- | arm/v6/sha256-compress.asm | 14 |
7 files changed, 138 insertions, 42 deletions
diff --git a/arm/memxor.asm b/arm/memxor.asm index a50e91bc..239a4034 100644 --- a/arm/memxor.asm +++ b/arm/memxor.asm @@ -44,6 +44,11 @@ define(<N>, <r2>) define(<CNT>, <r6>) define(<TNC>, <r12>) +C little-endian and big-endian need to shift in different directions for +C alignment correction +define(<S0ADJ>, IF_LE(<lsr>, <lsl>)) +define(<S1ADJ>, IF_LE(<lsl>, <lsr>)) + .syntax unified .file "memxor.asm" @@ -99,6 +104,8 @@ PROLOGUE(nettle_memxor) C C With little-endian, we need to do C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC) + C With big-endian, we need to do + C DST[i] ^= (SRC[i] << CNT) ^ (SRC[i+1] >> TNC) push {r4,r5,r6} @@ -117,14 +124,14 @@ PROLOGUE(nettle_memxor) .Lmemxor_word_loop: ldr r5, [SRC], #+4 ldr r3, [DST] - eor r3, r3, r4, lsr CNT - eor r3, r3, r5, lsl TNC + eor r3, r3, r4, S0ADJ CNT + eor r3, r3, r5, S1ADJ TNC str r3, [DST], #+4 .Lmemxor_odd: ldr r4, [SRC], #+4 ldr r3, [DST] - eor r3, r3, r5, lsr CNT - eor r3, r3, r4, lsl TNC + eor r3, r3, r5, S0ADJ CNT + eor r3, r3, r4, S1ADJ TNC str r3, [DST], #+4 subs N, #8 bcs .Lmemxor_word_loop @@ -132,10 +139,14 @@ PROLOGUE(nettle_memxor) beq .Lmemxor_odd_done C We have TNC/8 left-over bytes in r4, high end - lsr r4, CNT + S0ADJ r4, CNT ldr r3, [DST] eor r3, r4 + C memxor_leftover does an LSB store + C so we need to reverse if actually BE +IF_BE(< rev r3, r3>) + pop {r4,r5,r6} C Store bytes, one by one. diff --git a/arm/memxor3.asm b/arm/memxor3.asm index 139fd208..69598e1c 100644 --- a/arm/memxor3.asm +++ b/arm/memxor3.asm @@ -49,6 +49,11 @@ define(<ATNC>, <r10>) define(<BCNT>, <r11>) define(<BTNC>, <r12>) +C little-endian and big-endian need to shift in different directions for +C alignment correction +define(<S0ADJ>, IF_LE(<lsr>, <lsl>)) +define(<S1ADJ>, IF_LE(<lsl>, <lsr>)) + .syntax unified .file "memxor3.asm" @@ -124,6 +129,8 @@ PROLOGUE(nettle_memxor3) C C With little-endian, we need to do C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC) + C With big-endian, we need to do + C DST[i-i] ^= (SRC[i-i] << CNT) ^ (SRC[i] >> TNC) rsb ATNC, ACNT, #32 bic BP, #3 @@ -138,14 +145,14 @@ PROLOGUE(nettle_memxor3) .Lmemxor3_au_loop: ldr r5, [BP, #-4]! ldr r6, [AP, #-4]! - eor r6, r6, r4, lsl ATNC - eor r6, r6, r5, lsr ACNT + eor r6, r6, r4, S1ADJ ATNC + eor r6, r6, r5, S0ADJ ACNT str r6, [DST, #-4]! .Lmemxor3_au_odd: ldr r4, [BP, #-4]! ldr r6, [AP, #-4]! - eor r6, r6, r5, lsl ATNC - eor r6, r6, r4, lsr ACNT + eor r6, r6, r5, S1ADJ ATNC + eor r6, r6, r4, S0ADJ ACNT str r6, [DST, #-4]! subs N, #8 bcs .Lmemxor3_au_loop @@ -154,7 +161,11 @@ PROLOGUE(nettle_memxor3) C Leftover bytes in r4, low end ldr r5, [AP, #-4] - eor r4, r5, r4, lsl ATNC + eor r4, r5, r4, S1ADJ ATNC + + C leftover does an LSB store + C so we need to reverse if actually BE +IF_BE(< rev r4, r4>) .Lmemxor3_au_leftover: C Store a byte at a time @@ -247,21 +258,25 @@ PROLOGUE(nettle_memxor3) ldr r5, [AP, #-4]! ldr r6, [BP, #-4]! eor r5, r6 - lsl r4, ATNC - eor r4, r4, r5, lsr ACNT + S1ADJ r4, ATNC + eor r4, r4, r5, S0ADJ ACNT str r4, [DST, #-4]! .Lmemxor3_uu_odd: ldr r4, [AP, #-4]! ldr r6, [BP, #-4]! eor r4, r6 - lsl r5, ATNC - eor r5, r5, r4, lsr ACNT + S1ADJ r5, ATNC + eor r5, r5, r4, S0ADJ ACNT str r5, [DST, #-4]! subs N, #8 bcs .Lmemxor3_uu_loop adds N, #8 beq .Lmemxor3_done + C leftover does an LSB store + C so we need to reverse if actually BE +IF_BE(< rev r4, r4>) + C Leftover bytes in a4, low end ror r4, ACNT .Lmemxor3_uu_leftover: @@ -290,18 +305,18 @@ PROLOGUE(nettle_memxor3) .Lmemxor3_uud_loop: ldr r5, [AP, #-4]! ldr r7, [BP, #-4]! - lsl r4, ATNC - eor r4, r4, r6, lsl BTNC - eor r4, r4, r5, lsr ACNT - eor r4, r4, r7, lsr BCNT + S1ADJ r4, ATNC + eor r4, r4, r6, S1ADJ BTNC + eor r4, r4, r5, S0ADJ ACNT + eor r4, r4, r7, S0ADJ BCNT str r4, [DST, #-4]! .Lmemxor3_uud_odd: ldr r4, [AP, #-4]! ldr r6, [BP, #-4]! - lsl r5, ATNC - eor r5, r5, r7, lsl BTNC - eor r5, r5, r4, lsr ACNT - eor r5, r5, r6, lsr BCNT + S1ADJ r5, ATNC + eor r5, r5, r7, S1ADJ BTNC + eor r5, r5, r4, S0ADJ ACNT + eor r5, r5, r6, S0ADJ BCNT str r5, [DST, #-4]! subs N, #8 bcs .Lmemxor3_uud_loop diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm index 6f623106..22f843e8 100644 --- a/arm/neon/chacha-core-internal.asm +++ b/arm/neon/chacha-core-internal.asm @@ -90,31 +90,52 @@ PROLOGUE(_nettle_chacha_core) vmov S2, X2 vmov S3, X3 - C Input rows: + C Input rows little-endian: C 0 1 2 3 X0 C 4 5 6 7 X1 C 8 9 10 11 X2 C 12 13 14 15 X3 + C Input rows big-endian: + C 1 0 3 2 X0 + C 5 4 7 6 X1 + C 9 8 11 10 X2 + C 13 12 15 14 X3 + C even and odd columns switched because + C vldm loads consecutive doublewords and + C switches words inside them to make them BE + .Loop: QROUND(X0, X1, X2, X3) - C Rotate rows, to get + C In little-endian rotate rows, to get C 0 1 2 3 C 5 6 7 4 >>> 3 C 10 11 8 9 >>> 2 C 15 12 13 14 >>> 1 - vext.32 X1, X1, X1, #1 + + C In big-endian rotate rows, to get + C 1 0 3 2 + C 6 5 4 7 >>> 1 + C 11 10 9 8 >>> 2 + C 12 15 14 13 >>> 3 + C different number of elements needs to be + C extracted on BE because of different column order +IF_LE(< vext.32 X1, X1, X1, #1>) +IF_BE(< vext.32 X1, X1, X1, #3>) vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #3 +IF_LE(< vext.32 X3, X3, X3, #3>) +IF_BE(< vext.32 X3, X3, X3, #1>) QROUND(X0, X1, X2, X3) subs ROUNDS, ROUNDS, #2 C Inverse rotation - vext.32 X1, X1, X1, #3 +IF_LE(< vext.32 X1, X1, X1, #3>) +IF_BE(< vext.32 X1, X1, X1, #1>) vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #1 +IF_LE(< vext.32 X3, X3, X3, #1>) +IF_BE(< vext.32 X3, X3, X3, #3>) bhi .Loop @@ -123,6 +144,12 @@ PROLOGUE(_nettle_chacha_core) vadd.u32 X2, X2, S2 vadd.u32 X3, X3, S3 + C caller expects result little-endian +IF_BE(< vrev32.u8 X0, X0 + vrev32.u8 X1, X1 + vrev32.u8 X2, X2 + vrev32.u8 X3, X3>) + vstm DST, {X0,X1,X2,X3} bx lr EPILOGUE(_nettle_chacha_core) diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm index 34eb1fba..20710499 100644 --- a/arm/neon/salsa20-core-internal.asm +++ b/arm/neon/salsa20-core-internal.asm @@ -88,7 +88,7 @@ define(<QROUND>, < PROLOGUE(_nettle_salsa20_core) vldm SRC, {X0,X1,X2,X3} - C Input rows: + C Input rows little-endian: C 0 1 2 3 X0 C 4 5 6 7 X1 C 8 9 10 11 X2 @@ -99,6 +99,20 @@ PROLOGUE(_nettle_salsa20_core) C 8 13 2 7 C 12 1 6 11 + C Input rows big-endian: + C 1 0 3 2 X0 + C 5 4 7 6 X1 + C 9 8 11 10 X2 + C 13 12 15 14 X3 + C even and odd columns switched because + C vldm loads consecutive doublewords and + C switches words inside them to make them BE + C Permuted to: + C 5 0 15 10 + C 9 4 3 14 + C 13 8 7 2 + C 1 12 11 6 + C FIXME: Construct in some other way? adr r12, .Lmasks vldm r12, {M0101, M0110, M0011} @@ -112,6 +126,7 @@ PROLOGUE(_nettle_salsa20_core) C 4 1 6 3 T0 v C 8 13 10 15 T1 ^ C 12 9 14 11 X3 v + C same in big endian just with transposed rows vmov T0, X1 vmov T1, X2 vbit T0, X0, M0101 @@ -140,22 +155,34 @@ PROLOGUE(_nettle_salsa20_core) .Loop: QROUND(X0, X1, X2, X3) - C Rotate rows, to get + C In little-endian rotate rows, to get C 0 5 10 15 C 3 4 9 14 >>> 1 C 2 7 8 13 >>> 2 C 1 6 11 12 >>> 3 - vext.32 X1, X1, X1, #3 + + C In big-endian rotate rows, to get + C 5 0 15 10 + C 4 3 14 9 >>> 3 + C 7 2 13 8 >>> 2 + C 6 1 12 11 >>> 1 + C different number of elements needs to be + C extracted on BE because of different column order +IF_LE(< vext.32 X1, X1, X1, #3>) +IF_BE(< vext.32 X1, X1, X1, #1>) vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #1 +IF_LE(< vext.32 X3, X3, X3, #1>) +IF_BE(< vext.32 X3, X3, X3, #3>) QROUND(X0, X3, X2, X1) subs ROUNDS, ROUNDS, #2 C Inverse rotation - vext.32 X1, X1, X1, #1 +IF_LE(< vext.32 X1, X1, X1, #1>) +IF_BE(< vext.32 X1, X1, X1, #3>) vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #3 +IF_LE(< vext.32 X3, X3, X3, #3>) +IF_BE(< vext.32 X3, X3, X3, #1>) bhi .Loop @@ -181,6 +208,12 @@ PROLOGUE(_nettle_salsa20_core) vadd.u32 X2, X2, S2 vadd.u32 X3, X3, S3 + C caller expects result little-endian +IF_BE(< vrev32.u8 X0, X0 + vrev32.u8 X1, X1 + vrev32.u8 X2, X2 + vrev32.u8 X3, X3>) + vstm DST, {X0,X1,X2,X3} bx lr EPILOGUE(_nettle_salsa20_core) diff --git a/arm/neon/umac-nh.asm b/arm/neon/umac-nh.asm index 158a5686..38be654c 100644 --- a/arm/neon/umac-nh.asm +++ b/arm/neon/umac-nh.asm @@ -97,6 +97,8 @@ PROLOGUE(_nettle_umac_nh) bhi .Loop vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY) - vmov r0, r1, D0REG(QY) + C return value needs to respect word order mandated by AAPCS +IF_LE(< vmov r0, r1, D0REG(QY)>) +IF_BE(< vmov r1, r0, D0REG(QY)>) bx lr EPILOGUE(_nettle_umac_nh) diff --git a/arm/v6/sha1-compress.asm b/arm/v6/sha1-compress.asm index 59d6297e..8cc22be7 100644 --- a/arm/v6/sha1-compress.asm +++ b/arm/v6/sha1-compress.asm @@ -52,7 +52,7 @@ define(<LOAD>, < sel W, WPREV, T0 ror W, W, SHIFT mov WPREV, T0 - rev W, W +IF_LE(< rev W, W>) str W, [SP,#eval(4*$1)] >) define(<EXPN>, < @@ -127,8 +127,12 @@ PROLOGUE(_nettle_sha1_compress) lsl SHIFT, SHIFT, #3 mov T0, #0 movne T0, #-1 - lsl W, T0, SHIFT +IF_LE(< lsl W, T0, SHIFT>) +IF_BE(< lsr W, T0, SHIFT>) uadd8 T0, T0, W C Sets APSR.GE bits + C on BE rotate right by 32-SHIFT bits + C because there is no rotate left +IF_BE(< rsb SHIFT, SHIFT, #32>) ldr K, .LK1 ldm STATE, {SA,SB,SC,SD,SE} diff --git a/arm/v6/sha256-compress.asm b/arm/v6/sha256-compress.asm index e6f4e1e9..324730c7 100644 --- a/arm/v6/sha256-compress.asm +++ b/arm/v6/sha256-compress.asm @@ -137,8 +137,12 @@ PROLOGUE(_nettle_sha256_compress) lsl SHIFT, SHIFT, #3 mov T0, #0 movne T0, #-1 - lsl I1, T0, SHIFT +IF_LE(< lsl I1, T0, SHIFT>) +IF_BE(< lsr I1, T0, SHIFT>) uadd8 T0, T0, I1 C Sets APSR.GE bits + C on BE rotate right by 32-SHIFT bits + C because there is no rotate left +IF_BE(< rsb SHIFT, SHIFT, #32>) mov DST, sp mov ILEFT, #4 @@ -146,16 +150,16 @@ PROLOGUE(_nettle_sha256_compress) ldm INPUT!, {I1,I2,I3,I4} sel I0, I0, I1 ror I0, I0, SHIFT - rev I0, I0 +IF_LE(< rev I0, I0>) sel I1, I1, I2 ror I1, I1, SHIFT - rev I1, I1 +IF_LE(< rev I1, I1>) sel I2, I2, I3 ror I2, I2, SHIFT - rev I2, I2 +IF_LE(< rev I2, I2>) sel I3, I3, I4 ror I3, I3, SHIFT - rev I3, I3 +IF_LE(< rev I3, I3>) subs ILEFT, ILEFT, #1 stm DST!, {I0,I1,I2,I3} mov I0, I4 |