diff options
author | Michael Weiser <michael.weiser@gmx.de> | 2018-02-13 22:13:13 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2018-03-25 11:27:37 +0200 |
commit | 2644d1ed132f7dad05e165d6c96a68ee66547d32 (patch) | |
tree | c4164ff59e65d5bfb49e4047c8ee7a82444c688a /arm/neon | |
parent | d5738a574daee265ebfcf28dd51dfdca56c1798b (diff) | |
download | nettle-2644d1ed132f7dad05e165d6c96a68ee66547d32.tar.gz |
Support big-endian arm in assembly code
Adjust sha1-compress, sha256-compress, umac-nh, chacha-core-internal,
salsa20-core-internal and memxor for arm to work in big-endian mode.
Diffstat (limited to 'arm/neon')
-rw-r--r-- | arm/neon/chacha-core-internal.asm | 39 | ||||
-rw-r--r-- | arm/neon/salsa20-core-internal.asm | 45 | ||||
-rw-r--r-- | arm/neon/umac-nh.asm | 4 |
3 files changed, 75 insertions, 13 deletions
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm index 6f623106..22f843e8 100644 --- a/arm/neon/chacha-core-internal.asm +++ b/arm/neon/chacha-core-internal.asm @@ -90,31 +90,52 @@ PROLOGUE(_nettle_chacha_core) vmov S2, X2 vmov S3, X3 - C Input rows: + C Input rows little-endian: C 0 1 2 3 X0 C 4 5 6 7 X1 C 8 9 10 11 X2 C 12 13 14 15 X3 + C Input rows big-endian: + C 1 0 3 2 X0 + C 5 4 7 6 X1 + C 9 8 11 10 X2 + C 13 12 15 14 X3 + C even and odd columns switched because + C vldm loads consecutive doublewords and + C switches words inside them to make them BE + .Loop: QROUND(X0, X1, X2, X3) - C Rotate rows, to get + C In little-endian rotate rows, to get C 0 1 2 3 C 5 6 7 4 >>> 3 C 10 11 8 9 >>> 2 C 15 12 13 14 >>> 1 - vext.32 X1, X1, X1, #1 + + C In big-endian rotate rows, to get + C 1 0 3 2 + C 6 5 4 7 >>> 1 + C 11 10 9 8 >>> 2 + C 12 15 14 13 >>> 3 + C different number of elements needs to be + C extracted on BE because of different column order +IF_LE(< vext.32 X1, X1, X1, #1>) +IF_BE(< vext.32 X1, X1, X1, #3>) vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #3 +IF_LE(< vext.32 X3, X3, X3, #3>) +IF_BE(< vext.32 X3, X3, X3, #1>) QROUND(X0, X1, X2, X3) subs ROUNDS, ROUNDS, #2 C Inverse rotation - vext.32 X1, X1, X1, #3 +IF_LE(< vext.32 X1, X1, X1, #3>) +IF_BE(< vext.32 X1, X1, X1, #1>) vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #1 +IF_LE(< vext.32 X3, X3, X3, #1>) +IF_BE(< vext.32 X3, X3, X3, #3>) bhi .Loop @@ -123,6 +144,12 @@ PROLOGUE(_nettle_chacha_core) vadd.u32 X2, X2, S2 vadd.u32 X3, X3, S3 + C caller expects result little-endian +IF_BE(< vrev32.u8 X0, X0 + vrev32.u8 X1, X1 + vrev32.u8 X2, X2 + vrev32.u8 X3, X3>) + vstm DST, {X0,X1,X2,X3} bx lr EPILOGUE(_nettle_chacha_core) diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm index 34eb1fba..20710499 100644 --- a/arm/neon/salsa20-core-internal.asm +++ b/arm/neon/salsa20-core-internal.asm @@ -88,7 +88,7 @@ define(<QROUND>, < PROLOGUE(_nettle_salsa20_core) vldm SRC, {X0,X1,X2,X3} - C Input rows: + C Input rows little-endian: C 0 1 2 3 X0 C 4 5 6 7 X1 C 8 9 10 11 X2 @@ -99,6 +99,20 @@ PROLOGUE(_nettle_salsa20_core) C 8 13 2 7 C 12 1 6 11 + C Input rows big-endian: + C 1 0 3 2 X0 + C 5 4 7 6 X1 + C 9 8 11 10 X2 + C 13 12 15 14 X3 + C even and odd columns switched because + C vldm loads consecutive doublewords and + C switches words inside them to make them BE + C Permuted to: + C 5 0 15 10 + C 9 4 3 14 + C 13 8 7 2 + C 1 12 11 6 + C FIXME: Construct in some other way? adr r12, .Lmasks vldm r12, {M0101, M0110, M0011} @@ -112,6 +126,7 @@ PROLOGUE(_nettle_salsa20_core) C 4 1 6 3 T0 v C 8 13 10 15 T1 ^ C 12 9 14 11 X3 v + C same in big endian just with transposed rows vmov T0, X1 vmov T1, X2 vbit T0, X0, M0101 @@ -140,22 +155,34 @@ PROLOGUE(_nettle_salsa20_core) .Loop: QROUND(X0, X1, X2, X3) - C Rotate rows, to get + C In little-endian rotate rows, to get C 0 5 10 15 C 3 4 9 14 >>> 1 C 2 7 8 13 >>> 2 C 1 6 11 12 >>> 3 - vext.32 X1, X1, X1, #3 + + C In big-endian rotate rows, to get + C 5 0 15 10 + C 4 3 14 9 >>> 3 + C 7 2 13 8 >>> 2 + C 6 1 12 11 >>> 1 + C different number of elements needs to be + C extracted on BE because of different column order +IF_LE(< vext.32 X1, X1, X1, #3>) +IF_BE(< vext.32 X1, X1, X1, #1>) vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #1 +IF_LE(< vext.32 X3, X3, X3, #1>) +IF_BE(< vext.32 X3, X3, X3, #3>) QROUND(X0, X3, X2, X1) subs ROUNDS, ROUNDS, #2 C Inverse rotation - vext.32 X1, X1, X1, #1 +IF_LE(< vext.32 X1, X1, X1, #1>) +IF_BE(< vext.32 X1, X1, X1, #3>) vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #3 +IF_LE(< vext.32 X3, X3, X3, #3>) +IF_BE(< vext.32 X3, X3, X3, #1>) bhi .Loop @@ -181,6 +208,12 @@ PROLOGUE(_nettle_salsa20_core) vadd.u32 X2, X2, S2 vadd.u32 X3, X3, S3 + C caller expects result little-endian +IF_BE(< vrev32.u8 X0, X0 + vrev32.u8 X1, X1 + vrev32.u8 X2, X2 + vrev32.u8 X3, X3>) + vstm DST, {X0,X1,X2,X3} bx lr EPILOGUE(_nettle_salsa20_core) diff --git a/arm/neon/umac-nh.asm b/arm/neon/umac-nh.asm index 158a5686..38be654c 100644 --- a/arm/neon/umac-nh.asm +++ b/arm/neon/umac-nh.asm @@ -97,6 +97,8 @@ PROLOGUE(_nettle_umac_nh) bhi .Loop vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY) - vmov r0, r1, D0REG(QY) + C return value needs to respect word order mandated by AAPCS +IF_LE(< vmov r0, r1, D0REG(QY)>) +IF_BE(< vmov r1, r0, D0REG(QY)>) bx lr EPILOGUE(_nettle_umac_nh) |