summaryrefslogtreecommitdiff
path: root/arm/neon
diff options
context:
space:
mode:
authorMichael Weiser <michael.weiser@gmx.de>2018-02-13 22:13:13 +0100
committerNiels Möller <nisse@lysator.liu.se>2018-03-25 11:27:37 +0200
commit2644d1ed132f7dad05e165d6c96a68ee66547d32 (patch)
treec4164ff59e65d5bfb49e4047c8ee7a82444c688a /arm/neon
parentd5738a574daee265ebfcf28dd51dfdca56c1798b (diff)
downloadnettle-2644d1ed132f7dad05e165d6c96a68ee66547d32.tar.gz
Support big-endian arm in assembly code
Adjust sha1-compress, sha256-compress, umac-nh, chacha-core-internal, salsa20-core-internal and memxor for arm to work in big-endian mode.
Diffstat (limited to 'arm/neon')
-rw-r--r--arm/neon/chacha-core-internal.asm39
-rw-r--r--arm/neon/salsa20-core-internal.asm45
-rw-r--r--arm/neon/umac-nh.asm4
3 files changed, 75 insertions, 13 deletions
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm
index 6f623106..22f843e8 100644
--- a/arm/neon/chacha-core-internal.asm
+++ b/arm/neon/chacha-core-internal.asm
@@ -90,31 +90,52 @@ PROLOGUE(_nettle_chacha_core)
vmov S2, X2
vmov S3, X3
- C Input rows:
+ C Input rows little-endian:
C 0 1 2 3 X0
C 4 5 6 7 X1
C 8 9 10 11 X2
C 12 13 14 15 X3
+ C Input rows big-endian:
+ C 1 0 3 2 X0
+ C 5 4 7 6 X1
+ C 9 8 11 10 X2
+ C 13 12 15 14 X3
+ C even and odd columns switched because
+ C vldm loads consecutive doublewords and
+ C switches words inside them to make them BE
+
.Loop:
QROUND(X0, X1, X2, X3)
- C Rotate rows, to get
+ C In little-endian rotate rows, to get
C 0 1 2 3
C 5 6 7 4 >>> 3
C 10 11 8 9 >>> 2
C 15 12 13 14 >>> 1
- vext.32 X1, X1, X1, #1
+
+ C In big-endian rotate rows, to get
+ C 1 0 3 2
+ C 6 5 4 7 >>> 1
+ C 11 10 9 8 >>> 2
+ C 12 15 14 13 >>> 3
+ C different number of elements needs to be
+ C extracted on BE because of different column order
+IF_LE(< vext.32 X1, X1, X1, #1>)
+IF_BE(< vext.32 X1, X1, X1, #3>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #3
+IF_LE(< vext.32 X3, X3, X3, #3>)
+IF_BE(< vext.32 X3, X3, X3, #1>)
QROUND(X0, X1, X2, X3)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
- vext.32 X1, X1, X1, #3
+IF_LE(< vext.32 X1, X1, X1, #3>)
+IF_BE(< vext.32 X1, X1, X1, #1>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #1
+IF_LE(< vext.32 X3, X3, X3, #1>)
+IF_BE(< vext.32 X3, X3, X3, #3>)
bhi .Loop
@@ -123,6 +144,12 @@ PROLOGUE(_nettle_chacha_core)
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
+ C caller expects result little-endian
+IF_BE(< vrev32.u8 X0, X0
+ vrev32.u8 X1, X1
+ vrev32.u8 X2, X2
+ vrev32.u8 X3, X3>)
+
vstm DST, {X0,X1,X2,X3}
bx lr
EPILOGUE(_nettle_chacha_core)
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
index 34eb1fba..20710499 100644
--- a/arm/neon/salsa20-core-internal.asm
+++ b/arm/neon/salsa20-core-internal.asm
@@ -88,7 +88,7 @@ define(<QROUND>, <
PROLOGUE(_nettle_salsa20_core)
vldm SRC, {X0,X1,X2,X3}
- C Input rows:
+ C Input rows little-endian:
C 0 1 2 3 X0
C 4 5 6 7 X1
C 8 9 10 11 X2
@@ -99,6 +99,20 @@ PROLOGUE(_nettle_salsa20_core)
C 8 13 2 7
C 12 1 6 11
+ C Input rows big-endian:
+ C 1 0 3 2 X0
+ C 5 4 7 6 X1
+ C 9 8 11 10 X2
+ C 13 12 15 14 X3
+ C even and odd columns switched because
+ C vldm loads consecutive doublewords and
+ C switches words inside them to make them BE
+ C Permuted to:
+ C 5 0 15 10
+ C 9 4 3 14
+ C 13 8 7 2
+ C 1 12 11 6
+
C FIXME: Construct in some other way?
adr r12, .Lmasks
vldm r12, {M0101, M0110, M0011}
@@ -112,6 +126,7 @@ PROLOGUE(_nettle_salsa20_core)
C 4 1 6 3 T0 v
C 8 13 10 15 T1 ^
C 12 9 14 11 X3 v
+ C same in big endian just with transposed rows
vmov T0, X1
vmov T1, X2
vbit T0, X0, M0101
@@ -140,22 +155,34 @@ PROLOGUE(_nettle_salsa20_core)
.Loop:
QROUND(X0, X1, X2, X3)
- C Rotate rows, to get
+ C In little-endian rotate rows, to get
C 0 5 10 15
C 3 4 9 14 >>> 1
C 2 7 8 13 >>> 2
C 1 6 11 12 >>> 3
- vext.32 X1, X1, X1, #3
+
+ C In big-endian rotate rows, to get
+ C 5 0 15 10
+ C 4 3 14 9 >>> 3
+ C 7 2 13 8 >>> 2
+ C 6 1 12 11 >>> 1
+ C different number of elements needs to be
+ C extracted on BE because of different column order
+IF_LE(< vext.32 X1, X1, X1, #3>)
+IF_BE(< vext.32 X1, X1, X1, #1>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #1
+IF_LE(< vext.32 X3, X3, X3, #1>)
+IF_BE(< vext.32 X3, X3, X3, #3>)
QROUND(X0, X3, X2, X1)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
- vext.32 X1, X1, X1, #1
+IF_LE(< vext.32 X1, X1, X1, #1>)
+IF_BE(< vext.32 X1, X1, X1, #3>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #3
+IF_LE(< vext.32 X3, X3, X3, #3>)
+IF_BE(< vext.32 X3, X3, X3, #1>)
bhi .Loop
@@ -181,6 +208,12 @@ PROLOGUE(_nettle_salsa20_core)
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
+ C caller expects result little-endian
+IF_BE(< vrev32.u8 X0, X0
+ vrev32.u8 X1, X1
+ vrev32.u8 X2, X2
+ vrev32.u8 X3, X3>)
+
vstm DST, {X0,X1,X2,X3}
bx lr
EPILOGUE(_nettle_salsa20_core)
diff --git a/arm/neon/umac-nh.asm b/arm/neon/umac-nh.asm
index 158a5686..38be654c 100644
--- a/arm/neon/umac-nh.asm
+++ b/arm/neon/umac-nh.asm
@@ -97,6 +97,8 @@ PROLOGUE(_nettle_umac_nh)
bhi .Loop
vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY)
- vmov r0, r1, D0REG(QY)
+ C return value needs to respect word order mandated by AAPCS
+IF_LE(< vmov r0, r1, D0REG(QY)>)
+IF_BE(< vmov r1, r0, D0REG(QY)>)
bx lr
EPILOGUE(_nettle_umac_nh)