summaryrefslogtreecommitdiff
path: root/arm
diff options
context:
space:
mode:
authorMichael Weiser <michael.weiser@gmx.de>2018-02-13 22:13:13 +0100
committerNiels Möller <nisse@lysator.liu.se>2018-03-25 11:27:37 +0200
commit2644d1ed132f7dad05e165d6c96a68ee66547d32 (patch)
treec4164ff59e65d5bfb49e4047c8ee7a82444c688a /arm
parentd5738a574daee265ebfcf28dd51dfdca56c1798b (diff)
downloadnettle-2644d1ed132f7dad05e165d6c96a68ee66547d32.tar.gz
Support big-endian arm in assembly code
Adjust sha1-compress, sha256-compress, umac-nh, chacha-core-internal, salsa20-core-internal and memxor for arm to work in big-endian mode.
Diffstat (limited to 'arm')
-rw-r--r--arm/memxor.asm21
-rw-r--r--arm/memxor3.asm49
-rw-r--r--arm/neon/chacha-core-internal.asm39
-rw-r--r--arm/neon/salsa20-core-internal.asm45
-rw-r--r--arm/neon/umac-nh.asm4
-rw-r--r--arm/v6/sha1-compress.asm8
-rw-r--r--arm/v6/sha256-compress.asm14
7 files changed, 138 insertions, 42 deletions
diff --git a/arm/memxor.asm b/arm/memxor.asm
index a50e91bc..239a4034 100644
--- a/arm/memxor.asm
+++ b/arm/memxor.asm
@@ -44,6 +44,11 @@ define(<N>, <r2>)
define(<CNT>, <r6>)
define(<TNC>, <r12>)
+C little-endian and big-endian need to shift in different directions for
+C alignment correction
+define(<S0ADJ>, IF_LE(<lsr>, <lsl>))
+define(<S1ADJ>, IF_LE(<lsl>, <lsr>))
+
.syntax unified
.file "memxor.asm"
@@ -99,6 +104,8 @@ PROLOGUE(nettle_memxor)
C
C With little-endian, we need to do
C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
+ C With big-endian, we need to do
+ C DST[i] ^= (SRC[i] << CNT) ^ (SRC[i+1] >> TNC)
push {r4,r5,r6}
@@ -117,14 +124,14 @@ PROLOGUE(nettle_memxor)
.Lmemxor_word_loop:
ldr r5, [SRC], #+4
ldr r3, [DST]
- eor r3, r3, r4, lsr CNT
- eor r3, r3, r5, lsl TNC
+ eor r3, r3, r4, S0ADJ CNT
+ eor r3, r3, r5, S1ADJ TNC
str r3, [DST], #+4
.Lmemxor_odd:
ldr r4, [SRC], #+4
ldr r3, [DST]
- eor r3, r3, r5, lsr CNT
- eor r3, r3, r4, lsl TNC
+ eor r3, r3, r5, S0ADJ CNT
+ eor r3, r3, r4, S1ADJ TNC
str r3, [DST], #+4
subs N, #8
bcs .Lmemxor_word_loop
@@ -132,10 +139,14 @@ PROLOGUE(nettle_memxor)
beq .Lmemxor_odd_done
C We have TNC/8 left-over bytes in r4, high end
- lsr r4, CNT
+ S0ADJ r4, CNT
ldr r3, [DST]
eor r3, r4
+ C memxor_leftover does an LSB store
+ C so we need to reverse if actually BE
+IF_BE(< rev r3, r3>)
+
pop {r4,r5,r6}
C Store bytes, one by one.
diff --git a/arm/memxor3.asm b/arm/memxor3.asm
index 139fd208..69598e1c 100644
--- a/arm/memxor3.asm
+++ b/arm/memxor3.asm
@@ -49,6 +49,11 @@ define(<ATNC>, <r10>)
define(<BCNT>, <r11>)
define(<BTNC>, <r12>)
+C little-endian and big-endian need to shift in different directions for
+C alignment correction
+define(<S0ADJ>, IF_LE(<lsr>, <lsl>))
+define(<S1ADJ>, IF_LE(<lsl>, <lsr>))
+
.syntax unified
.file "memxor3.asm"
@@ -124,6 +129,8 @@ PROLOGUE(nettle_memxor3)
C
C With little-endian, we need to do
C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
+ C With big-endian, we need to do
+ C DST[i-i] ^= (SRC[i-i] << CNT) ^ (SRC[i] >> TNC)
rsb ATNC, ACNT, #32
bic BP, #3
@@ -138,14 +145,14 @@ PROLOGUE(nettle_memxor3)
.Lmemxor3_au_loop:
ldr r5, [BP, #-4]!
ldr r6, [AP, #-4]!
- eor r6, r6, r4, lsl ATNC
- eor r6, r6, r5, lsr ACNT
+ eor r6, r6, r4, S1ADJ ATNC
+ eor r6, r6, r5, S0ADJ ACNT
str r6, [DST, #-4]!
.Lmemxor3_au_odd:
ldr r4, [BP, #-4]!
ldr r6, [AP, #-4]!
- eor r6, r6, r5, lsl ATNC
- eor r6, r6, r4, lsr ACNT
+ eor r6, r6, r5, S1ADJ ATNC
+ eor r6, r6, r4, S0ADJ ACNT
str r6, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_au_loop
@@ -154,7 +161,11 @@ PROLOGUE(nettle_memxor3)
C Leftover bytes in r4, low end
ldr r5, [AP, #-4]
- eor r4, r5, r4, lsl ATNC
+ eor r4, r5, r4, S1ADJ ATNC
+
+ C leftover does an LSB store
+ C so we need to reverse if actually BE
+IF_BE(< rev r4, r4>)
.Lmemxor3_au_leftover:
C Store a byte at a time
@@ -247,21 +258,25 @@ PROLOGUE(nettle_memxor3)
ldr r5, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r5, r6
- lsl r4, ATNC
- eor r4, r4, r5, lsr ACNT
+ S1ADJ r4, ATNC
+ eor r4, r4, r5, S0ADJ ACNT
str r4, [DST, #-4]!
.Lmemxor3_uu_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r4, r6
- lsl r5, ATNC
- eor r5, r5, r4, lsr ACNT
+ S1ADJ r5, ATNC
+ eor r5, r5, r4, S0ADJ ACNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uu_loop
adds N, #8
beq .Lmemxor3_done
+ C leftover does an LSB store
+ C so we need to reverse if actually BE
+IF_BE(< rev r4, r4>)
+
C Leftover bytes in a4, low end
ror r4, ACNT
.Lmemxor3_uu_leftover:
@@ -290,18 +305,18 @@ PROLOGUE(nettle_memxor3)
.Lmemxor3_uud_loop:
ldr r5, [AP, #-4]!
ldr r7, [BP, #-4]!
- lsl r4, ATNC
- eor r4, r4, r6, lsl BTNC
- eor r4, r4, r5, lsr ACNT
- eor r4, r4, r7, lsr BCNT
+ S1ADJ r4, ATNC
+ eor r4, r4, r6, S1ADJ BTNC
+ eor r4, r4, r5, S0ADJ ACNT
+ eor r4, r4, r7, S0ADJ BCNT
str r4, [DST, #-4]!
.Lmemxor3_uud_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
- lsl r5, ATNC
- eor r5, r5, r7, lsl BTNC
- eor r5, r5, r4, lsr ACNT
- eor r5, r5, r6, lsr BCNT
+ S1ADJ r5, ATNC
+ eor r5, r5, r7, S1ADJ BTNC
+ eor r5, r5, r4, S0ADJ ACNT
+ eor r5, r5, r6, S0ADJ BCNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uud_loop
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm
index 6f623106..22f843e8 100644
--- a/arm/neon/chacha-core-internal.asm
+++ b/arm/neon/chacha-core-internal.asm
@@ -90,31 +90,52 @@ PROLOGUE(_nettle_chacha_core)
vmov S2, X2
vmov S3, X3
- C Input rows:
+ C Input rows little-endian:
C 0 1 2 3 X0
C 4 5 6 7 X1
C 8 9 10 11 X2
C 12 13 14 15 X3
+ C Input rows big-endian:
+ C 1 0 3 2 X0
+ C 5 4 7 6 X1
+ C 9 8 11 10 X2
+ C 13 12 15 14 X3
+ C even and odd columns switched because
+ C vldm loads consecutive doublewords and
+ C switches words inside them to make them BE
+
.Loop:
QROUND(X0, X1, X2, X3)
- C Rotate rows, to get
+ C In little-endian rotate rows, to get
C 0 1 2 3
C 5 6 7 4 >>> 3
C 10 11 8 9 >>> 2
C 15 12 13 14 >>> 1
- vext.32 X1, X1, X1, #1
+
+ C In big-endian rotate rows, to get
+ C 1 0 3 2
+ C 6 5 4 7 >>> 1
+ C 11 10 9 8 >>> 2
+ C 12 15 14 13 >>> 3
+ C different number of elements needs to be
+ C extracted on BE because of different column order
+IF_LE(< vext.32 X1, X1, X1, #1>)
+IF_BE(< vext.32 X1, X1, X1, #3>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #3
+IF_LE(< vext.32 X3, X3, X3, #3>)
+IF_BE(< vext.32 X3, X3, X3, #1>)
QROUND(X0, X1, X2, X3)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
- vext.32 X1, X1, X1, #3
+IF_LE(< vext.32 X1, X1, X1, #3>)
+IF_BE(< vext.32 X1, X1, X1, #1>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #1
+IF_LE(< vext.32 X3, X3, X3, #1>)
+IF_BE(< vext.32 X3, X3, X3, #3>)
bhi .Loop
@@ -123,6 +144,12 @@ PROLOGUE(_nettle_chacha_core)
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
+ C caller expects result little-endian
+IF_BE(< vrev32.u8 X0, X0
+ vrev32.u8 X1, X1
+ vrev32.u8 X2, X2
+ vrev32.u8 X3, X3>)
+
vstm DST, {X0,X1,X2,X3}
bx lr
EPILOGUE(_nettle_chacha_core)
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
index 34eb1fba..20710499 100644
--- a/arm/neon/salsa20-core-internal.asm
+++ b/arm/neon/salsa20-core-internal.asm
@@ -88,7 +88,7 @@ define(<QROUND>, <
PROLOGUE(_nettle_salsa20_core)
vldm SRC, {X0,X1,X2,X3}
- C Input rows:
+ C Input rows little-endian:
C 0 1 2 3 X0
C 4 5 6 7 X1
C 8 9 10 11 X2
@@ -99,6 +99,20 @@ PROLOGUE(_nettle_salsa20_core)
C 8 13 2 7
C 12 1 6 11
+ C Input rows big-endian:
+ C 1 0 3 2 X0
+ C 5 4 7 6 X1
+ C 9 8 11 10 X2
+ C 13 12 15 14 X3
+ C even and odd columns switched because
+ C vldm loads consecutive doublewords and
+ C switches words inside them to make them BE
+ C Permuted to:
+ C 5 0 15 10
+ C 9 4 3 14
+ C 13 8 7 2
+ C 1 12 11 6
+
C FIXME: Construct in some other way?
adr r12, .Lmasks
vldm r12, {M0101, M0110, M0011}
@@ -112,6 +126,7 @@ PROLOGUE(_nettle_salsa20_core)
C 4 1 6 3 T0 v
C 8 13 10 15 T1 ^
C 12 9 14 11 X3 v
+ C same in big endian just with transposed rows
vmov T0, X1
vmov T1, X2
vbit T0, X0, M0101
@@ -140,22 +155,34 @@ PROLOGUE(_nettle_salsa20_core)
.Loop:
QROUND(X0, X1, X2, X3)
- C Rotate rows, to get
+ C In little-endian rotate rows, to get
C 0 5 10 15
C 3 4 9 14 >>> 1
C 2 7 8 13 >>> 2
C 1 6 11 12 >>> 3
- vext.32 X1, X1, X1, #3
+
+ C In big-endian rotate rows, to get
+ C 5 0 15 10
+ C 4 3 14 9 >>> 3
+ C 7 2 13 8 >>> 2
+ C 6 1 12 11 >>> 1
+ C different number of elements needs to be
+ C extracted on BE because of different column order
+IF_LE(< vext.32 X1, X1, X1, #3>)
+IF_BE(< vext.32 X1, X1, X1, #1>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #1
+IF_LE(< vext.32 X3, X3, X3, #1>)
+IF_BE(< vext.32 X3, X3, X3, #3>)
QROUND(X0, X3, X2, X1)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
- vext.32 X1, X1, X1, #1
+IF_LE(< vext.32 X1, X1, X1, #1>)
+IF_BE(< vext.32 X1, X1, X1, #3>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #3
+IF_LE(< vext.32 X3, X3, X3, #3>)
+IF_BE(< vext.32 X3, X3, X3, #1>)
bhi .Loop
@@ -181,6 +208,12 @@ PROLOGUE(_nettle_salsa20_core)
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
+ C caller expects result little-endian
+IF_BE(< vrev32.u8 X0, X0
+ vrev32.u8 X1, X1
+ vrev32.u8 X2, X2
+ vrev32.u8 X3, X3>)
+
vstm DST, {X0,X1,X2,X3}
bx lr
EPILOGUE(_nettle_salsa20_core)
diff --git a/arm/neon/umac-nh.asm b/arm/neon/umac-nh.asm
index 158a5686..38be654c 100644
--- a/arm/neon/umac-nh.asm
+++ b/arm/neon/umac-nh.asm
@@ -97,6 +97,8 @@ PROLOGUE(_nettle_umac_nh)
bhi .Loop
vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY)
- vmov r0, r1, D0REG(QY)
+ C return value needs to respect word order mandated by AAPCS
+IF_LE(< vmov r0, r1, D0REG(QY)>)
+IF_BE(< vmov r1, r0, D0REG(QY)>)
bx lr
EPILOGUE(_nettle_umac_nh)
diff --git a/arm/v6/sha1-compress.asm b/arm/v6/sha1-compress.asm
index 59d6297e..8cc22be7 100644
--- a/arm/v6/sha1-compress.asm
+++ b/arm/v6/sha1-compress.asm
@@ -52,7 +52,7 @@ define(<LOAD>, <
sel W, WPREV, T0
ror W, W, SHIFT
mov WPREV, T0
- rev W, W
+IF_LE(< rev W, W>)
str W, [SP,#eval(4*$1)]
>)
define(<EXPN>, <
@@ -127,8 +127,12 @@ PROLOGUE(_nettle_sha1_compress)
lsl SHIFT, SHIFT, #3
mov T0, #0
movne T0, #-1
- lsl W, T0, SHIFT
+IF_LE(< lsl W, T0, SHIFT>)
+IF_BE(< lsr W, T0, SHIFT>)
uadd8 T0, T0, W C Sets APSR.GE bits
+ C on BE rotate right by 32-SHIFT bits
+ C because there is no rotate left
+IF_BE(< rsb SHIFT, SHIFT, #32>)
ldr K, .LK1
ldm STATE, {SA,SB,SC,SD,SE}
diff --git a/arm/v6/sha256-compress.asm b/arm/v6/sha256-compress.asm
index e6f4e1e9..324730c7 100644
--- a/arm/v6/sha256-compress.asm
+++ b/arm/v6/sha256-compress.asm
@@ -137,8 +137,12 @@ PROLOGUE(_nettle_sha256_compress)
lsl SHIFT, SHIFT, #3
mov T0, #0
movne T0, #-1
- lsl I1, T0, SHIFT
+IF_LE(< lsl I1, T0, SHIFT>)
+IF_BE(< lsr I1, T0, SHIFT>)
uadd8 T0, T0, I1 C Sets APSR.GE bits
+ C on BE rotate right by 32-SHIFT bits
+ C because there is no rotate left
+IF_BE(< rsb SHIFT, SHIFT, #32>)
mov DST, sp
mov ILEFT, #4
@@ -146,16 +150,16 @@ PROLOGUE(_nettle_sha256_compress)
ldm INPUT!, {I1,I2,I3,I4}
sel I0, I0, I1
ror I0, I0, SHIFT
- rev I0, I0
+IF_LE(< rev I0, I0>)
sel I1, I1, I2
ror I1, I1, SHIFT
- rev I1, I1
+IF_LE(< rev I1, I1>)
sel I2, I2, I3
ror I2, I2, SHIFT
- rev I2, I2
+IF_LE(< rev I2, I2>)
sel I3, I3, I4
ror I3, I3, SHIFT
- rev I3, I3
+IF_LE(< rev I3, I3>)
subs ILEFT, ILEFT, #1
stm DST!, {I0,I1,I2,I3}
mov I0, I4