summaryrefslogtreecommitdiff
path: root/arm
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2013-04-18 14:07:20 +0200
committerNiels Möller <nisse@lysator.liu.se>2013-04-18 14:17:49 +0200
commitade7779c98a5426c7d86c8a01bbd7ad65980c9b9 (patch)
tree0235c694ed12a49037d62e4b05ada53472c804ad /arm
parentb7c953630bf9a05eca5b744c89eb643049eeb700 (diff)
downloadnettle-ade7779c98a5426c7d86c8a01bbd7ad65980c9b9.tar.gz
Reorganization of ARM assembly.
Renamed directory armv7 to arm. New subdirectory arm/neon, for files using neon instructions. configure.ac hacked to make use of neon configurable.
Diffstat (limited to 'arm')
-rw-r--r--arm/README47
-rw-r--r--arm/aes-decrypt-internal.asm105
-rw-r--r--arm/aes-encrypt-internal.asm107
-rw-r--r--arm/aes.m4164
-rw-r--r--arm/ecc-192-modp.asm93
-rw-r--r--arm/ecc-224-modp.asm111
-rw-r--r--arm/ecc-256-redc.asm160
-rw-r--r--arm/ecc-384-modp.asm257
-rw-r--r--arm/ecc-521-modp.asm114
-rw-r--r--arm/machine.m456
-rw-r--r--arm/memxor.asm488
-rw-r--r--arm/neon/salsa20-core-internal.asm181
-rw-r--r--arm/neon/sha3-permute.asm266
-rw-r--r--arm/neon/sha512-compress.asm317
-rw-r--r--arm/neon/umac-nh-n.asm298
-rw-r--r--arm/neon/umac-nh.asm89
-rw-r--r--arm/sha1-compress.asm234
-rw-r--r--arm/sha256-compress.asm204
18 files changed, 3291 insertions, 0 deletions
diff --git a/arm/README b/arm/README
new file mode 100644
index 00000000..9bacd97b
--- /dev/null
+++ b/arm/README
@@ -0,0 +1,47 @@
+Currently, code in this directory is written for arm cortex-a9.
+
+For efficient loads and stores, use ldmia, stmia and friends. Can do
+two loads or stores per cycle with 8-byte aligned addresses, or three
+loads or stores in two cycles, regardless of alignment.
+
+12 usable registers (if we exclude r9).
+
+ABI gnueabi(hf) (not depending on the floating point conventions)
+
+Registers May be Argument
+ clobbered number
+
+r0 Y 1
+r1 Y 2
+r2 Y 3
+r3 Y 4
+r4 N
+r5 N
+r6 N
+r7 N
+r8 N
+r9 (sl)
+r10 N
+r11 N
+r12 (ip) Y
+r13 (sp)
+r14 (lr) N
+r15 (pc)
+
+q0 (d0, d1) Y 1 (for "hf" abi)
+q1 (d2, d3) Y 2
+q2 (d4, d5) Y 3
+q3 (d6, d7) Y 4
+q4 (d8, d9) N
+q5 (d10, d11) N
+q6 (d12, d13) N
+q7 (d14, d15) N
+q8 (d16, d17) Y
+q9 (d18, d19) Y
+q10 (d20, d21) Y
+q11 (d22, d23) Y
+q12 (d24, d25) Y
+q13 (d26, d27) Y
+q14 (d28, d29) Y
+q15 (d30, d31) Y
+
diff --git a/arm/aes-decrypt-internal.asm b/arm/aes-decrypt-internal.asm
new file mode 100644
index 00000000..1cd92fb2
--- /dev/null
+++ b/arm/aes-decrypt-internal.asm
@@ -0,0 +1,105 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+include_src(<arm/aes.m4>)
+
+C Benchmarked at at 785, 914, 1051 cycles/block on cortex A9,
+C for 128, 192 and 256 bit key sizes. Unclear why it is slower
+C than _aes_encrypt.
+
+define(<CTX>, <r0>)
+define(<TABLE>, <r1>)
+define(<LENGTH>, <r2>)
+define(<DST>, <r3>)
+define(<SRC>, <r12>)
+
+define(<W0>, <r4>)
+define(<W1>, <r5>)
+define(<W2>, <r6>)
+define(<W3>, <r7>)
+define(<T0>, <r8>)
+define(<KEY>, <r10>)
+define(<ROUND>, <r11>)
+
+define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
+define(<X1>, <r3>)
+define(<X2>, <r12>)
+define(<X3>, <r14>) C lr
+
+
+ .file "aes-decrypt-internal.asm"
+
+ C _aes_decrypt(struct aes_context *ctx,
+ C const struct aes_table *T,
+ C unsigned length, uint8_t *dst,
+ C uint8_t *src)
+ .text
+ .align 2
+PROLOGUE(_nettle_aes_decrypt)
+ teq LENGTH, #0
+ beq .Lend
+ ldr SRC, [sp]
+
+ push {r4,r5,r6,r7,r8,r10,r11,lr}
+.Lblock_loop:
+ mov KEY, CTX
+ AES_LOAD(SRC,KEY,W0)
+ AES_LOAD(SRC,KEY,W1)
+ AES_LOAD(SRC,KEY,W2)
+ AES_LOAD(SRC,KEY,W3)
+
+ push {LENGTH, DST, SRC}
+ ldr ROUND, [CTX, #+AES_NROUNDS]
+ add TABLE, TABLE, #AES_TABLE0
+
+ b .Lentry
+ .align 2
+.Lround_loop:
+ C Transform X -> W
+ AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
+
+.Lentry:
+ subs ROUND, ROUND,#2
+ C Transform W -> X
+ AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
+
+ bne .Lround_loop
+
+ sub TABLE, TABLE, #AES_TABLE0
+ C Final round
+ AES_FINAL_ROUND(X0, X3, X2, X1, KEY, W0)
+ AES_FINAL_ROUND(X1, X0, X3, X2, KEY, W1)
+ AES_FINAL_ROUND(X2, X1, X0, X3, KEY, W2)
+ AES_FINAL_ROUND(X3, X2, X1, X0, KEY, W3)
+
+ pop {LENGTH, DST, SRC}
+
+ AES_STORE(DST,W0)
+ AES_STORE(DST,W1)
+ AES_STORE(DST,W2)
+ AES_STORE(DST,W3)
+
+ subs LENGTH, LENGTH, #16
+ bhi .Lblock_loop
+
+ pop {r4,r5,r6,r7,r8,r10,r11,pc}
+
+.Lend:
+ bx lr
+EPILOGUE(_nettle_aes_decrypt)
diff --git a/arm/aes-encrypt-internal.asm b/arm/aes-encrypt-internal.asm
new file mode 100644
index 00000000..b3309351
--- /dev/null
+++ b/arm/aes-encrypt-internal.asm
@@ -0,0 +1,107 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+include_src(<arm/aes.m4>)
+
+C Benchmarked at at 693, 824, 950 cycles/block on cortex A9,
+C for 128, 192 and 256 bit key sizes.
+
+C Possible improvements: More efficient load and store with
+C aligned accesses. Better scheduling.
+
+define(<CTX>, <r0>)
+define(<TABLE>, <r1>)
+define(<LENGTH>, <r2>)
+define(<DST>, <r3>)
+define(<SRC>, <r12>)
+
+define(<W0>, <r4>)
+define(<W1>, <r5>)
+define(<W2>, <r6>)
+define(<W3>, <r7>)
+define(<T0>, <r8>)
+define(<KEY>, <r10>)
+define(<ROUND>, <r11>)
+
+define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
+define(<X1>, <r3>)
+define(<X2>, <r12>)
+define(<X3>, <r14>) C lr
+
+
+ .file "aes-encrypt-internal.asm"
+
+ C _aes_encrypt(struct aes_context *ctx,
+ C const struct aes_table *T,
+ C unsigned length, uint8_t *dst,
+ C uint8_t *src)
+ .text
+ .align 2
+PROLOGUE(_nettle_aes_encrypt)
+ teq LENGTH, #0
+ beq .Lend
+ ldr SRC, [sp]
+
+ push {r4,r5,r6,r7,r8,r10,r11,lr}
+.Lblock_loop:
+ mov KEY, CTX
+ AES_LOAD(SRC,KEY,W0)
+ AES_LOAD(SRC,KEY,W1)
+ AES_LOAD(SRC,KEY,W2)
+ AES_LOAD(SRC,KEY,W3)
+
+ push {LENGTH, DST, SRC}
+ ldr ROUND, [CTX, #+AES_NROUNDS]
+ add TABLE, TABLE, #AES_TABLE0
+
+ b .Lentry
+ .align 2
+.Lround_loop:
+ C Transform X -> W
+ AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
+
+.Lentry:
+ subs ROUND, ROUND,#2
+ C Transform W -> X
+ AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
+
+ bne .Lround_loop
+
+ sub TABLE, TABLE, #AES_TABLE0
+ C Final round
+ AES_FINAL_ROUND(X0, X1, X2, X3, KEY, W0)
+ AES_FINAL_ROUND(X1, X2, X3, X0, KEY, W1)
+ AES_FINAL_ROUND(X2, X3, X0, X1, KEY, W2)
+ AES_FINAL_ROUND(X3, X0, X1, X2, KEY, W3)
+
+ pop {LENGTH, DST, SRC}
+
+ AES_STORE(DST,W0)
+ AES_STORE(DST,W1)
+ AES_STORE(DST,W2)
+ AES_STORE(DST,W3)
+
+ subs LENGTH, LENGTH, #16
+ bhi .Lblock_loop
+
+ pop {r4,r5,r6,r7,r8,r10,r11,pc}
+
+.Lend:
+ bx lr
+EPILOGUE(_nettle_aes_encrypt)
diff --git a/arm/aes.m4 b/arm/aes.m4
new file mode 100644
index 00000000..00d3c9a3
--- /dev/null
+++ b/arm/aes.m4
@@ -0,0 +1,164 @@
+C Loads one word, and adds it to the subkey. Uses T0
+C AES_LOAD(SRC, KEY, REG)
+define(<AES_LOAD>, <
+ ldrb $3, [$1], #+1
+ ldrb T0, [$1], #+1
+ orr $3, T0, lsl #8
+ ldrb T0, [$1], #+1
+ orr $3, T0, lsl #16
+ ldrb T0, [$1], #+1
+ orr $3, T0, lsl #24
+ ldr T0, [$2], #+4
+ eor $3, T0
+>)
+C Stores one word. Destroys input.
+C AES_STORE(DST, X)
+define(<AES_STORE>, <
+ strb $2, [$1], #+1
+ ror $2, $2, #8
+ strb $2, [$1], #+1
+ ror $2, $2, #8
+ strb $2, [$1], #+1
+ ror $2, $2, #8
+ strb $2, [$1], #+1
+>)
+
+C 53 instr.
+C It's tempting to use eor with rotation, but that's slower.
+C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
+define(<AES_ENCRYPT_ROUND>, <
+ uxtb T0, $1
+ ldr $5, [TABLE, T0, lsl #2]
+ uxtb T0, $2
+ ldr $6, [TABLE, T0, lsl #2]
+ uxtb T0, $3
+ ldr $7, [TABLE, T0, lsl #2]
+ uxtb T0, $4
+ ldr $8, [TABLE, T0, lsl #2]
+
+ uxtb T0, $2, ror #8
+ add TABLE, TABLE, #1024
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $5, $5, T0
+ uxtb T0, $3, ror #8
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $6, $6, T0
+ uxtb T0, $4, ror #8
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $7, $7, T0
+ uxtb T0, $1, ror #8
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $8, $8, T0
+
+ uxtb T0, $3, ror #16
+ add TABLE, TABLE, #1024
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $5, $5, T0
+ uxtb T0, $4, ror #16
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $6, $6, T0
+ uxtb T0, $1, ror #16
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $7, $7, T0
+ uxtb T0, $2, ror #16
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $8, $8, T0
+
+ uxtb T0, $4, ror #24
+ add TABLE, TABLE, #1024
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $5, $5, T0
+ uxtb T0, $1, ror #24
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $6, $6, T0
+ uxtb T0, $2, ror #24
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $7, $7, T0
+ uxtb T0, $3, ror #24
+ ldr T0, [TABLE, T0, lsl #2]
+
+ ldm $9!, {$1,$2,$3,$4}
+ eor $8, $8, T0
+ sub TABLE, TABLE, #3072
+ eor $5, $5, $1
+ eor $6, $6, $2
+ eor $7, $7, $3
+ eor $8, $8, $4
+>)
+
+define(<AES_DECRYPT_ROUND>, <
+ uxtb T0, $1
+ ldr $5, [TABLE, T0, lsl #2]
+ uxtb T0, $2
+ ldr $6, [TABLE, T0, lsl #2]
+ uxtb T0, $3
+ ldr $7, [TABLE, T0, lsl #2]
+ uxtb T0, $4
+ ldr $8, [TABLE, T0, lsl #2]
+
+ uxtb T0, $4, ror #8
+ add TABLE, TABLE, #1024
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $5, $5, T0
+ uxtb T0, $1, ror #8
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $6, $6, T0
+ uxtb T0, $2, ror #8
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $7, $7, T0
+ uxtb T0, $3, ror #8
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $8, $8, T0
+
+ uxtb T0, $3, ror #16
+ add TABLE, TABLE, #1024
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $5, $5, T0
+ uxtb T0, $4, ror #16
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $6, $6, T0
+ uxtb T0, $1, ror #16
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $7, $7, T0
+ uxtb T0, $2, ror #16
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $8, $8, T0
+
+ uxtb T0, $2, ror #24
+ add TABLE, TABLE, #1024
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $5, $5, T0
+ uxtb T0, $3, ror #24
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $6, $6, T0
+ uxtb T0, $4, ror #24
+ ldr T0, [TABLE, T0, lsl #2]
+ eor $7, $7, T0
+ uxtb T0, $1, ror #24
+ ldr T0, [TABLE, T0, lsl #2]
+
+ ldm $9!, {$1,$2,$3,$4}
+ eor $8, $8, T0
+ sub TABLE, TABLE, #3072
+ eor $5, $5, $1
+ eor $6, $6, $2
+ eor $7, $7, $3
+ eor $8, $8, $4
+>)
+
+C AES_FINAL_ROUND(a,b,c,d,key,res)
+define(<AES_FINAL_ROUND>, <
+ uxtb T0, $1
+ ldrb $6, [TABLE, T0]
+ uxtb T0, $2, ror #8
+ ldrb T0, [TABLE, T0]
+ eor $6, $6, T0, lsl #8
+ uxtb T0, $3, ror #16
+ ldrb T0, [TABLE, T0]
+ eor $6, $6, T0, lsl #16
+ uxtb T0, $4, ror #24
+ ldrb T0, [TABLE, T0]
+ eor $6, $6, T0, lsl #24
+ ldr T0, [$5], #+4
+ eor $6, T0
+>)
diff --git a/arm/ecc-192-modp.asm b/arm/ecc-192-modp.asm
new file mode 100644
index 00000000..1b226e30
--- /dev/null
+++ b/arm/ecc-192-modp.asm
@@ -0,0 +1,93 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-192-modp.asm"
+ .arm
+
+define(<HP>, <r0>) C Overlaps unused ecc argument
+define(<RP>, <r1>)
+
+define(<T0>, <r2>)
+define(<T1>, <r3>)
+define(<T2>, <r4>)
+define(<T3>, <r5>)
+define(<T4>, <r6>)
+define(<T5>, <r7>)
+define(<T6>, <r8>)
+define(<T7>, <r10>)
+define(<H0>, <T0>) C Overlaps T0 and T1
+define(<H1>, <T1>)
+define(<C2>, <HP>)
+define(<C4>, <r12>)
+
+ C ecc_192_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_192_modp)
+ push {r4,r5,r6,r7,r8,r10}
+ C Reduce two words at a time
+ add HP, RP, #48
+ add RP, RP, #8
+ ldmdb HP!, {H0,H1}
+ ldm RP, {T2,T3,T4,T5,T6,T7}
+ mov C4, #0
+ adds T4, T4, H0
+ adcs T5, T5, H1
+ adcs T6, T6, H0
+ adcs T7, T7, H1
+ C Need to add carry to T2 and T4, do T4 later.
+ adc C4, C4, #0
+
+ ldmdb HP!, {H0,H1}
+ mov C2, #0
+ adcs T2, T2, H0
+ adcs T3, T3, H1
+ adcs T4, T4, H0
+ adcs T5, T5, H1
+ C Need to add carry to T0 and T2, do T2 later
+ adc C2, C2, #0
+
+ ldmdb RP!, {T0, T1}
+ adcs T0, T0, T6
+ adcs T1, T1, T7
+ adcs T2, T2, T6
+ adcs T3, T3, T7
+ adc C4, C4, #0
+
+ adds T2, T2, C2
+ adcs T3, T3, #0
+ adcs T4, T4, C4
+ adcs T5, T5, #0
+ mov C2, #0
+ adc C2, C2, #0
+
+ C Add in final carry
+ adcs T0, T0, #0
+ adcs T1, T1, #0
+ adcs T2, T2, C2
+ adcs T3, T3, #0
+ adcs T4, T4, #0
+ adc T5, T5, #0
+
+ stm RP, {T0,T1,T2,T3,T4,T5}
+
+ pop {r4,r5,r6,r7,r8,r10}
+ bx lr
+EPILOGUE(nettle_ecc_192_modp)
diff --git a/arm/ecc-224-modp.asm b/arm/ecc-224-modp.asm
new file mode 100644
index 00000000..ef7a703a
--- /dev/null
+++ b/arm/ecc-224-modp.asm
@@ -0,0 +1,111 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-224-modp.asm"
+ .arm
+
+define(<RP>, <r1>)
+define(<H>, <r0>) C Overlaps unused ecc argument
+
+define(<T0>, <r2>)
+define(<T1>, <r3>)
+define(<T2>, <r4>)
+define(<T3>, <r5>)
+define(<T4>, <r6>)
+define(<T5>, <r7>)
+define(<T6>, <r8>)
+define(<N3>, <r10>)
+define(<L0>, <r11>)
+define(<L1>, <r12>)
+define(<L2>, <lr>)
+
+ C ecc_224_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_224_modp)
+ push {r4,r5,r6,r7,r8,r10,r11,lr}
+
+ add L2, RP, #28
+ ldm L2, {T0,T1,T2,T3,T4,T5,T6}
+ mov H, #0
+
+ adds T0, T0, T4
+ adcs T1, T1, T5
+ adcs T2, T2, T6
+ adc H, H, #0
+
+ C This switch from adcs to sbcs takes carry into account with
+ C correct sign, but it always subtracts 1 too much. We arrange
+ C to also add B^7 + 1 below, so the effect is adding p. This
+ C addition of p also ensures that the result never is
+ C negative.
+
+ sbcs N3, T3, T0
+ sbcs T4, T4, T1
+ sbcs T5, T5, T2
+ sbcs T6, T6, H
+ mov H, #1 C This is the B^7
+ sbc H, #0
+ subs T6, T6, T3
+ sbc H, #0
+
+ C Now subtract from low half
+ ldm RP!, {L0,L1,L2}
+
+ C Clear carry, with the sbcs, this is the 1.
+ adds RP, #0
+
+ sbcs T0, L0, T0
+ sbcs T1, L1, T1
+ sbcs T2, L2, T2
+ ldm RP!, {T3,L0,L1,L2}
+ sbcs T3, T3, N3
+ sbcs T4, L0, T4
+ sbcs T5, L1, T5
+ sbcs T6, L2, T6
+ rsc H, H, #0
+
+ C Now -2 <= H <= 0 is the borrow, so subtract (B^3 - 1) |H|
+ C Use (B^3 - 1) H = <H, H, H> if -1 <=H <= 0, and
+ C (B^3 - 1) H = <1,B-1, B-1, B-2> if H = -2
+ subs T0, T0, H
+ asr L1, H, #1
+ sbcs T1, T1, L1
+ eor H, H, L1
+ sbcs T2, T2, L1
+ sbcs T3, T3, H
+ sbcs T4, T4, #0
+ sbcs T5, T5, #0
+ sbcs T6, T6, #0
+ sbcs H, H, H
+
+ C Final borrow, subtract (B^3 - 1) |H|
+ subs T0, T0, H
+ sbcs T1, T1, H
+ sbcs T2, T2, H
+ sbcs T3, T3, #0
+ sbcs T4, T4, #0
+ sbcs T5, T5, #0
+ sbcs T6, T6, #0
+
+ stmdb RP, {T0,T1,T2,T3,T4,T5,T6}
+
+ pop {r4,r5,r6,r7,r8,r10,r11,pc}
+EPILOGUE(nettle_ecc_224_modp)
diff --git a/arm/ecc-256-redc.asm b/arm/ecc-256-redc.asm
new file mode 100644
index 00000000..cbf10a89
--- /dev/null
+++ b/arm/ecc-256-redc.asm
@@ -0,0 +1,160 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-256-redc.asm"
+ .arm
+
+define(<RP>, <r1>)
+
+define(<T0>, <r0>) C Overlaps unused ecc argument
+define(<T1>, <r2>)
+define(<T2>, <r3>)
+define(<T3>, <r4>)
+define(<T4>, <r5>)
+define(<T5>, <r6>)
+define(<T6>, <r7>)
+define(<T7>, <r8>)
+define(<F0>, <r10>)
+define(<F1>, <r11>)
+define(<F2>, <r12>)
+define(<F3>, <lr>)
+
+ C ecc_256_redc (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_256_redc)
+ push {r4,r5,r6,r7,r8,r10,r11,lr}
+
+ ldm RP!, {T0,T1,T2,T3,T4,T5,T6,T7}
+
+ C Set <F3,F2,F1> to the high 4 limbs of (B^2-B+1)<T2,T1,T0>
+ C T2 T1
+ C T2 T1 T0
+ C - T2 T1 T0
+ C -------------
+ C F3 F2 F1 F0
+
+
+ adds F1, T0, T2
+ adcs F2, T1, #0
+ adc F3, T2, #0
+
+ subs F0, T1, T0
+ sbcs F1, F1, T1 C Could also be rsc ?
+ sbcs F2, F2, T2
+ sbc F3, F3, #0
+
+ C Add:
+ C T10 T9 T8 T7 T6 T5 T4 T3
+ C + F3 F2 F1 F0 T0 T2 T1 T0
+ C --------------------------
+ C T7 T6 T5 T4 T3 T2 T1 T0
+
+ adds T3, T3, T0
+ adcs T1, T4, T1
+ adcs T2, T5, T2
+ adcs T6, T6, T0
+ mov T0, T3 C FIXME: Be more clever?
+ mov T3, T6
+ adcs T4, T7, F0
+
+ ldm RP!, {T5,T6,T7}
+ adcs T5, T5, F1
+ adcs T6, T6, F2
+ adcs T7, T7, F3
+
+ C New F3, F2, F1, F0, also adding in carry
+ adcs F1, T0, T2
+ adcs F2, T1, #0
+ adc F3, T2, #0
+
+ subs F0, T1, T0
+ sbcs F1, F1, T1 C Could also be rsc ?
+ sbcs F2, F2, T2
+ sbc F3, F3, #0
+
+ C Start adding
+ adds T3, T3, T0
+ adcs T1, T4, T1
+ adcs T2, T5, T2
+ adcs T6, T6, T0
+ mov T0, T3 C FIXME: Be more clever?
+ mov T3, T6
+ adcs T4, T7, F0
+
+ ldm RP!, {T5,T6,T7}
+ adcs T5, T5, F1
+ adcs T6, T6, F2
+ adcs T7, T7, F3
+
+ C Final iteration, eliminate only T0, T1
+ C Set <F2, F1, F0> to the high 3 limbs of (B^2-B+1)<T1,T0>
+
+ C T1 T0 T1
+ C - T1 T0
+ C -------------
+ C F2 F1 F0
+
+ C First add in carry
+ adcs F1, T0, #0
+ adcs F2, T1, #0
+ subs F0, T1, T0
+ sbcs F1, F1, T1
+ sbc F2, F2, #0
+
+ C Add:
+ C T9 T8 T7 T6 T5 T4 T3 T2
+ C + F2 F1 F0 T0 0 T1 T0 0
+ C --------------------------
+ C F2 F1 T7 T6 T5 T4 T3 T2
+
+ adds T3, T3, T0
+ adcs T4, T4, T1
+ adcs T5, T5, #0
+ adcs T6, T6, T0
+ adcs T7, T7, F0
+ ldm RP!, {T0, T1}
+ mov F3, #0
+ adcs F1, F1, T0
+ adcs F2, F2, T1
+
+ C Sum is < B^8 + p, so it's enough to fold carry once,
+ C If carry, add in
+ C B^7 - B^6 - B^3 + 1 = <0, B-2, B-1, B-1, B-1, 0, 0, 1>
+
+ C Mask from carry flag, leaving carry intact
+ adc F3, F3, #0
+ rsb F3, F3, #0
+
+ adcs T0, T2, #0
+ adcs T1, T3, #0
+ adcs T2, T4, #0
+ adcs T3, T5, F3
+ adcs T4, T6, F3
+ adcs T5, T7, F3
+ and F3, F3, #-2
+ adcs T6, F1, F3
+ adcs T7, F2, #0
+
+ sub RP, RP, #64
+ stm RP, {T0,T1,T2,T3,T4,T5,T6,T7}
+
+ pop {r4,r5,r6,r7,r8,r10,r11,pc}
+EPILOGUE(nettle_ecc_256_redc)
diff --git a/arm/ecc-384-modp.asm b/arm/ecc-384-modp.asm
new file mode 100644
index 00000000..fb5a6e12
--- /dev/null
+++ b/arm/ecc-384-modp.asm
@@ -0,0 +1,257 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-384-modp.asm"
+ .arm
+
+define(<RP>, <r1>)
+define(<T0>, <r0>)
+define(<T1>, <r2>)
+define(<T2>, <r3>)
+define(<T3>, <r4>)
+define(<F0>, <r5>)
+define(<F1>, <r6>)
+define(<F2>, <r7>)
+define(<F3>, <r8>)
+define(<F4>, <r10>)
+define(<N>, <r12>)
+define(<H>, <lr>)
+
+ C ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_384_modp)
+ push {r4,r5,r6,r7,r8,r10,lr}
+
+ add RP, RP, #80
+ ldm RP, {T0, T1, T2, T3} C 20-23
+
+ C First get top 4 limbs, which need folding twice, as
+ C
+ C T3 T2 T1 T0
+ C T3 T2 T1
+ C -T3
+ C ----------------
+ C F4 F3 F2 F1 F0
+ C
+ C Start with
+ C
+ C T3 T1 T0
+ C T1
+ C -T3
+ C -----------
+ C F2 F1 F0 Always fits
+
+ adds F0, T0, T1
+ adcs F1, T1, #0
+ adcs F2, T3, #0
+ subs F0, F0, T3
+ sbcs F1, F1, #0
+ sbcs F2, F2, #0
+
+ C T3 T2 T2 0
+ C F2 F1 F0
+ C ----------------
+ C F4 F3 F2 F1 F0
+
+ mov F4, #0
+ adds F1, F1, T2
+ adcs F2, F2, T2
+ adcs F3, T3, #0
+ adcs F4, F4, #0
+
+ C Add in to high part
+ sub RP, RP, #32
+ ldm RP, {T0, T1, T2, T3} C 12-15
+ mov H, #0
+ adds F0, T0, F0
+ adcs F1, T1, F1
+ adcs F2, T2, F2
+ adcs F3, T3, F3
+ adcs F4, F4, #0 C Do F4 later
+
+ C Add to low part, keeping carry (positive or negative) in H
+ sub RP, RP, #48
+ ldm RP, {T0, T1, T2, T3} C 0-3
+ mov H, #0
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+ subs T1, T1, F0
+ sbcs T2, T2, F1
+ sbcs T3, T3, F2
+ sbc H, H, #0
+ adds T3, T3, F0
+ adc H, H, #0
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ mov N, #2
+.Loop:
+ ldm RP, {T0,T1,T2,T3} C 4-7
+
+ C First, propagate carry
+ adds T0, T0, H
+ asr H, #31 C Sign extend
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ adc H, H, #0
+
+ C +B^4 term
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+
+ C +B^3 terms
+ ldr F0, [RP, #+48] C 16
+ adds T0, T0, F1
+ adcs T1, T1, F2
+ adcs T2, T2, F3
+ adcs T3, T3, F0
+ adc H, H, #0
+
+ C -B
+ ldr F1, [RP, #+52] C 17-18
+ ldr F2, [RP, #+56]
+ subs T0, T0, F3
+ sbcs T1, T1, F0
+ sbcs T2, T2, F1
+ sbcs T3, T3, F2
+ sbcs H, H, #0
+
+ C +1
+ ldr F3, [RP, #+60] C 19
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+ subs N, N, #1
+ stm RP!, {T0,T1,T2,T3}
+ bne .Loop
+
+ C Fold high limbs, we need to add in
+ C
+ C F4 F4 0 -F4 F4 H H 0 -H H
+ C
+ C We always have F4 >= 0, but we can have H < 0.
+ C Sign extension gets tricky when F4 = 0 and H < 0.
+ sub RP, RP, #48
+
+ ldm RP, {T0,T1,T2,T3} C 0-3
+
+ C H H 0 -H H
+ C ----------------
+ C S H F3 F2 F1 F0
+ C
+ C Define S = H >> 31 (asr), we then have
+ C
+ C F0 = H
+ C F1 = S - H
+ C F2 = - [H > 0]
+ C F3 = H - [H > 0]
+ C H = H + S
+ C
+ C And we get underflow in S - H iff H > 0
+
+ C H = 0 H > 0 H = -1
+ mov F0, H C 0 H -1
+ asr H, #31
+ subs F1, H, F0 C 0,C=1 -H,C=0 0,C=1
+ sbc F2, F2, F2 C 0 -1 0
+ sbc F3, F0, #0 C 0 H-1 -1
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, F0 C 0+cy H+cy -2+cy
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ ldm RP, {T0,T1,T2,T3} C 4-7
+
+ C F4 0 -F4
+ C ---------
+ C F3 F2 F1
+
+ rsbs F1, F4, #0
+ sbc F2, F2, F2
+ sbc F3, F4, #0
+
+ C Sign extend H
+ adds F0, F4, H
+ asr H, H, #31
+ adcs F1, F1, H
+ adcs F2, F2, H
+ adcs F3, F3, H
+ adcs F4, F4, H
+ adc H, H, #0
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+
+ stm RP!, {T0,T1,T2,T3} C 4-7
+ ldm RP, {T0,T1,T2,T3} C 8-11
+
+ adcs T0, T0, F4
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ adc H, H, #0
+
+ stm RP, {T0,T1,T2,T3} C 8-11
+
+ C Final (unlikely) carry
+ sub RP, RP, #32
+ ldm RP, {T0,T1,T2,T3} C 0-3
+ C Fold H into F0-F4
+ mov F0, H
+ asr H, #31
+ subs F1, H, F0
+ sbc F2, F2, F2
+ sbc F3, F0, #0
+ add F4, F0, H
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ ldm RP, {T0,T1,T2,T3} C 4-7
+ adcs T0, T0, F4
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ stm RP!, {T0,T1,T2,T3} C 4-7
+ ldm RP, {T0,T1,T2,T3} C 8-11
+ adcs T0, T0, H
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ stm RP!, {T0,T1,T2,T3} C 8-11
+ pop {r4,r5,r6,r7,r8,r10,pc}
+EPILOGUE(nettle_ecc_384_modp)
diff --git a/arm/ecc-521-modp.asm b/arm/ecc-521-modp.asm
new file mode 100644
index 00000000..fe305805
--- /dev/null
+++ b/arm/ecc-521-modp.asm
@@ -0,0 +1,114 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-521-modp.asm"
+ .arm
+
+define(<HP>, <r0>)
+define(<RP>, <r1>)
+define(<T0>, <r2>)
+define(<T1>, <r3>)
+define(<T2>, <r4>)
+define(<F0>, <r5>)
+define(<F1>, <r6>)
+define(<F2>, <r7>)
+define(<F3>, <r8>)
+define(<H>, <r12>)
+define(<N>, <lr>)
+
+ C ecc_521_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+.Lc511:
+ .int 511
+
+ .align 2
+
+PROLOGUE(nettle_ecc_521_modp)
+ push {r4,r5,r6,r7,r8,lr}
+
+ C Use that B^17 = 2^23 (mod p)
+ ldr F3, [RP, #+68] C 17
+ add HP, RP, #72 C 18
+ ldr T0, [RP] C 0
+ adds T0, T0, F3, lsl #23
+ str T0, [RP], #+4
+ mov N, #5
+
+ C 5 iterations, reading limbs 18-20, 21-23, 24-26, 27-29, 30-32
+ C and adding to limbs 1-3, 4-6, 7-9, 19-12, 13-15
+.Loop:
+ ldm RP, {T0,T1,T2} C 1+3*k -- 3+3*k
+ lsr F0, F3, #9
+ ldm HP!, {F1,F2,F3} C 18+3*k -- 20+3*k
+ orr F0, F0, F1, lsl #23
+ lsr F1, F1, #9
+ orr F1, F1, F2, lsl #23
+ lsr F2, F2, #9
+ orr F2, F2, F3, lsl #23
+ adcs T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ sub N, N, #1
+ stm RP!,{T0,T1,T2}
+ teq N, #0
+ bne .Loop
+
+ ldr F0, [RP], #-64 C 16
+ ldr F1, [HP] C 33
+ ldr T0, .Lc511
+
+ C Handling of high limbs
+ C F0 = rp[16] + carry in + F3 >> 9
+ adcs F0, F0, F3, lsr #9
+ C Copy low 9 bits to H, then shift right including carry
+ and H, F0, T0
+ rrx F0, F0
+ lsr F0, F0, #8
+ C Add in F1 = rp[33], with weight 2^1056 = 2^14
+ adds F0, F0, F1, lsl #14
+ lsr F1, F1, #18
+ adc F1, F1, #0
+
+ ldm RP, {T0, T1} C 0-1
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ stm RP!, {T0, T1}
+
+ ldm RP, {T0,T1,T2,F0,F1,F2,F3} C 2-8
+ adcs T0, T0, #0
+ adcs T1, T1, #0
+ adcs T2, T2, #0
+ adcs F0, F0, #0
+ adcs F1, F1, #0
+ adcs F2, F2, #0
+ adcs F3, F3, #0
+ stm RP!, {T0,T1,T2,F0,F1,F2,F3} C 2-8
+ ldm RP, {T0,T1,T2,F0,F1,F2,F3} C 9-15
+ adcs T0, T0, #0
+ adcs T1, T1, #0
+ adcs T2, T2, #0
+ adcs F0, F0, #0
+ adcs F1, F1, #0
+ adcs F2, F2, #0
+ adcs F3, F3, #0
+ adcs H, H, #0
+ stm RP, {T0,T1,T2,F0,F1,F2,F3,H} C 9-16
+
+ pop {r4,r5,r6,r7,r8,pc}
+EPILOGUE(nettle_ecc_521_modp)
diff --git a/arm/machine.m4 b/arm/machine.m4
new file mode 100644
index 00000000..f982a66a
--- /dev/null
+++ b/arm/machine.m4
@@ -0,0 +1,56 @@
+define(<QREG>, <ifelse(
+ $1, d0, q0,
+ $1, d2, q1,
+ $1, d4, q2,
+ $1, d6, q3,
+ $1, d8, q4,
+ $1, d10, q5,
+ $1, d12, q6,
+ $1, d14, q7,
+ $1, d16, q8,
+ $1, d18, q9,
+ $1, d20, q10,
+ $1, d22, q11,
+ $1, d24, q12,
+ $1, d26, q13,
+ $1, d28, q14,
+ $1, d30, q15,
+ <NO REGISTER>)>)dnl
+
+define(<D0REG>, <ifelse(
+ $1, q0, d0,
+ $1, q1, d2,
+ $1, q2, d4,
+ $1, q3, d6,
+ $1, q4, d8,
+ $1, q5, d10,
+ $1, q6, d12,
+ $1, q7, d14,
+ $1, q8, d16,
+ $1, q9, d18,
+ $1, q10, d20,
+ $1, q11, d22,
+ $1, q12, d24,
+ $1, q13, d26,
+ $1, q14, d28,
+ $1, q15, d30,
+ <NO REGISTER>)>)dnl
+
+define(<D1REG>, <ifelse(
+ $1, q0, d1,
+ $1, q1, d3,
+ $1, q2, d5,
+ $1, q3, d7,
+ $1, q4, d9,
+ $1, q5, d11,
+ $1, q6, d13,
+ $1, q7, d15,
+ $1, q8, d17,
+ $1, q9, d19,
+ $1, q10, d21,
+ $1, q11, d23,
+ $1, q12, d25,
+ $1, q13, d27,
+ $1, q14, d29,
+ $1, q15, d31,
+ <NO REGISTER>)>)dnl
diff --git a/arm/memxor.asm b/arm/memxor.asm
new file mode 100644
index 00000000..33f672c6
--- /dev/null
+++ b/arm/memxor.asm
@@ -0,0 +1,488 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+C Possible speedups:
+C
+C The ldm instruction can do load two registers per cycle,
+C if the address is two-word aligned. Or three registers in two
+C cycles, regardless of alignment.
+
+C Register usage:
+
+define(<DST>, <r0>)
+define(<SRC>, <r1>)
+define(<N>, <r2>)
+define(<CNT>, <r6>)
+define(<TNC>, <r12>)
+
+ .syntax unified
+
+ .file "memxor.asm"
+
+ .text
+ .arm
+
+ C memxor(uint8_t *dst, const uint8_t *src, size_t n)
+ .align 4
+PROLOGUE(memxor)
+ cmp N, #0
+ beq .Lmemxor_done
+
+ cmp N, #7
+ bcs .Lmemxor_large
+
+ C Simple byte loop
+.Lmemxor_bytes:
+ ldrb r3, [SRC], #+1
+ ldrb r12, [DST]
+ eor r3, r12
+ strb r3, [DST], #+1
+ subs N, #1
+ bne .Lmemxor_bytes
+
+.Lmemxor_done:
+ bx lr
+
+.Lmemxor_align_loop:
+ ldrb r3, [SRC], #+1
+ ldrb r12, [DST]
+ eor r3, r12
+ strb r3, [DST], #+1
+ sub N, #1
+
+.Lmemxor_large:
+ tst DST, #3
+ bne .Lmemxor_align_loop
+
+ C We have at least 4 bytes left to do here.
+ sub N, #4
+
+ ands r3, SRC, #3
+ beq .Lmemxor_same
+
+ C Different alignment case.
+ C v original SRC
+ C +-------+------+
+ C |SRC |SRC+4 |
+ C +---+---+------+
+ C |DST |
+ C +-------+
+ C
+ C With little-endian, we need to do
+ C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
+
+ push {r4,r5,r6}
+
+ lsl CNT, r3, #3
+ bic SRC, #3
+ rsb TNC, CNT, #32
+
+ ldr r4, [SRC], #+4
+
+ tst N, #4
+ itet eq
+ moveq r5, r4
+ subne N, #4
+ beq .Lmemxor_odd
+
+.Lmemxor_word_loop:
+ ldr r5, [SRC], #+4
+ ldr r3, [DST]
+ eor r3, r3, r4, lsr CNT
+ eor r3, r3, r5, lsl TNC
+ str r3, [DST], #+4
+.Lmemxor_odd:
+ ldr r4, [SRC], #+4
+ ldr r3, [DST]
+ eor r3, r3, r5, lsr CNT
+ eor r3, r3, r4, lsl TNC
+ str r3, [DST], #+4
+ subs N, #8
+ bcs .Lmemxor_word_loop
+ adds N, #8
+ beq .Lmemxor_odd_done
+
+ C We have TNC/8 left-over bytes in r4, high end
+ lsr r4, CNT
+ ldr r3, [DST]
+ eor r3, r4
+
+ pop {r4,r5,r6}
+
+ C Store bytes, one by one.
+.Lmemxor_leftover:
+ strb r3, [DST], #+1
+ subs N, #1
+ beq .Lmemxor_done
+ subs TNC, #8
+ lsr r3, #8
+ bne .Lmemxor_leftover
+ b .Lmemxor_bytes
+.Lmemxor_odd_done:
+ pop {r4,r5,r6}
+ bx lr
+
+.Lmemxor_same:
+ push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register
+
+ subs N, #8
+ bcc .Lmemxor_same_end
+
+ ldmia SRC!, {r3, r4, r5}
+ C Keep address for loads in r14
+ mov r14, DST
+ ldmia r14!, {r6, r7, r8}
+ subs N, #12
+ eor r10, r3, r6
+ eor r11, r4, r7
+ eor r12, r5, r8
+ bcc .Lmemxor_same_final_store
+ subs N, #12
+ ldmia r14!, {r6, r7, r8}
+ bcc .Lmemxor_same_wind_down
+
+ C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
+ C loop starts at offset 0x11c in the object file.
+
+.Lmemxor_same_loop:
+ C r10-r12 contains values to be stored at DST
+ C r6-r8 contains values read from r14, in advance
+ ldmia SRC!, {r3, r4, r5}
+ subs N, #12
+ stmia DST!, {r10, r11, r12}
+ eor r10, r3, r6
+ eor r11, r4, r7
+ eor r12, r5, r8
+ ldmia r14!, {r6, r7, r8}
+ bcs .Lmemxor_same_loop
+
+.Lmemxor_same_wind_down:
+ C Wind down code
+ ldmia SRC!, {r3, r4, r5}
+ stmia DST!, {r10, r11, r12}
+ eor r10, r3, r6
+ eor r11, r4, r7
+ eor r12, r5, r8
+.Lmemxor_same_final_store:
+ stmia DST!, {r10, r11, r12}
+
+.Lmemxor_same_end:
+ C We have 0-11 bytes left to do, and N holds number of bytes -12.
+ adds N, #4
+ bcc .Lmemxor_same_lt_8
+ C Do 8 bytes more, leftover is in N
+ ldmia SRC!, {r3, r4}
+ ldmia DST, {r6, r7}
+ eor r3, r6
+ eor r4, r7
+ stmia DST!, {r3, r4}
+ pop {r4,r5,r6,r7,r8,r10,r11,r14}
+ beq .Lmemxor_done
+ b .Lmemxor_bytes
+
+.Lmemxor_same_lt_8:
+ pop {r4,r5,r6,r7,r8,r10,r11,r14}
+ adds N, #4
+ bcc .Lmemxor_same_lt_4
+
+ ldr r3, [SRC], #+4
+ ldr r12, [DST]
+ eor r3, r12
+ str r3, [DST], #+4
+ beq .Lmemxor_done
+ b .Lmemxor_bytes
+
+.Lmemxor_same_lt_4:
+ adds N, #4
+ beq .Lmemxor_done
+ b .Lmemxor_bytes
+
+EPILOGUE(memxor)
+
+define(<DST>, <r0>)
+define(<AP>, <r1>)
+define(<BP>, <r2>)
+define(<N>, <r3>)
+undefine(<CNT>)
+undefine(<TNC>)
+
+C Temporaries r4-r7
+define(<ACNT>, <r8>)
+define(<ATNC>, <r10>)
+define(<BCNT>, <r11>)
+define(<BTNC>, <r12>)
+
+ C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
+ .align 2
+PROLOGUE(memxor3)
+ cmp N, #0
+ beq .Lmemxor3_ret
+
+ push {r4,r5,r6,r7,r8,r10,r11}
+ cmp N, #7
+
+ add AP, N
+ add BP, N
+ add DST, N
+
+ bcs .Lmemxor3_large
+
+ C Simple byte loop
+.Lmemxor3_bytes:
+ ldrb r4, [AP, #-1]!
+ ldrb r5, [BP, #-1]!
+ eor r4, r5
+ strb r4, [DST, #-1]!
+ subs N, #1
+ bne .Lmemxor3_bytes
+
+.Lmemxor3_done:
+ pop {r4,r5,r6,r7,r8,r10,r11}
+.Lmemxor3_ret:
+ bx lr
+
+.Lmemxor3_align_loop:
+ ldrb r4, [AP, #-1]!
+ ldrb r5, [BP, #-1]!
+ eor r5, r4
+ strb r5, [DST, #-1]!
+ sub N, #1
+
+.Lmemxor3_large:
+ tst DST, #3
+ bne .Lmemxor3_align_loop
+
+ C We have at least 4 bytes left to do here.
+ sub N, #4
+ ands ACNT, AP, #3
+ lsl ACNT, #3
+ beq .Lmemxor3_a_aligned
+
+ ands BCNT, BP, #3
+ lsl BCNT, #3
+ bne .Lmemxor3_uu
+
+ C Swap
+ mov r4, AP
+ mov AP, BP
+ mov BP, r4
+
+.Lmemxor3_au:
+ C NOTE: We have the relevant shift count in ACNT, not BCNT
+
+ C AP is aligned, BP is not
+ C v original SRC
+ C +-------+------+
+ C |SRC-4 |SRC |
+ C +---+---+------+
+ C |DST-4 |
+ C +-------+
+ C
+ C With little-endian, we need to do
+ C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
+ rsb ATNC, ACNT, #32
+ bic BP, #3
+
+ ldr r4, [BP]
+
+ tst N, #4
+ itet eq
+ moveq r5, r4
+ subne N, #4
+ beq .Lmemxor3_au_odd
+
+.Lmemxor3_au_loop:
+ ldr r5, [BP, #-4]!
+ ldr r6, [AP, #-4]!
+ eor r6, r6, r4, lsl ATNC
+ eor r6, r6, r5, lsr ACNT
+ str r6, [DST, #-4]!
+.Lmemxor3_au_odd:
+ ldr r4, [BP, #-4]!
+ ldr r6, [AP, #-4]!
+ eor r6, r6, r5, lsl ATNC
+ eor r6, r6, r4, lsr ACNT
+ str r6, [DST, #-4]!
+ subs N, #8
+ bcs .Lmemxor3_au_loop
+ adds N, #8
+ beq .Lmemxor3_done
+
+ C Leftover bytes in r4, low end
+ ldr r5, [AP, #-4]
+ eor r4, r5, r4, lsl ATNC
+
+.Lmemxor3_au_leftover:
+ C Store a byte at a time
+ ror r4, #24
+ strb r4, [DST, #-1]!
+ subs N, #1
+ beq .Lmemxor3_done
+ subs ACNT, #8
+ sub AP, #1
+ bne .Lmemxor3_au_leftover
+ b .Lmemxor3_bytes
+
+.Lmemxor3_a_aligned:
+ ands ACNT, BP, #3
+ lsl ACNT, #3
+ bne .Lmemxor3_au ;
+
+ C a, b and dst all have the same alignment.
+ subs N, #8
+ bcc .Lmemxor3_aligned_word_end
+
+ C This loop runs at 8 cycles per iteration. It has been
+ C observed running at only 7 cycles, for this speed, the loop
+ C started at offset 0x2ac in the object file.
+
+ C FIXME: consider software pipelining, similarly to the memxor
+ C loop.
+
+.Lmemxor3_aligned_word_loop:
+ ldmdb AP!, {r4,r5,r6}
+ ldmdb BP!, {r7,r8,r10}
+ subs N, #12
+ eor r4, r7
+ eor r5, r8
+ eor r6, r10
+ stmdb DST!, {r4, r5,r6}
+ bcs .Lmemxor3_aligned_word_loop
+
+.Lmemxor3_aligned_word_end:
+ C We have 0-11 bytes left to do, and N holds number of bytes -12.
+ adds N, #4
+ bcc .Lmemxor3_aligned_lt_8
+ C Do 8 bytes more, leftover is in N
+ ldmdb AP!, {r4, r5}
+ ldmdb BP!, {r6, r7}
+ eor r4, r6
+ eor r5, r7
+ stmdb DST!, {r4,r5}
+ beq .Lmemxor3_done
+ b .Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_8:
+ adds N, #4
+ bcc .Lmemxor3_aligned_lt_4
+
+ ldr r4, [AP,#-4]!
+ ldr r5, [BP,#-4]!
+ eor r4, r5
+ str r4, [DST,#-4]!
+ beq .Lmemxor3_done
+ b .Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_4:
+ adds N, #4
+ beq .Lmemxor3_done
+ b .Lmemxor3_bytes
+
+.Lmemxor3_uu:
+
+ cmp ACNT, BCNT
+ bic AP, #3
+ bic BP, #3
+ rsb ATNC, ACNT, #32
+
+ bne .Lmemxor3_uud
+
+ C AP and BP are unaligned in the same way
+
+ ldr r4, [AP]
+ ldr r6, [BP]
+ eor r4, r6
+
+ tst N, #4
+ itet eq
+ moveq r5, r4
+ subne N, #4
+ beq .Lmemxor3_uu_odd
+
+.Lmemxor3_uu_loop:
+ ldr r5, [AP, #-4]!
+ ldr r6, [BP, #-4]!
+ eor r5, r6
+ lsl r4, ATNC
+ eor r4, r4, r5, lsr ACNT
+ str r4, [DST, #-4]!
+.Lmemxor3_uu_odd:
+ ldr r4, [AP, #-4]!
+ ldr r6, [BP, #-4]!
+ eor r4, r6
+ lsl r5, ATNC
+ eor r5, r5, r4, lsr ACNT
+ str r5, [DST, #-4]!
+ subs N, #8
+ bcs .Lmemxor3_uu_loop
+ adds N, #8
+ beq .Lmemxor3_done
+
+ C Leftover bytes in a4, low end
+ ror r4, ACNT
+.Lmemxor3_uu_leftover:
+ ror r4, #24
+ strb r4, [DST, #-1]!
+ subs N, #1
+ beq .Lmemxor3_done
+ subs ACNT, #8
+ bne .Lmemxor3_uu_leftover
+ b .Lmemxor3_bytes
+
+.Lmemxor3_uud:
+ C Both AP and BP unaligned, and in different ways
+ rsb BTNC, BCNT, #32
+
+ ldr r4, [AP]
+ ldr r6, [BP]
+
+ tst N, #4
+ ittet eq
+ moveq r5, r4
+ moveq r7, r6
+ subne N, #4
+ beq .Lmemxor3_uud_odd
+
+.Lmemxor3_uud_loop:
+ ldr r5, [AP, #-4]!
+ ldr r7, [BP, #-4]!
+ lsl r4, ATNC
+ eor r4, r4, r6, lsl BTNC
+ eor r4, r4, r5, lsr ACNT
+ eor r4, r4, r7, lsr BCNT
+ str r4, [DST, #-4]!
+.Lmemxor3_uud_odd:
+ ldr r4, [AP, #-4]!
+ ldr r6, [BP, #-4]!
+ lsl r5, ATNC
+ eor r5, r5, r7, lsl BTNC
+ eor r5, r5, r4, lsr ACNT
+ eor r5, r5, r6, lsr BCNT
+ str r5, [DST, #-4]!
+ subs N, #8
+ bcs .Lmemxor3_uud_loop
+ adds N, #8
+ beq .Lmemxor3_done
+
+ C FIXME: More clever left-over handling? For now, just adjust pointers.
+ add AP, AP, ACNT, lsr #3
+ add BP, BP, BCNT, lsr #3
+ b .Lmemxor3_bytes
+EPILOGUE(memxor3)
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
new file mode 100644
index 00000000..fe26e5c5
--- /dev/null
+++ b/arm/neon/salsa20-core-internal.asm
@@ -0,0 +1,181 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "salsa20-core-internal.asm"
+ .fpu neon
+
+define(<DST>, <r0>)
+define(<SRC>, <r1>)
+define(<ROUNDS>, <r2>)
+
+define(<X0>, <q0>)
+define(<X1>, <q1>)
+define(<X2>, <q2>)
+define(<X3>, <q3>)
+define(<T0>, <q8>)
+define(<T1>, <q9>)
+define(<M0101>, <q10>)
+define(<M0110>, <q11>)
+define(<M0011>, <q12>)
+define(<S1>, <q13>)
+define(<S2>, <q14>)
+define(<S3>, <q15>)
+
+define(<QROUND>, <
+ vadd.i32 T0, $1, $4
+ vshl.i32 T1, T0, #7
+ vshr.u32 T0, T0, #25
+ veor $2, $2, T0
+ veor $2, $2, T1
+
+ vadd.i32 T0, $1, $2
+ vshl.i32 T1, T0, #9
+ vshr.u32 T0, T0, #23
+ veor $3, $3, T0
+ veor $3, $3, T1
+
+ vadd.i32 T0, $2, $3
+ vshl.i32 T1, T0, #13
+ vshr.u32 T0, T0, #19
+ veor $4, $4, T0
+ veor $4, $4, T1
+
+ vadd.i32 T0, $3, $4
+ vshl.i32 T1, T0, #18
+ vshr.u32 T0, T0, #14
+ veor $1, $1, T0
+ veor $1, $1, T1
+>)
+
+ .text
+ .align 4
+.Lmasks:
+ .int 0,-1, 0,-1
+ .int 0,-1,-1, 0
+ .int 0, 0,-1,-1
+
+ C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_salsa20_core)
+ vldm SRC, {X0,X1,X2,X3}
+
+ C Input rows:
+ C 0 1 2 3 X0
+ C 4 5 6 7 X1
+ C 8 9 10 11 X2
+ C 12 13 14 15 X3
+ C Permuted to:
+ C 0 5 10 15
+ C 4 9 14 3
+ C 8 13 2 7
+ C 12 1 6 11
+
+ C FIXME: Construct in some other way?
+ adr r12, .Lmasks
+ vldm r12, {M0101, M0110, M0011}
+
+ vmov S1, X1
+ vmov S2, X2
+ vmov S3, X3
+
+ C Swaps in columns 1, 3:
+ C 0 5 2 7 X0 ^
+ C 4 1 6 3 T0 v
+ C 8 13 10 15 T1 ^
+ C 12 9 14 11 X3 v
+ vmov T0, X1
+ vmov T1, X2
+ vbit T0, X0, M0101
+ vbit X0, X1, M0101
+ vbit T1, X3, M0101
+ vbit X3, X2, M0101
+
+ C Swaps in column 1, 2:
+ C 0 5 2 7 X0
+ C 4 9 14 3 X1 ^
+ C 8 13 10 15 T1 |
+ C 12 1 6 11 X3 v
+ vmov X1, T0
+ vbit X1, X3, M0110
+ vbit X3, T0, M0110
+
+ C Swaps in columm 2,3:
+ C 0 5 10 15 X0 ^
+ C 4 9 14 3 X1 |
+ C 8 13 2 7 X2 v
+ C 12 1 6 11 X3
+ vmov X2, T1
+ vbit X2, X0, M0011
+ vbit X0, T1, M0011
+
+.Loop:
+ QROUND(X0, X1, X2, X3)
+
+ C Rotate rows, to get
+ C 0 5 10 15
+ C 3 4 9 14 >>> 1
+ C 2 7 8 13 >>> 2
+ C 1 6 11 12 >>> 3
+ vext.32 X1, X1, X1, #3
+ vext.32 X2, X2, X2, #2
+ vext.32 X3, X3, X3, #1
+
+ QROUND(X0, X3, X2, X1)
+
+ subs ROUNDS, ROUNDS, #2
+ C Inverse rotation
+ vext.32 X1, X1, X1, #1
+ vext.32 X2, X2, X2, #2
+ vext.32 X3, X3, X3, #3
+
+ bhi .Loop
+
+ C Inverse swaps
+ vmov T1, X2
+ vbit T1, X0, M0011
+ vbit X0, X2, M0011
+
+ vmov T0, X1
+ vbit T0, X3, M0110
+ vbit X3, X1, M0110
+
+ vmov X1, T0
+ vmov X2, T1
+ vbit X1, X0, M0101
+ vbit X0, T0, M0101
+ vbit X2, X3, M0101
+ vbit X3, T1, M0101
+
+ vld1.64 {T0}, [SRC]
+ vadd.u32 X0, X0, T0
+ vadd.u32 X1, X1, S1
+ vadd.u32 X2, X2, S2
+ vadd.u32 X3, X3, S3
+
+ vstm DST, {X0,X1,X2,X3}
+ bx lr
+EPILOGUE(_nettle_salsa20_core)
+
+divert(-1)
+define salsastate
+p/x $q0.u32
+p/x $q1.u32
+p/x $q2.u32
+p/x $q3.u32
+end
diff --git a/arm/neon/sha3-permute.asm b/arm/neon/sha3-permute.asm
new file mode 100644
index 00000000..beee09f7
--- /dev/null
+++ b/arm/neon/sha3-permute.asm
@@ -0,0 +1,266 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "sha3-permute.asm"
+ .fpu neon
+
+define(<CTX>, <r0>)
+define(<COUNT>, <r1>)
+define(<RC>, <r2>)
+C First column
+define(<A0>, <d0>)
+define(<A5>, <d2>)
+define(<A10>, <d3>)
+define(<A15>, <d4>)
+define(<A20>, <d5>)
+
+define(<A1>, <d6>)
+define(<A2>, <d7>)
+define(<A3>, <d8>)
+define(<A4>, <d9>)
+
+define(<A6>, <d16>)
+define(<A7>, <d17>)
+define(<A8>, <d18>)
+define(<A9>, <d19>)
+
+define(<A11>, <d20>)
+define(<A12>, <d21>)
+define(<A13>, <d22>)
+define(<A14>, <d23>)
+
+define(<A16>, <d24>)
+define(<A17>, <d25>)
+define(<A18>, <d26>)
+define(<A19>, <d27>)
+
+define(<A21>, <d28>)
+define(<A22>, <d29>)
+define(<A23>, <d30>)
+define(<A24>, <d31>)
+
+define(<T0>, <d10>)
+define(<T1>, <d11>)
+
+define(<C0>, <d1>)
+define(<C1>, <d12>)
+define(<C2>, <d13>)
+define(<C3>, <d14>)
+define(<C4>, <d15>)
+
+
+C ROL(DST, SRC, COUNT)
+C Must have SRC != DST
+define(<ROL>, <
+ vshr.u64 $1, $2, #eval(64-$3)
+ vsli.i64 $1, $2, #$3
+ >)
+C sha3_permute(struct sha3_ctx *ctx)
+
+ .text
+ .align 3
+.Lrc:
+ .quad 0x0000000000000001
+ .quad 0x0000000000008082
+ .quad 0x800000000000808A
+ .quad 0x8000000080008000
+ .quad 0x000000000000808B
+ .quad 0x0000000080000001
+ .quad 0x8000000080008081
+ .quad 0x8000000000008009
+ .quad 0x000000000000008A
+ .quad 0x0000000000000088
+ .quad 0x0000000080008009
+ .quad 0x000000008000000A
+ .quad 0x000000008000808B
+ .quad 0x800000000000008B
+ .quad 0x8000000000008089
+ .quad 0x8000000000008003
+ .quad 0x8000000000008002
+ .quad 0x8000000000000080
+ .quad 0x000000000000800A
+ .quad 0x800000008000000A
+ .quad 0x8000000080008081
+ .quad 0x8000000000008080
+ .quad 0x0000000080000001
+ .quad 0x8000000080008008
+
+PROLOGUE(nettle_sha3_permute)
+ vpush {d8-d15}
+
+ vld1.64 {A0}, [CTX]!
+ vldm CTX!, {A1,A2,A3,A4}
+ vld1.64 {A5}, [CTX]!
+ vldm CTX!, {A6,A7,A8,A9}
+ vld1.64 {A10}, [CTX]!
+ vldm CTX!, {A11,A12,A13,A14}
+ vld1.64 {A15}, [CTX]!
+ vldm CTX!, {A16,A17,A18,A19}
+ vld1.64 {A20}, [CTX]!
+ vldm CTX, {A21,A22,A23,A24}
+ sub CTX, CTX, #168
+
+ mov COUNT, #24
+ adr RC, .Lrc
+
+ .align 3
+.Loop:
+ veor QREG(T0), QREG(A5), QREG(A15)
+ veor C0, A0, T0
+ veor C0, C0, T1
+ veor QREG(C1), QREG(A1), QREG(A6)
+ veor QREG(C1), QREG(C1), QREG(A11)
+ veor QREG(C1), QREG(C1), QREG(A16)
+ veor QREG(C1), QREG(C1), QREG(A21)
+
+ veor QREG(C3), QREG(A3), QREG(A8)
+ veor QREG(C3), QREG(C3), QREG(A13)
+ veor QREG(C3), QREG(C3), QREG(A18)
+ veor QREG(C3), QREG(C3), QREG(A23)
+
+ C D0 = C4 ^ (C1 <<< 1)
+ C NOTE: Using ROL macro (and vsli) is slightly slower.
+ vshl.i64 T0, C1, #1
+ vshr.u64 T1, C1, #63
+ veor T0, T0, C4
+ veor T0, T0, T1
+ vmov T1, T0
+ veor A0, A0, T0
+ veor QREG(A5), QREG(A5), QREG(T0)
+ veor QREG(A15), QREG(A15), QREG(T0)
+
+ C D1 = C0 ^ (C2 <<< 1)
+ C D2 = C1 ^ (C3 <<< 1)
+ ROL(T0, C2, 1)
+ ROL(T1, C3, 1)
+ veor T0, T0, C0
+ veor T1, T1, C1
+ veor QREG(A1), QREG(A1), QREG(T0)
+ veor QREG(A6), QREG(A6), QREG(T0)
+ veor QREG(A11), QREG(A11), QREG(T0)
+ veor QREG(A16), QREG(A16), QREG(T0)
+ veor QREG(A21), QREG(A21), QREG(T0)
+
+ C D3 = C2 ^ (C4 <<< 1)
+ C D4 = C3 ^ (C0 <<< 1)
+ ROL(T0, C4, 1)
+ ROL(T1, C0, 1)
+ veor T0, T0, C2
+ veor T1, T1, C3
+ veor QREG(A3), QREG(A3), QREG(T0)
+ veor QREG(A8), QREG(A8), QREG(T0)
+ veor QREG(A13), QREG(A13), QREG(T0)
+ veor QREG(A18), QREG(A18), QREG(T0)
+ veor QREG(A23), QREG(A23), QREG(T0)
+
+ ROL( T0, A1, 1)
+ ROL( A1, A6, 44)
+ ROL( A6, A9, 20)
+ ROL( A9, A22, 61)
+ ROL(A22, A14, 39)
+ ROL(A14, A20, 18)
+ ROL(A20, A2, 62)
+ ROL( A2, A12, 43)
+ ROL(A12, A13, 25)
+ ROL(A13, A19, 8)
+ ROL(A19, A23, 56)
+ ROL(A23, A15, 41)
+ ROL(A15, A4, 27)
+ ROL( A4, A24, 14)
+ ROL(A24, A21, 2)
+ ROL(A21, A8, 55)
+ ROL( A8, A16, 45)
+ ROL(A16, A5, 36)
+ ROL( A5, A3, 28)
+ ROL( A3, A18, 21)
+ ROL(A18, A17, 15)
+ ROL(A17, A11, 10)
+ ROL(A11, A7, 6)
+ ROL( A7, A10, 3)
+ C New A10 value left in T0
+
+ vbic C0, A2, A1
+ vbic C1, A3, A2
+ vbic C2, A4, A3
+ vbic C3, A0, A4
+ vbic C4, A1, A0
+
+ veor A0, A0, C0
+ vld1.64 {C0}, [RC :64]!
+ veor QREG(A1), QREG(A1), QREG(C1)
+ veor QREG(A3), QREG(A3), QREG(C3)
+ veor A0, A0, C0
+
+ vbic C0, A7, A6
+ vbic C1, A8, A7
+ vbic C2, A9, A8
+ vbic C3, A5, A9
+ vbic C4, A6, A5
+
+ veor A5, A5, C0
+ veor QREG(A6), QREG(A6), QREG(C1)
+ veor QREG(A8), QREG(A8), QREG(C3)
+
+ vbic C0, A12, A11
+ vbic C1, A13, A12
+ vbic C2, A14, A13
+ vbic C3, T0, A14
+ vbic C4, A11, T0
+
+ veor A10, T0, C0
+ veor QREG(A11), QREG(A11), QREG(C1)
+ veor QREG(A13), QREG(A13), QREG(C3)
+
+ vbic C0, A17, A16
+ vbic C1, A18, A17
+ vbic C2, A19, A18
+ vbic C3, A15, A19
+ vbic C4, A16, A15
+
+ veor A15, A15, C0
+ veor QREG(A16), QREG(A16), QREG(C1)
+ veor QREG(A18), QREG(A18), QREG(C3)
+
+ vbic C0, A22, A21
+ vbic C1, A23, A22
+ vbic C2, A24, A23
+ vbic C3, A20, A24
+ vbic C4, A21, A20
+
+ subs COUNT, COUNT, #1
+ veor A20, A20, C0
+ veor QREG(A21), QREG(A21), QREG(C1)
+ veor QREG(A23), QREG(A23), QREG(C3)
+
+ bne .Loop
+
+ vst1.64 {A0}, [CTX]!
+ vstm CTX!, {A1,A2,A3,A4}
+ vst1.64 {A5}, [CTX]!
+ vstm CTX!, {A6,A7,A8,A9}
+ vst1.64 {A10}, [CTX]!
+ vstm CTX!, {A11,A12,A13,A14}
+ vst1.64 {A15}, [CTX]!
+ vstm CTX!, {A16,A17,A18,A19}
+ vst1.64 {A20}, [CTX]!
+ vstm CTX, {A21,A22,A23,A24}
+
+ vpop {d8-d15}
+ bx lr
+EPILOGUE(nettle_sha3_permute)
diff --git a/arm/neon/sha512-compress.asm b/arm/neon/sha512-compress.asm
new file mode 100644
index 00000000..ac2b4382
--- /dev/null
+++ b/arm/neon/sha512-compress.asm
@@ -0,0 +1,317 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "sha512-compress.asm"
+ .fpu neon
+
+define(<STATE>, <r0>)
+define(<INPUT>, <r1>)
+define(<K>, <r2>)
+define(<COUNT>, <r3>)
+define(<SHIFT>, <r12>)
+
+define(<SA>, <d0>)
+define(<SB>, <d1>)
+define(<SC>, <d2>)
+define(<SD>, <d3>)
+define(<SE>, <d4>)
+define(<SF>, <d5>)
+define(<SG>, <d6>)
+define(<SH>, <d7>)
+define(<QSAB>, <q0>)
+define(<QSCD>, <q1>)
+define(<QSEF>, <q2>)
+define(<QSGH>, <q3>)
+
+C d8-d15 are callee-save
+define(<DT0>, <d8>)
+define(<DT1>, <d9>)
+define(<QT01>, <q4>)
+define(<DT2>, <d10>)
+define(<DT3>, <d11>)
+define(<QT23>, <q5>)
+define(<DT4>, <d12>)
+define(<DT5>, <d13>)
+define(<QT45>, <q6>)
+
+C Used only when reading the input, can overlap with state
+define(<DT6>, <d0>)
+define(<DT7>, <d1>)
+define(<QT67>, <q0>)
+
+define(<DW0>, <d16>)
+define(<DW1>, <d17>)
+define(<DW2>, <d18>)
+define(<DW3>, <d19>)
+define(<DW4>, <d20>)
+define(<DW5>, <d21>)
+define(<DW6>, <d22>)
+define(<DW7>, <d23>)
+define(<DW8>, <d24>)
+define(<DW9>, <d25>)
+define(<DW10>, <d26>)
+define(<DW11>, <d27>)
+define(<DW12>, <d28>)
+define(<DW13>, <d29>)
+define(<DW14>, <d30>)
+define(<DW15>, <d31>)
+define(<QW0001>, <q8>)
+define(<QW0203>, <q9>)
+define(<QW0405>, <q10>)
+define(<QW0607>, <q11>)
+define(<QW0809>, <q12>)
+define(<QW1011>, <q13>)
+define(<QW1213>, <q14>)
+define(<QW1415>, <q15>)
+
+define(<EXPAND_ME>, <$1>)
+define(<W>, <EXPAND_ME(<DW>eval(($1) % 16))>)
+
+C If x = W(i+14), y = w(i+1), we xor in parallel
+C
+C x << 45 y << 63
+C x >> 19 y >> 1
+C x << 3 y << 56
+C x >> 61 y >> 8
+C xor x >> 6 y >> 7
+C -----------------------------
+C DT0 DT1
+define(<EXPN>, <
+ vshl.i64 DT0, W($1+14), #45
+ vshl.i64 DT1, W($1 + 1), #63
+ vshr.u64 DT2, W($1+14), #19
+ vshr.u64 DT3, W($1 + 1), #1
+ vshl.i64 DT4, W($1+14), #3
+ vshl.i64 DT5, W($1 + 1), #56
+ veor.i64 QT01, QT01, QT23
+ vshr.u64 DT2, W($1+14), #61
+ vshr.u64 DT3, W($1 + 1), #8
+ veor.i64 QT01, QT01, QT45
+ vshr.u64 DT4, W($1+14), #6
+ vshr.u64 DT5, W($1 + 1), #7
+ veor.i64 QT01, QT01, QT23
+ vadd.i64 W($1), W($1), W($1 + 9)
+ veor.i64 QT01, QT01, QT45
+ vadd.i64 W($1), W($1), DT0
+ vadd.i64 W($1), W($1), DT1
+>)
+
+C ROUND(A,B,C,D,E,F,G,H,i)
+C
+C H += S1(E) + Choice(E,F,G) + K + W
+C D += H
+C H += S0(A) + Majority(A,B,C)
+C
+C Where
+C
+C S1(E) = E<<<50 ^ E<<<46 ^ E<<<23
+C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25
+C Choice (E, F, G) = G^(E&(F^G))
+C Majority (A,B,C) = (A&B) + (C&(A^B))
+
+C Do S1 and S0 in parallel
+C
+C e << 50 a << 36
+C e >> 14 a >> 28
+C e << 46 a << 30
+C e >> 18 a >> 34
+C e << 23 a << 25
+C xor e >> 41 a >> 39
+C ----------------------------
+C DT0 DT1
+define(<ROUND>, <
+ vshl.i64 DT0, $5, #50
+ vshl.i64 DT1, $1, #36
+ vshr.u64 DT2, $5, #14
+ vshr.u64 DT3, $1, #28
+ vshl.i64 DT4, $5, #46
+ vshl.i64 DT5, $1, #30
+ veor QT01, QT01, QT23
+ vshr.u64 DT2, $5, #18
+ vshr.u64 DT3, $1, #34
+ veor QT01, QT01, QT45
+ vshl.i64 DT4, $5, #23
+ vshl.i64 DT5, $1, #25
+ veor QT01, QT01, QT23
+ vshr.u64 DT2, $5, #41
+ vshr.u64 DT3, $1, #39
+ veor QT01, QT01, QT45
+ veor DT4, $6, $7
+ veor DT5, $1, $2
+ vand DT4, DT4, $5
+ vand DT5, DT5, $3
+ veor DT4, DT4, $7
+ veor QT01, QT01, QT23
+ vand DT2, $1, $2
+ vldr DT3, [K,#eval(8*$9)]
+ vadd.i64 $8, $8, W($9)
+ vadd.i64 QT01, QT01, QT45
+ vadd.i64 $8, $8, DT3
+ vadd.i64 $8, $8, DT0
+ vadd.i64 DT1, DT1, DT2
+ vadd.i64 $4, $4, $8
+ vadd.i64 $8, $8, DT1
+>)
+
+ C void
+ C _nettle_sha512_compress(uint64_t *state, const uint8_t *input, const uint64_t *k)
+
+ .text
+ .align 2
+
+PROLOGUE(_nettle_sha512_compress)
+ vpush {d8,d9,d10,d11,d12,d13}
+
+ ands SHIFT, INPUT, #7
+ and INPUT, INPUT, #-8
+ vld1.8 {DT5}, [INPUT :64]
+ addne INPUT, INPUT, #8
+ addeq SHIFT, SHIFT, #8
+ lsl SHIFT, SHIFT, #3
+
+ C Put right shift in DT0 and DT1, aka QT01
+ neg SHIFT, SHIFT
+ vmov.i32 DT0, #0
+ vmov.32 DT0[0], SHIFT
+ vmov DT1, DT0
+ C Put left shift in DT2 and DT3, aka QT23
+ add SHIFT, SHIFT, #64
+ vmov.i32 DT2, #0
+ vmov.32 DT2[0], SHIFT
+ vmov DT3, DT2
+ vshl.u64 DT5, DT5, DT0
+
+ C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT
+ vld1.8 {W(0),W(1),W(2),W(3)}, [INPUT :64]!
+ vshl.u64 QT67, QW0001, QT01 C Right shift
+ vshl.u64 QW0001, QW0001, QT23 C Left shift
+ veor W(0), W(0), DT5
+ veor W(1), W(1), DT6
+ vrev64.8 QW0001, QW0001
+ vshl.u64 QT45, QW0203, QT01 C Right shift
+ vshl.u64 QW0203, QW0203, QT23 C Left shift
+ veor W(2), W(2), DT7
+ veor W(3), W(3), DT4
+ vrev64.8 QW0203, QW0203
+
+ vld1.8 {W(4),W(5),W(6),W(7)}, [INPUT :64]!
+ vshl.u64 QT67, QW0405, QT01 C Right shift
+ vshl.u64 QW0405, QW0405, QT23 C Left shift
+ veor W(4), W(4), DT5
+ veor W(5), W(5), DT6
+ vrev64.8 QW0405, QW0405
+ vshl.u64 QT45, QW0607, QT01 C Right shift
+ vshl.u64 QW0607, QW0607, QT23 C Left shift
+ veor W(6), W(6), DT7
+ veor W(7), W(7), DT4
+ vrev64.8 QW0607, QW0607
+
+ vld1.8 {W(8),W(9),W(10),W(11)}, [INPUT :64]!
+ vshl.u64 QT67, QW0809, QT01 C Right shift
+ vshl.u64 QW0809, QW0809, QT23 C Left shift
+ veor W(8), W(8), DT5
+ veor W(9), W(9), DT6
+ vrev64.8 QW0809, QW0809
+ vshl.u64 QT45, QW1011, QT01 C Right shift
+ vshl.u64 QW1011, QW1011, QT23 C Left shift
+ veor W(10), W(10), DT7
+ veor W(11), W(11), DT4
+ vrev64.8 QW1011, QW1011
+
+ vld1.8 {W(12),W(13),W(14),W(15)}, [INPUT :64]!
+ vshl.u64 QT67, QW1213, QT01 C Right shift
+ vshl.u64 QW1213, QW1213, QT23 C Left shift
+ veor W(12), W(12), DT5
+ veor W(13), W(13), DT6
+ vrev64.8 QW1213, QW1213
+ vshl.u64 QT45, QW1415, QT01 C Right shift
+ vshl.u64 QW1415, QW1415, QT23 C Left shift
+ veor W(14), W(14), DT7
+ veor W(15), W(15), DT4
+ vrev64.8 QW1415, QW1415
+
+ vldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
+
+ ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
+ ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
+ ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
+ ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
+ ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
+ ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
+ ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
+ ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
+
+ ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
+ ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
+ ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
+ ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
+ ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
+ ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
+ ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
+ ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
+
+ add K, K, #128
+
+ mov COUNT, #4
+.Loop:
+
+ EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
+ EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
+ EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
+ EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
+ EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
+ EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
+ EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
+ EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
+ EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
+ EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
+ EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
+ EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
+ EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
+ EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
+ EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
+ subs COUNT, COUNT, #1
+ EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
+ add K, K, #128
+ bne .Loop
+
+ vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
+ vadd.i64 QSAB, QSAB, QW0001
+ vadd.i64 QSCD, QSCD, QW0203
+ vst1.64 {SA,SB,SC,SD}, [STATE]!
+ vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
+ vadd.i64 QSEF, QSEF, QW0001
+ vadd.i64 QSGH, QSGH, QW0203
+ vst1.64 {SE,SF,SG,SH}, [STATE]!
+
+ vpop {d8,d9,d10,d11,d12,d13}
+ bx lr
+EPILOGUE(_nettle_sha512_compress)
+
+divert(-1)
+define shastate
+p/x $d0.u64
+p/x $d1.u64
+p/x $d2.u64
+p/x $d3.u64
+p/x $d4.u64
+p/x $d5.u64
+p/x $d6.u64
+p/x $d7.u64
+end
diff --git a/arm/neon/umac-nh-n.asm b/arm/neon/umac-nh-n.asm
new file mode 100644
index 00000000..4ae876b5
--- /dev/null
+++ b/arm/neon/umac-nh-n.asm
@@ -0,0 +1,298 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "umac-nh.asm"
+ .fpu neon
+
+define(<OUT>, <r0>)
+define(<ITERS>, <r1>)
+define(<KEY>, <r2>)
+define(<LENGTH>, <r3>)
+define(<MSG>, <r12>)
+define(<SHIFT>, <r14>)
+
+define(<QA>, <q0>)
+define(<QB>, <q1>)
+define(<QY0>, <q3>) C Accumulates for the first two operations.
+define(<DM>, <d4>)
+define(<QY1>, <q4>) C Used for 3 and 4 iterations.
+define(<QC>, <q5>)
+define(<QD>, <q6>)
+define(<QLEFT>, <q8>)
+define(<QRIGHT>, <q9>)
+define(<QT0>, <q10>)
+define(<QT1>, <q11>)
+define(<QT2>, <q12>)
+define(<QK0>, <q13>)
+define(<QK1>, <q14>)
+define(<QK2>, <q15>)
+
+C FIXME: Try permuting subkeys using vld4, vzip or similar.
+
+ .text
+ .align 3
+
+PROLOGUE(_nettle_umac_nh_n)
+ ldr MSG, [sp]
+ str lr, [sp, #-4]!
+
+ C Setup for 64-bit aligned reads
+ ands SHIFT, MSG, #7
+ and MSG, MSG, #-8
+ vld1.8 {DM}, [MSG :64]
+ addne MSG, MSG, #8
+ addeq SHIFT, SHIFT, #8
+
+ C FIXME: Combine as rsb ?
+ lsl SHIFT, SHIFT, #3
+ neg SHIFT, SHIFT
+
+ C Right shift in QRIGHT (both halves)
+ vmov.i32 D0REG(QRIGHT)[0], SHIFT
+ vmov.32 D1REG(QRIGHT), D0REG(QRIGHT)
+ add SHIFT, SHIFT, #64
+
+ vmov.i32 D0REG(QLEFT)[0], SHIFT
+ vmov.32 D1REG(QLEFT), D0REG(QLEFT)
+ cmp r1, #3
+ vmov.i64 QY0, #0
+
+ vshl.u64 DM, DM, D0REG(QRIGHT)
+ bcc .Lnh2
+ beq .Lnh3
+
+.Lnh4:
+ C Permute key words, so we in each iteration have them in order
+ C
+ C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11]
+ C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19]
+ C
+ C Also arrange the message words, so we get them as
+ C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7]
+ C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15]
+ C
+ C Then, accumulate Y0 (first two "iters") using
+ C
+ C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3)
+ C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7)
+ C
+ C Next iteration is then
+ C
+ C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7)
+ C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11)
+ C
+ C So we can reuse P4, P5, P6, P7 from the previous iteration.
+
+ C How to for in registers? We need 4 Q regs for P0-P3, and one
+ C more for the last read key. We need at least two regiters
+ C for the message (QA and QB, more if we want to expand only
+ C once). For the Y0 update, we can let the factors overwrite
+ C P0-P3, and for the Y1 update, we can overwrite M0-M3.
+
+ vpush {q4,q5,q6}
+ vld1.32 {QK0,QK1}, [KEY]!
+ vld1.32 {QK2}, [KEY]!
+ vmov QT0, QK1
+ vmov QT1, QK2
+
+ C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1
+ vtrn.32 QK0, QK1 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+ vswp D1REG(QK0), D0REG(QK1) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+ vtrn.32 QT0, QT1 C Gives us [4,8,6,10] and [5 ,9,7,11]
+ vswp D1REG(QT0), D0REG(QT1) C Gives us [4,8,5, 9] and [6,10,7,11]
+
+ vmov.i64 QY1, #0
+.Loop4:
+ C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+ vld1.8 {QA, QB}, [MSG :64]!
+ vshl.u64 QC, QA, QRIGHT
+ vshl.u64 QD, QB, QRIGHT
+ vshl.u64 QA, QA, QLEFT
+ vshl.u64 QB, QB, QLEFT
+ veor D0REG(QA), D0REG(QA), DM
+ veor D1REG(QA), D1REG(QA), D0REG(QC)
+ veor D0REG(QB), D0REG(QB), D1REG(QC)
+ veor D1REG(QB), D1REG(QB), D0REG(QD)
+ vmov DM, D1REG(QD)
+
+ C Explode message (too bad there's no vadd with scalar)
+ vdup.32 D1REG(QD), D1REG(QB)[1]
+ vdup.32 D0REG(QD), D1REG(QB)[0]
+ vdup.32 D1REG(QC), D0REG(QB)[1]
+ vdup.32 D0REG(QC), D0REG(QB)[0]
+ vdup.32 D1REG(QB), D1REG(QA)[1]
+ vdup.32 D0REG(QB), D1REG(QA)[0]
+ vdup.32 D1REG(QA), D0REG(QA)[1]
+ vdup.32 D0REG(QA), D0REG(QA)[0]
+
+ vadd.i32 QK0, QK0, QA
+ vadd.i32 QK1, QK1, QB
+ vadd.i32 QT0, QT0, QC
+ vadd.i32 QT1, QT1, QD
+
+ vmlal.u32 QY0, D0REG(QK0), D0REG(QT0)
+ vmlal.u32 QY0, D1REG(QK0), D1REG(QT0)
+ vmlal.u32 QY0, D0REG(QK1), D0REG(QT1)
+ vmlal.u32 QY0, D1REG(QK1), D1REG(QT1)
+
+ C Next 4 subkeys
+ vld1.32 {QT0,QT1}, [KEY]!
+ vmov QK0, QK2
+ vmov QK1, QT0
+ vmov QK2, QT1 C Save
+ vtrn.32 QK0, QK1 C Gives us [8,12,10,14] and [9,13,11,15]
+ vswp D1REG(QK0), D0REG(QK1) C Gives us [8,12,9,13] and [10,14,11,15]
+ vtrn.32 QT0, QT1 C Gives us [12,16,14,18] and [13,17,15,19]
+ vswp D1REG(QT0), D0REG(QT1) C Gives us [12,16,13,17] and [14,18,15,19]
+
+ vadd.i32 QA, QA, QK0
+ vadd.i32 QB, QB, QK1
+ vadd.i32 QC, QC, QT0
+ vadd.i32 QD, QD, QT1
+
+ subs LENGTH, LENGTH, #32
+
+ vmlal.u32 QY1, D0REG(QA), D0REG(QC)
+ vmlal.u32 QY1, D1REG(QA), D1REG(QC)
+ vmlal.u32 QY1, D0REG(QB), D0REG(QD)
+ vmlal.u32 QY1, D1REG(QB), D1REG(QD)
+
+ bhi .Loop4
+
+ vst1.64 {QY0, QY1}, [OUT]
+
+ vpop {q4,q5,q6}
+
+ ldr pc, [sp], #+4
+
+.Lnh3:
+ vpush {q4}
+ vld1.32 {QK0,QK1}, [KEY]!
+ vmov.i64 QY1, #0
+.Loop3:
+ C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+ vld1.8 {QA, QB}, [MSG :64]!
+ vshl.u64 QT0, QA, QRIGHT
+ vshl.u64 QT1, QB, QRIGHT
+ vshl.u64 QA, QA, QLEFT
+ vshl.u64 QB, QB, QLEFT
+ veor D0REG(QA), D0REG(QA), DM
+ veor D1REG(QA), D1REG(QA), D0REG(QT0)
+ veor D0REG(QB), D0REG(QB), D1REG(QT0)
+ veor D1REG(QB), D1REG(QB), D0REG(QT1)
+ vmov DM, D1REG(QT1)
+
+ vld1.32 {QK2}, [KEY]!
+ C Construct factors, with low half corresponding to first iteration,
+ C and high half corresponding to the second iteration.
+ vmov QT0, QK1
+ vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+ vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+ vdup.32 D0REG(QT1), D0REG(QA)[0]
+ vdup.32 D1REG(QT1), D0REG(QA)[1]
+ vadd.i32 QT1, QT1, QK0
+
+ vmov QK0, QK2 C Save for next iteration
+ vtrn.32 QK1, QK2 C Gives us [4, 8, 2, 1] and [1, 5, 3, 7]
+ vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 1, 5] and [2, 1, 3, 7]
+
+ vdup.32 D0REG(QT2), D0REG(QB)[0]
+ vdup.32 D1REG(QT2), D0REG(QB)[1]
+ vadd.i32 QK1, QK1, QT2
+ vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
+ vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
+
+ vdup.32 D0REG(QT1), D1REG(QA)[0]
+ vdup.32 D1REG(QT1), D1REG(QA)[1]
+ vadd.i32 QT0, QT0, QT1
+ vdup.32 D0REG(QT1), D1REG(QB)[0]
+ vdup.32 D1REG(QT1), D1REG(QB)[1]
+ vadd.i32 QK2, QK2, QT1
+
+ vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
+ vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
+
+ vld1.32 {QK1}, [KEY]!
+ vadd.i32 QA, QA, QK0
+ vadd.i32 QB, QB, QK1
+ subs LENGTH, LENGTH, #32
+ vmlal.u32 QY1, D0REG(QA), D0REG(QB)
+ vmlal.u32 QY1, D1REG(QA), D1REG(QB)
+ bhi .Loop3
+
+ vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1)
+ vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT]
+
+ vpop {q4}
+
+ ldr pc, [sp], #+4
+
+.Lnh2:
+ vld1.32 {QK0}, [KEY]!
+.Loop2:
+ C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+ vld1.8 {QA, QB}, [MSG :64]!
+ vshl.u64 QT0, QA, QRIGHT
+ vshl.u64 QT1, QB, QRIGHT
+ vshl.u64 QA, QA, QLEFT
+ vshl.u64 QB, QB, QLEFT
+ veor D0REG(QA), D0REG(QA), DM
+ veor D1REG(QA), D1REG(QA), D0REG(QT0)
+ veor D0REG(QB), D0REG(QB), D1REG(QT0)
+ veor D1REG(QB), D1REG(QB), D0REG(QT1)
+ vmov DM, D1REG(QT1)
+
+ vld1.32 {QK1,QK2}, [KEY]!
+ C Construct factors, with low half corresponding to first iteration,
+ C and high half corresponding to the second iteration.
+ vmov QT0, QK1
+ vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+ vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+ vdup.32 D0REG(QT1), D0REG(QA)[0]
+ vdup.32 D1REG(QT1), D0REG(QA)[1]
+ vadd.i32 QT1, QT1, QK0
+
+ vmov QK0, QK2 C Save for next iteration
+ vtrn.32 QK1, QK2 C Gives us [4, 8, 6, 10] and [5, 9, 7, 11]
+ vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 5, 9] and [6, 10, 7, 11]
+
+ vdup.32 D0REG(QT2), D0REG(QB)[0]
+ vdup.32 D1REG(QT2), D0REG(QB)[1]
+ vadd.i32 QK1, QK1, QT2
+ vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
+ vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
+
+ vdup.32 D0REG(QT1), D1REG(QA)[0]
+ vdup.32 D1REG(QT1), D1REG(QA)[1]
+ vadd.i32 QT0, QT0, QT1
+ vdup.32 D0REG(QT1), D1REG(QB)[0]
+ vdup.32 D1REG(QT1), D1REG(QB)[1]
+ vadd.i32 QK2, QK2, QT1
+
+ subs LENGTH, LENGTH, #32
+
+ vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
+ vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
+
+ bhi .Loop2
+ vst1.64 {QY0}, [OUT]
+
+.Lend:
+ ldr pc, [sp], #+4
+EPILOGUE(_nettle_umac_nh_n)
diff --git a/arm/neon/umac-nh.asm b/arm/neon/umac-nh.asm
new file mode 100644
index 00000000..87cb86d0
--- /dev/null
+++ b/arm/neon/umac-nh.asm
@@ -0,0 +1,89 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "umac-nh.asm"
+ .fpu neon
+
+define(<KEY>, <r0>)
+define(<LENGTH>, <r1>)
+define(<MSG>, <r2>)
+define(<SHIFT>, <r3>)
+
+define(<QA>, <q0>)
+define(<QB>, <q1>)
+define(<DM>, <d16>)
+define(<QLEFT>, <q9>)
+define(<QRIGHT>, <q10>)
+define(<QY>, <q11>)
+define(<QT0>, <q12>)
+define(<QT1>, <q13>)
+define(<QK0>, <q14>)
+define(<QK1>, <q15>)
+
+ .text
+ .align 3
+
+PROLOGUE(_nettle_umac_nh)
+ C Setup for 64-bit aligned reads
+ ands SHIFT, MSG, #7
+ and MSG, MSG, #-8
+ vld1.8 {DM}, [MSG :64]
+ addne MSG, MSG, #8
+ addeq SHIFT, SHIFT, #8
+
+ C FIXME: Combine as rsb ?
+ lsl SHIFT, SHIFT, #3
+ neg SHIFT, SHIFT
+
+ C Right shift in QRIGHT (both halves)
+ vmov.i32 D0REG(QRIGHT)[0], SHIFT
+ vmov.32 D1REG(QRIGHT), D0REG(QRIGHT)
+ add SHIFT, SHIFT, #64
+
+ vmov.i32 D0REG(QLEFT)[0], SHIFT
+ vmov.32 D1REG(QLEFT), D0REG(QLEFT)
+
+ vmov.i64 QY, #0
+
+ vshl.u64 DM, DM, D0REG(QRIGHT)
+.Loop:
+ C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+ vld1.8 {QA, QB}, [MSG :64]!
+ vshl.u64 QT0, QA, QRIGHT
+ vshl.u64 QT1, QB, QRIGHT
+ vshl.u64 QA, QA, QLEFT
+ vshl.u64 QB, QB, QLEFT
+ veor D0REG(QA), D0REG(QA), DM
+ veor D1REG(QA), D1REG(QA), D0REG(QT0)
+ veor D0REG(QB), D0REG(QB), D1REG(QT0)
+ veor D1REG(QB), D1REG(QB), D0REG(QT1)
+ vmov DM, D1REG(QT1)
+
+ vld1.i32 {QK0, QK1}, [KEY]!
+ vadd.i32 QA, QA, QK0
+ vadd.i32 QB, QB, QK1
+ subs LENGTH, LENGTH, #32
+ vmlal.u32 QY, D0REG(QA), D0REG(QB)
+ vmlal.u32 QY, D1REG(QA), D1REG(QB)
+ bhi .Loop
+
+ vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY)
+ vmov r0, r1, D0REG(QY)
+ bx lr
+EPILOGUE(_nettle_umac_nh)
diff --git a/arm/sha1-compress.asm b/arm/sha1-compress.asm
new file mode 100644
index 00000000..69c30e42
--- /dev/null
+++ b/arm/sha1-compress.asm
@@ -0,0 +1,234 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "sha1-compress.asm"
+
+define(<STATE>, <r0>)
+define(<INPUT>, <r1>)
+define(<SA>, <r2>)
+define(<SB>, <r3>)
+define(<SC>, <r4>)
+define(<SD>, <r5>)
+define(<SE>, <r6>)
+define(<T0>, <r7>)
+define(<SHIFT>, <r8>)
+define(<WPREV>, <r10>)
+define(<W>, <r12>)
+define(<K>, <lr>)
+
+C FIXME: Could avoid a mov with even and odd variants.
+define(<LOAD>, <
+ ldr T0, [INPUT], #+4
+ sel W, WPREV, T0
+ ror W, W, SHIFT
+ mov WPREV, T0
+ rev W, W
+ str W, [SP,#eval(4*$1)]
+>)
+define(<EXPN>, <
+ ldr W, [sp, #+eval(4*$1)]
+ ldr T0, [sp, #+eval(4*(($1 + 2) % 16))]
+ eor W, W, T0
+ ldr T0, [sp, #+eval(4*(($1 + 8) % 16))]
+ eor W, W, T0
+ ldr T0, [sp, #+eval(4*(($1 + 13) % 16))]
+ eor W, W, T0
+ ror W, W, #31
+ str W, [sp, #+eval(4*$1)]
+>)
+
+C F1(B,C,D) = D^(B&(C^D))
+C ROUND1(A,B,C,D,E)
+define(<ROUND1>, <
+ eor T0, $3, $4
+ add $5, $5, K
+ and T0, T0, $2
+ add $5, $5, $1, ror #27
+ eor T0, T0, $4
+ add $5, $5, W
+ ror $2, $2, #2
+ add $5, $5, T0
+>)
+C F2(B,C,D) = B^C^D
+define(<ROUND2>, <
+ eor T0, $2, $4
+ add $5, $5, K
+ eor T0, T0, $3
+ add $5, $5, $1, ror #27
+ add $5, $5, W
+ ror $2, $2, #2
+ add $5, $5, T0
+>)
+C F3(B,C,D) = (B&C) | (D & (B|C)) = (B & (C ^ D)) + (C & D)
+define(<ROUND3>, <
+ eor T0, $3, $4
+ add $5, $5, K
+ and T0, T0, $2
+ add $5, $5, $1, ror #27
+ add $5, $5, T0
+ add $5, $5, W
+ and T0, $3, $4
+ ror $2, $2, #2
+ add $5, $5, T0
+>)
+ C void _nettle_sha1_compress(uint32_t *state, const uint8_t *input)
+
+ .text
+ .align 2
+.LK1:
+ .int 0x5A827999
+.LK2:
+ .int 0x6ED9EBA1
+.LK3:
+ .int 0x8F1BBCDC
+
+PROLOGUE(_nettle_sha1_compress)
+ push {r4,r5,r6,r7,r8,r10,lr}
+ sub sp, sp, #64
+
+ C Sets SHIFT to 8*low bits of input pointer. Sets up GE flags
+ C as follows, corresponding to bytes to be used from WPREV
+ C SHIFT 0 8 16 24
+ C CPSR.GE 0000 1110 1100 1000
+ ands SHIFT, INPUT, #3
+ and INPUT, INPUT, $-4
+ ldr WPREV, [INPUT]
+ addne INPUT, INPUT, #4 C Unaligned input
+ lsl SHIFT, SHIFT, #3
+ mov T0, #0
+ movne T0, #-1
+ lsl W, T0, SHIFT
+ uadd8 T0, T0, W C Sets APSR.GE bits
+
+ ldr K, .LK1
+ ldm STATE, {SA,SB,SC,SD,SE}
+
+ LOAD( 0) ROUND1(SA, SB, SC, SD, SE)
+ LOAD( 1) ROUND1(SE, SA, SB, SC, SD)
+ LOAD( 2) ROUND1(SD, SE, SA, SB, SC)
+ LOAD( 3) ROUND1(SC, SD, SE, SA, SB)
+ LOAD( 4) ROUND1(SB, SC, SD, SE, SA)
+
+ LOAD( 5) ROUND1(SA, SB, SC, SD, SE)
+ LOAD( 6) ROUND1(SE, SA, SB, SC, SD)
+ LOAD( 7) ROUND1(SD, SE, SA, SB, SC)
+ LOAD( 8) ROUND1(SC, SD, SE, SA, SB)
+ LOAD( 9) ROUND1(SB, SC, SD, SE, SA)
+
+ LOAD(10) ROUND1(SA, SB, SC, SD, SE)
+ LOAD(11) ROUND1(SE, SA, SB, SC, SD)
+ LOAD(12) ROUND1(SD, SE, SA, SB, SC)
+ LOAD(13) ROUND1(SC, SD, SE, SA, SB)
+ LOAD(14) ROUND1(SB, SC, SD, SE, SA)
+
+ LOAD(15) ROUND1(SA, SB, SC, SD, SE)
+ EXPN( 0) ROUND1(SE, SA, SB, SC, SD)
+ EXPN( 1) ROUND1(SD, SE, SA, SB, SC)
+ EXPN( 2) ROUND1(SC, SD, SE, SA, SB)
+ EXPN( 3) ROUND1(SB, SC, SD, SE, SA)
+
+ ldr K, .LK2
+ EXPN( 4) ROUND2(SA, SB, SC, SD, SE)
+ EXPN( 5) ROUND2(SE, SA, SB, SC, SD)
+ EXPN( 6) ROUND2(SD, SE, SA, SB, SC)
+ EXPN( 7) ROUND2(SC, SD, SE, SA, SB)
+ EXPN( 8) ROUND2(SB, SC, SD, SE, SA)
+
+ EXPN( 9) ROUND2(SA, SB, SC, SD, SE)
+ EXPN(10) ROUND2(SE, SA, SB, SC, SD)
+ EXPN(11) ROUND2(SD, SE, SA, SB, SC)
+ EXPN(12) ROUND2(SC, SD, SE, SA, SB)
+ EXPN(13) ROUND2(SB, SC, SD, SE, SA)
+
+ EXPN(14) ROUND2(SA, SB, SC, SD, SE)
+ EXPN(15) ROUND2(SE, SA, SB, SC, SD)
+ EXPN( 0) ROUND2(SD, SE, SA, SB, SC)
+ EXPN( 1) ROUND2(SC, SD, SE, SA, SB)
+ EXPN( 2) ROUND2(SB, SC, SD, SE, SA)
+
+ EXPN( 3) ROUND2(SA, SB, SC, SD, SE)
+ EXPN( 4) ROUND2(SE, SA, SB, SC, SD)
+ EXPN( 5) ROUND2(SD, SE, SA, SB, SC)
+ EXPN( 6) ROUND2(SC, SD, SE, SA, SB)
+ EXPN( 7) ROUND2(SB, SC, SD, SE, SA)
+
+ ldr K, .LK3
+ EXPN( 8) ROUND3(SA, SB, SC, SD, SE)
+ EXPN( 9) ROUND3(SE, SA, SB, SC, SD)
+ EXPN(10) ROUND3(SD, SE, SA, SB, SC)
+ EXPN(11) ROUND3(SC, SD, SE, SA, SB)
+ EXPN(12) ROUND3(SB, SC, SD, SE, SA)
+
+ EXPN(13) ROUND3(SA, SB, SC, SD, SE)
+ EXPN(14) ROUND3(SE, SA, SB, SC, SD)
+ EXPN(15) ROUND3(SD, SE, SA, SB, SC)
+ EXPN( 0) ROUND3(SC, SD, SE, SA, SB)
+ EXPN( 1) ROUND3(SB, SC, SD, SE, SA)
+
+ EXPN( 2) ROUND3(SA, SB, SC, SD, SE)
+ EXPN( 3) ROUND3(SE, SA, SB, SC, SD)
+ EXPN( 4) ROUND3(SD, SE, SA, SB, SC)
+ EXPN( 5) ROUND3(SC, SD, SE, SA, SB)
+ EXPN( 6) ROUND3(SB, SC, SD, SE, SA)
+
+ EXPN( 7) ROUND3(SA, SB, SC, SD, SE)
+ EXPN( 8) ROUND3(SE, SA, SB, SC, SD)
+ EXPN( 9) ROUND3(SD, SE, SA, SB, SC)
+ EXPN(10) ROUND3(SC, SD, SE, SA, SB)
+ EXPN(11) ROUND3(SB, SC, SD, SE, SA)
+
+ ldr K, .LK4
+ EXPN(12) ROUND2(SA, SB, SC, SD, SE)
+ EXPN(13) ROUND2(SE, SA, SB, SC, SD)
+ EXPN(14) ROUND2(SD, SE, SA, SB, SC)
+ EXPN(15) ROUND2(SC, SD, SE, SA, SB)
+ EXPN( 0) ROUND2(SB, SC, SD, SE, SA)
+
+ EXPN( 1) ROUND2(SA, SB, SC, SD, SE)
+ EXPN( 2) ROUND2(SE, SA, SB, SC, SD)
+ EXPN( 3) ROUND2(SD, SE, SA, SB, SC)
+ EXPN( 4) ROUND2(SC, SD, SE, SA, SB)
+ EXPN( 5) ROUND2(SB, SC, SD, SE, SA)
+
+ EXPN( 6) ROUND2(SA, SB, SC, SD, SE)
+ EXPN( 7) ROUND2(SE, SA, SB, SC, SD)
+ EXPN( 8) ROUND2(SD, SE, SA, SB, SC)
+ EXPN( 9) ROUND2(SC, SD, SE, SA, SB)
+ EXPN(10) ROUND2(SB, SC, SD, SE, SA)
+
+ EXPN(11) ROUND2(SA, SB, SC, SD, SE)
+ EXPN(12) ROUND2(SE, SA, SB, SC, SD)
+ EXPN(13) ROUND2(SD, SE, SA, SB, SC)
+ EXPN(14) ROUND2(SC, SD, SE, SA, SB)
+ EXPN(15) ROUND2(SB, SC, SD, SE, SA)
+
+ C Use registers we no longer need.
+ ldm STATE, {INPUT,T0,SHIFT,W,K}
+ add SA, SA, INPUT
+ add SB, SB, T0
+ add SC, SC, SHIFT
+ add SD, SD, W
+ add SE, SE, K
+ add sp, sp, #64
+ stm STATE, {SA,SB,SC,SD,SE}
+ pop {r4,r5,r6,r7,r8,r10,pc}
+EPILOGUE(_nettle_sha1_compress)
+
+.LK4:
+ .int 0xCA62C1D6
diff --git a/arm/sha256-compress.asm b/arm/sha256-compress.asm
new file mode 100644
index 00000000..c2aaabd2
--- /dev/null
+++ b/arm/sha256-compress.asm
@@ -0,0 +1,204 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "sha256-compress.asm"
+
+define(<STATE>, <r0>)
+define(<INPUT>, <r1>)
+define(<K>, <r2>)
+define(<SA>, <r3>)
+define(<SB>, <r4>)
+define(<SC>, <r5>)
+define(<SD>, <r6>)
+define(<SE>, <r7>)
+define(<SF>, <r8>)
+define(<SG>, <r10>)
+define(<SH>, <r11>)
+define(<T0>, <r12>)
+define(<T1>, <r1>) C Overlap INPUT
+define(<COUNT>, <r0>) C Overlap STATE
+define(<W>, <r14>)
+
+C Used for data load
+define(<I0>, <r3>)
+define(<I1>, <r4>)
+define(<I2>, <r5>)
+define(<I3>, <r6>)
+define(<I4>, <r7>)
+define(<DST>, <r8>)
+define(<SHIFT>, <r10>)
+define(<ILEFT>, <r11>)
+
+define(<EXPN>, <
+ ldr W, [sp, #+eval(4*$1)]
+ ldr T0, [sp, #+eval(4*(($1 + 14) % 16))]
+ ror T1, T0, #17
+ eor T1, T1, T0, ror #19
+ eor T1, T1, T0, lsr #10
+ add W, W, T1
+ ldr T0, [sp, #+eval(4*(($1 + 9) % 16))]
+ add W, W, T0
+ ldr T0, [sp, #+eval(4*(($1 + 1) % 16))]
+ ror T1, T0, #7
+ eor T1, T1, T0, ror #18
+ eor T1, T1, T0, lsr #3
+ add W, W, T1
+ str W, [sp, #+eval(4*$1)]
+>)
+
+C ROUND(A,B,C,D,E,F,G,H)
+C
+C H += S1(E) + Choice(E,F,G) + K + W
+C D += H
+C H += S0(A) + Majority(A,B,C)
+C
+C Where
+C
+C S1(E) = E<<<26 ^ E<<<21 ^ E<<<7
+C S0(A) = A<<<30 ^ A<<<19 ^ A<<<10
+C Choice (E, F, G) = G^(E&(F^G))
+C Majority (A,B,C) = (A&B) + (C&(A^B))
+
+define(<ROUND>, <
+ ror T0, $5, #6
+ eor T0, T0, $5, ror #11
+ eor T0, T0, $5, ror #25
+ add $8, $8, T0
+ eor T0, $6, $7
+ and T0, T0, $5
+ eor T0, T0, $7
+ add $8,$8, T0
+ ldr T0, [K], #+4
+ add $8, $8, W
+ add $8, $8, T0
+ add $4, $4, $8
+ ror T0, $1, #2
+ eor T0, T0, $1, ror #13
+ eor T0, T0, $1, ror #22
+ add $8, $8, T0
+ and T0, $1, $2
+ add $8, $8, T0
+ eor T0, $1, $2
+ and T0, T0, $3
+ add $8, $8, T0
+>)
+
+define(<NOEXPN>, <
+ ldr W, [sp, + $1]
+ add $1, $1, #4
+>)
+ C void
+ C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+
+ .text
+ .align 2
+
+PROLOGUE(_nettle_sha256_compress)
+ push {r4,r5,r6,r7,r8,r10,r11,r14}
+ sub sp, sp, #68
+ str STATE, [sp, +#64]
+
+ C Load data up front, since we don't have enough registers
+ C to load and shift on-the-fly
+ ands SHIFT, INPUT, #3
+ and INPUT, INPUT, $-4
+ ldr I0, [INPUT]
+ addne INPUT, INPUT, #4
+ lsl SHIFT, SHIFT, #3
+ mov T0, #0
+ movne T0, #-1
+ lsl I1, T0, SHIFT
+ uadd8 T0, T0, I1 C Sets APSR.GE bits
+
+ mov DST, sp
+ mov ILEFT, #4
+.Lcopy:
+ ldm INPUT!, {I1,I2,I3,I4}
+ sel I0, I0, I1
+ ror I0, I0, SHIFT
+ rev I0, I0
+ sel I1, I1, I2
+ ror I1, I1, SHIFT
+ rev I1, I1
+ sel I2, I2, I3
+ ror I2, I2, SHIFT
+ rev I2, I2
+ sel I3, I3, I4
+ ror I3, I3, SHIFT
+ rev I3, I3
+ subs ILEFT, ILEFT, #1
+ stm DST!, {I0,I1,I2,I3}
+ mov I0, I4
+ bne .Lcopy
+
+ ldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
+
+ mov COUNT,#0
+
+.Loop1:
+ NOEXPN(COUNT) ROUND(SA,SB,SC,SD,SE,SF,SG,SH)
+ NOEXPN(COUNT) ROUND(SH,SA,SB,SC,SD,SE,SF,SG)
+ NOEXPN(COUNT) ROUND(SG,SH,SA,SB,SC,SD,SE,SF)
+ NOEXPN(COUNT) ROUND(SF,SG,SH,SA,SB,SC,SD,SE)
+ NOEXPN(COUNT) ROUND(SE,SF,SG,SH,SA,SB,SC,SD)
+ NOEXPN(COUNT) ROUND(SD,SE,SF,SG,SH,SA,SB,SC)
+ NOEXPN(COUNT) ROUND(SC,SD,SE,SF,SG,SH,SA,SB)
+ NOEXPN(COUNT) ROUND(SB,SC,SD,SE,SF,SG,SH,SA)
+ cmp COUNT,#64
+ bne .Loop1
+
+ mov COUNT, #3
+.Loop2:
+
+ EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH)
+ EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG)
+ EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF)
+ EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE)
+ EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD)
+ EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC)
+ EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB)
+ EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA)
+ EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH)
+ EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG)
+ EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF)
+ EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE)
+ EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD)
+ EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC)
+ EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB)
+ subs COUNT, COUNT, #1
+ EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA)
+ bne .Loop2
+
+ ldr STATE, [sp, +#64]
+ C No longer needed registers
+ ldm STATE, {r1,r2,r12,r14}
+ add SA, SA, r1
+ add SB, SB, r2
+ add SC, SC, r12
+ add SD, SD, r14
+ stm STATE!, {SA,SB,SC,SD}
+ ldm STATE, {r1,r2,r12,r14}
+ add SE, SE, r1
+ add SF, SF, r2
+ add SG, SG, r12
+ add SH, SH, r14
+ stm STATE!, {SE,SF,SG,SH}
+ add sp, sp, #68
+ pop {r4,r5,r6,r7,r8,r10,r11,pc}
+EPILOGUE(_nettle_sha256_compress)