diff options
author | Niels Möller <nisse@lysator.liu.se> | 2013-04-18 14:07:20 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2013-04-18 14:17:49 +0200 |
commit | ade7779c98a5426c7d86c8a01bbd7ad65980c9b9 (patch) | |
tree | 0235c694ed12a49037d62e4b05ada53472c804ad /arm | |
parent | b7c953630bf9a05eca5b744c89eb643049eeb700 (diff) | |
download | nettle-ade7779c98a5426c7d86c8a01bbd7ad65980c9b9.tar.gz |
Reorganization of ARM assembly.
Renamed directory armv7 to arm. New subdirectory arm/neon, for files
using neon instructions. configure.ac hacked to make use of neon
configurable.
Diffstat (limited to 'arm')
-rw-r--r-- | arm/README | 47 | ||||
-rw-r--r-- | arm/aes-decrypt-internal.asm | 105 | ||||
-rw-r--r-- | arm/aes-encrypt-internal.asm | 107 | ||||
-rw-r--r-- | arm/aes.m4 | 164 | ||||
-rw-r--r-- | arm/ecc-192-modp.asm | 93 | ||||
-rw-r--r-- | arm/ecc-224-modp.asm | 111 | ||||
-rw-r--r-- | arm/ecc-256-redc.asm | 160 | ||||
-rw-r--r-- | arm/ecc-384-modp.asm | 257 | ||||
-rw-r--r-- | arm/ecc-521-modp.asm | 114 | ||||
-rw-r--r-- | arm/machine.m4 | 56 | ||||
-rw-r--r-- | arm/memxor.asm | 488 | ||||
-rw-r--r-- | arm/neon/salsa20-core-internal.asm | 181 | ||||
-rw-r--r-- | arm/neon/sha3-permute.asm | 266 | ||||
-rw-r--r-- | arm/neon/sha512-compress.asm | 317 | ||||
-rw-r--r-- | arm/neon/umac-nh-n.asm | 298 | ||||
-rw-r--r-- | arm/neon/umac-nh.asm | 89 | ||||
-rw-r--r-- | arm/sha1-compress.asm | 234 | ||||
-rw-r--r-- | arm/sha256-compress.asm | 204 |
18 files changed, 3291 insertions, 0 deletions
diff --git a/arm/README b/arm/README new file mode 100644 index 00000000..9bacd97b --- /dev/null +++ b/arm/README @@ -0,0 +1,47 @@ +Currently, code in this directory is written for arm cortex-a9. + +For efficient loads and stores, use ldmia, stmia and friends. Can do +two loads or stores per cycle with 8-byte aligned addresses, or three +loads or stores in two cycles, regardless of alignment. + +12 usable registers (if we exclude r9). + +ABI gnueabi(hf) (not depending on the floating point conventions) + +Registers May be Argument + clobbered number + +r0 Y 1 +r1 Y 2 +r2 Y 3 +r3 Y 4 +r4 N +r5 N +r6 N +r7 N +r8 N +r9 (sl) +r10 N +r11 N +r12 (ip) Y +r13 (sp) +r14 (lr) N +r15 (pc) + +q0 (d0, d1) Y 1 (for "hf" abi) +q1 (d2, d3) Y 2 +q2 (d4, d5) Y 3 +q3 (d6, d7) Y 4 +q4 (d8, d9) N +q5 (d10, d11) N +q6 (d12, d13) N +q7 (d14, d15) N +q8 (d16, d17) Y +q9 (d18, d19) Y +q10 (d20, d21) Y +q11 (d22, d23) Y +q12 (d24, d25) Y +q13 (d26, d27) Y +q14 (d28, d29) Y +q15 (d30, d31) Y + diff --git a/arm/aes-decrypt-internal.asm b/arm/aes-decrypt-internal.asm new file mode 100644 index 00000000..1cd92fb2 --- /dev/null +++ b/arm/aes-decrypt-internal.asm @@ -0,0 +1,105 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + +include_src(<arm/aes.m4>) + +C Benchmarked at at 785, 914, 1051 cycles/block on cortex A9, +C for 128, 192 and 256 bit key sizes. Unclear why it is slower +C than _aes_encrypt. + +define(<CTX>, <r0>) +define(<TABLE>, <r1>) +define(<LENGTH>, <r2>) +define(<DST>, <r3>) +define(<SRC>, <r12>) + +define(<W0>, <r4>) +define(<W1>, <r5>) +define(<W2>, <r6>) +define(<W3>, <r7>) +define(<T0>, <r8>) +define(<KEY>, <r10>) +define(<ROUND>, <r11>) + +define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST +define(<X1>, <r3>) +define(<X2>, <r12>) +define(<X3>, <r14>) C lr + + + .file "aes-decrypt-internal.asm" + + C _aes_decrypt(struct aes_context *ctx, + C const struct aes_table *T, + C unsigned length, uint8_t *dst, + C uint8_t *src) + .text + .align 2 +PROLOGUE(_nettle_aes_decrypt) + teq LENGTH, #0 + beq .Lend + ldr SRC, [sp] + + push {r4,r5,r6,r7,r8,r10,r11,lr} +.Lblock_loop: + mov KEY, CTX + AES_LOAD(SRC,KEY,W0) + AES_LOAD(SRC,KEY,W1) + AES_LOAD(SRC,KEY,W2) + AES_LOAD(SRC,KEY,W3) + + push {LENGTH, DST, SRC} + ldr ROUND, [CTX, #+AES_NROUNDS] + add TABLE, TABLE, #AES_TABLE0 + + b .Lentry + .align 2 +.Lround_loop: + C Transform X -> W + AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) + +.Lentry: + subs ROUND, ROUND,#2 + C Transform W -> X + AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) + + bne .Lround_loop + + sub TABLE, TABLE, #AES_TABLE0 + C Final round + AES_FINAL_ROUND(X0, X3, X2, X1, KEY, W0) + AES_FINAL_ROUND(X1, X0, X3, X2, KEY, W1) + AES_FINAL_ROUND(X2, X1, X0, X3, KEY, W2) + AES_FINAL_ROUND(X3, X2, X1, X0, KEY, W3) + + pop {LENGTH, DST, SRC} + + AES_STORE(DST,W0) + AES_STORE(DST,W1) + AES_STORE(DST,W2) + AES_STORE(DST,W3) + + subs LENGTH, LENGTH, #16 + bhi .Lblock_loop + + pop {r4,r5,r6,r7,r8,r10,r11,pc} + +.Lend: + bx lr +EPILOGUE(_nettle_aes_decrypt) diff --git a/arm/aes-encrypt-internal.asm b/arm/aes-encrypt-internal.asm new file mode 100644 index 00000000..b3309351 --- /dev/null +++ b/arm/aes-encrypt-internal.asm @@ -0,0 +1,107 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + +include_src(<arm/aes.m4>) + +C Benchmarked at at 693, 824, 950 cycles/block on cortex A9, +C for 128, 192 and 256 bit key sizes. + +C Possible improvements: More efficient load and store with +C aligned accesses. Better scheduling. + +define(<CTX>, <r0>) +define(<TABLE>, <r1>) +define(<LENGTH>, <r2>) +define(<DST>, <r3>) +define(<SRC>, <r12>) + +define(<W0>, <r4>) +define(<W1>, <r5>) +define(<W2>, <r6>) +define(<W3>, <r7>) +define(<T0>, <r8>) +define(<KEY>, <r10>) +define(<ROUND>, <r11>) + +define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST +define(<X1>, <r3>) +define(<X2>, <r12>) +define(<X3>, <r14>) C lr + + + .file "aes-encrypt-internal.asm" + + C _aes_encrypt(struct aes_context *ctx, + C const struct aes_table *T, + C unsigned length, uint8_t *dst, + C uint8_t *src) + .text + .align 2 +PROLOGUE(_nettle_aes_encrypt) + teq LENGTH, #0 + beq .Lend + ldr SRC, [sp] + + push {r4,r5,r6,r7,r8,r10,r11,lr} +.Lblock_loop: + mov KEY, CTX + AES_LOAD(SRC,KEY,W0) + AES_LOAD(SRC,KEY,W1) + AES_LOAD(SRC,KEY,W2) + AES_LOAD(SRC,KEY,W3) + + push {LENGTH, DST, SRC} + ldr ROUND, [CTX, #+AES_NROUNDS] + add TABLE, TABLE, #AES_TABLE0 + + b .Lentry + .align 2 +.Lround_loop: + C Transform X -> W + AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) + +.Lentry: + subs ROUND, ROUND,#2 + C Transform W -> X + AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) + + bne .Lround_loop + + sub TABLE, TABLE, #AES_TABLE0 + C Final round + AES_FINAL_ROUND(X0, X1, X2, X3, KEY, W0) + AES_FINAL_ROUND(X1, X2, X3, X0, KEY, W1) + AES_FINAL_ROUND(X2, X3, X0, X1, KEY, W2) + AES_FINAL_ROUND(X3, X0, X1, X2, KEY, W3) + + pop {LENGTH, DST, SRC} + + AES_STORE(DST,W0) + AES_STORE(DST,W1) + AES_STORE(DST,W2) + AES_STORE(DST,W3) + + subs LENGTH, LENGTH, #16 + bhi .Lblock_loop + + pop {r4,r5,r6,r7,r8,r10,r11,pc} + +.Lend: + bx lr +EPILOGUE(_nettle_aes_encrypt) diff --git a/arm/aes.m4 b/arm/aes.m4 new file mode 100644 index 00000000..00d3c9a3 --- /dev/null +++ b/arm/aes.m4 @@ -0,0 +1,164 @@ +C Loads one word, and adds it to the subkey. Uses T0 +C AES_LOAD(SRC, KEY, REG) +define(<AES_LOAD>, < + ldrb $3, [$1], #+1 + ldrb T0, [$1], #+1 + orr $3, T0, lsl #8 + ldrb T0, [$1], #+1 + orr $3, T0, lsl #16 + ldrb T0, [$1], #+1 + orr $3, T0, lsl #24 + ldr T0, [$2], #+4 + eor $3, T0 +>) +C Stores one word. Destroys input. +C AES_STORE(DST, X) +define(<AES_STORE>, < + strb $2, [$1], #+1 + ror $2, $2, #8 + strb $2, [$1], #+1 + ror $2, $2, #8 + strb $2, [$1], #+1 + ror $2, $2, #8 + strb $2, [$1], #+1 +>) + +C 53 instr. +C It's tempting to use eor with rotation, but that's slower. +C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key) +define(<AES_ENCRYPT_ROUND>, < + uxtb T0, $1 + ldr $5, [TABLE, T0, lsl #2] + uxtb T0, $2 + ldr $6, [TABLE, T0, lsl #2] + uxtb T0, $3 + ldr $7, [TABLE, T0, lsl #2] + uxtb T0, $4 + ldr $8, [TABLE, T0, lsl #2] + + uxtb T0, $2, ror #8 + add TABLE, TABLE, #1024 + ldr T0, [TABLE, T0, lsl #2] + eor $5, $5, T0 + uxtb T0, $3, ror #8 + ldr T0, [TABLE, T0, lsl #2] + eor $6, $6, T0 + uxtb T0, $4, ror #8 + ldr T0, [TABLE, T0, lsl #2] + eor $7, $7, T0 + uxtb T0, $1, ror #8 + ldr T0, [TABLE, T0, lsl #2] + eor $8, $8, T0 + + uxtb T0, $3, ror #16 + add TABLE, TABLE, #1024 + ldr T0, [TABLE, T0, lsl #2] + eor $5, $5, T0 + uxtb T0, $4, ror #16 + ldr T0, [TABLE, T0, lsl #2] + eor $6, $6, T0 + uxtb T0, $1, ror #16 + ldr T0, [TABLE, T0, lsl #2] + eor $7, $7, T0 + uxtb T0, $2, ror #16 + ldr T0, [TABLE, T0, lsl #2] + eor $8, $8, T0 + + uxtb T0, $4, ror #24 + add TABLE, TABLE, #1024 + ldr T0, [TABLE, T0, lsl #2] + eor $5, $5, T0 + uxtb T0, $1, ror #24 + ldr T0, [TABLE, T0, lsl #2] + eor $6, $6, T0 + uxtb T0, $2, ror #24 + ldr T0, [TABLE, T0, lsl #2] + eor $7, $7, T0 + uxtb T0, $3, ror #24 + ldr T0, [TABLE, T0, lsl #2] + + ldm $9!, {$1,$2,$3,$4} + eor $8, $8, T0 + sub TABLE, TABLE, #3072 + eor $5, $5, $1 + eor $6, $6, $2 + eor $7, $7, $3 + eor $8, $8, $4 +>) + +define(<AES_DECRYPT_ROUND>, < + uxtb T0, $1 + ldr $5, [TABLE, T0, lsl #2] + uxtb T0, $2 + ldr $6, [TABLE, T0, lsl #2] + uxtb T0, $3 + ldr $7, [TABLE, T0, lsl #2] + uxtb T0, $4 + ldr $8, [TABLE, T0, lsl #2] + + uxtb T0, $4, ror #8 + add TABLE, TABLE, #1024 + ldr T0, [TABLE, T0, lsl #2] + eor $5, $5, T0 + uxtb T0, $1, ror #8 + ldr T0, [TABLE, T0, lsl #2] + eor $6, $6, T0 + uxtb T0, $2, ror #8 + ldr T0, [TABLE, T0, lsl #2] + eor $7, $7, T0 + uxtb T0, $3, ror #8 + ldr T0, [TABLE, T0, lsl #2] + eor $8, $8, T0 + + uxtb T0, $3, ror #16 + add TABLE, TABLE, #1024 + ldr T0, [TABLE, T0, lsl #2] + eor $5, $5, T0 + uxtb T0, $4, ror #16 + ldr T0, [TABLE, T0, lsl #2] + eor $6, $6, T0 + uxtb T0, $1, ror #16 + ldr T0, [TABLE, T0, lsl #2] + eor $7, $7, T0 + uxtb T0, $2, ror #16 + ldr T0, [TABLE, T0, lsl #2] + eor $8, $8, T0 + + uxtb T0, $2, ror #24 + add TABLE, TABLE, #1024 + ldr T0, [TABLE, T0, lsl #2] + eor $5, $5, T0 + uxtb T0, $3, ror #24 + ldr T0, [TABLE, T0, lsl #2] + eor $6, $6, T0 + uxtb T0, $4, ror #24 + ldr T0, [TABLE, T0, lsl #2] + eor $7, $7, T0 + uxtb T0, $1, ror #24 + ldr T0, [TABLE, T0, lsl #2] + + ldm $9!, {$1,$2,$3,$4} + eor $8, $8, T0 + sub TABLE, TABLE, #3072 + eor $5, $5, $1 + eor $6, $6, $2 + eor $7, $7, $3 + eor $8, $8, $4 +>) + +C AES_FINAL_ROUND(a,b,c,d,key,res) +define(<AES_FINAL_ROUND>, < + uxtb T0, $1 + ldrb $6, [TABLE, T0] + uxtb T0, $2, ror #8 + ldrb T0, [TABLE, T0] + eor $6, $6, T0, lsl #8 + uxtb T0, $3, ror #16 + ldrb T0, [TABLE, T0] + eor $6, $6, T0, lsl #16 + uxtb T0, $4, ror #24 + ldrb T0, [TABLE, T0] + eor $6, $6, T0, lsl #24 + ldr T0, [$5], #+4 + eor $6, T0 +>) diff --git a/arm/ecc-192-modp.asm b/arm/ecc-192-modp.asm new file mode 100644 index 00000000..1b226e30 --- /dev/null +++ b/arm/ecc-192-modp.asm @@ -0,0 +1,93 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "ecc-192-modp.asm" + .arm + +define(<HP>, <r0>) C Overlaps unused ecc argument +define(<RP>, <r1>) + +define(<T0>, <r2>) +define(<T1>, <r3>) +define(<T2>, <r4>) +define(<T3>, <r5>) +define(<T4>, <r6>) +define(<T5>, <r7>) +define(<T6>, <r8>) +define(<T7>, <r10>) +define(<H0>, <T0>) C Overlaps T0 and T1 +define(<H1>, <T1>) +define(<C2>, <HP>) +define(<C4>, <r12>) + + C ecc_192_modp (const struct ecc_curve *ecc, mp_limb_t *rp) + .text + .align 2 + +PROLOGUE(nettle_ecc_192_modp) + push {r4,r5,r6,r7,r8,r10} + C Reduce two words at a time + add HP, RP, #48 + add RP, RP, #8 + ldmdb HP!, {H0,H1} + ldm RP, {T2,T3,T4,T5,T6,T7} + mov C4, #0 + adds T4, T4, H0 + adcs T5, T5, H1 + adcs T6, T6, H0 + adcs T7, T7, H1 + C Need to add carry to T2 and T4, do T4 later. + adc C4, C4, #0 + + ldmdb HP!, {H0,H1} + mov C2, #0 + adcs T2, T2, H0 + adcs T3, T3, H1 + adcs T4, T4, H0 + adcs T5, T5, H1 + C Need to add carry to T0 and T2, do T2 later + adc C2, C2, #0 + + ldmdb RP!, {T0, T1} + adcs T0, T0, T6 + adcs T1, T1, T7 + adcs T2, T2, T6 + adcs T3, T3, T7 + adc C4, C4, #0 + + adds T2, T2, C2 + adcs T3, T3, #0 + adcs T4, T4, C4 + adcs T5, T5, #0 + mov C2, #0 + adc C2, C2, #0 + + C Add in final carry + adcs T0, T0, #0 + adcs T1, T1, #0 + adcs T2, T2, C2 + adcs T3, T3, #0 + adcs T4, T4, #0 + adc T5, T5, #0 + + stm RP, {T0,T1,T2,T3,T4,T5} + + pop {r4,r5,r6,r7,r8,r10} + bx lr +EPILOGUE(nettle_ecc_192_modp) diff --git a/arm/ecc-224-modp.asm b/arm/ecc-224-modp.asm new file mode 100644 index 00000000..ef7a703a --- /dev/null +++ b/arm/ecc-224-modp.asm @@ -0,0 +1,111 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "ecc-224-modp.asm" + .arm + +define(<RP>, <r1>) +define(<H>, <r0>) C Overlaps unused ecc argument + +define(<T0>, <r2>) +define(<T1>, <r3>) +define(<T2>, <r4>) +define(<T3>, <r5>) +define(<T4>, <r6>) +define(<T5>, <r7>) +define(<T6>, <r8>) +define(<N3>, <r10>) +define(<L0>, <r11>) +define(<L1>, <r12>) +define(<L2>, <lr>) + + C ecc_224_modp (const struct ecc_curve *ecc, mp_limb_t *rp) + .text + .align 2 + +PROLOGUE(nettle_ecc_224_modp) + push {r4,r5,r6,r7,r8,r10,r11,lr} + + add L2, RP, #28 + ldm L2, {T0,T1,T2,T3,T4,T5,T6} + mov H, #0 + + adds T0, T0, T4 + adcs T1, T1, T5 + adcs T2, T2, T6 + adc H, H, #0 + + C This switch from adcs to sbcs takes carry into account with + C correct sign, but it always subtracts 1 too much. We arrange + C to also add B^7 + 1 below, so the effect is adding p. This + C addition of p also ensures that the result never is + C negative. + + sbcs N3, T3, T0 + sbcs T4, T4, T1 + sbcs T5, T5, T2 + sbcs T6, T6, H + mov H, #1 C This is the B^7 + sbc H, #0 + subs T6, T6, T3 + sbc H, #0 + + C Now subtract from low half + ldm RP!, {L0,L1,L2} + + C Clear carry, with the sbcs, this is the 1. + adds RP, #0 + + sbcs T0, L0, T0 + sbcs T1, L1, T1 + sbcs T2, L2, T2 + ldm RP!, {T3,L0,L1,L2} + sbcs T3, T3, N3 + sbcs T4, L0, T4 + sbcs T5, L1, T5 + sbcs T6, L2, T6 + rsc H, H, #0 + + C Now -2 <= H <= 0 is the borrow, so subtract (B^3 - 1) |H| + C Use (B^3 - 1) H = <H, H, H> if -1 <=H <= 0, and + C (B^3 - 1) H = <1,B-1, B-1, B-2> if H = -2 + subs T0, T0, H + asr L1, H, #1 + sbcs T1, T1, L1 + eor H, H, L1 + sbcs T2, T2, L1 + sbcs T3, T3, H + sbcs T4, T4, #0 + sbcs T5, T5, #0 + sbcs T6, T6, #0 + sbcs H, H, H + + C Final borrow, subtract (B^3 - 1) |H| + subs T0, T0, H + sbcs T1, T1, H + sbcs T2, T2, H + sbcs T3, T3, #0 + sbcs T4, T4, #0 + sbcs T5, T5, #0 + sbcs T6, T6, #0 + + stmdb RP, {T0,T1,T2,T3,T4,T5,T6} + + pop {r4,r5,r6,r7,r8,r10,r11,pc} +EPILOGUE(nettle_ecc_224_modp) diff --git a/arm/ecc-256-redc.asm b/arm/ecc-256-redc.asm new file mode 100644 index 00000000..cbf10a89 --- /dev/null +++ b/arm/ecc-256-redc.asm @@ -0,0 +1,160 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "ecc-256-redc.asm" + .arm + +define(<RP>, <r1>) + +define(<T0>, <r0>) C Overlaps unused ecc argument +define(<T1>, <r2>) +define(<T2>, <r3>) +define(<T3>, <r4>) +define(<T4>, <r5>) +define(<T5>, <r6>) +define(<T6>, <r7>) +define(<T7>, <r8>) +define(<F0>, <r10>) +define(<F1>, <r11>) +define(<F2>, <r12>) +define(<F3>, <lr>) + + C ecc_256_redc (const struct ecc_curve *ecc, mp_limb_t *rp) + .text + .align 2 + +PROLOGUE(nettle_ecc_256_redc) + push {r4,r5,r6,r7,r8,r10,r11,lr} + + ldm RP!, {T0,T1,T2,T3,T4,T5,T6,T7} + + C Set <F3,F2,F1> to the high 4 limbs of (B^2-B+1)<T2,T1,T0> + C T2 T1 + C T2 T1 T0 + C - T2 T1 T0 + C ------------- + C F3 F2 F1 F0 + + + adds F1, T0, T2 + adcs F2, T1, #0 + adc F3, T2, #0 + + subs F0, T1, T0 + sbcs F1, F1, T1 C Could also be rsc ? + sbcs F2, F2, T2 + sbc F3, F3, #0 + + C Add: + C T10 T9 T8 T7 T6 T5 T4 T3 + C + F3 F2 F1 F0 T0 T2 T1 T0 + C -------------------------- + C T7 T6 T5 T4 T3 T2 T1 T0 + + adds T3, T3, T0 + adcs T1, T4, T1 + adcs T2, T5, T2 + adcs T6, T6, T0 + mov T0, T3 C FIXME: Be more clever? + mov T3, T6 + adcs T4, T7, F0 + + ldm RP!, {T5,T6,T7} + adcs T5, T5, F1 + adcs T6, T6, F2 + adcs T7, T7, F3 + + C New F3, F2, F1, F0, also adding in carry + adcs F1, T0, T2 + adcs F2, T1, #0 + adc F3, T2, #0 + + subs F0, T1, T0 + sbcs F1, F1, T1 C Could also be rsc ? + sbcs F2, F2, T2 + sbc F3, F3, #0 + + C Start adding + adds T3, T3, T0 + adcs T1, T4, T1 + adcs T2, T5, T2 + adcs T6, T6, T0 + mov T0, T3 C FIXME: Be more clever? + mov T3, T6 + adcs T4, T7, F0 + + ldm RP!, {T5,T6,T7} + adcs T5, T5, F1 + adcs T6, T6, F2 + adcs T7, T7, F3 + + C Final iteration, eliminate only T0, T1 + C Set <F2, F1, F0> to the high 3 limbs of (B^2-B+1)<T1,T0> + + C T1 T0 T1 + C - T1 T0 + C ------------- + C F2 F1 F0 + + C First add in carry + adcs F1, T0, #0 + adcs F2, T1, #0 + subs F0, T1, T0 + sbcs F1, F1, T1 + sbc F2, F2, #0 + + C Add: + C T9 T8 T7 T6 T5 T4 T3 T2 + C + F2 F1 F0 T0 0 T1 T0 0 + C -------------------------- + C F2 F1 T7 T6 T5 T4 T3 T2 + + adds T3, T3, T0 + adcs T4, T4, T1 + adcs T5, T5, #0 + adcs T6, T6, T0 + adcs T7, T7, F0 + ldm RP!, {T0, T1} + mov F3, #0 + adcs F1, F1, T0 + adcs F2, F2, T1 + + C Sum is < B^8 + p, so it's enough to fold carry once, + C If carry, add in + C B^7 - B^6 - B^3 + 1 = <0, B-2, B-1, B-1, B-1, 0, 0, 1> + + C Mask from carry flag, leaving carry intact + adc F3, F3, #0 + rsb F3, F3, #0 + + adcs T0, T2, #0 + adcs T1, T3, #0 + adcs T2, T4, #0 + adcs T3, T5, F3 + adcs T4, T6, F3 + adcs T5, T7, F3 + and F3, F3, #-2 + adcs T6, F1, F3 + adcs T7, F2, #0 + + sub RP, RP, #64 + stm RP, {T0,T1,T2,T3,T4,T5,T6,T7} + + pop {r4,r5,r6,r7,r8,r10,r11,pc} +EPILOGUE(nettle_ecc_256_redc) diff --git a/arm/ecc-384-modp.asm b/arm/ecc-384-modp.asm new file mode 100644 index 00000000..fb5a6e12 --- /dev/null +++ b/arm/ecc-384-modp.asm @@ -0,0 +1,257 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "ecc-384-modp.asm" + .arm + +define(<RP>, <r1>) +define(<T0>, <r0>) +define(<T1>, <r2>) +define(<T2>, <r3>) +define(<T3>, <r4>) +define(<F0>, <r5>) +define(<F1>, <r6>) +define(<F2>, <r7>) +define(<F3>, <r8>) +define(<F4>, <r10>) +define(<N>, <r12>) +define(<H>, <lr>) + + C ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp) + .text + .align 2 + +PROLOGUE(nettle_ecc_384_modp) + push {r4,r5,r6,r7,r8,r10,lr} + + add RP, RP, #80 + ldm RP, {T0, T1, T2, T3} C 20-23 + + C First get top 4 limbs, which need folding twice, as + C + C T3 T2 T1 T0 + C T3 T2 T1 + C -T3 + C ---------------- + C F4 F3 F2 F1 F0 + C + C Start with + C + C T3 T1 T0 + C T1 + C -T3 + C ----------- + C F2 F1 F0 Always fits + + adds F0, T0, T1 + adcs F1, T1, #0 + adcs F2, T3, #0 + subs F0, F0, T3 + sbcs F1, F1, #0 + sbcs F2, F2, #0 + + C T3 T2 T2 0 + C F2 F1 F0 + C ---------------- + C F4 F3 F2 F1 F0 + + mov F4, #0 + adds F1, F1, T2 + adcs F2, F2, T2 + adcs F3, T3, #0 + adcs F4, F4, #0 + + C Add in to high part + sub RP, RP, #32 + ldm RP, {T0, T1, T2, T3} C 12-15 + mov H, #0 + adds F0, T0, F0 + adcs F1, T1, F1 + adcs F2, T2, F2 + adcs F3, T3, F3 + adcs F4, F4, #0 C Do F4 later + + C Add to low part, keeping carry (positive or negative) in H + sub RP, RP, #48 + ldm RP, {T0, T1, T2, T3} C 0-3 + mov H, #0 + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + adc H, H, #0 + subs T1, T1, F0 + sbcs T2, T2, F1 + sbcs T3, T3, F2 + sbc H, H, #0 + adds T3, T3, F0 + adc H, H, #0 + + stm RP!, {T0,T1,T2,T3} C 0-3 + mov N, #2 +.Loop: + ldm RP, {T0,T1,T2,T3} C 4-7 + + C First, propagate carry + adds T0, T0, H + asr H, #31 C Sign extend + adcs T1, T1, H + adcs T2, T2, H + adcs T3, T3, H + adc H, H, #0 + + C +B^4 term + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + adc H, H, #0 + + C +B^3 terms + ldr F0, [RP, #+48] C 16 + adds T0, T0, F1 + adcs T1, T1, F2 + adcs T2, T2, F3 + adcs T3, T3, F0 + adc H, H, #0 + + C -B + ldr F1, [RP, #+52] C 17-18 + ldr F2, [RP, #+56] + subs T0, T0, F3 + sbcs T1, T1, F0 + sbcs T2, T2, F1 + sbcs T3, T3, F2 + sbcs H, H, #0 + + C +1 + ldr F3, [RP, #+60] C 19 + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + adc H, H, #0 + subs N, N, #1 + stm RP!, {T0,T1,T2,T3} + bne .Loop + + C Fold high limbs, we need to add in + C + C F4 F4 0 -F4 F4 H H 0 -H H + C + C We always have F4 >= 0, but we can have H < 0. + C Sign extension gets tricky when F4 = 0 and H < 0. + sub RP, RP, #48 + + ldm RP, {T0,T1,T2,T3} C 0-3 + + C H H 0 -H H + C ---------------- + C S H F3 F2 F1 F0 + C + C Define S = H >> 31 (asr), we then have + C + C F0 = H + C F1 = S - H + C F2 = - [H > 0] + C F3 = H - [H > 0] + C H = H + S + C + C And we get underflow in S - H iff H > 0 + + C H = 0 H > 0 H = -1 + mov F0, H C 0 H -1 + asr H, #31 + subs F1, H, F0 C 0,C=1 -H,C=0 0,C=1 + sbc F2, F2, F2 C 0 -1 0 + sbc F3, F0, #0 C 0 H-1 -1 + + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + adc H, H, F0 C 0+cy H+cy -2+cy + + stm RP!, {T0,T1,T2,T3} C 0-3 + ldm RP, {T0,T1,T2,T3} C 4-7 + + C F4 0 -F4 + C --------- + C F3 F2 F1 + + rsbs F1, F4, #0 + sbc F2, F2, F2 + sbc F3, F4, #0 + + C Sign extend H + adds F0, F4, H + asr H, H, #31 + adcs F1, F1, H + adcs F2, F2, H + adcs F3, F3, H + adcs F4, F4, H + adc H, H, #0 + + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + + stm RP!, {T0,T1,T2,T3} C 4-7 + ldm RP, {T0,T1,T2,T3} C 8-11 + + adcs T0, T0, F4 + adcs T1, T1, H + adcs T2, T2, H + adcs T3, T3, H + adc H, H, #0 + + stm RP, {T0,T1,T2,T3} C 8-11 + + C Final (unlikely) carry + sub RP, RP, #32 + ldm RP, {T0,T1,T2,T3} C 0-3 + C Fold H into F0-F4 + mov F0, H + asr H, #31 + subs F1, H, F0 + sbc F2, F2, F2 + sbc F3, F0, #0 + add F4, F0, H + + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + + stm RP!, {T0,T1,T2,T3} C 0-3 + ldm RP, {T0,T1,T2,T3} C 4-7 + adcs T0, T0, F4 + adcs T1, T1, H + adcs T2, T2, H + adcs T3, T3, H + stm RP!, {T0,T1,T2,T3} C 4-7 + ldm RP, {T0,T1,T2,T3} C 8-11 + adcs T0, T0, H + adcs T1, T1, H + adcs T2, T2, H + adcs T3, T3, H + stm RP!, {T0,T1,T2,T3} C 8-11 + pop {r4,r5,r6,r7,r8,r10,pc} +EPILOGUE(nettle_ecc_384_modp) diff --git a/arm/ecc-521-modp.asm b/arm/ecc-521-modp.asm new file mode 100644 index 00000000..fe305805 --- /dev/null +++ b/arm/ecc-521-modp.asm @@ -0,0 +1,114 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "ecc-521-modp.asm" + .arm + +define(<HP>, <r0>) +define(<RP>, <r1>) +define(<T0>, <r2>) +define(<T1>, <r3>) +define(<T2>, <r4>) +define(<F0>, <r5>) +define(<F1>, <r6>) +define(<F2>, <r7>) +define(<F3>, <r8>) +define(<H>, <r12>) +define(<N>, <lr>) + + C ecc_521_modp (const struct ecc_curve *ecc, mp_limb_t *rp) + .text +.Lc511: + .int 511 + + .align 2 + +PROLOGUE(nettle_ecc_521_modp) + push {r4,r5,r6,r7,r8,lr} + + C Use that B^17 = 2^23 (mod p) + ldr F3, [RP, #+68] C 17 + add HP, RP, #72 C 18 + ldr T0, [RP] C 0 + adds T0, T0, F3, lsl #23 + str T0, [RP], #+4 + mov N, #5 + + C 5 iterations, reading limbs 18-20, 21-23, 24-26, 27-29, 30-32 + C and adding to limbs 1-3, 4-6, 7-9, 19-12, 13-15 +.Loop: + ldm RP, {T0,T1,T2} C 1+3*k -- 3+3*k + lsr F0, F3, #9 + ldm HP!, {F1,F2,F3} C 18+3*k -- 20+3*k + orr F0, F0, F1, lsl #23 + lsr F1, F1, #9 + orr F1, F1, F2, lsl #23 + lsr F2, F2, #9 + orr F2, F2, F3, lsl #23 + adcs T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + sub N, N, #1 + stm RP!,{T0,T1,T2} + teq N, #0 + bne .Loop + + ldr F0, [RP], #-64 C 16 + ldr F1, [HP] C 33 + ldr T0, .Lc511 + + C Handling of high limbs + C F0 = rp[16] + carry in + F3 >> 9 + adcs F0, F0, F3, lsr #9 + C Copy low 9 bits to H, then shift right including carry + and H, F0, T0 + rrx F0, F0 + lsr F0, F0, #8 + C Add in F1 = rp[33], with weight 2^1056 = 2^14 + adds F0, F0, F1, lsl #14 + lsr F1, F1, #18 + adc F1, F1, #0 + + ldm RP, {T0, T1} C 0-1 + adds T0, T0, F0 + adcs T1, T1, F1 + stm RP!, {T0, T1} + + ldm RP, {T0,T1,T2,F0,F1,F2,F3} C 2-8 + adcs T0, T0, #0 + adcs T1, T1, #0 + adcs T2, T2, #0 + adcs F0, F0, #0 + adcs F1, F1, #0 + adcs F2, F2, #0 + adcs F3, F3, #0 + stm RP!, {T0,T1,T2,F0,F1,F2,F3} C 2-8 + ldm RP, {T0,T1,T2,F0,F1,F2,F3} C 9-15 + adcs T0, T0, #0 + adcs T1, T1, #0 + adcs T2, T2, #0 + adcs F0, F0, #0 + adcs F1, F1, #0 + adcs F2, F2, #0 + adcs F3, F3, #0 + adcs H, H, #0 + stm RP, {T0,T1,T2,F0,F1,F2,F3,H} C 9-16 + + pop {r4,r5,r6,r7,r8,pc} +EPILOGUE(nettle_ecc_521_modp) diff --git a/arm/machine.m4 b/arm/machine.m4 new file mode 100644 index 00000000..f982a66a --- /dev/null +++ b/arm/machine.m4 @@ -0,0 +1,56 @@ +define(<QREG>, <ifelse( + $1, d0, q0, + $1, d2, q1, + $1, d4, q2, + $1, d6, q3, + $1, d8, q4, + $1, d10, q5, + $1, d12, q6, + $1, d14, q7, + $1, d16, q8, + $1, d18, q9, + $1, d20, q10, + $1, d22, q11, + $1, d24, q12, + $1, d26, q13, + $1, d28, q14, + $1, d30, q15, + <NO REGISTER>)>)dnl + +define(<D0REG>, <ifelse( + $1, q0, d0, + $1, q1, d2, + $1, q2, d4, + $1, q3, d6, + $1, q4, d8, + $1, q5, d10, + $1, q6, d12, + $1, q7, d14, + $1, q8, d16, + $1, q9, d18, + $1, q10, d20, + $1, q11, d22, + $1, q12, d24, + $1, q13, d26, + $1, q14, d28, + $1, q15, d30, + <NO REGISTER>)>)dnl + +define(<D1REG>, <ifelse( + $1, q0, d1, + $1, q1, d3, + $1, q2, d5, + $1, q3, d7, + $1, q4, d9, + $1, q5, d11, + $1, q6, d13, + $1, q7, d15, + $1, q8, d17, + $1, q9, d19, + $1, q10, d21, + $1, q11, d23, + $1, q12, d25, + $1, q13, d27, + $1, q14, d29, + $1, q15, d31, + <NO REGISTER>)>)dnl diff --git a/arm/memxor.asm b/arm/memxor.asm new file mode 100644 index 00000000..33f672c6 --- /dev/null +++ b/arm/memxor.asm @@ -0,0 +1,488 @@ +C -*- mode: asm; asm-comment-char: ?C; -*- +C nettle, low-level cryptographics library +C +C Copyright (C) 2013, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + +C Possible speedups: +C +C The ldm instruction can do load two registers per cycle, +C if the address is two-word aligned. Or three registers in two +C cycles, regardless of alignment. + +C Register usage: + +define(<DST>, <r0>) +define(<SRC>, <r1>) +define(<N>, <r2>) +define(<CNT>, <r6>) +define(<TNC>, <r12>) + + .syntax unified + + .file "memxor.asm" + + .text + .arm + + C memxor(uint8_t *dst, const uint8_t *src, size_t n) + .align 4 +PROLOGUE(memxor) + cmp N, #0 + beq .Lmemxor_done + + cmp N, #7 + bcs .Lmemxor_large + + C Simple byte loop +.Lmemxor_bytes: + ldrb r3, [SRC], #+1 + ldrb r12, [DST] + eor r3, r12 + strb r3, [DST], #+1 + subs N, #1 + bne .Lmemxor_bytes + +.Lmemxor_done: + bx lr + +.Lmemxor_align_loop: + ldrb r3, [SRC], #+1 + ldrb r12, [DST] + eor r3, r12 + strb r3, [DST], #+1 + sub N, #1 + +.Lmemxor_large: + tst DST, #3 + bne .Lmemxor_align_loop + + C We have at least 4 bytes left to do here. + sub N, #4 + + ands r3, SRC, #3 + beq .Lmemxor_same + + C Different alignment case. + C v original SRC + C +-------+------+ + C |SRC |SRC+4 | + C +---+---+------+ + C |DST | + C +-------+ + C + C With little-endian, we need to do + C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC) + + push {r4,r5,r6} + + lsl CNT, r3, #3 + bic SRC, #3 + rsb TNC, CNT, #32 + + ldr r4, [SRC], #+4 + + tst N, #4 + itet eq + moveq r5, r4 + subne N, #4 + beq .Lmemxor_odd + +.Lmemxor_word_loop: + ldr r5, [SRC], #+4 + ldr r3, [DST] + eor r3, r3, r4, lsr CNT + eor r3, r3, r5, lsl TNC + str r3, [DST], #+4 +.Lmemxor_odd: + ldr r4, [SRC], #+4 + ldr r3, [DST] + eor r3, r3, r5, lsr CNT + eor r3, r3, r4, lsl TNC + str r3, [DST], #+4 + subs N, #8 + bcs .Lmemxor_word_loop + adds N, #8 + beq .Lmemxor_odd_done + + C We have TNC/8 left-over bytes in r4, high end + lsr r4, CNT + ldr r3, [DST] + eor r3, r4 + + pop {r4,r5,r6} + + C Store bytes, one by one. +.Lmemxor_leftover: + strb r3, [DST], #+1 + subs N, #1 + beq .Lmemxor_done + subs TNC, #8 + lsr r3, #8 + bne .Lmemxor_leftover + b .Lmemxor_bytes +.Lmemxor_odd_done: + pop {r4,r5,r6} + bx lr + +.Lmemxor_same: + push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register + + subs N, #8 + bcc .Lmemxor_same_end + + ldmia SRC!, {r3, r4, r5} + C Keep address for loads in r14 + mov r14, DST + ldmia r14!, {r6, r7, r8} + subs N, #12 + eor r10, r3, r6 + eor r11, r4, r7 + eor r12, r5, r8 + bcc .Lmemxor_same_final_store + subs N, #12 + ldmia r14!, {r6, r7, r8} + bcc .Lmemxor_same_wind_down + + C 6 cycles per iteration, 0.50 cycles/byte. For this speed, + C loop starts at offset 0x11c in the object file. + +.Lmemxor_same_loop: + C r10-r12 contains values to be stored at DST + C r6-r8 contains values read from r14, in advance + ldmia SRC!, {r3, r4, r5} + subs N, #12 + stmia DST!, {r10, r11, r12} + eor r10, r3, r6 + eor r11, r4, r7 + eor r12, r5, r8 + ldmia r14!, {r6, r7, r8} + bcs .Lmemxor_same_loop + +.Lmemxor_same_wind_down: + C Wind down code + ldmia SRC!, {r3, r4, r5} + stmia DST!, {r10, r11, r12} + eor r10, r3, r6 + eor r11, r4, r7 + eor r12, r5, r8 +.Lmemxor_same_final_store: + stmia DST!, {r10, r11, r12} + +.Lmemxor_same_end: + C We have 0-11 bytes left to do, and N holds number of bytes -12. + adds N, #4 + bcc .Lmemxor_same_lt_8 + C Do 8 bytes more, leftover is in N + ldmia SRC!, {r3, r4} + ldmia DST, {r6, r7} + eor r3, r6 + eor r4, r7 + stmia DST!, {r3, r4} + pop {r4,r5,r6,r7,r8,r10,r11,r14} + beq .Lmemxor_done + b .Lmemxor_bytes + +.Lmemxor_same_lt_8: + pop {r4,r5,r6,r7,r8,r10,r11,r14} + adds N, #4 + bcc .Lmemxor_same_lt_4 + + ldr r3, [SRC], #+4 + ldr r12, [DST] + eor r3, r12 + str r3, [DST], #+4 + beq .Lmemxor_done + b .Lmemxor_bytes + +.Lmemxor_same_lt_4: + adds N, #4 + beq .Lmemxor_done + b .Lmemxor_bytes + +EPILOGUE(memxor) + +define(<DST>, <r0>) +define(<AP>, <r1>) +define(<BP>, <r2>) +define(<N>, <r3>) +undefine(<CNT>) +undefine(<TNC>) + +C Temporaries r4-r7 +define(<ACNT>, <r8>) +define(<ATNC>, <r10>) +define(<BCNT>, <r11>) +define(<BTNC>, <r12>) + + C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n) + .align 2 +PROLOGUE(memxor3) + cmp N, #0 + beq .Lmemxor3_ret + + push {r4,r5,r6,r7,r8,r10,r11} + cmp N, #7 + + add AP, N + add BP, N + add DST, N + + bcs .Lmemxor3_large + + C Simple byte loop +.Lmemxor3_bytes: + ldrb r4, [AP, #-1]! + ldrb r5, [BP, #-1]! + eor r4, r5 + strb r4, [DST, #-1]! + subs N, #1 + bne .Lmemxor3_bytes + +.Lmemxor3_done: + pop {r4,r5,r6,r7,r8,r10,r11} +.Lmemxor3_ret: + bx lr + +.Lmemxor3_align_loop: + ldrb r4, [AP, #-1]! + ldrb r5, [BP, #-1]! + eor r5, r4 + strb r5, [DST, #-1]! + sub N, #1 + +.Lmemxor3_large: + tst DST, #3 + bne .Lmemxor3_align_loop + + C We have at least 4 bytes left to do here. + sub N, #4 + ands ACNT, AP, #3 + lsl ACNT, #3 + beq .Lmemxor3_a_aligned + + ands BCNT, BP, #3 + lsl BCNT, #3 + bne .Lmemxor3_uu + + C Swap + mov r4, AP + mov AP, BP + mov BP, r4 + +.Lmemxor3_au: + C NOTE: We have the relevant shift count in ACNT, not BCNT + + C AP is aligned, BP is not + C v original SRC + C +-------+------+ + C |SRC-4 |SRC | + C +---+---+------+ + C |DST-4 | + C +-------+ + C + C With little-endian, we need to do + C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC) + rsb ATNC, ACNT, #32 + bic BP, #3 + + ldr r4, [BP] + + tst N, #4 + itet eq + moveq r5, r4 + subne N, #4 + beq .Lmemxor3_au_odd + +.Lmemxor3_au_loop: + ldr r5, [BP, #-4]! + ldr r6, [AP, #-4]! + eor r6, r6, r4, lsl ATNC + eor r6, r6, r5, lsr ACNT + str r6, [DST, #-4]! +.Lmemxor3_au_odd: + ldr r4, [BP, #-4]! + ldr r6, [AP, #-4]! + eor r6, r6, r5, lsl ATNC + eor r6, r6, r4, lsr ACNT + str r6, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_au_loop + adds N, #8 + beq .Lmemxor3_done + + C Leftover bytes in r4, low end + ldr r5, [AP, #-4] + eor r4, r5, r4, lsl ATNC + +.Lmemxor3_au_leftover: + C Store a byte at a time + ror r4, #24 + strb r4, [DST, #-1]! + subs N, #1 + beq .Lmemxor3_done + subs ACNT, #8 + sub AP, #1 + bne .Lmemxor3_au_leftover + b .Lmemxor3_bytes + +.Lmemxor3_a_aligned: + ands ACNT, BP, #3 + lsl ACNT, #3 + bne .Lmemxor3_au ; + + C a, b and dst all have the same alignment. + subs N, #8 + bcc .Lmemxor3_aligned_word_end + + C This loop runs at 8 cycles per iteration. It has been + C observed running at only 7 cycles, for this speed, the loop + C started at offset 0x2ac in the object file. + + C FIXME: consider software pipelining, similarly to the memxor + C loop. + +.Lmemxor3_aligned_word_loop: + ldmdb AP!, {r4,r5,r6} + ldmdb BP!, {r7,r8,r10} + subs N, #12 + eor r4, r7 + eor r5, r8 + eor r6, r10 + stmdb DST!, {r4, r5,r6} + bcs .Lmemxor3_aligned_word_loop + +.Lmemxor3_aligned_word_end: + C We have 0-11 bytes left to do, and N holds number of bytes -12. + adds N, #4 + bcc .Lmemxor3_aligned_lt_8 + C Do 8 bytes more, leftover is in N + ldmdb AP!, {r4, r5} + ldmdb BP!, {r6, r7} + eor r4, r6 + eor r5, r7 + stmdb DST!, {r4,r5} + beq .Lmemxor3_done + b .Lmemxor3_bytes + +.Lmemxor3_aligned_lt_8: + adds N, #4 + bcc .Lmemxor3_aligned_lt_4 + + ldr r4, [AP,#-4]! + ldr r5, [BP,#-4]! + eor r4, r5 + str r4, [DST,#-4]! + beq .Lmemxor3_done + b .Lmemxor3_bytes + +.Lmemxor3_aligned_lt_4: + adds N, #4 + beq .Lmemxor3_done + b .Lmemxor3_bytes + +.Lmemxor3_uu: + + cmp ACNT, BCNT + bic AP, #3 + bic BP, #3 + rsb ATNC, ACNT, #32 + + bne .Lmemxor3_uud + + C AP and BP are unaligned in the same way + + ldr r4, [AP] + ldr r6, [BP] + eor r4, r6 + + tst N, #4 + itet eq + moveq r5, r4 + subne N, #4 + beq .Lmemxor3_uu_odd + +.Lmemxor3_uu_loop: + ldr r5, [AP, #-4]! + ldr r6, [BP, #-4]! + eor r5, r6 + lsl r4, ATNC + eor r4, r4, r5, lsr ACNT + str r4, [DST, #-4]! +.Lmemxor3_uu_odd: + ldr r4, [AP, #-4]! + ldr r6, [BP, #-4]! + eor r4, r6 + lsl r5, ATNC + eor r5, r5, r4, lsr ACNT + str r5, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_uu_loop + adds N, #8 + beq .Lmemxor3_done + + C Leftover bytes in a4, low end + ror r4, ACNT +.Lmemxor3_uu_leftover: + ror r4, #24 + strb r4, [DST, #-1]! + subs N, #1 + beq .Lmemxor3_done + subs ACNT, #8 + bne .Lmemxor3_uu_leftover + b .Lmemxor3_bytes + +.Lmemxor3_uud: + C Both AP and BP unaligned, and in different ways + rsb BTNC, BCNT, #32 + + ldr r4, [AP] + ldr r6, [BP] + + tst N, #4 + ittet eq + moveq r5, r4 + moveq r7, r6 + subne N, #4 + beq .Lmemxor3_uud_odd + +.Lmemxor3_uud_loop: + ldr r5, [AP, #-4]! + ldr r7, [BP, #-4]! + lsl r4, ATNC + eor r4, r4, r6, lsl BTNC + eor r4, r4, r5, lsr ACNT + eor r4, r4, r7, lsr BCNT + str r4, [DST, #-4]! +.Lmemxor3_uud_odd: + ldr r4, [AP, #-4]! + ldr r6, [BP, #-4]! + lsl r5, ATNC + eor r5, r5, r7, lsl BTNC + eor r5, r5, r4, lsr ACNT + eor r5, r5, r6, lsr BCNT + str r5, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_uud_loop + adds N, #8 + beq .Lmemxor3_done + + C FIXME: More clever left-over handling? For now, just adjust pointers. + add AP, AP, ACNT, lsr #3 + add BP, BP, BCNT, lsr #3 + b .Lmemxor3_bytes +EPILOGUE(memxor3) diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm new file mode 100644 index 00000000..fe26e5c5 --- /dev/null +++ b/arm/neon/salsa20-core-internal.asm @@ -0,0 +1,181 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "salsa20-core-internal.asm" + .fpu neon + +define(<DST>, <r0>) +define(<SRC>, <r1>) +define(<ROUNDS>, <r2>) + +define(<X0>, <q0>) +define(<X1>, <q1>) +define(<X2>, <q2>) +define(<X3>, <q3>) +define(<T0>, <q8>) +define(<T1>, <q9>) +define(<M0101>, <q10>) +define(<M0110>, <q11>) +define(<M0011>, <q12>) +define(<S1>, <q13>) +define(<S2>, <q14>) +define(<S3>, <q15>) + +define(<QROUND>, < + vadd.i32 T0, $1, $4 + vshl.i32 T1, T0, #7 + vshr.u32 T0, T0, #25 + veor $2, $2, T0 + veor $2, $2, T1 + + vadd.i32 T0, $1, $2 + vshl.i32 T1, T0, #9 + vshr.u32 T0, T0, #23 + veor $3, $3, T0 + veor $3, $3, T1 + + vadd.i32 T0, $2, $3 + vshl.i32 T1, T0, #13 + vshr.u32 T0, T0, #19 + veor $4, $4, T0 + veor $4, $4, T1 + + vadd.i32 T0, $3, $4 + vshl.i32 T1, T0, #18 + vshr.u32 T0, T0, #14 + veor $1, $1, T0 + veor $1, $1, T1 +>) + + .text + .align 4 +.Lmasks: + .int 0,-1, 0,-1 + .int 0,-1,-1, 0 + .int 0, 0,-1,-1 + + C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds) + +PROLOGUE(_nettle_salsa20_core) + vldm SRC, {X0,X1,X2,X3} + + C Input rows: + C 0 1 2 3 X0 + C 4 5 6 7 X1 + C 8 9 10 11 X2 + C 12 13 14 15 X3 + C Permuted to: + C 0 5 10 15 + C 4 9 14 3 + C 8 13 2 7 + C 12 1 6 11 + + C FIXME: Construct in some other way? + adr r12, .Lmasks + vldm r12, {M0101, M0110, M0011} + + vmov S1, X1 + vmov S2, X2 + vmov S3, X3 + + C Swaps in columns 1, 3: + C 0 5 2 7 X0 ^ + C 4 1 6 3 T0 v + C 8 13 10 15 T1 ^ + C 12 9 14 11 X3 v + vmov T0, X1 + vmov T1, X2 + vbit T0, X0, M0101 + vbit X0, X1, M0101 + vbit T1, X3, M0101 + vbit X3, X2, M0101 + + C Swaps in column 1, 2: + C 0 5 2 7 X0 + C 4 9 14 3 X1 ^ + C 8 13 10 15 T1 | + C 12 1 6 11 X3 v + vmov X1, T0 + vbit X1, X3, M0110 + vbit X3, T0, M0110 + + C Swaps in columm 2,3: + C 0 5 10 15 X0 ^ + C 4 9 14 3 X1 | + C 8 13 2 7 X2 v + C 12 1 6 11 X3 + vmov X2, T1 + vbit X2, X0, M0011 + vbit X0, T1, M0011 + +.Loop: + QROUND(X0, X1, X2, X3) + + C Rotate rows, to get + C 0 5 10 15 + C 3 4 9 14 >>> 1 + C 2 7 8 13 >>> 2 + C 1 6 11 12 >>> 3 + vext.32 X1, X1, X1, #3 + vext.32 X2, X2, X2, #2 + vext.32 X3, X3, X3, #1 + + QROUND(X0, X3, X2, X1) + + subs ROUNDS, ROUNDS, #2 + C Inverse rotation + vext.32 X1, X1, X1, #1 + vext.32 X2, X2, X2, #2 + vext.32 X3, X3, X3, #3 + + bhi .Loop + + C Inverse swaps + vmov T1, X2 + vbit T1, X0, M0011 + vbit X0, X2, M0011 + + vmov T0, X1 + vbit T0, X3, M0110 + vbit X3, X1, M0110 + + vmov X1, T0 + vmov X2, T1 + vbit X1, X0, M0101 + vbit X0, T0, M0101 + vbit X2, X3, M0101 + vbit X3, T1, M0101 + + vld1.64 {T0}, [SRC] + vadd.u32 X0, X0, T0 + vadd.u32 X1, X1, S1 + vadd.u32 X2, X2, S2 + vadd.u32 X3, X3, S3 + + vstm DST, {X0,X1,X2,X3} + bx lr +EPILOGUE(_nettle_salsa20_core) + +divert(-1) +define salsastate +p/x $q0.u32 +p/x $q1.u32 +p/x $q2.u32 +p/x $q3.u32 +end diff --git a/arm/neon/sha3-permute.asm b/arm/neon/sha3-permute.asm new file mode 100644 index 00000000..beee09f7 --- /dev/null +++ b/arm/neon/sha3-permute.asm @@ -0,0 +1,266 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "sha3-permute.asm" + .fpu neon + +define(<CTX>, <r0>) +define(<COUNT>, <r1>) +define(<RC>, <r2>) +C First column +define(<A0>, <d0>) +define(<A5>, <d2>) +define(<A10>, <d3>) +define(<A15>, <d4>) +define(<A20>, <d5>) + +define(<A1>, <d6>) +define(<A2>, <d7>) +define(<A3>, <d8>) +define(<A4>, <d9>) + +define(<A6>, <d16>) +define(<A7>, <d17>) +define(<A8>, <d18>) +define(<A9>, <d19>) + +define(<A11>, <d20>) +define(<A12>, <d21>) +define(<A13>, <d22>) +define(<A14>, <d23>) + +define(<A16>, <d24>) +define(<A17>, <d25>) +define(<A18>, <d26>) +define(<A19>, <d27>) + +define(<A21>, <d28>) +define(<A22>, <d29>) +define(<A23>, <d30>) +define(<A24>, <d31>) + +define(<T0>, <d10>) +define(<T1>, <d11>) + +define(<C0>, <d1>) +define(<C1>, <d12>) +define(<C2>, <d13>) +define(<C3>, <d14>) +define(<C4>, <d15>) + + +C ROL(DST, SRC, COUNT) +C Must have SRC != DST +define(<ROL>, < + vshr.u64 $1, $2, #eval(64-$3) + vsli.i64 $1, $2, #$3 + >) +C sha3_permute(struct sha3_ctx *ctx) + + .text + .align 3 +.Lrc: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808A + .quad 0x8000000080008000 + .quad 0x000000000000808B + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008A + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000A + .quad 0x000000008000808B + .quad 0x800000000000008B + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800A + .quad 0x800000008000000A + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +PROLOGUE(nettle_sha3_permute) + vpush {d8-d15} + + vld1.64 {A0}, [CTX]! + vldm CTX!, {A1,A2,A3,A4} + vld1.64 {A5}, [CTX]! + vldm CTX!, {A6,A7,A8,A9} + vld1.64 {A10}, [CTX]! + vldm CTX!, {A11,A12,A13,A14} + vld1.64 {A15}, [CTX]! + vldm CTX!, {A16,A17,A18,A19} + vld1.64 {A20}, [CTX]! + vldm CTX, {A21,A22,A23,A24} + sub CTX, CTX, #168 + + mov COUNT, #24 + adr RC, .Lrc + + .align 3 +.Loop: + veor QREG(T0), QREG(A5), QREG(A15) + veor C0, A0, T0 + veor C0, C0, T1 + veor QREG(C1), QREG(A1), QREG(A6) + veor QREG(C1), QREG(C1), QREG(A11) + veor QREG(C1), QREG(C1), QREG(A16) + veor QREG(C1), QREG(C1), QREG(A21) + + veor QREG(C3), QREG(A3), QREG(A8) + veor QREG(C3), QREG(C3), QREG(A13) + veor QREG(C3), QREG(C3), QREG(A18) + veor QREG(C3), QREG(C3), QREG(A23) + + C D0 = C4 ^ (C1 <<< 1) + C NOTE: Using ROL macro (and vsli) is slightly slower. + vshl.i64 T0, C1, #1 + vshr.u64 T1, C1, #63 + veor T0, T0, C4 + veor T0, T0, T1 + vmov T1, T0 + veor A0, A0, T0 + veor QREG(A5), QREG(A5), QREG(T0) + veor QREG(A15), QREG(A15), QREG(T0) + + C D1 = C0 ^ (C2 <<< 1) + C D2 = C1 ^ (C3 <<< 1) + ROL(T0, C2, 1) + ROL(T1, C3, 1) + veor T0, T0, C0 + veor T1, T1, C1 + veor QREG(A1), QREG(A1), QREG(T0) + veor QREG(A6), QREG(A6), QREG(T0) + veor QREG(A11), QREG(A11), QREG(T0) + veor QREG(A16), QREG(A16), QREG(T0) + veor QREG(A21), QREG(A21), QREG(T0) + + C D3 = C2 ^ (C4 <<< 1) + C D4 = C3 ^ (C0 <<< 1) + ROL(T0, C4, 1) + ROL(T1, C0, 1) + veor T0, T0, C2 + veor T1, T1, C3 + veor QREG(A3), QREG(A3), QREG(T0) + veor QREG(A8), QREG(A8), QREG(T0) + veor QREG(A13), QREG(A13), QREG(T0) + veor QREG(A18), QREG(A18), QREG(T0) + veor QREG(A23), QREG(A23), QREG(T0) + + ROL( T0, A1, 1) + ROL( A1, A6, 44) + ROL( A6, A9, 20) + ROL( A9, A22, 61) + ROL(A22, A14, 39) + ROL(A14, A20, 18) + ROL(A20, A2, 62) + ROL( A2, A12, 43) + ROL(A12, A13, 25) + ROL(A13, A19, 8) + ROL(A19, A23, 56) + ROL(A23, A15, 41) + ROL(A15, A4, 27) + ROL( A4, A24, 14) + ROL(A24, A21, 2) + ROL(A21, A8, 55) + ROL( A8, A16, 45) + ROL(A16, A5, 36) + ROL( A5, A3, 28) + ROL( A3, A18, 21) + ROL(A18, A17, 15) + ROL(A17, A11, 10) + ROL(A11, A7, 6) + ROL( A7, A10, 3) + C New A10 value left in T0 + + vbic C0, A2, A1 + vbic C1, A3, A2 + vbic C2, A4, A3 + vbic C3, A0, A4 + vbic C4, A1, A0 + + veor A0, A0, C0 + vld1.64 {C0}, [RC :64]! + veor QREG(A1), QREG(A1), QREG(C1) + veor QREG(A3), QREG(A3), QREG(C3) + veor A0, A0, C0 + + vbic C0, A7, A6 + vbic C1, A8, A7 + vbic C2, A9, A8 + vbic C3, A5, A9 + vbic C4, A6, A5 + + veor A5, A5, C0 + veor QREG(A6), QREG(A6), QREG(C1) + veor QREG(A8), QREG(A8), QREG(C3) + + vbic C0, A12, A11 + vbic C1, A13, A12 + vbic C2, A14, A13 + vbic C3, T0, A14 + vbic C4, A11, T0 + + veor A10, T0, C0 + veor QREG(A11), QREG(A11), QREG(C1) + veor QREG(A13), QREG(A13), QREG(C3) + + vbic C0, A17, A16 + vbic C1, A18, A17 + vbic C2, A19, A18 + vbic C3, A15, A19 + vbic C4, A16, A15 + + veor A15, A15, C0 + veor QREG(A16), QREG(A16), QREG(C1) + veor QREG(A18), QREG(A18), QREG(C3) + + vbic C0, A22, A21 + vbic C1, A23, A22 + vbic C2, A24, A23 + vbic C3, A20, A24 + vbic C4, A21, A20 + + subs COUNT, COUNT, #1 + veor A20, A20, C0 + veor QREG(A21), QREG(A21), QREG(C1) + veor QREG(A23), QREG(A23), QREG(C3) + + bne .Loop + + vst1.64 {A0}, [CTX]! + vstm CTX!, {A1,A2,A3,A4} + vst1.64 {A5}, [CTX]! + vstm CTX!, {A6,A7,A8,A9} + vst1.64 {A10}, [CTX]! + vstm CTX!, {A11,A12,A13,A14} + vst1.64 {A15}, [CTX]! + vstm CTX!, {A16,A17,A18,A19} + vst1.64 {A20}, [CTX]! + vstm CTX, {A21,A22,A23,A24} + + vpop {d8-d15} + bx lr +EPILOGUE(nettle_sha3_permute) diff --git a/arm/neon/sha512-compress.asm b/arm/neon/sha512-compress.asm new file mode 100644 index 00000000..ac2b4382 --- /dev/null +++ b/arm/neon/sha512-compress.asm @@ -0,0 +1,317 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "sha512-compress.asm" + .fpu neon + +define(<STATE>, <r0>) +define(<INPUT>, <r1>) +define(<K>, <r2>) +define(<COUNT>, <r3>) +define(<SHIFT>, <r12>) + +define(<SA>, <d0>) +define(<SB>, <d1>) +define(<SC>, <d2>) +define(<SD>, <d3>) +define(<SE>, <d4>) +define(<SF>, <d5>) +define(<SG>, <d6>) +define(<SH>, <d7>) +define(<QSAB>, <q0>) +define(<QSCD>, <q1>) +define(<QSEF>, <q2>) +define(<QSGH>, <q3>) + +C d8-d15 are callee-save +define(<DT0>, <d8>) +define(<DT1>, <d9>) +define(<QT01>, <q4>) +define(<DT2>, <d10>) +define(<DT3>, <d11>) +define(<QT23>, <q5>) +define(<DT4>, <d12>) +define(<DT5>, <d13>) +define(<QT45>, <q6>) + +C Used only when reading the input, can overlap with state +define(<DT6>, <d0>) +define(<DT7>, <d1>) +define(<QT67>, <q0>) + +define(<DW0>, <d16>) +define(<DW1>, <d17>) +define(<DW2>, <d18>) +define(<DW3>, <d19>) +define(<DW4>, <d20>) +define(<DW5>, <d21>) +define(<DW6>, <d22>) +define(<DW7>, <d23>) +define(<DW8>, <d24>) +define(<DW9>, <d25>) +define(<DW10>, <d26>) +define(<DW11>, <d27>) +define(<DW12>, <d28>) +define(<DW13>, <d29>) +define(<DW14>, <d30>) +define(<DW15>, <d31>) +define(<QW0001>, <q8>) +define(<QW0203>, <q9>) +define(<QW0405>, <q10>) +define(<QW0607>, <q11>) +define(<QW0809>, <q12>) +define(<QW1011>, <q13>) +define(<QW1213>, <q14>) +define(<QW1415>, <q15>) + +define(<EXPAND_ME>, <$1>) +define(<W>, <EXPAND_ME(<DW>eval(($1) % 16))>) + +C If x = W(i+14), y = w(i+1), we xor in parallel +C +C x << 45 y << 63 +C x >> 19 y >> 1 +C x << 3 y << 56 +C x >> 61 y >> 8 +C xor x >> 6 y >> 7 +C ----------------------------- +C DT0 DT1 +define(<EXPN>, < + vshl.i64 DT0, W($1+14), #45 + vshl.i64 DT1, W($1 + 1), #63 + vshr.u64 DT2, W($1+14), #19 + vshr.u64 DT3, W($1 + 1), #1 + vshl.i64 DT4, W($1+14), #3 + vshl.i64 DT5, W($1 + 1), #56 + veor.i64 QT01, QT01, QT23 + vshr.u64 DT2, W($1+14), #61 + vshr.u64 DT3, W($1 + 1), #8 + veor.i64 QT01, QT01, QT45 + vshr.u64 DT4, W($1+14), #6 + vshr.u64 DT5, W($1 + 1), #7 + veor.i64 QT01, QT01, QT23 + vadd.i64 W($1), W($1), W($1 + 9) + veor.i64 QT01, QT01, QT45 + vadd.i64 W($1), W($1), DT0 + vadd.i64 W($1), W($1), DT1 +>) + +C ROUND(A,B,C,D,E,F,G,H,i) +C +C H += S1(E) + Choice(E,F,G) + K + W +C D += H +C H += S0(A) + Majority(A,B,C) +C +C Where +C +C S1(E) = E<<<50 ^ E<<<46 ^ E<<<23 +C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25 +C Choice (E, F, G) = G^(E&(F^G)) +C Majority (A,B,C) = (A&B) + (C&(A^B)) + +C Do S1 and S0 in parallel +C +C e << 50 a << 36 +C e >> 14 a >> 28 +C e << 46 a << 30 +C e >> 18 a >> 34 +C e << 23 a << 25 +C xor e >> 41 a >> 39 +C ---------------------------- +C DT0 DT1 +define(<ROUND>, < + vshl.i64 DT0, $5, #50 + vshl.i64 DT1, $1, #36 + vshr.u64 DT2, $5, #14 + vshr.u64 DT3, $1, #28 + vshl.i64 DT4, $5, #46 + vshl.i64 DT5, $1, #30 + veor QT01, QT01, QT23 + vshr.u64 DT2, $5, #18 + vshr.u64 DT3, $1, #34 + veor QT01, QT01, QT45 + vshl.i64 DT4, $5, #23 + vshl.i64 DT5, $1, #25 + veor QT01, QT01, QT23 + vshr.u64 DT2, $5, #41 + vshr.u64 DT3, $1, #39 + veor QT01, QT01, QT45 + veor DT4, $6, $7 + veor DT5, $1, $2 + vand DT4, DT4, $5 + vand DT5, DT5, $3 + veor DT4, DT4, $7 + veor QT01, QT01, QT23 + vand DT2, $1, $2 + vldr DT3, [K,#eval(8*$9)] + vadd.i64 $8, $8, W($9) + vadd.i64 QT01, QT01, QT45 + vadd.i64 $8, $8, DT3 + vadd.i64 $8, $8, DT0 + vadd.i64 DT1, DT1, DT2 + vadd.i64 $4, $4, $8 + vadd.i64 $8, $8, DT1 +>) + + C void + C _nettle_sha512_compress(uint64_t *state, const uint8_t *input, const uint64_t *k) + + .text + .align 2 + +PROLOGUE(_nettle_sha512_compress) + vpush {d8,d9,d10,d11,d12,d13} + + ands SHIFT, INPUT, #7 + and INPUT, INPUT, #-8 + vld1.8 {DT5}, [INPUT :64] + addne INPUT, INPUT, #8 + addeq SHIFT, SHIFT, #8 + lsl SHIFT, SHIFT, #3 + + C Put right shift in DT0 and DT1, aka QT01 + neg SHIFT, SHIFT + vmov.i32 DT0, #0 + vmov.32 DT0[0], SHIFT + vmov DT1, DT0 + C Put left shift in DT2 and DT3, aka QT23 + add SHIFT, SHIFT, #64 + vmov.i32 DT2, #0 + vmov.32 DT2[0], SHIFT + vmov DT3, DT2 + vshl.u64 DT5, DT5, DT0 + + C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT + vld1.8 {W(0),W(1),W(2),W(3)}, [INPUT :64]! + vshl.u64 QT67, QW0001, QT01 C Right shift + vshl.u64 QW0001, QW0001, QT23 C Left shift + veor W(0), W(0), DT5 + veor W(1), W(1), DT6 + vrev64.8 QW0001, QW0001 + vshl.u64 QT45, QW0203, QT01 C Right shift + vshl.u64 QW0203, QW0203, QT23 C Left shift + veor W(2), W(2), DT7 + veor W(3), W(3), DT4 + vrev64.8 QW0203, QW0203 + + vld1.8 {W(4),W(5),W(6),W(7)}, [INPUT :64]! + vshl.u64 QT67, QW0405, QT01 C Right shift + vshl.u64 QW0405, QW0405, QT23 C Left shift + veor W(4), W(4), DT5 + veor W(5), W(5), DT6 + vrev64.8 QW0405, QW0405 + vshl.u64 QT45, QW0607, QT01 C Right shift + vshl.u64 QW0607, QW0607, QT23 C Left shift + veor W(6), W(6), DT7 + veor W(7), W(7), DT4 + vrev64.8 QW0607, QW0607 + + vld1.8 {W(8),W(9),W(10),W(11)}, [INPUT :64]! + vshl.u64 QT67, QW0809, QT01 C Right shift + vshl.u64 QW0809, QW0809, QT23 C Left shift + veor W(8), W(8), DT5 + veor W(9), W(9), DT6 + vrev64.8 QW0809, QW0809 + vshl.u64 QT45, QW1011, QT01 C Right shift + vshl.u64 QW1011, QW1011, QT23 C Left shift + veor W(10), W(10), DT7 + veor W(11), W(11), DT4 + vrev64.8 QW1011, QW1011 + + vld1.8 {W(12),W(13),W(14),W(15)}, [INPUT :64]! + vshl.u64 QT67, QW1213, QT01 C Right shift + vshl.u64 QW1213, QW1213, QT23 C Left shift + veor W(12), W(12), DT5 + veor W(13), W(13), DT6 + vrev64.8 QW1213, QW1213 + vshl.u64 QT45, QW1415, QT01 C Right shift + vshl.u64 QW1415, QW1415, QT23 C Left shift + veor W(14), W(14), DT7 + veor W(15), W(15), DT4 + vrev64.8 QW1415, QW1415 + + vldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH} + + ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0) + ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1) + ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2) + ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3) + ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4) + ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5) + ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6) + ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7) + + ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8) + ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9) + ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10) + ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11) + ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12) + ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13) + ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14) + ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15) + + add K, K, #128 + + mov COUNT, #4 +.Loop: + + EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0) + EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1) + EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2) + EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3) + EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4) + EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5) + EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6) + EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7) + EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8) + EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9) + EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10) + EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11) + EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12) + EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13) + EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14) + subs COUNT, COUNT, #1 + EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15) + add K, K, #128 + bne .Loop + + vld1.64 {DW0, DW1, DW2, DW3}, [STATE] + vadd.i64 QSAB, QSAB, QW0001 + vadd.i64 QSCD, QSCD, QW0203 + vst1.64 {SA,SB,SC,SD}, [STATE]! + vld1.64 {DW0, DW1, DW2, DW3}, [STATE] + vadd.i64 QSEF, QSEF, QW0001 + vadd.i64 QSGH, QSGH, QW0203 + vst1.64 {SE,SF,SG,SH}, [STATE]! + + vpop {d8,d9,d10,d11,d12,d13} + bx lr +EPILOGUE(_nettle_sha512_compress) + +divert(-1) +define shastate +p/x $d0.u64 +p/x $d1.u64 +p/x $d2.u64 +p/x $d3.u64 +p/x $d4.u64 +p/x $d5.u64 +p/x $d6.u64 +p/x $d7.u64 +end diff --git a/arm/neon/umac-nh-n.asm b/arm/neon/umac-nh-n.asm new file mode 100644 index 00000000..4ae876b5 --- /dev/null +++ b/arm/neon/umac-nh-n.asm @@ -0,0 +1,298 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "umac-nh.asm" + .fpu neon + +define(<OUT>, <r0>) +define(<ITERS>, <r1>) +define(<KEY>, <r2>) +define(<LENGTH>, <r3>) +define(<MSG>, <r12>) +define(<SHIFT>, <r14>) + +define(<QA>, <q0>) +define(<QB>, <q1>) +define(<QY0>, <q3>) C Accumulates for the first two operations. +define(<DM>, <d4>) +define(<QY1>, <q4>) C Used for 3 and 4 iterations. +define(<QC>, <q5>) +define(<QD>, <q6>) +define(<QLEFT>, <q8>) +define(<QRIGHT>, <q9>) +define(<QT0>, <q10>) +define(<QT1>, <q11>) +define(<QT2>, <q12>) +define(<QK0>, <q13>) +define(<QK1>, <q14>) +define(<QK2>, <q15>) + +C FIXME: Try permuting subkeys using vld4, vzip or similar. + + .text + .align 3 + +PROLOGUE(_nettle_umac_nh_n) + ldr MSG, [sp] + str lr, [sp, #-4]! + + C Setup for 64-bit aligned reads + ands SHIFT, MSG, #7 + and MSG, MSG, #-8 + vld1.8 {DM}, [MSG :64] + addne MSG, MSG, #8 + addeq SHIFT, SHIFT, #8 + + C FIXME: Combine as rsb ? + lsl SHIFT, SHIFT, #3 + neg SHIFT, SHIFT + + C Right shift in QRIGHT (both halves) + vmov.i32 D0REG(QRIGHT)[0], SHIFT + vmov.32 D1REG(QRIGHT), D0REG(QRIGHT) + add SHIFT, SHIFT, #64 + + vmov.i32 D0REG(QLEFT)[0], SHIFT + vmov.32 D1REG(QLEFT), D0REG(QLEFT) + cmp r1, #3 + vmov.i64 QY0, #0 + + vshl.u64 DM, DM, D0REG(QRIGHT) + bcc .Lnh2 + beq .Lnh3 + +.Lnh4: + C Permute key words, so we in each iteration have them in order + C + C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11] + C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19] + C + C Also arrange the message words, so we get them as + C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7] + C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15] + C + C Then, accumulate Y0 (first two "iters") using + C + C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3) + C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7) + C + C Next iteration is then + C + C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7) + C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11) + C + C So we can reuse P4, P5, P6, P7 from the previous iteration. + + C How to for in registers? We need 4 Q regs for P0-P3, and one + C more for the last read key. We need at least two regiters + C for the message (QA and QB, more if we want to expand only + C once). For the Y0 update, we can let the factors overwrite + C P0-P3, and for the Y1 update, we can overwrite M0-M3. + + vpush {q4,q5,q6} + vld1.32 {QK0,QK1}, [KEY]! + vld1.32 {QK2}, [KEY]! + vmov QT0, QK1 + vmov QT1, QK2 + + C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1 + vtrn.32 QK0, QK1 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7] + vswp D1REG(QK0), D0REG(QK1) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7] + vtrn.32 QT0, QT1 C Gives us [4,8,6,10] and [5 ,9,7,11] + vswp D1REG(QT0), D0REG(QT1) C Gives us [4,8,5, 9] and [6,10,7,11] + + vmov.i64 QY1, #0 +.Loop4: + C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT + vld1.8 {QA, QB}, [MSG :64]! + vshl.u64 QC, QA, QRIGHT + vshl.u64 QD, QB, QRIGHT + vshl.u64 QA, QA, QLEFT + vshl.u64 QB, QB, QLEFT + veor D0REG(QA), D0REG(QA), DM + veor D1REG(QA), D1REG(QA), D0REG(QC) + veor D0REG(QB), D0REG(QB), D1REG(QC) + veor D1REG(QB), D1REG(QB), D0REG(QD) + vmov DM, D1REG(QD) + + C Explode message (too bad there's no vadd with scalar) + vdup.32 D1REG(QD), D1REG(QB)[1] + vdup.32 D0REG(QD), D1REG(QB)[0] + vdup.32 D1REG(QC), D0REG(QB)[1] + vdup.32 D0REG(QC), D0REG(QB)[0] + vdup.32 D1REG(QB), D1REG(QA)[1] + vdup.32 D0REG(QB), D1REG(QA)[0] + vdup.32 D1REG(QA), D0REG(QA)[1] + vdup.32 D0REG(QA), D0REG(QA)[0] + + vadd.i32 QK0, QK0, QA + vadd.i32 QK1, QK1, QB + vadd.i32 QT0, QT0, QC + vadd.i32 QT1, QT1, QD + + vmlal.u32 QY0, D0REG(QK0), D0REG(QT0) + vmlal.u32 QY0, D1REG(QK0), D1REG(QT0) + vmlal.u32 QY0, D0REG(QK1), D0REG(QT1) + vmlal.u32 QY0, D1REG(QK1), D1REG(QT1) + + C Next 4 subkeys + vld1.32 {QT0,QT1}, [KEY]! + vmov QK0, QK2 + vmov QK1, QT0 + vmov QK2, QT1 C Save + vtrn.32 QK0, QK1 C Gives us [8,12,10,14] and [9,13,11,15] + vswp D1REG(QK0), D0REG(QK1) C Gives us [8,12,9,13] and [10,14,11,15] + vtrn.32 QT0, QT1 C Gives us [12,16,14,18] and [13,17,15,19] + vswp D1REG(QT0), D0REG(QT1) C Gives us [12,16,13,17] and [14,18,15,19] + + vadd.i32 QA, QA, QK0 + vadd.i32 QB, QB, QK1 + vadd.i32 QC, QC, QT0 + vadd.i32 QD, QD, QT1 + + subs LENGTH, LENGTH, #32 + + vmlal.u32 QY1, D0REG(QA), D0REG(QC) + vmlal.u32 QY1, D1REG(QA), D1REG(QC) + vmlal.u32 QY1, D0REG(QB), D0REG(QD) + vmlal.u32 QY1, D1REG(QB), D1REG(QD) + + bhi .Loop4 + + vst1.64 {QY0, QY1}, [OUT] + + vpop {q4,q5,q6} + + ldr pc, [sp], #+4 + +.Lnh3: + vpush {q4} + vld1.32 {QK0,QK1}, [KEY]! + vmov.i64 QY1, #0 +.Loop3: + C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT + vld1.8 {QA, QB}, [MSG :64]! + vshl.u64 QT0, QA, QRIGHT + vshl.u64 QT1, QB, QRIGHT + vshl.u64 QA, QA, QLEFT + vshl.u64 QB, QB, QLEFT + veor D0REG(QA), D0REG(QA), DM + veor D1REG(QA), D1REG(QA), D0REG(QT0) + veor D0REG(QB), D0REG(QB), D1REG(QT0) + veor D1REG(QB), D1REG(QB), D0REG(QT1) + vmov DM, D1REG(QT1) + + vld1.32 {QK2}, [KEY]! + C Construct factors, with low half corresponding to first iteration, + C and high half corresponding to the second iteration. + vmov QT0, QK1 + vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7] + vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7] + vdup.32 D0REG(QT1), D0REG(QA)[0] + vdup.32 D1REG(QT1), D0REG(QA)[1] + vadd.i32 QT1, QT1, QK0 + + vmov QK0, QK2 C Save for next iteration + vtrn.32 QK1, QK2 C Gives us [4, 8, 2, 1] and [1, 5, 3, 7] + vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 1, 5] and [2, 1, 3, 7] + + vdup.32 D0REG(QT2), D0REG(QB)[0] + vdup.32 D1REG(QT2), D0REG(QB)[1] + vadd.i32 QK1, QK1, QT2 + vmlal.u32 QY0, D0REG(QT1), D0REG(QK1) + vmlal.u32 QY0, D1REG(QT1), D1REG(QK1) + + vdup.32 D0REG(QT1), D1REG(QA)[0] + vdup.32 D1REG(QT1), D1REG(QA)[1] + vadd.i32 QT0, QT0, QT1 + vdup.32 D0REG(QT1), D1REG(QB)[0] + vdup.32 D1REG(QT1), D1REG(QB)[1] + vadd.i32 QK2, QK2, QT1 + + vmlal.u32 QY0, D0REG(QT0), D0REG(QK2) + vmlal.u32 QY0, D1REG(QT0), D1REG(QK2) + + vld1.32 {QK1}, [KEY]! + vadd.i32 QA, QA, QK0 + vadd.i32 QB, QB, QK1 + subs LENGTH, LENGTH, #32 + vmlal.u32 QY1, D0REG(QA), D0REG(QB) + vmlal.u32 QY1, D1REG(QA), D1REG(QB) + bhi .Loop3 + + vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1) + vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT] + + vpop {q4} + + ldr pc, [sp], #+4 + +.Lnh2: + vld1.32 {QK0}, [KEY]! +.Loop2: + C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT + vld1.8 {QA, QB}, [MSG :64]! + vshl.u64 QT0, QA, QRIGHT + vshl.u64 QT1, QB, QRIGHT + vshl.u64 QA, QA, QLEFT + vshl.u64 QB, QB, QLEFT + veor D0REG(QA), D0REG(QA), DM + veor D1REG(QA), D1REG(QA), D0REG(QT0) + veor D0REG(QB), D0REG(QB), D1REG(QT0) + veor D1REG(QB), D1REG(QB), D0REG(QT1) + vmov DM, D1REG(QT1) + + vld1.32 {QK1,QK2}, [KEY]! + C Construct factors, with low half corresponding to first iteration, + C and high half corresponding to the second iteration. + vmov QT0, QK1 + vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7] + vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7] + vdup.32 D0REG(QT1), D0REG(QA)[0] + vdup.32 D1REG(QT1), D0REG(QA)[1] + vadd.i32 QT1, QT1, QK0 + + vmov QK0, QK2 C Save for next iteration + vtrn.32 QK1, QK2 C Gives us [4, 8, 6, 10] and [5, 9, 7, 11] + vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 5, 9] and [6, 10, 7, 11] + + vdup.32 D0REG(QT2), D0REG(QB)[0] + vdup.32 D1REG(QT2), D0REG(QB)[1] + vadd.i32 QK1, QK1, QT2 + vmlal.u32 QY0, D0REG(QT1), D0REG(QK1) + vmlal.u32 QY0, D1REG(QT1), D1REG(QK1) + + vdup.32 D0REG(QT1), D1REG(QA)[0] + vdup.32 D1REG(QT1), D1REG(QA)[1] + vadd.i32 QT0, QT0, QT1 + vdup.32 D0REG(QT1), D1REG(QB)[0] + vdup.32 D1REG(QT1), D1REG(QB)[1] + vadd.i32 QK2, QK2, QT1 + + subs LENGTH, LENGTH, #32 + + vmlal.u32 QY0, D0REG(QT0), D0REG(QK2) + vmlal.u32 QY0, D1REG(QT0), D1REG(QK2) + + bhi .Loop2 + vst1.64 {QY0}, [OUT] + +.Lend: + ldr pc, [sp], #+4 +EPILOGUE(_nettle_umac_nh_n) diff --git a/arm/neon/umac-nh.asm b/arm/neon/umac-nh.asm new file mode 100644 index 00000000..87cb86d0 --- /dev/null +++ b/arm/neon/umac-nh.asm @@ -0,0 +1,89 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "umac-nh.asm" + .fpu neon + +define(<KEY>, <r0>) +define(<LENGTH>, <r1>) +define(<MSG>, <r2>) +define(<SHIFT>, <r3>) + +define(<QA>, <q0>) +define(<QB>, <q1>) +define(<DM>, <d16>) +define(<QLEFT>, <q9>) +define(<QRIGHT>, <q10>) +define(<QY>, <q11>) +define(<QT0>, <q12>) +define(<QT1>, <q13>) +define(<QK0>, <q14>) +define(<QK1>, <q15>) + + .text + .align 3 + +PROLOGUE(_nettle_umac_nh) + C Setup for 64-bit aligned reads + ands SHIFT, MSG, #7 + and MSG, MSG, #-8 + vld1.8 {DM}, [MSG :64] + addne MSG, MSG, #8 + addeq SHIFT, SHIFT, #8 + + C FIXME: Combine as rsb ? + lsl SHIFT, SHIFT, #3 + neg SHIFT, SHIFT + + C Right shift in QRIGHT (both halves) + vmov.i32 D0REG(QRIGHT)[0], SHIFT + vmov.32 D1REG(QRIGHT), D0REG(QRIGHT) + add SHIFT, SHIFT, #64 + + vmov.i32 D0REG(QLEFT)[0], SHIFT + vmov.32 D1REG(QLEFT), D0REG(QLEFT) + + vmov.i64 QY, #0 + + vshl.u64 DM, DM, D0REG(QRIGHT) +.Loop: + C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT + vld1.8 {QA, QB}, [MSG :64]! + vshl.u64 QT0, QA, QRIGHT + vshl.u64 QT1, QB, QRIGHT + vshl.u64 QA, QA, QLEFT + vshl.u64 QB, QB, QLEFT + veor D0REG(QA), D0REG(QA), DM + veor D1REG(QA), D1REG(QA), D0REG(QT0) + veor D0REG(QB), D0REG(QB), D1REG(QT0) + veor D1REG(QB), D1REG(QB), D0REG(QT1) + vmov DM, D1REG(QT1) + + vld1.i32 {QK0, QK1}, [KEY]! + vadd.i32 QA, QA, QK0 + vadd.i32 QB, QB, QK1 + subs LENGTH, LENGTH, #32 + vmlal.u32 QY, D0REG(QA), D0REG(QB) + vmlal.u32 QY, D1REG(QA), D1REG(QB) + bhi .Loop + + vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY) + vmov r0, r1, D0REG(QY) + bx lr +EPILOGUE(_nettle_umac_nh) diff --git a/arm/sha1-compress.asm b/arm/sha1-compress.asm new file mode 100644 index 00000000..69c30e42 --- /dev/null +++ b/arm/sha1-compress.asm @@ -0,0 +1,234 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "sha1-compress.asm" + +define(<STATE>, <r0>) +define(<INPUT>, <r1>) +define(<SA>, <r2>) +define(<SB>, <r3>) +define(<SC>, <r4>) +define(<SD>, <r5>) +define(<SE>, <r6>) +define(<T0>, <r7>) +define(<SHIFT>, <r8>) +define(<WPREV>, <r10>) +define(<W>, <r12>) +define(<K>, <lr>) + +C FIXME: Could avoid a mov with even and odd variants. +define(<LOAD>, < + ldr T0, [INPUT], #+4 + sel W, WPREV, T0 + ror W, W, SHIFT + mov WPREV, T0 + rev W, W + str W, [SP,#eval(4*$1)] +>) +define(<EXPN>, < + ldr W, [sp, #+eval(4*$1)] + ldr T0, [sp, #+eval(4*(($1 + 2) % 16))] + eor W, W, T0 + ldr T0, [sp, #+eval(4*(($1 + 8) % 16))] + eor W, W, T0 + ldr T0, [sp, #+eval(4*(($1 + 13) % 16))] + eor W, W, T0 + ror W, W, #31 + str W, [sp, #+eval(4*$1)] +>) + +C F1(B,C,D) = D^(B&(C^D)) +C ROUND1(A,B,C,D,E) +define(<ROUND1>, < + eor T0, $3, $4 + add $5, $5, K + and T0, T0, $2 + add $5, $5, $1, ror #27 + eor T0, T0, $4 + add $5, $5, W + ror $2, $2, #2 + add $5, $5, T0 +>) +C F2(B,C,D) = B^C^D +define(<ROUND2>, < + eor T0, $2, $4 + add $5, $5, K + eor T0, T0, $3 + add $5, $5, $1, ror #27 + add $5, $5, W + ror $2, $2, #2 + add $5, $5, T0 +>) +C F3(B,C,D) = (B&C) | (D & (B|C)) = (B & (C ^ D)) + (C & D) +define(<ROUND3>, < + eor T0, $3, $4 + add $5, $5, K + and T0, T0, $2 + add $5, $5, $1, ror #27 + add $5, $5, T0 + add $5, $5, W + and T0, $3, $4 + ror $2, $2, #2 + add $5, $5, T0 +>) + C void _nettle_sha1_compress(uint32_t *state, const uint8_t *input) + + .text + .align 2 +.LK1: + .int 0x5A827999 +.LK2: + .int 0x6ED9EBA1 +.LK3: + .int 0x8F1BBCDC + +PROLOGUE(_nettle_sha1_compress) + push {r4,r5,r6,r7,r8,r10,lr} + sub sp, sp, #64 + + C Sets SHIFT to 8*low bits of input pointer. Sets up GE flags + C as follows, corresponding to bytes to be used from WPREV + C SHIFT 0 8 16 24 + C CPSR.GE 0000 1110 1100 1000 + ands SHIFT, INPUT, #3 + and INPUT, INPUT, $-4 + ldr WPREV, [INPUT] + addne INPUT, INPUT, #4 C Unaligned input + lsl SHIFT, SHIFT, #3 + mov T0, #0 + movne T0, #-1 + lsl W, T0, SHIFT + uadd8 T0, T0, W C Sets APSR.GE bits + + ldr K, .LK1 + ldm STATE, {SA,SB,SC,SD,SE} + + LOAD( 0) ROUND1(SA, SB, SC, SD, SE) + LOAD( 1) ROUND1(SE, SA, SB, SC, SD) + LOAD( 2) ROUND1(SD, SE, SA, SB, SC) + LOAD( 3) ROUND1(SC, SD, SE, SA, SB) + LOAD( 4) ROUND1(SB, SC, SD, SE, SA) + + LOAD( 5) ROUND1(SA, SB, SC, SD, SE) + LOAD( 6) ROUND1(SE, SA, SB, SC, SD) + LOAD( 7) ROUND1(SD, SE, SA, SB, SC) + LOAD( 8) ROUND1(SC, SD, SE, SA, SB) + LOAD( 9) ROUND1(SB, SC, SD, SE, SA) + + LOAD(10) ROUND1(SA, SB, SC, SD, SE) + LOAD(11) ROUND1(SE, SA, SB, SC, SD) + LOAD(12) ROUND1(SD, SE, SA, SB, SC) + LOAD(13) ROUND1(SC, SD, SE, SA, SB) + LOAD(14) ROUND1(SB, SC, SD, SE, SA) + + LOAD(15) ROUND1(SA, SB, SC, SD, SE) + EXPN( 0) ROUND1(SE, SA, SB, SC, SD) + EXPN( 1) ROUND1(SD, SE, SA, SB, SC) + EXPN( 2) ROUND1(SC, SD, SE, SA, SB) + EXPN( 3) ROUND1(SB, SC, SD, SE, SA) + + ldr K, .LK2 + EXPN( 4) ROUND2(SA, SB, SC, SD, SE) + EXPN( 5) ROUND2(SE, SA, SB, SC, SD) + EXPN( 6) ROUND2(SD, SE, SA, SB, SC) + EXPN( 7) ROUND2(SC, SD, SE, SA, SB) + EXPN( 8) ROUND2(SB, SC, SD, SE, SA) + + EXPN( 9) ROUND2(SA, SB, SC, SD, SE) + EXPN(10) ROUND2(SE, SA, SB, SC, SD) + EXPN(11) ROUND2(SD, SE, SA, SB, SC) + EXPN(12) ROUND2(SC, SD, SE, SA, SB) + EXPN(13) ROUND2(SB, SC, SD, SE, SA) + + EXPN(14) ROUND2(SA, SB, SC, SD, SE) + EXPN(15) ROUND2(SE, SA, SB, SC, SD) + EXPN( 0) ROUND2(SD, SE, SA, SB, SC) + EXPN( 1) ROUND2(SC, SD, SE, SA, SB) + EXPN( 2) ROUND2(SB, SC, SD, SE, SA) + + EXPN( 3) ROUND2(SA, SB, SC, SD, SE) + EXPN( 4) ROUND2(SE, SA, SB, SC, SD) + EXPN( 5) ROUND2(SD, SE, SA, SB, SC) + EXPN( 6) ROUND2(SC, SD, SE, SA, SB) + EXPN( 7) ROUND2(SB, SC, SD, SE, SA) + + ldr K, .LK3 + EXPN( 8) ROUND3(SA, SB, SC, SD, SE) + EXPN( 9) ROUND3(SE, SA, SB, SC, SD) + EXPN(10) ROUND3(SD, SE, SA, SB, SC) + EXPN(11) ROUND3(SC, SD, SE, SA, SB) + EXPN(12) ROUND3(SB, SC, SD, SE, SA) + + EXPN(13) ROUND3(SA, SB, SC, SD, SE) + EXPN(14) ROUND3(SE, SA, SB, SC, SD) + EXPN(15) ROUND3(SD, SE, SA, SB, SC) + EXPN( 0) ROUND3(SC, SD, SE, SA, SB) + EXPN( 1) ROUND3(SB, SC, SD, SE, SA) + + EXPN( 2) ROUND3(SA, SB, SC, SD, SE) + EXPN( 3) ROUND3(SE, SA, SB, SC, SD) + EXPN( 4) ROUND3(SD, SE, SA, SB, SC) + EXPN( 5) ROUND3(SC, SD, SE, SA, SB) + EXPN( 6) ROUND3(SB, SC, SD, SE, SA) + + EXPN( 7) ROUND3(SA, SB, SC, SD, SE) + EXPN( 8) ROUND3(SE, SA, SB, SC, SD) + EXPN( 9) ROUND3(SD, SE, SA, SB, SC) + EXPN(10) ROUND3(SC, SD, SE, SA, SB) + EXPN(11) ROUND3(SB, SC, SD, SE, SA) + + ldr K, .LK4 + EXPN(12) ROUND2(SA, SB, SC, SD, SE) + EXPN(13) ROUND2(SE, SA, SB, SC, SD) + EXPN(14) ROUND2(SD, SE, SA, SB, SC) + EXPN(15) ROUND2(SC, SD, SE, SA, SB) + EXPN( 0) ROUND2(SB, SC, SD, SE, SA) + + EXPN( 1) ROUND2(SA, SB, SC, SD, SE) + EXPN( 2) ROUND2(SE, SA, SB, SC, SD) + EXPN( 3) ROUND2(SD, SE, SA, SB, SC) + EXPN( 4) ROUND2(SC, SD, SE, SA, SB) + EXPN( 5) ROUND2(SB, SC, SD, SE, SA) + + EXPN( 6) ROUND2(SA, SB, SC, SD, SE) + EXPN( 7) ROUND2(SE, SA, SB, SC, SD) + EXPN( 8) ROUND2(SD, SE, SA, SB, SC) + EXPN( 9) ROUND2(SC, SD, SE, SA, SB) + EXPN(10) ROUND2(SB, SC, SD, SE, SA) + + EXPN(11) ROUND2(SA, SB, SC, SD, SE) + EXPN(12) ROUND2(SE, SA, SB, SC, SD) + EXPN(13) ROUND2(SD, SE, SA, SB, SC) + EXPN(14) ROUND2(SC, SD, SE, SA, SB) + EXPN(15) ROUND2(SB, SC, SD, SE, SA) + + C Use registers we no longer need. + ldm STATE, {INPUT,T0,SHIFT,W,K} + add SA, SA, INPUT + add SB, SB, T0 + add SC, SC, SHIFT + add SD, SD, W + add SE, SE, K + add sp, sp, #64 + stm STATE, {SA,SB,SC,SD,SE} + pop {r4,r5,r6,r7,r8,r10,pc} +EPILOGUE(_nettle_sha1_compress) + +.LK4: + .int 0xCA62C1D6 diff --git a/arm/sha256-compress.asm b/arm/sha256-compress.asm new file mode 100644 index 00000000..c2aaabd2 --- /dev/null +++ b/arm/sha256-compress.asm @@ -0,0 +1,204 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "sha256-compress.asm" + +define(<STATE>, <r0>) +define(<INPUT>, <r1>) +define(<K>, <r2>) +define(<SA>, <r3>) +define(<SB>, <r4>) +define(<SC>, <r5>) +define(<SD>, <r6>) +define(<SE>, <r7>) +define(<SF>, <r8>) +define(<SG>, <r10>) +define(<SH>, <r11>) +define(<T0>, <r12>) +define(<T1>, <r1>) C Overlap INPUT +define(<COUNT>, <r0>) C Overlap STATE +define(<W>, <r14>) + +C Used for data load +define(<I0>, <r3>) +define(<I1>, <r4>) +define(<I2>, <r5>) +define(<I3>, <r6>) +define(<I4>, <r7>) +define(<DST>, <r8>) +define(<SHIFT>, <r10>) +define(<ILEFT>, <r11>) + +define(<EXPN>, < + ldr W, [sp, #+eval(4*$1)] + ldr T0, [sp, #+eval(4*(($1 + 14) % 16))] + ror T1, T0, #17 + eor T1, T1, T0, ror #19 + eor T1, T1, T0, lsr #10 + add W, W, T1 + ldr T0, [sp, #+eval(4*(($1 + 9) % 16))] + add W, W, T0 + ldr T0, [sp, #+eval(4*(($1 + 1) % 16))] + ror T1, T0, #7 + eor T1, T1, T0, ror #18 + eor T1, T1, T0, lsr #3 + add W, W, T1 + str W, [sp, #+eval(4*$1)] +>) + +C ROUND(A,B,C,D,E,F,G,H) +C +C H += S1(E) + Choice(E,F,G) + K + W +C D += H +C H += S0(A) + Majority(A,B,C) +C +C Where +C +C S1(E) = E<<<26 ^ E<<<21 ^ E<<<7 +C S0(A) = A<<<30 ^ A<<<19 ^ A<<<10 +C Choice (E, F, G) = G^(E&(F^G)) +C Majority (A,B,C) = (A&B) + (C&(A^B)) + +define(<ROUND>, < + ror T0, $5, #6 + eor T0, T0, $5, ror #11 + eor T0, T0, $5, ror #25 + add $8, $8, T0 + eor T0, $6, $7 + and T0, T0, $5 + eor T0, T0, $7 + add $8,$8, T0 + ldr T0, [K], #+4 + add $8, $8, W + add $8, $8, T0 + add $4, $4, $8 + ror T0, $1, #2 + eor T0, T0, $1, ror #13 + eor T0, T0, $1, ror #22 + add $8, $8, T0 + and T0, $1, $2 + add $8, $8, T0 + eor T0, $1, $2 + and T0, T0, $3 + add $8, $8, T0 +>) + +define(<NOEXPN>, < + ldr W, [sp, + $1] + add $1, $1, #4 +>) + C void + C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k) + + .text + .align 2 + +PROLOGUE(_nettle_sha256_compress) + push {r4,r5,r6,r7,r8,r10,r11,r14} + sub sp, sp, #68 + str STATE, [sp, +#64] + + C Load data up front, since we don't have enough registers + C to load and shift on-the-fly + ands SHIFT, INPUT, #3 + and INPUT, INPUT, $-4 + ldr I0, [INPUT] + addne INPUT, INPUT, #4 + lsl SHIFT, SHIFT, #3 + mov T0, #0 + movne T0, #-1 + lsl I1, T0, SHIFT + uadd8 T0, T0, I1 C Sets APSR.GE bits + + mov DST, sp + mov ILEFT, #4 +.Lcopy: + ldm INPUT!, {I1,I2,I3,I4} + sel I0, I0, I1 + ror I0, I0, SHIFT + rev I0, I0 + sel I1, I1, I2 + ror I1, I1, SHIFT + rev I1, I1 + sel I2, I2, I3 + ror I2, I2, SHIFT + rev I2, I2 + sel I3, I3, I4 + ror I3, I3, SHIFT + rev I3, I3 + subs ILEFT, ILEFT, #1 + stm DST!, {I0,I1,I2,I3} + mov I0, I4 + bne .Lcopy + + ldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH} + + mov COUNT,#0 + +.Loop1: + NOEXPN(COUNT) ROUND(SA,SB,SC,SD,SE,SF,SG,SH) + NOEXPN(COUNT) ROUND(SH,SA,SB,SC,SD,SE,SF,SG) + NOEXPN(COUNT) ROUND(SG,SH,SA,SB,SC,SD,SE,SF) + NOEXPN(COUNT) ROUND(SF,SG,SH,SA,SB,SC,SD,SE) + NOEXPN(COUNT) ROUND(SE,SF,SG,SH,SA,SB,SC,SD) + NOEXPN(COUNT) ROUND(SD,SE,SF,SG,SH,SA,SB,SC) + NOEXPN(COUNT) ROUND(SC,SD,SE,SF,SG,SH,SA,SB) + NOEXPN(COUNT) ROUND(SB,SC,SD,SE,SF,SG,SH,SA) + cmp COUNT,#64 + bne .Loop1 + + mov COUNT, #3 +.Loop2: + + EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH) + EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG) + EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF) + EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE) + EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD) + EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC) + EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB) + EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA) + EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH) + EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG) + EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF) + EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE) + EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD) + EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC) + EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB) + subs COUNT, COUNT, #1 + EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA) + bne .Loop2 + + ldr STATE, [sp, +#64] + C No longer needed registers + ldm STATE, {r1,r2,r12,r14} + add SA, SA, r1 + add SB, SB, r2 + add SC, SC, r12 + add SD, SD, r14 + stm STATE!, {SA,SB,SC,SD} + ldm STATE, {r1,r2,r12,r14} + add SE, SE, r1 + add SF, SF, r2 + add SG, SG, r12 + add SH, SH, r14 + stm STATE!, {SE,SF,SG,SH} + add sp, sp, #68 + pop {r4,r5,r6,r7,r8,r10,r11,pc} +EPILOGUE(_nettle_sha256_compress) |