summaryrefslogtreecommitdiff
path: root/cipher/chacha20-armv7-neon.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2018-01-09 18:40:25 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2018-01-09 18:40:25 +0200
commit172ad09cbedc893f147180875335f4c525393c0b (patch)
tree02f489abcd22683b0c39d86a962c0af0c81c18f5 /cipher/chacha20-armv7-neon.S
parentb9a471ccf5f02f89e25c7ccc29898d0e4e486099 (diff)
downloadlibgcrypt-172ad09cbedc893f147180875335f4c525393c0b.tar.gz
New ChaCha implementations
* cipher/Makefile.am: Remove 'chacha20-sse2-amd64.S', 'chacha20-ssse3-amd64.S', 'chacha20-avx2-amd64.S'; Add 'chacha20-amd64-ssse3.S', 'chacha20-amd64-avx2.S'. * cipher/chacha20-amd64-avx2.S: New. * cipher/chacha20-amd64-ssse3.S: New. * cipher/chacha20-armv7-neon.S: Rewrite. * cipher/chacha20-avx2-amd64.S: Remove. * cipher/chacha20-sse2-amd64.S: Remove. * cipher/chacha20-ssse3-amd64.S: Remove. * cipher/chacha20.c (CHACHA20_INPUT_LENGTH, USE_SSE2, USE_NEON) (ASM_EXTRA_STACK, chacha20_blocks_t, _gcry_chacha20_amd64_sse2_blocks) (_gcry_chacha20_amd64_ssse3_blocks, _gcry_chacha20_amd64_avx2_blocks) (_gcry_chacha20_armv7_neon_blocks, QROUND, QOUT, chacha20_core) (chacha20_do_encrypt_stream): Remove. (_gcry_chacha20_amd64_ssse3_blocks4, _gcry_chacha20_amd64_avx2_blocks8) (_gcry_chacha20_armv7_neon_blocks4, ROTATE, XOR, PLUS, PLUSONE) (QUARTERROUND, BUF_XOR_LE32): New. (CHACHA20_context_s, chacha20_blocks, chacha20_keysetup) (chacha20_encrypt_stream): Rewrite. (chacha20_do_setkey): Adjust for new CHACHA20_context_s. * configure.ac: Remove 'chacha20-sse2-amd64.lo', 'chacha20-ssse3-amd64.lo', 'chacha20-avx2-amd64.lo'; Add 'chacha20-amd64-ssse3.lo', 'chacha20-amd64-avx2.lo'. -- Intel Core i7-4790K CPU @ 4.00GHz (x86_64/AVX2): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.319 ns/B 2988.5 MiB/s 1.28 c/B STREAM dec | 0.318 ns/B 2995.4 MiB/s 1.27 c/B Intel Core i7-4790K CPU @ 4.00GHz (x86_64/SSSE3): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.633 ns/B 1507.4 MiB/s 2.53 c/B STREAM dec | 0.633 ns/B 1506.6 MiB/s 2.53 c/B Intel Core i7-4790K CPU @ 4.00GHz (i386): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.05 ns/B 465.2 MiB/s 8.20 c/B STREAM dec | 2.04 ns/B 467.5 MiB/s 8.16 c/B Cortex-A53 @ 1152Mhz (armv7/neon): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 5.29 ns/B 180.3 MiB/s 6.09 c/B STREAM dec | 5.29 ns/B 180.1 MiB/s 6.10 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20-armv7-neon.S')
-rw-r--r--cipher/chacha20-armv7-neon.S1071
1 files changed, 357 insertions, 714 deletions
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S
index c1971fc7..33a43df1 100644
--- a/cipher/chacha20-armv7-neon.S
+++ b/cipher/chacha20-armv7-neon.S
@@ -1,6 +1,6 @@
-/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function
+/* chacha20-armv7-neon.S - ARMv7 NEON implementation of ChaCha20 cipher
*
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -19,732 +19,375 @@
*/
/*
- * Based on public domain implementation by Andrew Moon at
- * https://github.com/floodyberry/chacha-opt
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
*/
#include <config.h>
#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20)
+ defined(HAVE_GCC_INLINE_ASM_NEON)
.syntax unified
.fpu neon
.arm
-#define UNALIGNED_STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
- tst ptr, #3; \
- beq 1f; \
- vpush {d0-d3}; \
- vmov s0, l0; \
- vmov s1, l1; \
- vmov s2, l2; \
- vmov s3, l3; \
- vmov s4, l4; \
- vmov s5, l5; \
- vmov s6, l6; \
- vmov s7, l7; \
- vst1.32 {d0-d3}, [ptr]; \
- add ptr, #32; \
- vpop {d0-d3}; \
- b 2f; \
- 1: stmia ptr!, {l0-l7}; \
- 2: ;
-
-#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
- tst ptr, #3; \
- beq 1f; \
- vpush {d0-d1}; \
- vld1.32 {d0-d1}, [ptr]; \
- add ptr, #16; \
- vmov l0, s0; \
- vmov l1, s1; \
- vmov l2, s2; \
- vmov l3, s3; \
- vpop {d0-d1}; \
- b 2f; \
- 1: ldmia ptr!, {l0-l3}; \
- 2: ;
-
.text
-.globl _gcry_chacha20_armv7_neon_blocks
-.type _gcry_chacha20_armv7_neon_blocks,%function;
-_gcry_chacha20_armv7_neon_blocks:
-.Lchacha_blocks_neon_local:
- tst r3, r3
- beq .Lchacha_blocks_neon_nobytes
- vstmdb sp!, {q4,q5,q6,q7}
- stmfd sp!, {r4-r12, r14}
- mov r8, sp
- sub sp, sp, #196
- and sp, sp, #0xffffffe0
- str r0, [sp, #60]
- str r1, [sp, #48]
- str r2, [sp, #40]
- str r3, [sp, #52]
- str r8, [sp, #192]
- add r1, sp, #64
- ldmia r0!, {r4-r11}
- stmia r1!, {r4-r11}
- ldmia r0!, {r4-r11}
- stmia r1!, {r4-r11}
- mov r4, #20
- str r4, [sp, #44]
- cmp r3, #256
- blo .Lchacha_blocks_neon_mainloop2
-.Lchacha_blocks_neon_mainloop1:
- ldr r0, [sp, #44]
- str r0, [sp, #0]
- add r1, sp, #(64)
- mov r2, #1
- veor q12, q12
- vld1.32 {q0,q1}, [r1,:128]!
- vld1.32 {q2,q3}, [r1,:128]
- vmov.32 d24[0], r2
- vadd.u64 q3, q3, q12
- vmov q4, q0
- vmov q5, q1
- vmov q6, q2
- vadd.u64 q7, q3, q12
- vmov q8, q0
- vmov q9, q1
- vmov q10, q2
- vadd.u64 q11, q7, q12
- add r0, sp, #64
- ldm r0, {r0-r12}
- ldr r14, [sp, #(64 +60)]
- str r6, [sp, #8]
- str r11, [sp, #12]
- str r14, [sp, #28]
- ldr r11, [sp, #(64 +52)]
- ldr r14, [sp, #(64 +56)]
-.Lchacha_blocks_neon_rounds1:
- ldr r6, [sp, #0]
- vadd.i32 q0, q0, q1
- add r0, r0, r4
- vadd.i32 q4, q4, q5
- add r1, r1, r5
- vadd.i32 q8, q8, q9
- eor r12, r12, r0
- veor q12, q3, q0
- eor r11, r11, r1
- veor q13, q7, q4
- ror r12, r12, #16
- veor q14, q11, q8
- ror r11, r11, #16
- vrev32.16 q3, q12
- subs r6, r6, #2
- vrev32.16 q7, q13
- add r8, r8, r12
- vrev32.16 q11, q14
- add r9, r9, r11
- vadd.i32 q2, q2, q3
- eor r4, r4, r8
- vadd.i32 q6, q6, q7
- eor r5, r5, r9
- vadd.i32 q10, q10, q11
- str r6, [sp, #0]
- veor q12, q1, q2
- ror r4, r4, #20
- veor q13, q5, q6
- ror r5, r5, #20
- veor q14, q9, q10
- add r0, r0, r4
- vshl.i32 q1, q12, #12
- add r1, r1, r5
- vshl.i32 q5, q13, #12
- ldr r6, [sp, #8]
- vshl.i32 q9, q14, #12
- eor r12, r12, r0
- vsri.u32 q1, q12, #20
- eor r11, r11, r1
- vsri.u32 q5, q13, #20
- ror r12, r12, #24
- vsri.u32 q9, q14, #20
- ror r11, r11, #24
- vadd.i32 q0, q0, q1
- add r8, r8, r12
- vadd.i32 q4, q4, q5
- add r9, r9, r11
- vadd.i32 q8, q8, q9
- eor r4, r4, r8
- veor q12, q3, q0
- eor r5, r5, r9
- veor q13, q7, q4
- str r11, [sp, #20]
- veor q14, q11, q8
- ror r4, r4, #25
- vshl.i32 q3, q12, #8
- ror r5, r5, #25
- vshl.i32 q7, q13, #8
- str r4, [sp, #4]
- vshl.i32 q11, q14, #8
- ldr r4, [sp, #28]
- vsri.u32 q3, q12, #24
- add r2, r2, r6
- vsri.u32 q7, q13, #24
- add r3, r3, r7
- vsri.u32 q11, q14, #24
- ldr r11, [sp, #12]
- vadd.i32 q2, q2, q3
- eor r14, r14, r2
- vadd.i32 q6, q6, q7
- eor r4, r4, r3
- vadd.i32 q10, q10, q11
- ror r14, r14, #16
- veor q12, q1, q2
- ror r4, r4, #16
- veor q13, q5, q6
- add r10, r10, r14
- veor q14, q9, q10
- add r11, r11, r4
- vshl.i32 q1, q12, #7
- eor r6, r6, r10
- vshl.i32 q5, q13, #7
- eor r7, r7, r11
- vshl.i32 q9, q14, #7
- ror r6, r6, #20
- vsri.u32 q1, q12, #25
- ror r7, r7, #20
- vsri.u32 q5, q13, #25
- add r2, r2, r6
- vsri.u32 q9, q14, #25
- add r3, r3, r7
- vext.32 q3, q3, q3, #3
- eor r14, r14, r2
- vext.32 q7, q7, q7, #3
- eor r4, r4, r3
- vext.32 q11, q11, q11, #3
- ror r14, r14, #24
- vext.32 q1, q1, q1, #1
- ror r4, r4, #24
- vext.32 q5, q5, q5, #1
- add r10, r10, r14
- vext.32 q9, q9, q9, #1
- add r11, r11, r4
- vext.32 q2, q2, q2, #2
- eor r6, r6, r10
- vext.32 q6, q6, q6, #2
- eor r7, r7, r11
- vext.32 q10, q10, q10, #2
- ror r6, r6, #25
- vadd.i32 q0, q0, q1
- ror r7, r7, #25
- vadd.i32 q4, q4, q5
- add r0, r0, r5
- vadd.i32 q8, q8, q9
- add r1, r1, r6
- veor q12, q3, q0
- eor r4, r4, r0
- veor q13, q7, q4
- eor r12, r12, r1
- veor q14, q11, q8
- ror r4, r4, #16
- vrev32.16 q3, q12
- ror r12, r12, #16
- vrev32.16 q7, q13
- add r10, r10, r4
- vrev32.16 q11, q14
- add r11, r11, r12
- vadd.i32 q2, q2, q3
- eor r5, r5, r10
- vadd.i32 q6, q6, q7
- eor r6, r6, r11
- vadd.i32 q10, q10, q11
- ror r5, r5, #20
- veor q12, q1, q2
- ror r6, r6, #20
- veor q13, q5, q6
- add r0, r0, r5
- veor q14, q9, q10
- add r1, r1, r6
- vshl.i32 q1, q12, #12
- eor r4, r4, r0
- vshl.i32 q5, q13, #12
- eor r12, r12, r1
- vshl.i32 q9, q14, #12
- ror r4, r4, #24
- vsri.u32 q1, q12, #20
- ror r12, r12, #24
- vsri.u32 q5, q13, #20
- add r10, r10, r4
- vsri.u32 q9, q14, #20
- add r11, r11, r12
- vadd.i32 q0, q0, q1
- eor r5, r5, r10
- vadd.i32 q4, q4, q5
- eor r6, r6, r11
- vadd.i32 q8, q8, q9
- str r11, [sp, #12]
- veor q12, q3, q0
- ror r5, r5, #25
- veor q13, q7, q4
- ror r6, r6, #25
- veor q14, q11, q8
- str r4, [sp, #28]
- vshl.i32 q3, q12, #8
- ldr r4, [sp, #4]
- vshl.i32 q7, q13, #8
- add r2, r2, r7
- vshl.i32 q11, q14, #8
- add r3, r3, r4
- vsri.u32 q3, q12, #24
- ldr r11, [sp, #20]
- vsri.u32 q7, q13, #24
- eor r11, r11, r2
- vsri.u32 q11, q14, #24
- eor r14, r14, r3
- vadd.i32 q2, q2, q3
- ror r11, r11, #16
- vadd.i32 q6, q6, q7
- ror r14, r14, #16
- vadd.i32 q10, q10, q11
- add r8, r8, r11
- veor q12, q1, q2
- add r9, r9, r14
- veor q13, q5, q6
- eor r7, r7, r8
- veor q14, q9, q10
- eor r4, r4, r9
- vshl.i32 q1, q12, #7
- ror r7, r7, #20
- vshl.i32 q5, q13, #7
- ror r4, r4, #20
- vshl.i32 q9, q14, #7
- str r6, [sp, #8]
- vsri.u32 q1, q12, #25
- add r2, r2, r7
- vsri.u32 q5, q13, #25
- add r3, r3, r4
- vsri.u32 q9, q14, #25
- eor r11, r11, r2
- vext.32 q3, q3, q3, #1
- eor r14, r14, r3
- vext.32 q7, q7, q7, #1
- ror r11, r11, #24
- vext.32 q11, q11, q11, #1
- ror r14, r14, #24
- vext.32 q1, q1, q1, #3
- add r8, r8, r11
- vext.32 q5, q5, q5, #3
- add r9, r9, r14
- vext.32 q9, q9, q9, #3
- eor r7, r7, r8
- vext.32 q2, q2, q2, #2
- eor r4, r4, r9
- vext.32 q6, q6, q6, #2
- ror r7, r7, #25
- vext.32 q10, q10, q10, #2
- ror r4, r4, #25
- bne .Lchacha_blocks_neon_rounds1
- str r8, [sp, #0]
- str r9, [sp, #4]
- str r10, [sp, #8]
- str r12, [sp, #16]
- str r11, [sp, #20]
- str r14, [sp, #24]
- add r9, sp, #64
- vld1.32 {q12,q13}, [r9,:128]!
- ldr r12, [sp, #48]
- vld1.32 {q14,q15}, [r9,:128]
- ldr r14, [sp, #40]
- vadd.i32 q0, q0, q12
- ldr r8, [sp, #(64 +0)]
- vadd.i32 q4, q4, q12
- ldr r9, [sp, #(64 +4)]
- vadd.i32 q8, q8, q12
- ldr r10, [sp, #(64 +8)]
- vadd.i32 q1, q1, q13
- ldr r11, [sp, #(64 +12)]
- vadd.i32 q5, q5, q13
- add r0, r0, r8
- vadd.i32 q9, q9, q13
- add r1, r1, r9
- vadd.i32 q2, q2, q14
- add r2, r2, r10
- vadd.i32 q6, q6, q14
- ldr r8, [sp, #(64 +16)]
- vadd.i32 q10, q10, q14
- add r3, r3, r11
- veor q14, q14, q14
- ldr r9, [sp, #(64 +20)]
- mov r11, #1
- add r4, r4, r8
- vmov.32 d28[0], r11
- ldr r10, [sp, #(64 +24)]
- vadd.u64 q12, q14, q15
- add r5, r5, r9
- vadd.u64 q13, q14, q12
- ldr r11, [sp, #(64 +28)]
- vadd.u64 q14, q14, q13
- add r6, r6, r10
- vadd.i32 q3, q3, q12
- tst r12, r12
- vadd.i32 q7, q7, q13
- add r7, r7, r11
- vadd.i32 q11, q11, q14
- beq .Lchacha_blocks_neon_nomessage11
- UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
- tst r12, r12
- eor r0, r0, r8
- eor r1, r1, r9
- eor r2, r2, r10
- ldr r8, [r12, #0]
- eor r3, r3, r11
- ldr r9, [r12, #4]
- eor r4, r4, r8
- ldr r10, [r12, #8]
- eor r5, r5, r9
- ldr r11, [r12, #12]
- eor r6, r6, r10
- add r12, r12, #16
- eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage11:
- UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
- tst r12, r12
- ldm sp, {r0-r7}
- ldr r8, [sp, #(64 +32)]
- ldr r9, [sp, #(64 +36)]
- ldr r10, [sp, #(64 +40)]
- ldr r11, [sp, #(64 +44)]
- add r0, r0, r8
- add r1, r1, r9
- add r2, r2, r10
- ldr r8, [sp, #(64 +48)]
- add r3, r3, r11
- ldr r9, [sp, #(64 +52)]
- add r4, r4, r8
- ldr r10, [sp, #(64 +56)]
- add r5, r5, r9
- ldr r11, [sp, #(64 +60)]
- add r6, r6, r10
- adds r8, r8, #4
- add r7, r7, r11
- adc r9, r9, #0
- str r8, [sp, #(64 +48)]
- tst r12, r12
- str r9, [sp, #(64 +52)]
- beq .Lchacha_blocks_neon_nomessage12
- UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
- tst r12, r12
- eor r0, r0, r8
- eor r1, r1, r9
- eor r2, r2, r10
- ldr r8, [r12, #0]
- eor r3, r3, r11
- ldr r9, [r12, #4]
- eor r4, r4, r8
- ldr r10, [r12, #8]
- eor r5, r5, r9
- ldr r11, [r12, #12]
- eor r6, r6, r10
- add r12, r12, #16
- eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage12:
- UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
- tst r12, r12
- beq .Lchacha_blocks_neon_nomessage13
- vld1.32 {q12,q13}, [r12]!
- vld1.32 {q14,q15}, [r12]!
- veor q0, q0, q12
- veor q1, q1, q13
- veor q2, q2, q14
- veor q3, q3, q15
-.Lchacha_blocks_neon_nomessage13:
- vst1.32 {q0,q1}, [r14]!
- vst1.32 {q2,q3}, [r14]!
- beq .Lchacha_blocks_neon_nomessage14
- vld1.32 {q12,q13}, [r12]!
- vld1.32 {q14,q15}, [r12]!
- veor q4, q4, q12
- veor q5, q5, q13
- veor q6, q6, q14
- veor q7, q7, q15
-.Lchacha_blocks_neon_nomessage14:
- vst1.32 {q4,q5}, [r14]!
- vst1.32 {q6,q7}, [r14]!
- beq .Lchacha_blocks_neon_nomessage15
- vld1.32 {q12,q13}, [r12]!
- vld1.32 {q14,q15}, [r12]!
- veor q8, q8, q12
- veor q9, q9, q13
- veor q10, q10, q14
- veor q11, q11, q15
-.Lchacha_blocks_neon_nomessage15:
- vst1.32 {q8,q9}, [r14]!
- vst1.32 {q10,q11}, [r14]!
- str r12, [sp, #48]
- str r14, [sp, #40]
- ldr r3, [sp, #52]
- sub r3, r3, #256
- cmp r3, #256
- str r3, [sp, #52]
- bhs .Lchacha_blocks_neon_mainloop1
- tst r3, r3
- beq .Lchacha_blocks_neon_done
-.Lchacha_blocks_neon_mainloop2:
- ldr r3, [sp, #52]
- ldr r1, [sp, #48]
- cmp r3, #64
- bhs .Lchacha_blocks_neon_noswap1
- add r4, sp, #128
- mov r5, r4
- tst r1, r1
- beq .Lchacha_blocks_neon_nocopy1
-.Lchacha_blocks_neon_copyinput1:
- subs r3, r3, #1
- ldrb r0, [r1], #1
- strb r0, [r4], #1
- bne .Lchacha_blocks_neon_copyinput1
- str r5, [sp, #48]
-.Lchacha_blocks_neon_nocopy1:
- ldr r4, [sp, #40]
- str r5, [sp, #40]
- str r4, [sp, #56]
-.Lchacha_blocks_neon_noswap1:
- ldr r0, [sp, #44]
- str r0, [sp, #0]
- add r0, sp, #64
- ldm r0, {r0-r12}
- ldr r14, [sp, #(64 +60)]
- str r6, [sp, #8]
- str r11, [sp, #12]
- str r14, [sp, #28]
- ldr r11, [sp, #(64 +52)]
- ldr r14, [sp, #(64 +56)]
-.Lchacha_blocks_neon_rounds2:
- ldr r6, [sp, #0]
- add r0, r0, r4
- add r1, r1, r5
- eor r12, r12, r0
- eor r11, r11, r1
- ror r12, r12, #16
- ror r11, r11, #16
- subs r6, r6, #2
- add r8, r8, r12
- add r9, r9, r11
- eor r4, r4, r8
- eor r5, r5, r9
- str r6, [sp, #0]
- ror r4, r4, #20
- ror r5, r5, #20
- add r0, r0, r4
- add r1, r1, r5
- ldr r6, [sp, #8]
- eor r12, r12, r0
- eor r11, r11, r1
- ror r12, r12, #24
- ror r11, r11, #24
- add r8, r8, r12
- add r9, r9, r11
- eor r4, r4, r8
- eor r5, r5, r9
- str r11, [sp, #20]
- ror r4, r4, #25
- ror r5, r5, #25
- str r4, [sp, #4]
- ldr r4, [sp, #28]
- add r2, r2, r6
- add r3, r3, r7
- ldr r11, [sp, #12]
- eor r14, r14, r2
- eor r4, r4, r3
- ror r14, r14, #16
- ror r4, r4, #16
- add r10, r10, r14
- add r11, r11, r4
- eor r6, r6, r10
- eor r7, r7, r11
- ror r6, r6, #20
- ror r7, r7, #20
- add r2, r2, r6
- add r3, r3, r7
- eor r14, r14, r2
- eor r4, r4, r3
- ror r14, r14, #24
- ror r4, r4, #24
- add r10, r10, r14
- add r11, r11, r4
- eor r6, r6, r10
- eor r7, r7, r11
- ror r6, r6, #25
- ror r7, r7, #25
- add r0, r0, r5
- add r1, r1, r6
- eor r4, r4, r0
- eor r12, r12, r1
- ror r4, r4, #16
- ror r12, r12, #16
- add r10, r10, r4
- add r11, r11, r12
- eor r5, r5, r10
- eor r6, r6, r11
- ror r5, r5, #20
- ror r6, r6, #20
- add r0, r0, r5
- add r1, r1, r6
- eor r4, r4, r0
- eor r12, r12, r1
- ror r4, r4, #24
- ror r12, r12, #24
- add r10, r10, r4
- add r11, r11, r12
- eor r5, r5, r10
- eor r6, r6, r11
- str r11, [sp, #12]
- ror r5, r5, #25
- ror r6, r6, #25
- str r4, [sp, #28]
- ldr r4, [sp, #4]
- add r2, r2, r7
- add r3, r3, r4
- ldr r11, [sp, #20]
- eor r11, r11, r2
- eor r14, r14, r3
- ror r11, r11, #16
- ror r14, r14, #16
- add r8, r8, r11
- add r9, r9, r14
- eor r7, r7, r8
- eor r4, r4, r9
- ror r7, r7, #20
- ror r4, r4, #20
- str r6, [sp, #8]
- add r2, r2, r7
- add r3, r3, r4
- eor r11, r11, r2
- eor r14, r14, r3
- ror r11, r11, #24
- ror r14, r14, #24
- add r8, r8, r11
- add r9, r9, r14
- eor r7, r7, r8
- eor r4, r4, r9
- ror r7, r7, #25
- ror r4, r4, #25
- bne .Lchacha_blocks_neon_rounds2
- str r8, [sp, #0]
- str r9, [sp, #4]
- str r10, [sp, #8]
- str r12, [sp, #16]
- str r11, [sp, #20]
- str r14, [sp, #24]
- ldr r12, [sp, #48]
- ldr r14, [sp, #40]
- ldr r8, [sp, #(64 +0)]
- ldr r9, [sp, #(64 +4)]
- ldr r10, [sp, #(64 +8)]
- ldr r11, [sp, #(64 +12)]
- add r0, r0, r8
- add r1, r1, r9
- add r2, r2, r10
- ldr r8, [sp, #(64 +16)]
- add r3, r3, r11
- ldr r9, [sp, #(64 +20)]
- add r4, r4, r8
- ldr r10, [sp, #(64 +24)]
- add r5, r5, r9
- ldr r11, [sp, #(64 +28)]
- add r6, r6, r10
- tst r12, r12
- add r7, r7, r11
- beq .Lchacha_blocks_neon_nomessage21
- UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
- tst r12, r12
- eor r0, r0, r8
- eor r1, r1, r9
- eor r2, r2, r10
- ldr r8, [r12, #0]
- eor r3, r3, r11
- ldr r9, [r12, #4]
- eor r4, r4, r8
- ldr r10, [r12, #8]
- eor r5, r5, r9
- ldr r11, [r12, #12]
- eor r6, r6, r10
- add r12, r12, #16
- eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage21:
- UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
- ldm sp, {r0-r7}
- ldr r8, [sp, #(64 +32)]
- ldr r9, [sp, #(64 +36)]
- ldr r10, [sp, #(64 +40)]
- ldr r11, [sp, #(64 +44)]
- add r0, r0, r8
- add r1, r1, r9
- add r2, r2, r10
- ldr r8, [sp, #(64 +48)]
- add r3, r3, r11
- ldr r9, [sp, #(64 +52)]
- add r4, r4, r8
- ldr r10, [sp, #(64 +56)]
- add r5, r5, r9
- ldr r11, [sp, #(64 +60)]
- add r6, r6, r10
- adds r8, r8, #1
- add r7, r7, r11
- adc r9, r9, #0
- str r8, [sp, #(64 +48)]
- tst r12, r12
- str r9, [sp, #(64 +52)]
- beq .Lchacha_blocks_neon_nomessage22
- UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
- tst r12, r12
- eor r0, r0, r8
- eor r1, r1, r9
- eor r2, r2, r10
- ldr r8, [r12, #0]
- eor r3, r3, r11
- ldr r9, [r12, #4]
- eor r4, r4, r8
- ldr r10, [r12, #8]
- eor r5, r5, r9
- ldr r11, [r12, #12]
- eor r6, r6, r10
- add r12, r12, #16
- eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage22:
- UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
- str r12, [sp, #48]
- str r14, [sp, #40]
- ldr r3, [sp, #52]
- cmp r3, #64
- sub r4, r3, #64
- str r4, [sp, #52]
- bhi .Lchacha_blocks_neon_mainloop2
- cmp r3, #64
- beq .Lchacha_blocks_neon_nocopy2
- ldr r1, [sp, #56]
- sub r14, r14, #64
-.Lchacha_blocks_neon_copyinput2:
- subs r3, r3, #1
- ldrb r0, [r14], #1
- strb r0, [r1], #1
- bne .Lchacha_blocks_neon_copyinput2
-.Lchacha_blocks_neon_nocopy2:
-.Lchacha_blocks_neon_done:
- ldr r7, [sp, #60]
- ldr r8, [sp, #(64 +48)]
- ldr r9, [sp, #(64 +52)]
- str r8, [r7, #(48 + 0)]
- str r9, [r7, #(48 + 4)]
+#ifdef __PIC__
+# define GET_DATA_POINTER(reg, name, rtmp) \
+ ldr reg, 1f; \
+ ldr rtmp, 2f; \
+ b 3f; \
+ 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+ 2: .word name(GOT); \
+ 3: add reg, pc, reg; \
+ ldr reg, [reg, rtmp];
+#else
+# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+/* register macros */
+#define INPUT r0
+#define DST r1
+#define SRC r2
+#define NBLKS r3
+#define ROUND r4
+
+/* stack structure */
+#define STACK_VEC_X12 (16)
+#define STACK_VEC_X13 (STACK_VEC_X12 + 16)
+#define STACK_TMP (STACK_VEC_X13 + 16)
+#define STACK_TMP1 (16 + STACK_TMP)
+#define STACK_TMP2 (16 + STACK_TMP1)
+
+#define STACK_MAX (16 + STACK_TMP2)
+
+/* vector registers */
+#define X0 q0
+#define X1 q1
+#define X2 q2
+#define X3 q3
+#define X4 q4
+#define X5 q5
+#define X6 q6
+#define X7 q7
+#define X8 q8
+#define X9 q9
+#define X10 q10
+#define X11 q11
+#define X12 q12
+#define X13 q13
+#define X14 q14
+#define X15 q15
+
+#define X0l d0
+#define X1l d2
+#define X2l d4
+#define X3l d6
+#define X4l d8
+#define X5l d10
+#define X6l d12
+#define X7l d14
+#define X8l d16
+#define X9l d18
+#define X10l d20
+#define X11l d22
+#define X12l d24
+#define X13l d26
+#define X14l d28
+#define X15l d30
+
+#define X0h d1
+#define X1h d3
+#define X2h d5
+#define X3h d7
+#define X4h d9
+#define X5h d11
+#define X6h d13
+#define X7h d15
+#define X8h d17
+#define X9h d19
+#define X10h d21
+#define X11h d23
+#define X12h d25
+#define X13h d27
+#define X14h d29
+#define X15h d31
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4_part1(_q0, _q1, _q2, _q3) \
+ vtrn.32 _q0, _q1; \
+ vtrn.32 _q2, _q3;
+#define transpose_4x4_part2(_q0, _q1, _q2, _q3) \
+ vswp _q0##h, _q2##l; \
+ vswp _q1##h, _q3##l;
+
+#define clear(x) veor x,x,x;
+
+/**********************************************************************
+ 4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(dst1,dst2,c,src1,src2) \
+ vshl.u32 dst1, src1, #(c); \
+ vshl.u32 dst2, src2, #(c); \
+ vsri.u32 dst1, src1, #(32 - (c)); \
+ vsri.u32 dst2, src2, #(32 - (c));
+
+#define ROTATE2_16(dst1,dst2,src1,src2) \
+ vrev32.16 dst1, src1; \
+ vrev32.16 dst2, src2;
+
+#define XOR(d,s1,s2) \
+ veor d, s2, s1;
+
+#define PLUS(ds,s) \
+ vadd.u32 ds, ds, s;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \
+ PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \
+ ROTATE2_16(d1, d2, tmp1, tmp2); \
+ PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \
+ ROTATE2(b1, b2, 12, tmp1, tmp2); \
+ PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \
+ ROTATE2(d1, d2, 8, tmp1, tmp2); \
+ PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \
+ ROTATE2(b1, b2, 7, tmp1, tmp2);
+
+chacha20_data:
+.align 4
+.Linc_counter:
+ .long 0,1,2,3
+
+.align 3
+.globl _gcry_chacha20_armv7_neon_blocks4
+.type _gcry_chacha20_armv7_neon_blocks4,%function;
+
+_gcry_chacha20_armv7_neon_blocks4:
+ /* input:
+ * r0: input
+ * r1: dst
+ * r2: src
+ * r3: nblks (multiple of 4)
+ */
+
+ vpush {q4-q7};
+ push {r4-r12,lr};
+
mov r12, sp
- stmia r12!, {r0-r7}
- add r12, r12, #48
- stmia r12!, {r0-r7}
- sub r0, sp, #8
- ldr sp, [sp, #192]
- ldmfd sp!, {r4-r12, r14}
- vldm sp!, {q4-q7}
- sub r0, sp, r0
- bx lr
-.Lchacha_blocks_neon_nobytes:
- mov r0, #0;
+
+ mov r6, sp;
+ sub r6, r6, #(STACK_MAX);
+ and r6, r6, #(~15);
+ mov sp, r6;
+ GET_DATA_POINTER(r9, .Linc_counter, lr);
+ add lr, INPUT, #(12*4);
+ add r8, sp, #STACK_VEC_X12;
+
+.Loop4:
+ mov ROUND, #20;
+
+ /* Construct counter vectors X12 and X13 */
+
+ vld1.8 {X15}, [lr];
+ mov lr, INPUT;
+ vld1.8 {X8}, [r9];
+ vdup.32 X12, X15l[0];
+ vdup.32 X13, X15l[1];
+ vld1.8 {X3}, [lr]!;
+ vadd.u32 X12, X12, X8;
+ vdup.32 X0, X3l[0];
+ vdup.32 X1, X3l[1];
+ vdup.32 X2, X3h[0];
+ vcgt.u32 X8, X8, X12;
+ vdup.32 X3, X3h[1];
+ vdup.32 X14, X15h[0];
+ vdup.32 X15, X15h[1];
+ vsub.u32 X13, X13, X8;
+ vld1.8 {X7}, [lr]!;
+ vld1.8 {X11}, [lr];
+ vst1.8 {X12, X13}, [r8];
+ vdup.32 X4, X7l[0];
+ vdup.32 X5, X7l[1];
+ vdup.32 X6, X7h[0];
+ vdup.32 X7, X7h[1];
+ vdup.32 X8, X11l[0];
+ vdup.32 X9, X11l[1];
+ vdup.32 X10, X11h[0];
+ vdup.32 X11, X11h[1];
+
+ add r7, sp, #STACK_TMP2;
+ add r6, sp, #STACK_TMP1;
+ add r5, sp, #STACK_TMP;
+ vst1.8 {X15}, [r6];
+ vst1.8 {X11}, [r5];
+
+ mov lr, INPUT;
+.Lround2:
+ subs ROUND, ROUND, #2
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15)
+ vld1.8 {X11}, [r5];
+ vld1.8 {X15}, [r6];
+ vst1.8 {X8}, [r5];
+ vst1.8 {X9}, [r6];
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9)
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9)
+ vld1.8 {X8}, [r5];
+ vld1.8 {X9}, [r6];
+ vst1.8 {X11}, [r5];
+ vst1.8 {X15}, [r6];
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15)
+ bne .Lround2;
+
+ vld1.8 {X11}, [lr]!;
+ vst1.8 {X14}, [r7];
+
+ vdup.32 X14, X11l[0]; /* INPUT + 0 * 4 */
+ vdup.32 X15, X11l[1]; /* INPUT + 1 * 4 */
+ PLUS(X0, X14);
+ PLUS(X1, X15);
+ vdup.32 X14, X11h[0]; /* INPUT + 2 * 4 */
+ vdup.32 X15, X11h[1]; /* INPUT + 3 * 4 */
+ PLUS(X2, X14);
+ PLUS(X3, X15);
+
+ vld1.8 {X11}, [r5];
+ vld1.8 {X15}, [r6];
+ vst1.8 {X0}, [r5];
+ vld1.8 {X0}, [lr]!;
+ vst1.8 {X1}, [r6];
+
+ vdup.32 X14, X0l[0]; /* INPUT + 4 * 4 */
+ vdup.32 X1, X0l[1]; /* INPUT + 5 * 4 */
+ PLUS(X4, X14);
+ PLUS(X5, X1);
+ vdup.32 X14, X0h[0]; /* INPUT + 6 * 4 */
+ vdup.32 X1, X0h[1]; /* INPUT + 7 * 4 */
+ PLUS(X6, X14);
+ PLUS(X7, X1);
+
+ vld1.8 {X0}, [lr]!;
+
+ vdup.32 X14, X0l[0]; /* INPUT + 8 * 4 */
+ vdup.32 X1, X0l[1]; /* INPUT + 9 * 4 */
+ PLUS(X8, X14);
+ PLUS(X9, X1);
+ vdup.32 X14, X0h[0]; /* INPUT + 10 * 4 */
+ vdup.32 X1, X0h[1]; /* INPUT + 11 * 4 */
+ PLUS(X10, X14);
+ PLUS(X11, X1);
+
+ vld1.8 {X0}, [lr];
+ add lr, INPUT, #(12*4)
+ vld1.8 {X14}, [r7];
+
+ vdup.32 X1, X0h[0]; /* INPUT + 10 * 4 */
+ ldm lr, {r10, r11}; /* Update counter */
+ vdup.32 X0, X0h[1]; /* INPUT + 11 * 4 */
+ PLUS(X14, X1);
+ PLUS(X15, X0);
+ adds r10, r10, #4; /* Update counter */
+ vld1.8 {X0, X1}, [r8];
+
+ PLUS(X12, X0);
+ vld1.8 {X0}, [r5];
+ PLUS(X13, X1);
+ adc r11, r11, #0; /* Update counter */
+
+ vld1.8 {X1}, [r6];
+ stm lr, {r10, r11}; /* Update counter */
+ transpose_4x4_part1(X0, X1, X2, X3);
+ transpose_4x4_part1(X4, X5, X6, X7);
+ transpose_4x4_part1(X8, X9, X10, X11);
+ transpose_4x4_part1(X12, X13, X14, X15);
+ transpose_4x4_part2(X0, X1, X2, X3);
+ transpose_4x4_part2(X4, X5, X6, X7);
+ transpose_4x4_part2(X8, X9, X10, X11);
+ transpose_4x4_part2(X12, X13, X14, X15);
+
+ subs NBLKS, NBLKS, #4;
+
+ vst1.8 {X10}, [r5];
+ add lr, INPUT, #(12*4)
+ vst1.8 {X11}, [r6];
+ vld1.8 {X10, X11}, [SRC]!;
+ veor X10, X0, X10;
+ vld1.8 {X0}, [SRC]!;
+ veor X11, X4, X11;
+ vld1.8 {X4}, [SRC]!;
+ vst1.8 {X10, X11}, [DST]!;
+ vld1.8 {X10, X11}, [SRC]!;
+ veor X0, X8, X0;
+ veor X4, X12, X4;
+ veor X10, X1, X10;
+ veor X11, X5, X11;
+ vst1.8 {X0}, [DST]!;
+ vld1.8 {X0, X1}, [SRC]!;
+ vst1.8 {X4}, [DST]!;
+ vld1.8 {X4, X5}, [SRC]!;
+ vst1.8 {X10, X11}, [DST]!;
+ vld1.8 {X10}, [r5];
+ vld1.8 {X11}, [r6];
+ veor X0, X9, X0;
+ vld1.8 {X8, X9}, [SRC]!;
+ veor X1, X13, X1;
+ vld1.8 {X12, X13}, [SRC]!;
+ veor X4, X2, X4;
+ veor X5, X6, X5;
+ vst1.8 {X0, X1}, [DST]!;
+ vld1.8 {X0, X1}, [SRC]!;
+ vst1.8 {X4, X5}, [DST]!;
+ veor X8, X10, X8;
+ veor X9, X14, X9;
+ veor X12, X3, X12;
+ veor X13, X7, X13;
+ veor X0, X11, X0;
+ veor X1, X15, X1;
+ vst1.8 {X8, X9}, [DST]!;
+ vst1.8 {X12, X13}, [DST]!;
+ vst1.8 {X0, X1}, [DST]!;
+
+ bne .Loop4;
+
+ /* clear the used vector registers and stack */
+ clear(X0);
+ vst1.8 {X0}, [r5];
+ vst1.8 {X0}, [r6];
+ vst1.8 {X0}, [r7];
+ vst1.8 {X0}, [r8]!;
+ vst1.8 {X0}, [r8];
+
+ mov sp, r12
+ clear(X1);
+ clear(X2);
+ clear(X3);
+ clear(X4);
+ clear(X5);
+ clear(X6);
+ clear(X7);
+ clear(X8);
+ clear(X9);
+ clear(X10);
+ clear(X11);
+ clear(X12);
+ clear(X13);
+ clear(X14);
+ clear(X15);
+
+ pop {r4-r12,lr}
+ vpop {q4-q7}
+ eor r0, r0, r0
bx lr
-.ltorg
-.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks;
+.size _gcry_chacha20_armv7_neon_blocks4, .-_gcry_chacha20_armv7_neon_blocks4;
#endif