diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2018-01-09 18:40:25 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2018-01-09 18:40:25 +0200 |
commit | 172ad09cbedc893f147180875335f4c525393c0b (patch) | |
tree | 02f489abcd22683b0c39d86a962c0af0c81c18f5 /cipher/chacha20-armv7-neon.S | |
parent | b9a471ccf5f02f89e25c7ccc29898d0e4e486099 (diff) | |
download | libgcrypt-172ad09cbedc893f147180875335f4c525393c0b.tar.gz |
New ChaCha implementations
* cipher/Makefile.am: Remove 'chacha20-sse2-amd64.S',
'chacha20-ssse3-amd64.S', 'chacha20-avx2-amd64.S'; Add
'chacha20-amd64-ssse3.S', 'chacha20-amd64-avx2.S'.
* cipher/chacha20-amd64-avx2.S: New.
* cipher/chacha20-amd64-ssse3.S: New.
* cipher/chacha20-armv7-neon.S: Rewrite.
* cipher/chacha20-avx2-amd64.S: Remove.
* cipher/chacha20-sse2-amd64.S: Remove.
* cipher/chacha20-ssse3-amd64.S: Remove.
* cipher/chacha20.c (CHACHA20_INPUT_LENGTH, USE_SSE2, USE_NEON)
(ASM_EXTRA_STACK, chacha20_blocks_t, _gcry_chacha20_amd64_sse2_blocks)
(_gcry_chacha20_amd64_ssse3_blocks, _gcry_chacha20_amd64_avx2_blocks)
(_gcry_chacha20_armv7_neon_blocks, QROUND, QOUT, chacha20_core)
(chacha20_do_encrypt_stream): Remove.
(_gcry_chacha20_amd64_ssse3_blocks4, _gcry_chacha20_amd64_avx2_blocks8)
(_gcry_chacha20_armv7_neon_blocks4, ROTATE, XOR, PLUS, PLUSONE)
(QUARTERROUND, BUF_XOR_LE32): New.
(CHACHA20_context_s, chacha20_blocks, chacha20_keysetup)
(chacha20_encrypt_stream): Rewrite.
(chacha20_do_setkey): Adjust for new CHACHA20_context_s.
* configure.ac: Remove 'chacha20-sse2-amd64.lo',
'chacha20-ssse3-amd64.lo', 'chacha20-avx2-amd64.lo'; Add
'chacha20-amd64-ssse3.lo', 'chacha20-amd64-avx2.lo'.
--
Intel Core i7-4790K CPU @ 4.00GHz (x86_64/AVX2):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.319 ns/B 2988.5 MiB/s 1.28 c/B
STREAM dec | 0.318 ns/B 2995.4 MiB/s 1.27 c/B
Intel Core i7-4790K CPU @ 4.00GHz (x86_64/SSSE3):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.633 ns/B 1507.4 MiB/s 2.53 c/B
STREAM dec | 0.633 ns/B 1506.6 MiB/s 2.53 c/B
Intel Core i7-4790K CPU @ 4.00GHz (i386):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 2.05 ns/B 465.2 MiB/s 8.20 c/B
STREAM dec | 2.04 ns/B 467.5 MiB/s 8.16 c/B
Cortex-A53 @ 1152Mhz (armv7/neon):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 5.29 ns/B 180.3 MiB/s 6.09 c/B
STREAM dec | 5.29 ns/B 180.1 MiB/s 6.10 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20-armv7-neon.S')
-rw-r--r-- | cipher/chacha20-armv7-neon.S | 1071 |
1 files changed, 357 insertions, 714 deletions
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S index c1971fc7..33a43df1 100644 --- a/cipher/chacha20-armv7-neon.S +++ b/cipher/chacha20-armv7-neon.S @@ -1,6 +1,6 @@ -/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function +/* chacha20-armv7-neon.S - ARMv7 NEON implementation of ChaCha20 cipher * - * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -19,732 +19,375 @@ */ /* - * Based on public domain implementation by Andrew Moon at - * https://github.com/floodyberry/chacha-opt + * Based on D. J. Bernstein reference implementation at + * http://cr.yp.to/chacha.html: + * + * chacha-regs.c version 20080118 + * D. J. Bernstein + * Public domain. */ #include <config.h> #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ - defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20) + defined(HAVE_GCC_INLINE_ASM_NEON) .syntax unified .fpu neon .arm -#define UNALIGNED_STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \ - tst ptr, #3; \ - beq 1f; \ - vpush {d0-d3}; \ - vmov s0, l0; \ - vmov s1, l1; \ - vmov s2, l2; \ - vmov s3, l3; \ - vmov s4, l4; \ - vmov s5, l5; \ - vmov s6, l6; \ - vmov s7, l7; \ - vst1.32 {d0-d3}, [ptr]; \ - add ptr, #32; \ - vpop {d0-d3}; \ - b 2f; \ - 1: stmia ptr!, {l0-l7}; \ - 2: ; - -#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \ - tst ptr, #3; \ - beq 1f; \ - vpush {d0-d1}; \ - vld1.32 {d0-d1}, [ptr]; \ - add ptr, #16; \ - vmov l0, s0; \ - vmov l1, s1; \ - vmov l2, s2; \ - vmov l3, s3; \ - vpop {d0-d1}; \ - b 2f; \ - 1: ldmia ptr!, {l0-l3}; \ - 2: ; - .text -.globl _gcry_chacha20_armv7_neon_blocks -.type _gcry_chacha20_armv7_neon_blocks,%function; -_gcry_chacha20_armv7_neon_blocks: -.Lchacha_blocks_neon_local: - tst r3, r3 - beq .Lchacha_blocks_neon_nobytes - vstmdb sp!, {q4,q5,q6,q7} - stmfd sp!, {r4-r12, r14} - mov r8, sp - sub sp, sp, #196 - and sp, sp, #0xffffffe0 - str r0, [sp, #60] - str r1, [sp, #48] - str r2, [sp, #40] - str r3, [sp, #52] - str r8, [sp, #192] - add r1, sp, #64 - ldmia r0!, {r4-r11} - stmia r1!, {r4-r11} - ldmia r0!, {r4-r11} - stmia r1!, {r4-r11} - mov r4, #20 - str r4, [sp, #44] - cmp r3, #256 - blo .Lchacha_blocks_neon_mainloop2 -.Lchacha_blocks_neon_mainloop1: - ldr r0, [sp, #44] - str r0, [sp, #0] - add r1, sp, #(64) - mov r2, #1 - veor q12, q12 - vld1.32 {q0,q1}, [r1,:128]! - vld1.32 {q2,q3}, [r1,:128] - vmov.32 d24[0], r2 - vadd.u64 q3, q3, q12 - vmov q4, q0 - vmov q5, q1 - vmov q6, q2 - vadd.u64 q7, q3, q12 - vmov q8, q0 - vmov q9, q1 - vmov q10, q2 - vadd.u64 q11, q7, q12 - add r0, sp, #64 - ldm r0, {r0-r12} - ldr r14, [sp, #(64 +60)] - str r6, [sp, #8] - str r11, [sp, #12] - str r14, [sp, #28] - ldr r11, [sp, #(64 +52)] - ldr r14, [sp, #(64 +56)] -.Lchacha_blocks_neon_rounds1: - ldr r6, [sp, #0] - vadd.i32 q0, q0, q1 - add r0, r0, r4 - vadd.i32 q4, q4, q5 - add r1, r1, r5 - vadd.i32 q8, q8, q9 - eor r12, r12, r0 - veor q12, q3, q0 - eor r11, r11, r1 - veor q13, q7, q4 - ror r12, r12, #16 - veor q14, q11, q8 - ror r11, r11, #16 - vrev32.16 q3, q12 - subs r6, r6, #2 - vrev32.16 q7, q13 - add r8, r8, r12 - vrev32.16 q11, q14 - add r9, r9, r11 - vadd.i32 q2, q2, q3 - eor r4, r4, r8 - vadd.i32 q6, q6, q7 - eor r5, r5, r9 - vadd.i32 q10, q10, q11 - str r6, [sp, #0] - veor q12, q1, q2 - ror r4, r4, #20 - veor q13, q5, q6 - ror r5, r5, #20 - veor q14, q9, q10 - add r0, r0, r4 - vshl.i32 q1, q12, #12 - add r1, r1, r5 - vshl.i32 q5, q13, #12 - ldr r6, [sp, #8] - vshl.i32 q9, q14, #12 - eor r12, r12, r0 - vsri.u32 q1, q12, #20 - eor r11, r11, r1 - vsri.u32 q5, q13, #20 - ror r12, r12, #24 - vsri.u32 q9, q14, #20 - ror r11, r11, #24 - vadd.i32 q0, q0, q1 - add r8, r8, r12 - vadd.i32 q4, q4, q5 - add r9, r9, r11 - vadd.i32 q8, q8, q9 - eor r4, r4, r8 - veor q12, q3, q0 - eor r5, r5, r9 - veor q13, q7, q4 - str r11, [sp, #20] - veor q14, q11, q8 - ror r4, r4, #25 - vshl.i32 q3, q12, #8 - ror r5, r5, #25 - vshl.i32 q7, q13, #8 - str r4, [sp, #4] - vshl.i32 q11, q14, #8 - ldr r4, [sp, #28] - vsri.u32 q3, q12, #24 - add r2, r2, r6 - vsri.u32 q7, q13, #24 - add r3, r3, r7 - vsri.u32 q11, q14, #24 - ldr r11, [sp, #12] - vadd.i32 q2, q2, q3 - eor r14, r14, r2 - vadd.i32 q6, q6, q7 - eor r4, r4, r3 - vadd.i32 q10, q10, q11 - ror r14, r14, #16 - veor q12, q1, q2 - ror r4, r4, #16 - veor q13, q5, q6 - add r10, r10, r14 - veor q14, q9, q10 - add r11, r11, r4 - vshl.i32 q1, q12, #7 - eor r6, r6, r10 - vshl.i32 q5, q13, #7 - eor r7, r7, r11 - vshl.i32 q9, q14, #7 - ror r6, r6, #20 - vsri.u32 q1, q12, #25 - ror r7, r7, #20 - vsri.u32 q5, q13, #25 - add r2, r2, r6 - vsri.u32 q9, q14, #25 - add r3, r3, r7 - vext.32 q3, q3, q3, #3 - eor r14, r14, r2 - vext.32 q7, q7, q7, #3 - eor r4, r4, r3 - vext.32 q11, q11, q11, #3 - ror r14, r14, #24 - vext.32 q1, q1, q1, #1 - ror r4, r4, #24 - vext.32 q5, q5, q5, #1 - add r10, r10, r14 - vext.32 q9, q9, q9, #1 - add r11, r11, r4 - vext.32 q2, q2, q2, #2 - eor r6, r6, r10 - vext.32 q6, q6, q6, #2 - eor r7, r7, r11 - vext.32 q10, q10, q10, #2 - ror r6, r6, #25 - vadd.i32 q0, q0, q1 - ror r7, r7, #25 - vadd.i32 q4, q4, q5 - add r0, r0, r5 - vadd.i32 q8, q8, q9 - add r1, r1, r6 - veor q12, q3, q0 - eor r4, r4, r0 - veor q13, q7, q4 - eor r12, r12, r1 - veor q14, q11, q8 - ror r4, r4, #16 - vrev32.16 q3, q12 - ror r12, r12, #16 - vrev32.16 q7, q13 - add r10, r10, r4 - vrev32.16 q11, q14 - add r11, r11, r12 - vadd.i32 q2, q2, q3 - eor r5, r5, r10 - vadd.i32 q6, q6, q7 - eor r6, r6, r11 - vadd.i32 q10, q10, q11 - ror r5, r5, #20 - veor q12, q1, q2 - ror r6, r6, #20 - veor q13, q5, q6 - add r0, r0, r5 - veor q14, q9, q10 - add r1, r1, r6 - vshl.i32 q1, q12, #12 - eor r4, r4, r0 - vshl.i32 q5, q13, #12 - eor r12, r12, r1 - vshl.i32 q9, q14, #12 - ror r4, r4, #24 - vsri.u32 q1, q12, #20 - ror r12, r12, #24 - vsri.u32 q5, q13, #20 - add r10, r10, r4 - vsri.u32 q9, q14, #20 - add r11, r11, r12 - vadd.i32 q0, q0, q1 - eor r5, r5, r10 - vadd.i32 q4, q4, q5 - eor r6, r6, r11 - vadd.i32 q8, q8, q9 - str r11, [sp, #12] - veor q12, q3, q0 - ror r5, r5, #25 - veor q13, q7, q4 - ror r6, r6, #25 - veor q14, q11, q8 - str r4, [sp, #28] - vshl.i32 q3, q12, #8 - ldr r4, [sp, #4] - vshl.i32 q7, q13, #8 - add r2, r2, r7 - vshl.i32 q11, q14, #8 - add r3, r3, r4 - vsri.u32 q3, q12, #24 - ldr r11, [sp, #20] - vsri.u32 q7, q13, #24 - eor r11, r11, r2 - vsri.u32 q11, q14, #24 - eor r14, r14, r3 - vadd.i32 q2, q2, q3 - ror r11, r11, #16 - vadd.i32 q6, q6, q7 - ror r14, r14, #16 - vadd.i32 q10, q10, q11 - add r8, r8, r11 - veor q12, q1, q2 - add r9, r9, r14 - veor q13, q5, q6 - eor r7, r7, r8 - veor q14, q9, q10 - eor r4, r4, r9 - vshl.i32 q1, q12, #7 - ror r7, r7, #20 - vshl.i32 q5, q13, #7 - ror r4, r4, #20 - vshl.i32 q9, q14, #7 - str r6, [sp, #8] - vsri.u32 q1, q12, #25 - add r2, r2, r7 - vsri.u32 q5, q13, #25 - add r3, r3, r4 - vsri.u32 q9, q14, #25 - eor r11, r11, r2 - vext.32 q3, q3, q3, #1 - eor r14, r14, r3 - vext.32 q7, q7, q7, #1 - ror r11, r11, #24 - vext.32 q11, q11, q11, #1 - ror r14, r14, #24 - vext.32 q1, q1, q1, #3 - add r8, r8, r11 - vext.32 q5, q5, q5, #3 - add r9, r9, r14 - vext.32 q9, q9, q9, #3 - eor r7, r7, r8 - vext.32 q2, q2, q2, #2 - eor r4, r4, r9 - vext.32 q6, q6, q6, #2 - ror r7, r7, #25 - vext.32 q10, q10, q10, #2 - ror r4, r4, #25 - bne .Lchacha_blocks_neon_rounds1 - str r8, [sp, #0] - str r9, [sp, #4] - str r10, [sp, #8] - str r12, [sp, #16] - str r11, [sp, #20] - str r14, [sp, #24] - add r9, sp, #64 - vld1.32 {q12,q13}, [r9,:128]! - ldr r12, [sp, #48] - vld1.32 {q14,q15}, [r9,:128] - ldr r14, [sp, #40] - vadd.i32 q0, q0, q12 - ldr r8, [sp, #(64 +0)] - vadd.i32 q4, q4, q12 - ldr r9, [sp, #(64 +4)] - vadd.i32 q8, q8, q12 - ldr r10, [sp, #(64 +8)] - vadd.i32 q1, q1, q13 - ldr r11, [sp, #(64 +12)] - vadd.i32 q5, q5, q13 - add r0, r0, r8 - vadd.i32 q9, q9, q13 - add r1, r1, r9 - vadd.i32 q2, q2, q14 - add r2, r2, r10 - vadd.i32 q6, q6, q14 - ldr r8, [sp, #(64 +16)] - vadd.i32 q10, q10, q14 - add r3, r3, r11 - veor q14, q14, q14 - ldr r9, [sp, #(64 +20)] - mov r11, #1 - add r4, r4, r8 - vmov.32 d28[0], r11 - ldr r10, [sp, #(64 +24)] - vadd.u64 q12, q14, q15 - add r5, r5, r9 - vadd.u64 q13, q14, q12 - ldr r11, [sp, #(64 +28)] - vadd.u64 q14, q14, q13 - add r6, r6, r10 - vadd.i32 q3, q3, q12 - tst r12, r12 - vadd.i32 q7, q7, q13 - add r7, r7, r11 - vadd.i32 q11, q11, q14 - beq .Lchacha_blocks_neon_nomessage11 - UNALIGNED_LDMIA4(r12, r8, r9, r10, r11) - tst r12, r12 - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - ldr r8, [r12, #0] - eor r3, r3, r11 - ldr r9, [r12, #4] - eor r4, r4, r8 - ldr r10, [r12, #8] - eor r5, r5, r9 - ldr r11, [r12, #12] - eor r6, r6, r10 - add r12, r12, #16 - eor r7, r7, r11 -.Lchacha_blocks_neon_nomessage11: - UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7) - tst r12, r12 - ldm sp, {r0-r7} - ldr r8, [sp, #(64 +32)] - ldr r9, [sp, #(64 +36)] - ldr r10, [sp, #(64 +40)] - ldr r11, [sp, #(64 +44)] - add r0, r0, r8 - add r1, r1, r9 - add r2, r2, r10 - ldr r8, [sp, #(64 +48)] - add r3, r3, r11 - ldr r9, [sp, #(64 +52)] - add r4, r4, r8 - ldr r10, [sp, #(64 +56)] - add r5, r5, r9 - ldr r11, [sp, #(64 +60)] - add r6, r6, r10 - adds r8, r8, #4 - add r7, r7, r11 - adc r9, r9, #0 - str r8, [sp, #(64 +48)] - tst r12, r12 - str r9, [sp, #(64 +52)] - beq .Lchacha_blocks_neon_nomessage12 - UNALIGNED_LDMIA4(r12, r8, r9, r10, r11) - tst r12, r12 - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - ldr r8, [r12, #0] - eor r3, r3, r11 - ldr r9, [r12, #4] - eor r4, r4, r8 - ldr r10, [r12, #8] - eor r5, r5, r9 - ldr r11, [r12, #12] - eor r6, r6, r10 - add r12, r12, #16 - eor r7, r7, r11 -.Lchacha_blocks_neon_nomessage12: - UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7) - tst r12, r12 - beq .Lchacha_blocks_neon_nomessage13 - vld1.32 {q12,q13}, [r12]! - vld1.32 {q14,q15}, [r12]! - veor q0, q0, q12 - veor q1, q1, q13 - veor q2, q2, q14 - veor q3, q3, q15 -.Lchacha_blocks_neon_nomessage13: - vst1.32 {q0,q1}, [r14]! - vst1.32 {q2,q3}, [r14]! - beq .Lchacha_blocks_neon_nomessage14 - vld1.32 {q12,q13}, [r12]! - vld1.32 {q14,q15}, [r12]! - veor q4, q4, q12 - veor q5, q5, q13 - veor q6, q6, q14 - veor q7, q7, q15 -.Lchacha_blocks_neon_nomessage14: - vst1.32 {q4,q5}, [r14]! - vst1.32 {q6,q7}, [r14]! - beq .Lchacha_blocks_neon_nomessage15 - vld1.32 {q12,q13}, [r12]! - vld1.32 {q14,q15}, [r12]! - veor q8, q8, q12 - veor q9, q9, q13 - veor q10, q10, q14 - veor q11, q11, q15 -.Lchacha_blocks_neon_nomessage15: - vst1.32 {q8,q9}, [r14]! - vst1.32 {q10,q11}, [r14]! - str r12, [sp, #48] - str r14, [sp, #40] - ldr r3, [sp, #52] - sub r3, r3, #256 - cmp r3, #256 - str r3, [sp, #52] - bhs .Lchacha_blocks_neon_mainloop1 - tst r3, r3 - beq .Lchacha_blocks_neon_done -.Lchacha_blocks_neon_mainloop2: - ldr r3, [sp, #52] - ldr r1, [sp, #48] - cmp r3, #64 - bhs .Lchacha_blocks_neon_noswap1 - add r4, sp, #128 - mov r5, r4 - tst r1, r1 - beq .Lchacha_blocks_neon_nocopy1 -.Lchacha_blocks_neon_copyinput1: - subs r3, r3, #1 - ldrb r0, [r1], #1 - strb r0, [r4], #1 - bne .Lchacha_blocks_neon_copyinput1 - str r5, [sp, #48] -.Lchacha_blocks_neon_nocopy1: - ldr r4, [sp, #40] - str r5, [sp, #40] - str r4, [sp, #56] -.Lchacha_blocks_neon_noswap1: - ldr r0, [sp, #44] - str r0, [sp, #0] - add r0, sp, #64 - ldm r0, {r0-r12} - ldr r14, [sp, #(64 +60)] - str r6, [sp, #8] - str r11, [sp, #12] - str r14, [sp, #28] - ldr r11, [sp, #(64 +52)] - ldr r14, [sp, #(64 +56)] -.Lchacha_blocks_neon_rounds2: - ldr r6, [sp, #0] - add r0, r0, r4 - add r1, r1, r5 - eor r12, r12, r0 - eor r11, r11, r1 - ror r12, r12, #16 - ror r11, r11, #16 - subs r6, r6, #2 - add r8, r8, r12 - add r9, r9, r11 - eor r4, r4, r8 - eor r5, r5, r9 - str r6, [sp, #0] - ror r4, r4, #20 - ror r5, r5, #20 - add r0, r0, r4 - add r1, r1, r5 - ldr r6, [sp, #8] - eor r12, r12, r0 - eor r11, r11, r1 - ror r12, r12, #24 - ror r11, r11, #24 - add r8, r8, r12 - add r9, r9, r11 - eor r4, r4, r8 - eor r5, r5, r9 - str r11, [sp, #20] - ror r4, r4, #25 - ror r5, r5, #25 - str r4, [sp, #4] - ldr r4, [sp, #28] - add r2, r2, r6 - add r3, r3, r7 - ldr r11, [sp, #12] - eor r14, r14, r2 - eor r4, r4, r3 - ror r14, r14, #16 - ror r4, r4, #16 - add r10, r10, r14 - add r11, r11, r4 - eor r6, r6, r10 - eor r7, r7, r11 - ror r6, r6, #20 - ror r7, r7, #20 - add r2, r2, r6 - add r3, r3, r7 - eor r14, r14, r2 - eor r4, r4, r3 - ror r14, r14, #24 - ror r4, r4, #24 - add r10, r10, r14 - add r11, r11, r4 - eor r6, r6, r10 - eor r7, r7, r11 - ror r6, r6, #25 - ror r7, r7, #25 - add r0, r0, r5 - add r1, r1, r6 - eor r4, r4, r0 - eor r12, r12, r1 - ror r4, r4, #16 - ror r12, r12, #16 - add r10, r10, r4 - add r11, r11, r12 - eor r5, r5, r10 - eor r6, r6, r11 - ror r5, r5, #20 - ror r6, r6, #20 - add r0, r0, r5 - add r1, r1, r6 - eor r4, r4, r0 - eor r12, r12, r1 - ror r4, r4, #24 - ror r12, r12, #24 - add r10, r10, r4 - add r11, r11, r12 - eor r5, r5, r10 - eor r6, r6, r11 - str r11, [sp, #12] - ror r5, r5, #25 - ror r6, r6, #25 - str r4, [sp, #28] - ldr r4, [sp, #4] - add r2, r2, r7 - add r3, r3, r4 - ldr r11, [sp, #20] - eor r11, r11, r2 - eor r14, r14, r3 - ror r11, r11, #16 - ror r14, r14, #16 - add r8, r8, r11 - add r9, r9, r14 - eor r7, r7, r8 - eor r4, r4, r9 - ror r7, r7, #20 - ror r4, r4, #20 - str r6, [sp, #8] - add r2, r2, r7 - add r3, r3, r4 - eor r11, r11, r2 - eor r14, r14, r3 - ror r11, r11, #24 - ror r14, r14, #24 - add r8, r8, r11 - add r9, r9, r14 - eor r7, r7, r8 - eor r4, r4, r9 - ror r7, r7, #25 - ror r4, r4, #25 - bne .Lchacha_blocks_neon_rounds2 - str r8, [sp, #0] - str r9, [sp, #4] - str r10, [sp, #8] - str r12, [sp, #16] - str r11, [sp, #20] - str r14, [sp, #24] - ldr r12, [sp, #48] - ldr r14, [sp, #40] - ldr r8, [sp, #(64 +0)] - ldr r9, [sp, #(64 +4)] - ldr r10, [sp, #(64 +8)] - ldr r11, [sp, #(64 +12)] - add r0, r0, r8 - add r1, r1, r9 - add r2, r2, r10 - ldr r8, [sp, #(64 +16)] - add r3, r3, r11 - ldr r9, [sp, #(64 +20)] - add r4, r4, r8 - ldr r10, [sp, #(64 +24)] - add r5, r5, r9 - ldr r11, [sp, #(64 +28)] - add r6, r6, r10 - tst r12, r12 - add r7, r7, r11 - beq .Lchacha_blocks_neon_nomessage21 - UNALIGNED_LDMIA4(r12, r8, r9, r10, r11) - tst r12, r12 - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - ldr r8, [r12, #0] - eor r3, r3, r11 - ldr r9, [r12, #4] - eor r4, r4, r8 - ldr r10, [r12, #8] - eor r5, r5, r9 - ldr r11, [r12, #12] - eor r6, r6, r10 - add r12, r12, #16 - eor r7, r7, r11 -.Lchacha_blocks_neon_nomessage21: - UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7) - ldm sp, {r0-r7} - ldr r8, [sp, #(64 +32)] - ldr r9, [sp, #(64 +36)] - ldr r10, [sp, #(64 +40)] - ldr r11, [sp, #(64 +44)] - add r0, r0, r8 - add r1, r1, r9 - add r2, r2, r10 - ldr r8, [sp, #(64 +48)] - add r3, r3, r11 - ldr r9, [sp, #(64 +52)] - add r4, r4, r8 - ldr r10, [sp, #(64 +56)] - add r5, r5, r9 - ldr r11, [sp, #(64 +60)] - add r6, r6, r10 - adds r8, r8, #1 - add r7, r7, r11 - adc r9, r9, #0 - str r8, [sp, #(64 +48)] - tst r12, r12 - str r9, [sp, #(64 +52)] - beq .Lchacha_blocks_neon_nomessage22 - UNALIGNED_LDMIA4(r12, r8, r9, r10, r11) - tst r12, r12 - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - ldr r8, [r12, #0] - eor r3, r3, r11 - ldr r9, [r12, #4] - eor r4, r4, r8 - ldr r10, [r12, #8] - eor r5, r5, r9 - ldr r11, [r12, #12] - eor r6, r6, r10 - add r12, r12, #16 - eor r7, r7, r11 -.Lchacha_blocks_neon_nomessage22: - UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7) - str r12, [sp, #48] - str r14, [sp, #40] - ldr r3, [sp, #52] - cmp r3, #64 - sub r4, r3, #64 - str r4, [sp, #52] - bhi .Lchacha_blocks_neon_mainloop2 - cmp r3, #64 - beq .Lchacha_blocks_neon_nocopy2 - ldr r1, [sp, #56] - sub r14, r14, #64 -.Lchacha_blocks_neon_copyinput2: - subs r3, r3, #1 - ldrb r0, [r14], #1 - strb r0, [r1], #1 - bne .Lchacha_blocks_neon_copyinput2 -.Lchacha_blocks_neon_nocopy2: -.Lchacha_blocks_neon_done: - ldr r7, [sp, #60] - ldr r8, [sp, #(64 +48)] - ldr r9, [sp, #(64 +52)] - str r8, [r7, #(48 + 0)] - str r9, [r7, #(48 + 4)] +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + +/* register macros */ +#define INPUT r0 +#define DST r1 +#define SRC r2 +#define NBLKS r3 +#define ROUND r4 + +/* stack structure */ +#define STACK_VEC_X12 (16) +#define STACK_VEC_X13 (STACK_VEC_X12 + 16) +#define STACK_TMP (STACK_VEC_X13 + 16) +#define STACK_TMP1 (16 + STACK_TMP) +#define STACK_TMP2 (16 + STACK_TMP1) + +#define STACK_MAX (16 + STACK_TMP2) + +/* vector registers */ +#define X0 q0 +#define X1 q1 +#define X2 q2 +#define X3 q3 +#define X4 q4 +#define X5 q5 +#define X6 q6 +#define X7 q7 +#define X8 q8 +#define X9 q9 +#define X10 q10 +#define X11 q11 +#define X12 q12 +#define X13 q13 +#define X14 q14 +#define X15 q15 + +#define X0l d0 +#define X1l d2 +#define X2l d4 +#define X3l d6 +#define X4l d8 +#define X5l d10 +#define X6l d12 +#define X7l d14 +#define X8l d16 +#define X9l d18 +#define X10l d20 +#define X11l d22 +#define X12l d24 +#define X13l d26 +#define X14l d28 +#define X15l d30 + +#define X0h d1 +#define X1h d3 +#define X2h d5 +#define X3h d7 +#define X4h d9 +#define X5h d11 +#define X6h d13 +#define X7h d15 +#define X8h d17 +#define X9h d19 +#define X10h d21 +#define X11h d23 +#define X12h d25 +#define X13h d27 +#define X14h d29 +#define X15h d31 + +/********************************************************************** + helper macros + **********************************************************************/ + +/* 4x4 32-bit integer matrix transpose */ +#define transpose_4x4_part1(_q0, _q1, _q2, _q3) \ + vtrn.32 _q0, _q1; \ + vtrn.32 _q2, _q3; +#define transpose_4x4_part2(_q0, _q1, _q2, _q3) \ + vswp _q0##h, _q2##l; \ + vswp _q1##h, _q3##l; + +#define clear(x) veor x,x,x; + +/********************************************************************** + 4-way chacha20 + **********************************************************************/ + +#define ROTATE2(dst1,dst2,c,src1,src2) \ + vshl.u32 dst1, src1, #(c); \ + vshl.u32 dst2, src2, #(c); \ + vsri.u32 dst1, src1, #(32 - (c)); \ + vsri.u32 dst2, src2, #(32 - (c)); + +#define ROTATE2_16(dst1,dst2,src1,src2) \ + vrev32.16 dst1, src1; \ + vrev32.16 dst2, src2; + +#define XOR(d,s1,s2) \ + veor d, s2, s1; + +#define PLUS(ds,s) \ + vadd.u32 ds, ds, s; + +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \ + PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \ + ROTATE2_16(d1, d2, tmp1, tmp2); \ + PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \ + ROTATE2(b1, b2, 12, tmp1, tmp2); \ + PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \ + ROTATE2(d1, d2, 8, tmp1, tmp2); \ + PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \ + ROTATE2(b1, b2, 7, tmp1, tmp2); + +chacha20_data: +.align 4 +.Linc_counter: + .long 0,1,2,3 + +.align 3 +.globl _gcry_chacha20_armv7_neon_blocks4 +.type _gcry_chacha20_armv7_neon_blocks4,%function; + +_gcry_chacha20_armv7_neon_blocks4: + /* input: + * r0: input + * r1: dst + * r2: src + * r3: nblks (multiple of 4) + */ + + vpush {q4-q7}; + push {r4-r12,lr}; + mov r12, sp - stmia r12!, {r0-r7} - add r12, r12, #48 - stmia r12!, {r0-r7} - sub r0, sp, #8 - ldr sp, [sp, #192] - ldmfd sp!, {r4-r12, r14} - vldm sp!, {q4-q7} - sub r0, sp, r0 - bx lr -.Lchacha_blocks_neon_nobytes: - mov r0, #0; + + mov r6, sp; + sub r6, r6, #(STACK_MAX); + and r6, r6, #(~15); + mov sp, r6; + GET_DATA_POINTER(r9, .Linc_counter, lr); + add lr, INPUT, #(12*4); + add r8, sp, #STACK_VEC_X12; + +.Loop4: + mov ROUND, #20; + + /* Construct counter vectors X12 and X13 */ + + vld1.8 {X15}, [lr]; + mov lr, INPUT; + vld1.8 {X8}, [r9]; + vdup.32 X12, X15l[0]; + vdup.32 X13, X15l[1]; + vld1.8 {X3}, [lr]!; + vadd.u32 X12, X12, X8; + vdup.32 X0, X3l[0]; + vdup.32 X1, X3l[1]; + vdup.32 X2, X3h[0]; + vcgt.u32 X8, X8, X12; + vdup.32 X3, X3h[1]; + vdup.32 X14, X15h[0]; + vdup.32 X15, X15h[1]; + vsub.u32 X13, X13, X8; + vld1.8 {X7}, [lr]!; + vld1.8 {X11}, [lr]; + vst1.8 {X12, X13}, [r8]; + vdup.32 X4, X7l[0]; + vdup.32 X5, X7l[1]; + vdup.32 X6, X7h[0]; + vdup.32 X7, X7h[1]; + vdup.32 X8, X11l[0]; + vdup.32 X9, X11l[1]; + vdup.32 X10, X11h[0]; + vdup.32 X11, X11h[1]; + + add r7, sp, #STACK_TMP2; + add r6, sp, #STACK_TMP1; + add r5, sp, #STACK_TMP; + vst1.8 {X15}, [r6]; + vst1.8 {X11}, [r5]; + + mov lr, INPUT; +.Lround2: + subs ROUND, ROUND, #2 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15) + vld1.8 {X11}, [r5]; + vld1.8 {X15}, [r6]; + vst1.8 {X8}, [r5]; + vst1.8 {X9}, [r6]; + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9) + vld1.8 {X8}, [r5]; + vld1.8 {X9}, [r6]; + vst1.8 {X11}, [r5]; + vst1.8 {X15}, [r6]; + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15) + bne .Lround2; + + vld1.8 {X11}, [lr]!; + vst1.8 {X14}, [r7]; + + vdup.32 X14, X11l[0]; /* INPUT + 0 * 4 */ + vdup.32 X15, X11l[1]; /* INPUT + 1 * 4 */ + PLUS(X0, X14); + PLUS(X1, X15); + vdup.32 X14, X11h[0]; /* INPUT + 2 * 4 */ + vdup.32 X15, X11h[1]; /* INPUT + 3 * 4 */ + PLUS(X2, X14); + PLUS(X3, X15); + + vld1.8 {X11}, [r5]; + vld1.8 {X15}, [r6]; + vst1.8 {X0}, [r5]; + vld1.8 {X0}, [lr]!; + vst1.8 {X1}, [r6]; + + vdup.32 X14, X0l[0]; /* INPUT + 4 * 4 */ + vdup.32 X1, X0l[1]; /* INPUT + 5 * 4 */ + PLUS(X4, X14); + PLUS(X5, X1); + vdup.32 X14, X0h[0]; /* INPUT + 6 * 4 */ + vdup.32 X1, X0h[1]; /* INPUT + 7 * 4 */ + PLUS(X6, X14); + PLUS(X7, X1); + + vld1.8 {X0}, [lr]!; + + vdup.32 X14, X0l[0]; /* INPUT + 8 * 4 */ + vdup.32 X1, X0l[1]; /* INPUT + 9 * 4 */ + PLUS(X8, X14); + PLUS(X9, X1); + vdup.32 X14, X0h[0]; /* INPUT + 10 * 4 */ + vdup.32 X1, X0h[1]; /* INPUT + 11 * 4 */ + PLUS(X10, X14); + PLUS(X11, X1); + + vld1.8 {X0}, [lr]; + add lr, INPUT, #(12*4) + vld1.8 {X14}, [r7]; + + vdup.32 X1, X0h[0]; /* INPUT + 10 * 4 */ + ldm lr, {r10, r11}; /* Update counter */ + vdup.32 X0, X0h[1]; /* INPUT + 11 * 4 */ + PLUS(X14, X1); + PLUS(X15, X0); + adds r10, r10, #4; /* Update counter */ + vld1.8 {X0, X1}, [r8]; + + PLUS(X12, X0); + vld1.8 {X0}, [r5]; + PLUS(X13, X1); + adc r11, r11, #0; /* Update counter */ + + vld1.8 {X1}, [r6]; + stm lr, {r10, r11}; /* Update counter */ + transpose_4x4_part1(X0, X1, X2, X3); + transpose_4x4_part1(X4, X5, X6, X7); + transpose_4x4_part1(X8, X9, X10, X11); + transpose_4x4_part1(X12, X13, X14, X15); + transpose_4x4_part2(X0, X1, X2, X3); + transpose_4x4_part2(X4, X5, X6, X7); + transpose_4x4_part2(X8, X9, X10, X11); + transpose_4x4_part2(X12, X13, X14, X15); + + subs NBLKS, NBLKS, #4; + + vst1.8 {X10}, [r5]; + add lr, INPUT, #(12*4) + vst1.8 {X11}, [r6]; + vld1.8 {X10, X11}, [SRC]!; + veor X10, X0, X10; + vld1.8 {X0}, [SRC]!; + veor X11, X4, X11; + vld1.8 {X4}, [SRC]!; + vst1.8 {X10, X11}, [DST]!; + vld1.8 {X10, X11}, [SRC]!; + veor X0, X8, X0; + veor X4, X12, X4; + veor X10, X1, X10; + veor X11, X5, X11; + vst1.8 {X0}, [DST]!; + vld1.8 {X0, X1}, [SRC]!; + vst1.8 {X4}, [DST]!; + vld1.8 {X4, X5}, [SRC]!; + vst1.8 {X10, X11}, [DST]!; + vld1.8 {X10}, [r5]; + vld1.8 {X11}, [r6]; + veor X0, X9, X0; + vld1.8 {X8, X9}, [SRC]!; + veor X1, X13, X1; + vld1.8 {X12, X13}, [SRC]!; + veor X4, X2, X4; + veor X5, X6, X5; + vst1.8 {X0, X1}, [DST]!; + vld1.8 {X0, X1}, [SRC]!; + vst1.8 {X4, X5}, [DST]!; + veor X8, X10, X8; + veor X9, X14, X9; + veor X12, X3, X12; + veor X13, X7, X13; + veor X0, X11, X0; + veor X1, X15, X1; + vst1.8 {X8, X9}, [DST]!; + vst1.8 {X12, X13}, [DST]!; + vst1.8 {X0, X1}, [DST]!; + + bne .Loop4; + + /* clear the used vector registers and stack */ + clear(X0); + vst1.8 {X0}, [r5]; + vst1.8 {X0}, [r6]; + vst1.8 {X0}, [r7]; + vst1.8 {X0}, [r8]!; + vst1.8 {X0}, [r8]; + + mov sp, r12 + clear(X1); + clear(X2); + clear(X3); + clear(X4); + clear(X5); + clear(X6); + clear(X7); + clear(X8); + clear(X9); + clear(X10); + clear(X11); + clear(X12); + clear(X13); + clear(X14); + clear(X15); + + pop {r4-r12,lr} + vpop {q4-q7} + eor r0, r0, r0 bx lr -.ltorg -.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks; +.size _gcry_chacha20_armv7_neon_blocks4, .-_gcry_chacha20_armv7_neon_blocks4; #endif |