/* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform * * Copyright © 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_NEON) .text .syntax unified .fpu neon .arm /* structure of SHA512_CONTEXT */ #define hd_a 0 #define hd_b ((hd_a) + 8) #define hd_c ((hd_b) + 8) #define hd_d ((hd_c) + 8) #define hd_e ((hd_d) + 8) #define hd_f ((hd_e) + 8) #define hd_g ((hd_f) + 8) /* register macros */ #define RK %r2 #define RA d0 #define RB d1 #define RC d2 #define RD d3 #define RE d4 #define RF d5 #define RG d6 #define RH d7 #define RT0 d8 #define RT1 d9 #define RT2 d10 #define RT3 d11 #define RT4 d12 #define RT5 d13 #define RT6 d14 #define RT7 d15 #define RW0 d16 #define RW1 d17 #define RW2 d18 #define RW3 d19 #define RW4 d20 #define RW5 d21 #define RW6 d22 #define RW7 d23 #define RW8 d24 #define RW9 d25 #define RW10 d26 #define RW11 d27 #define RW12 d28 #define RW13 d29 #define RW14 d30 #define RW15 d31 #define RW01q q8 #define RW23q q9 #define RW45q q10 #define RW67q q11 #define RW89q q12 #define RW1011q q13 #define RW1213q q14 #define RW1415q q15 /*********************************************************************** * ARM assembly implementation of sha512 transform ***********************************************************************/ #define round_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw14, rw9, rw1) \ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ vshr.u64 RT1, re, #14; \ vshl.u64 RT3, re, #64 - 14; \ vshr.u64 RT4, re, #18; \ vshl.u64 RT5, re, #64 - 18; \ veor.64 RT1, RT1, RT3; \ vld1.64 {RT0}, [RK]!; \ veor.64 RT1, RT1, RT4; \ vshr.u64 RT3, re, #41; \ vshl.u64 RT4, re, #64 - 41; \ veor.64 RT1, RT1, RT5; \ vadd.u64 RT0, RT0, rw0; \ veor.64 RT1, RT1, RT3; \ vand.64 RT2, re, rf; \ veor.64 RT1, RT1, RT4; \ vbic.64 RT6, rg, re; \ \ vadd.u64 RT1, RT1, rh; \ veor.64 RT2, RT2, RT6; \ vshr.u64 rh, ra, #28; \ vshl.u64 RT3, ra, #64 - 28; \ vadd.u64 RT1, RT1, RT0; \ vshr.u64 RT4, ra, #34; \ veor.64 rh, rh, RT3; \ vshl.u64 RT5, ra, #64 - 34; \ vadd.u64 RT1, RT1, RT2; \ \ /* h = Sum0 (a) + Maj (a, b, c); */ \ veor.64 rh, rh, RT4; \ vshr.u64 RT3, ra, #39; \ vshl.u64 RT4, ra, #64 - 39; \ vorr.64 RT6, ra, rb; \ vand.64 RT0, ra, rb; \ veor.64 rh, rh, RT5; \ vand.64 RT6, RT6, rc; \ veor.64 rh, rh, RT3; \ vorr.64 RT0, RT0, RT6; \ veor.64 rh, rh, RT4; \ vshr.u64 RT4, rw14, #19; \ vadd.u64 rh, rh, RT0; \ vshl.u64 RT2, rw14, #64 - 19; \ \ /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \ vshr.u64 RT3, rw14, #61; \ vshl.u64 RT6, rw14, #64 - 61; \ veor.64 RT0, RT4, RT2; \ vshr.u64 RT2, rw14, 6; \ veor.64 RT0, RT0, RT3; \ vshr.u64 RT7, rw1, #1; \ veor.64 RT0, RT0, RT6; \ vshl.u64 RT4, rw1, #64 - 1; \ veor.64 RT0, RT0, RT2; \ vshr.u64 RT5, rw1, #8; \ vadd.u64 rw0, rw0, RT0; \ vshl.u64 RT6, rw1, #64 - 8; \ veor.64 RT7, RT7, RT4; \ vshr.u64 RT4, rw1, 7; \ veor.64 RT7, RT7, RT5; \ vadd.u64 rw0, rw0, rw9; /* w[0]+=w[9]; */\ veor.64 RT7, RT7, RT6; \ vadd.u64 rd, rd, RT1; /* d+=t1; */ \ veor.64 RT7, RT7, RT4; \ vadd.u64 rh, rh, RT1; /* h+=t1; */ \ vadd.u64 rw0, rw0, RT7; \ #define round_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0) \ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ vld1.64 {RT0}, [RK]!; \ vshr.u64 RT1, re, #14; \ vshl.u64 RT3, re, #64 - 14; \ vshr.u64 RT4, re, #18; \ vshl.u64 RT5, re, #64 - 18; \ veor.64 RT1, RT1, RT3; \ vshr.u64 RT7, ra, #28; \ veor.64 RT1, RT1, RT4; \ vshr.u64 RT3, re, #41; \ vshl.u64 RT4, re, #64 - 41; \ veor.64 RT1, RT1, RT5; \ vadd.u64 RT0, RT0, rw0; \ veor.64 RT1, RT1, RT3; \ vand.64 RT2, re, rf; \ veor.64 RT1, RT1, RT4; \ vbic.64 RT6, rg, re; \ \ vadd.u64 RT1, RT1, rh; \ veor.64 RT2, RT2, RT6; \ vadd.u64 RT1, RT1, RT0; \ vshr.u64 RT4, ra, #34; \ vshl.u64 RT5, ra, #64 - 34; \ \ /* t7 = Sum0 (a) + Maj (a, b, c); */ \ vshl.u64 RT6, ra, #64 - 28; \ veor.64 RT7, RT7, RT4; \ vshr.u64 RT3, ra, #39; \ veor.64 RT7, RT7, RT6; \ vshl.u64 RT4, ra, #64 - 39; \ vorr.64 RT6, ra, rb; \ vand.64 RT0, ra, rb; \ veor.64 RT7, RT7, RT5; \ vand.64 RT6, RT6, rc; \ veor.64 RT7, RT7, RT3; \ vorr.64 RT0, RT0, RT6; \ veor.64 RT7, RT7, RT4; \ vadd.u64 RT1, RT1, RT2; \ vadd.u64 RT7, RT7, RT0; \ vadd.u64 rd, rd, RT1; /* d+=t1; */ \ vadd.u64 rh, RT7, RT1; /* h=t7+t1; */ .align 3 .globl _gcry_sha512_transform_armv7_neon .type _gcry_sha512_transform_armv7_neon,%function; _gcry_sha512_transform_armv7_neon: /* Input: * %r0: SHA512_CONTEXT * %r1: data * %r2: u64 k[] constants */ mov %r3, #0; /* Load context to d0-d7 */ vld1.64 {RA-RD}, [%r0]!; vld1.64 {RE-RH}, [%r0]; sub %r0, #(4*8); /* Load input to w[16], d16-d31 */ /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */ vld1.64 {RW0-RW3}, [%r1]!; vld1.64 {RW4-RW7}, [%r1]!; vld1.64 {RW8-RW11}, [%r1]!; vld1.64 {RW12-RW15}, [%r1]; #ifdef __ARMEL__ /* byteswap */ vrev64.8 RW01q, RW01q; vrev64.8 RW23q, RW23q; vrev64.8 RW45q, RW45q; vrev64.8 RW67q, RW67q; vrev64.8 RW89q, RW89q; vrev64.8 RW1011q, RW1011q; vrev64.8 RW1213q, RW1213q; vrev64.8 RW1415q, RW1415q; #endif /* EABI says that d8-d15 must be preserved by callee. */ vpush {RT0-RT7}; .Loop: add %r3, #16; round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW14, RW9, RW1); cmp %r3, #64; round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW1, RW15, RW10, RW2); round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW0, RW11, RW3); round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW3, RW1, RW12, RW4); round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW2, RW13, RW5); round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW5, RW3, RW14, RW6); round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW4, RW15, RW7); round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW7, RW5, RW0, RW8); round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW6, RW1, RW9); round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW9, RW7, RW2, RW10); round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW8, RW3, RW11); round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW11, RW9, RW4, RW12); round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW10, RW5, RW13); round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW13, RW11, RW6, RW14); round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW12, RW7, RW15); round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW15, RW13, RW8, RW0); bne .Loop; round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0); round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW1); round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2); round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW3); round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4); round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW5); round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6); round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW7); round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8); round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW9); round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10); round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW11); round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12); round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW13); round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14); round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW15); /* Load context to d16-d23 */ vld1.64 {RW0-RW3}, [%r0]!; vld1.64 {RW4-RW7}, [%r0]; sub %r0, #(4*8); vadd.u64 RA, RW0; vadd.u64 RB, RW1; vadd.u64 RC, RW2; vadd.u64 RD, RW3; vadd.u64 RE, RW4; vadd.u64 RF, RW5; vadd.u64 RG, RW6; vadd.u64 RH, RW7; /* Store the first half of context */ vst1.64 {RA-RD}, [%r0]!; /* Clear used registers */ /* d16-d31 */ veor.u64 RW01q, RW01q; veor.u64 RW23q, RW23q; veor.u64 RW45q, RW45q; veor.u64 RW67q, RW67q; vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ veor.u64 RW89q, RW89q; veor.u64 RW1011q, RW1011q; veor.u64 RW1213q, RW1213q; veor.u64 RW1415q, RW1415q; /* d8-d15 */ vpop {RT0-RT7}; /* d0-d7 (q0-q3) */ veor.u64 %q0, %q0; veor.u64 %q1, %q1; veor.u64 %q2, %q2; veor.u64 %q3, %q3; bx %lr; .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon; #endif