summaryrefslogtreecommitdiff
path: root/cipher/keccak_permute_64.h
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2015-10-23 22:30:48 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2015-10-28 20:08:56 +0200
commit74184c28fbe7ff58cf57f0094ef957d94045da7d (patch)
tree3d1c33a07e749439aa010abbdfbdd6fff0364356 /cipher/keccak_permute_64.h
parent909644ef5883927262366c356eed530e55aba478 (diff)
downloadlibgcrypt-74184c28fbe7ff58cf57f0094ef957d94045da7d.tar.gz
keccak: rewrite for improved performance
* cipher/Makefile.am: Add 'keccak_permute_32.h' and 'keccak_permute_64.h'. * cipher/hash-common.h [USE_SHA3] (MD_BLOCK_MAX_BLOCKSIZE): Remove. * cipher/keccak.c (USE_64BIT, USE_32BIT, USE_64BIT_BMI2) (USE_64BIT_SHLD, USE_32BIT_BMI2, NEED_COMMON64, NEED_COMMON32BI) (keccak_ops_t): New. (KECCAK_STATE): Add 'state64' and 'state32bi' members. (KECCAK_CONTEXT): Remove 'bctx'; add 'blocksize', 'count' and 'ops'. (rol64, keccak_f1600_state_permute): Remove. [NEED_COMMON64] (round_consts_64bit, keccak_extract_inplace64): New. [NEED_COMMON32BI] (round_consts_32bit, keccak_extract_inplace32bi) (keccak_absorb_lane32bi): New. [USE_64BIT] (ANDN64, ROL64, keccak_f1600_state_permute64) (keccak_absorb_lanes64, keccak_generic64_ops): New. [USE_64BIT_SHLD] (ANDN64, ROL64, keccak_f1600_state_permute64_shld) (keccak_absorb_lanes64_shld, keccak_shld_64_ops): New. [USE_64BIT_BMI2] (ANDN64, ROL64, keccak_f1600_state_permute64_bmi2) (keccak_absorb_lanes64_bmi2, keccak_bmi2_64_ops): New. [USE_32BIT] (ANDN64, ROL64, keccak_f1600_state_permute32bi) (keccak_absorb_lanes32bi, keccak_generic32bi_ops): New. [USE_32BIT_BMI2] (ANDN64, ROL64, keccak_f1600_state_permute32bi_bmi2) (pext, pdep, keccak_absorb_lane32bi_bmi2, keccak_absorb_lanes32bi_bmi2) (keccak_extract_inplace32bi_bmi2, keccak_bmi2_32bi_ops): New. (keccak_write): New. (keccak_init): Adjust to KECCAK_CONTEXT changes; add implementation selection based on HWF features. (keccak_final): Adjust to KECCAK_CONTEXT changes; use selected 'ops' for state manipulation. (keccak_read): Adjust to KECCAK_CONTEXT changes. (_gcry_digest_spec_sha3_224, _gcry_digest_spec_sha3_256) (_gcry_digest_spec_sha3_348, _gcry_digest_spec_sha3_512): Use 'keccak_write' instead of '_gcry_md_block_write'. * cipher/keccak_permute_32.h: New. * cipher/keccak_permute_64.h: New. -- Patch adds new generic 64-bit and 32-bit implementations and optimized implementations for SHA3: - Generic 64-bit implementation based on 'simple' implementation from SUPERCOP package. - Generic 32-bit bit-inteleaved implementataion based on 'simple32bi' implementation from SUPERCOP package. - Intel BMI2 optimized variants of 64-bit and 32-bit BI implementations. - Intel SHLD optimized variant of 64-bit implementation. Patch also makes proper use of sponge construction to avoid use of addition input buffer. Below are bench-slope benchmarks for new 64-bit implementations made on Intel Core i5-4570 (no turbo, 3.2 Ghz, gcc-4.9.2). Before (amd64): SHA3-224 | 3.92 ns/B 243.2 MiB/s 12.55 c/B SHA3-256 | 4.15 ns/B 230.0 MiB/s 13.27 c/B SHA3-384 | 5.40 ns/B 176.6 MiB/s 17.29 c/B SHA3-512 | 7.77 ns/B 122.7 MiB/s 24.87 c/B After (generic 64-bit, amd64), 1.10x faster): SHA3-224 | 3.57 ns/B 267.4 MiB/s 11.42 c/B SHA3-256 | 3.77 ns/B 252.8 MiB/s 12.07 c/B SHA3-384 | 4.91 ns/B 194.1 MiB/s 15.72 c/B SHA3-512 | 7.06 ns/B 135.0 MiB/s 22.61 c/B After (Intel SHLD 64-bit, amd64, 1.13x faster): SHA3-224 | 3.48 ns/B 273.7 MiB/s 11.15 c/B SHA3-256 | 3.68 ns/B 258.9 MiB/s 11.79 c/B SHA3-384 | 4.80 ns/B 198.7 MiB/s 15.36 c/B SHA3-512 | 6.89 ns/B 138.4 MiB/s 22.05 c/B After (Intel BMI2 64-bit, amd64, 1.45x faster): SHA3-224 | 2.71 ns/B 352.1 MiB/s 8.67 c/B SHA3-256 | 2.86 ns/B 333.2 MiB/s 9.16 c/B SHA3-384 | 3.72 ns/B 256.2 MiB/s 11.91 c/B SHA3-512 | 5.34 ns/B 178.5 MiB/s 17.10 c/B Benchmarks of new 32-bit implementations on Intel Core i5-4570 (no turbo, 3.2 Ghz, gcc-4.9.2): Before (win32): SHA3-224 | 12.05 ns/B 79.16 MiB/s 38.56 c/B SHA3-256 | 12.75 ns/B 74.78 MiB/s 40.82 c/B SHA3-384 | 16.63 ns/B 57.36 MiB/s 53.22 c/B SHA3-512 | 23.97 ns/B 39.79 MiB/s 76.72 c/B After (generic 32-bit BI, win32, 1.23x to 1.29x faster): SHA3-224 | 9.76 ns/B 97.69 MiB/s 31.25 c/B SHA3-256 | 10.27 ns/B 92.82 MiB/s 32.89 c/B SHA3-384 | 13.22 ns/B 72.16 MiB/s 42.31 c/B SHA3-512 | 18.65 ns/B 51.13 MiB/s 59.70 c/B After (Intel BMI2 32-bit BI, win32, 1.66x to 1.70x faster): SHA3-224 | 7.26 ns/B 131.4 MiB/s 23.23 c/B SHA3-256 | 7.65 ns/B 124.7 MiB/s 24.47 c/B SHA3-384 | 9.87 ns/B 96.67 MiB/s 31.58 c/B SHA3-512 | 14.05 ns/B 67.85 MiB/s 44.99 c/B Benchmarks of new 32-bit implementation on ARM Cortex-A8 (1008 Mhz, gcc-4.9.1): Before: SHA3-224 | 148.6 ns/B 6.42 MiB/s 149.8 c/B SHA3-256 | 157.2 ns/B 6.07 MiB/s 158.4 c/B SHA3-384 | 205.3 ns/B 4.65 MiB/s 206.9 c/B SHA3-512 | 296.3 ns/B 3.22 MiB/s 298.6 c/B After (1.56x faster): SHA3-224 | 96.12 ns/B 9.92 MiB/s 96.89 c/B SHA3-256 | 101.5 ns/B 9.40 MiB/s 102.3 c/B SHA3-384 | 131.4 ns/B 7.26 MiB/s 132.5 c/B SHA3-512 | 188.2 ns/B 5.07 MiB/s 189.7 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak_permute_64.h')
-rw-r--r--cipher/keccak_permute_64.h290
1 files changed, 290 insertions, 0 deletions
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
new file mode 100644
index 00000000..1264f195
--- /dev/null
+++ b/cipher/keccak_permute_64.h
@@ -0,0 +1,290 @@
+/* keccak_permute_64.h - Keccak permute function (simple 64bit)
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 "keccakc1024/simple/Keccak-simple.c"
+ * implementation by Ronny Van Keer from SUPERCOP toolkit package.
+ */
+
+/* Function that computes the Keccak-f[1600] permutation on the given state. */
+static unsigned int
+KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
+{
+ const u64 *round_consts = round_consts_64bit;
+ u64 Aba, Abe, Abi, Abo, Abu;
+ u64 Aga, Age, Agi, Ago, Agu;
+ u64 Aka, Ake, Aki, Ako, Aku;
+ u64 Ama, Ame, Ami, Amo, Amu;
+ u64 Asa, Ase, Asi, Aso, Asu;
+ u64 BCa, BCe, BCi, BCo, BCu;
+ u64 Da, De, Di, Do, Du;
+ u64 Eba, Ebe, Ebi, Ebo, Ebu;
+ u64 Ega, Ege, Egi, Ego, Egu;
+ u64 Eka, Eke, Eki, Eko, Eku;
+ u64 Ema, Eme, Emi, Emo, Emu;
+ u64 Esa, Ese, Esi, Eso, Esu;
+ u64 *state = hd->u.state64;
+ unsigned int round;
+
+ Aba = state[0];
+ Abe = state[1];
+ Abi = state[2];
+ Abo = state[3];
+ Abu = state[4];
+ Aga = state[5];
+ Age = state[6];
+ Agi = state[7];
+ Ago = state[8];
+ Agu = state[9];
+ Aka = state[10];
+ Ake = state[11];
+ Aki = state[12];
+ Ako = state[13];
+ Aku = state[14];
+ Ama = state[15];
+ Ame = state[16];
+ Ami = state[17];
+ Amo = state[18];
+ Amu = state[19];
+ Asa = state[20];
+ Ase = state[21];
+ Asi = state[22];
+ Aso = state[23];
+ Asu = state[24];
+
+ for (round = 0; round < 24; round += 2)
+ {
+ /* prepareTheta */
+ BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
+ BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase;
+ BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
+ BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
+ BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
+
+ /* thetaRhoPiChiIotaPrepareTheta(round , A, E) */
+ Da = BCu ^ ROL64(BCe, 1);
+ De = BCa ^ ROL64(BCi, 1);
+ Di = BCe ^ ROL64(BCo, 1);
+ Do = BCi ^ ROL64(BCu, 1);
+ Du = BCo ^ ROL64(BCa, 1);
+
+ Aba ^= Da;
+ BCa = Aba;
+ Age ^= De;
+ BCe = ROL64(Age, 44);
+ Aki ^= Di;
+ BCi = ROL64(Aki, 43);
+ Amo ^= Do;
+ BCo = ROL64(Amo, 21);
+ Asu ^= Du;
+ BCu = ROL64(Asu, 14);
+ Eba = BCa ^ ANDN64(BCe, BCi);
+ Eba ^= (u64)round_consts[round];
+ Ebe = BCe ^ ANDN64(BCi, BCo);
+ Ebi = BCi ^ ANDN64(BCo, BCu);
+ Ebo = BCo ^ ANDN64(BCu, BCa);
+ Ebu = BCu ^ ANDN64(BCa, BCe);
+
+ Abo ^= Do;
+ BCa = ROL64(Abo, 28);
+ Agu ^= Du;
+ BCe = ROL64(Agu, 20);
+ Aka ^= Da;
+ BCi = ROL64(Aka, 3);
+ Ame ^= De;
+ BCo = ROL64(Ame, 45);
+ Asi ^= Di;
+ BCu = ROL64(Asi, 61);
+ Ega = BCa ^ ANDN64(BCe, BCi);
+ Ege = BCe ^ ANDN64(BCi, BCo);
+ Egi = BCi ^ ANDN64(BCo, BCu);
+ Ego = BCo ^ ANDN64(BCu, BCa);
+ Egu = BCu ^ ANDN64(BCa, BCe);
+
+ Abe ^= De;
+ BCa = ROL64(Abe, 1);
+ Agi ^= Di;
+ BCe = ROL64(Agi, 6);
+ Ako ^= Do;
+ BCi = ROL64(Ako, 25);
+ Amu ^= Du;
+ BCo = ROL64(Amu, 8);
+ Asa ^= Da;
+ BCu = ROL64(Asa, 18);
+ Eka = BCa ^ ANDN64(BCe, BCi);
+ Eke = BCe ^ ANDN64(BCi, BCo);
+ Eki = BCi ^ ANDN64(BCo, BCu);
+ Eko = BCo ^ ANDN64(BCu, BCa);
+ Eku = BCu ^ ANDN64(BCa, BCe);
+
+ Abu ^= Du;
+ BCa = ROL64(Abu, 27);
+ Aga ^= Da;
+ BCe = ROL64(Aga, 36);
+ Ake ^= De;
+ BCi = ROL64(Ake, 10);
+ Ami ^= Di;
+ BCo = ROL64(Ami, 15);
+ Aso ^= Do;
+ BCu = ROL64(Aso, 56);
+ Ema = BCa ^ ANDN64(BCe, BCi);
+ Eme = BCe ^ ANDN64(BCi, BCo);
+ Emi = BCi ^ ANDN64(BCo, BCu);
+ Emo = BCo ^ ANDN64(BCu, BCa);
+ Emu = BCu ^ ANDN64(BCa, BCe);
+
+ Abi ^= Di;
+ BCa = ROL64(Abi, 62);
+ Ago ^= Do;
+ BCe = ROL64(Ago, 55);
+ Aku ^= Du;
+ BCi = ROL64(Aku, 39);
+ Ama ^= Da;
+ BCo = ROL64(Ama, 41);
+ Ase ^= De;
+ BCu = ROL64(Ase, 2);
+ Esa = BCa ^ ANDN64(BCe, BCi);
+ Ese = BCe ^ ANDN64(BCi, BCo);
+ Esi = BCi ^ ANDN64(BCo, BCu);
+ Eso = BCo ^ ANDN64(BCu, BCa);
+ Esu = BCu ^ ANDN64(BCa, BCe);
+
+ /* prepareTheta */
+ BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
+ BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
+ BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
+ BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
+ BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
+
+ /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */
+ Da = BCu ^ ROL64(BCe, 1);
+ De = BCa ^ ROL64(BCi, 1);
+ Di = BCe ^ ROL64(BCo, 1);
+ Do = BCi ^ ROL64(BCu, 1);
+ Du = BCo ^ ROL64(BCa, 1);
+
+ Eba ^= Da;
+ BCa = Eba;
+ Ege ^= De;
+ BCe = ROL64(Ege, 44);
+ Eki ^= Di;
+ BCi = ROL64(Eki, 43);
+ Emo ^= Do;
+ BCo = ROL64(Emo, 21);
+ Esu ^= Du;
+ BCu = ROL64(Esu, 14);
+ Aba = BCa ^ ANDN64(BCe, BCi);
+ Aba ^= (u64)round_consts[round + 1];
+ Abe = BCe ^ ANDN64(BCi, BCo);
+ Abi = BCi ^ ANDN64(BCo, BCu);
+ Abo = BCo ^ ANDN64(BCu, BCa);
+ Abu = BCu ^ ANDN64(BCa, BCe);
+
+ Ebo ^= Do;
+ BCa = ROL64(Ebo, 28);
+ Egu ^= Du;
+ BCe = ROL64(Egu, 20);
+ Eka ^= Da;
+ BCi = ROL64(Eka, 3);
+ Eme ^= De;
+ BCo = ROL64(Eme, 45);
+ Esi ^= Di;
+ BCu = ROL64(Esi, 61);
+ Aga = BCa ^ ANDN64(BCe, BCi);
+ Age = BCe ^ ANDN64(BCi, BCo);
+ Agi = BCi ^ ANDN64(BCo, BCu);
+ Ago = BCo ^ ANDN64(BCu, BCa);
+ Agu = BCu ^ ANDN64(BCa, BCe);
+
+ Ebe ^= De;
+ BCa = ROL64(Ebe, 1);
+ Egi ^= Di;
+ BCe = ROL64(Egi, 6);
+ Eko ^= Do;
+ BCi = ROL64(Eko, 25);
+ Emu ^= Du;
+ BCo = ROL64(Emu, 8);
+ Esa ^= Da;
+ BCu = ROL64(Esa, 18);
+ Aka = BCa ^ ANDN64(BCe, BCi);
+ Ake = BCe ^ ANDN64(BCi, BCo);
+ Aki = BCi ^ ANDN64(BCo, BCu);
+ Ako = BCo ^ ANDN64(BCu, BCa);
+ Aku = BCu ^ ANDN64(BCa, BCe);
+
+ Ebu ^= Du;
+ BCa = ROL64(Ebu, 27);
+ Ega ^= Da;
+ BCe = ROL64(Ega, 36);
+ Eke ^= De;
+ BCi = ROL64(Eke, 10);
+ Emi ^= Di;
+ BCo = ROL64(Emi, 15);
+ Eso ^= Do;
+ BCu = ROL64(Eso, 56);
+ Ama = BCa ^ ANDN64(BCe, BCi);
+ Ame = BCe ^ ANDN64(BCi, BCo);
+ Ami = BCi ^ ANDN64(BCo, BCu);
+ Amo = BCo ^ ANDN64(BCu, BCa);
+ Amu = BCu ^ ANDN64(BCa, BCe);
+
+ Ebi ^= Di;
+ BCa = ROL64(Ebi, 62);
+ Ego ^= Do;
+ BCe = ROL64(Ego, 55);
+ Eku ^= Du;
+ BCi = ROL64(Eku, 39);
+ Ema ^= Da;
+ BCo = ROL64(Ema, 41);
+ Ese ^= De;
+ BCu = ROL64(Ese, 2);
+ Asa = BCa ^ ANDN64(BCe, BCi);
+ Ase = BCe ^ ANDN64(BCi, BCo);
+ Asi = BCi ^ ANDN64(BCo, BCu);
+ Aso = BCo ^ ANDN64(BCu, BCa);
+ Asu = BCu ^ ANDN64(BCa, BCe);
+ }
+
+ state[0] = Aba;
+ state[1] = Abe;
+ state[2] = Abi;
+ state[3] = Abo;
+ state[4] = Abu;
+ state[5] = Aga;
+ state[6] = Age;
+ state[7] = Agi;
+ state[8] = Ago;
+ state[9] = Agu;
+ state[10] = Aka;
+ state[11] = Ake;
+ state[12] = Aki;
+ state[13] = Ako;
+ state[14] = Aku;
+ state[15] = Ama;
+ state[16] = Ame;
+ state[17] = Ami;
+ state[18] = Amo;
+ state[19] = Amu;
+ state[20] = Asa;
+ state[21] = Ase;
+ state[22] = Asi;
+ state[23] = Aso;
+ state[24] = Asu;
+
+ return sizeof(void *) * 4 + sizeof(u64) * 12 * 5;
+}