summaryrefslogtreecommitdiff
path: root/Modules/_sha3/kcp
diff options
context:
space:
mode:
Diffstat (limited to 'Modules/_sha3/kcp')
-rw-r--r--Modules/_sha3/kcp/KeccakHash.c82
-rw-r--r--Modules/_sha3/kcp/KeccakHash.h114
-rw-r--r--Modules/_sha3/kcp/KeccakP-1600-64.macros2208
-rw-r--r--Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h37
-rw-r--r--Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h49
-rw-r--r--Modules/_sha3/kcp/KeccakP-1600-SnP.h7
-rw-r--r--Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c1162
-rw-r--r--Modules/_sha3/kcp/KeccakP-1600-opt64-config.h3
-rw-r--r--Modules/_sha3/kcp/KeccakP-1600-opt64.c474
-rw-r--r--Modules/_sha3/kcp/KeccakP-1600-unrolling.macros185
-rw-r--r--Modules/_sha3/kcp/KeccakSponge.c92
-rw-r--r--Modules/_sha3/kcp/KeccakSponge.h172
-rw-r--r--Modules/_sha3/kcp/KeccakSponge.inc332
-rw-r--r--Modules/_sha3/kcp/PlSnP-Fallback.inc257
-rw-r--r--Modules/_sha3/kcp/SnP-Relaned.h134
-rw-r--r--Modules/_sha3/kcp/align.h35
16 files changed, 5343 insertions, 0 deletions
diff --git a/Modules/_sha3/kcp/KeccakHash.c b/Modules/_sha3/kcp/KeccakHash.c
new file mode 100644
index 0000000000..e09fb43cac
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakHash.c
@@ -0,0 +1,82 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakHash.h"
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashInitialize(Keccak_HashInstance *instance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix)
+{
+ HashReturn result;
+
+ if (delimitedSuffix == 0)
+ return FAIL;
+ result = (HashReturn)KeccakWidth1600_SpongeInitialize(&instance->sponge, rate, capacity);
+ if (result != SUCCESS)
+ return result;
+ instance->fixedOutputLength = hashbitlen;
+ instance->delimitedSuffix = delimitedSuffix;
+ return SUCCESS;
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *instance, const BitSequence *data, DataLength databitlen)
+{
+ if ((databitlen % 8) == 0)
+ return (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
+ else {
+ HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
+ if (ret == SUCCESS) {
+ /* The last partial byte is assumed to be aligned on the least significant bits */
+
+ unsigned char lastByte = data[databitlen/8];
+ /* Concatenate the last few bits provided here with those of the suffix */
+
+ unsigned short delimitedLastBytes = (unsigned short)((unsigned short)lastByte | ((unsigned short)instance->delimitedSuffix << (databitlen % 8)));
+ if ((delimitedLastBytes & 0xFF00) == 0x0000) {
+ instance->delimitedSuffix = delimitedLastBytes & 0xFF;
+ }
+ else {
+ unsigned char oneByte[1];
+ oneByte[0] = delimitedLastBytes & 0xFF;
+ ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, oneByte, 1);
+ instance->delimitedSuffix = (delimitedLastBytes >> 8) & 0xFF;
+ }
+ }
+ return ret;
+ }
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashFinal(Keccak_HashInstance *instance, BitSequence *hashval)
+{
+ HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorbLastFewBits(&instance->sponge, instance->delimitedSuffix);
+ if (ret == SUCCESS)
+ return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, hashval, instance->fixedOutputLength/8);
+ else
+ return ret;
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *instance, BitSequence *data, DataLength databitlen)
+{
+ if ((databitlen % 8) != 0)
+ return FAIL;
+ return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, data, databitlen/8);
+}
diff --git a/Modules/_sha3/kcp/KeccakHash.h b/Modules/_sha3/kcp/KeccakHash.h
new file mode 100644
index 0000000000..bbd3dc64a2
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakHash.h
@@ -0,0 +1,114 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakHashInterface_h_
+#define _KeccakHashInterface_h_
+
+#ifndef KeccakP1600_excluded
+
+#include "KeccakSponge.h"
+#include <string.h>
+
+typedef unsigned char BitSequence;
+typedef size_t DataLength;
+typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn;
+
+typedef struct {
+ KeccakWidth1600_SpongeInstance sponge;
+ unsigned int fixedOutputLength;
+ unsigned char delimitedSuffix;
+} Keccak_HashInstance;
+
+/**
+ * Function to initialize the Keccak[r, c] sponge function instance used in sequential hashing mode.
+ * @param hashInstance Pointer to the hash instance to be initialized.
+ * @param rate The value of the rate r.
+ * @param capacity The value of the capacity c.
+ * @param hashbitlen The desired number of output bits,
+ * or 0 for an arbitrarily-long output.
+ * @param delimitedSuffix Bits that will be automatically appended to the end
+ * of the input message, as in domain separation.
+ * This is a byte containing from 0 to 7 bits
+ * formatted like the @a delimitedData parameter of
+ * the Keccak_SpongeAbsorbLastFewBits() function.
+ * @pre One must have r+c=1600 and the rate a multiple of 8 bits in this implementation.
+ * @return SUCCESS if successful, FAIL otherwise.
+ */
+HashReturn Keccak_HashInitialize(Keccak_HashInstance *hashInstance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix);
+
+/** Macro to initialize a SHAKE128 instance as specified in the FIPS 202 standard.
+ */
+#define Keccak_HashInitialize_SHAKE128(hashInstance) Keccak_HashInitialize(hashInstance, 1344, 256, 0, 0x1F)
+
+/** Macro to initialize a SHAKE256 instance as specified in the FIPS 202 standard.
+ */
+#define Keccak_HashInitialize_SHAKE256(hashInstance) Keccak_HashInitialize(hashInstance, 1088, 512, 0, 0x1F)
+
+/** Macro to initialize a SHA3-224 instance as specified in the FIPS 202 standard.
+ */
+#define Keccak_HashInitialize_SHA3_224(hashInstance) Keccak_HashInitialize(hashInstance, 1152, 448, 224, 0x06)
+
+/** Macro to initialize a SHA3-256 instance as specified in the FIPS 202 standard.
+ */
+#define Keccak_HashInitialize_SHA3_256(hashInstance) Keccak_HashInitialize(hashInstance, 1088, 512, 256, 0x06)
+
+/** Macro to initialize a SHA3-384 instance as specified in the FIPS 202 standard.
+ */
+#define Keccak_HashInitialize_SHA3_384(hashInstance) Keccak_HashInitialize(hashInstance, 832, 768, 384, 0x06)
+
+/** Macro to initialize a SHA3-512 instance as specified in the FIPS 202 standard.
+ */
+#define Keccak_HashInitialize_SHA3_512(hashInstance) Keccak_HashInitialize(hashInstance, 576, 1024, 512, 0x06)
+
+/**
+ * Function to give input data to be absorbed.
+ * @param hashInstance Pointer to the hash instance initialized by Keccak_HashInitialize().
+ * @param data Pointer to the input data.
+ * When @a databitLen is not a multiple of 8, the last bits of data must be
+ * in the least significant bits of the last byte (little-endian convention).
+ * @param databitLen The number of input bits provided in the input data.
+ * @pre In the previous call to Keccak_HashUpdate(), databitlen was a multiple of 8.
+ * @return SUCCESS if successful, FAIL otherwise.
+ */
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *hashInstance, const BitSequence *data, DataLength databitlen);
+
+/**
+ * Function to call after all input blocks have been input and to get
+ * output bits if the length was specified when calling Keccak_HashInitialize().
+ * @param hashInstance Pointer to the hash instance initialized by Keccak_HashInitialize().
+ * If @a hashbitlen was not 0 in the call to Keccak_HashInitialize(), the number of
+ * output bits is equal to @a hashbitlen.
+ * If @a hashbitlen was 0 in the call to Keccak_HashInitialize(), the output bits
+ * must be extracted using the Keccak_HashSqueeze() function.
+ * @param state Pointer to the state of the sponge function initialized by Init().
+ * @param hashval Pointer to the buffer where to store the output data.
+ * @return SUCCESS if successful, FAIL otherwise.
+ */
+HashReturn Keccak_HashFinal(Keccak_HashInstance *hashInstance, BitSequence *hashval);
+
+ /**
+ * Function to squeeze output data.
+ * @param hashInstance Pointer to the hash instance initialized by Keccak_HashInitialize().
+ * @param data Pointer to the buffer where to store the output data.
+ * @param databitlen The number of output bits desired (must be a multiple of 8).
+ * @pre Keccak_HashFinal() must have been already called.
+ * @pre @a databitlen is a multiple of 8.
+ * @return SUCCESS if successful, FAIL otherwise.
+ */
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *hashInstance, BitSequence *data, DataLength databitlen);
+
+#endif
+
+#endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-64.macros b/Modules/_sha3/kcp/KeccakP-1600-64.macros
new file mode 100644
index 0000000000..1f11fe3e79
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-64.macros
@@ -0,0 +1,2208 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+ UINT64 Aba, Abe, Abi, Abo, Abu; \
+ UINT64 Aga, Age, Agi, Ago, Agu; \
+ UINT64 Aka, Ake, Aki, Ako, Aku; \
+ UINT64 Ama, Ame, Ami, Amo, Amu; \
+ UINT64 Asa, Ase, Asi, Aso, Asu; \
+ UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \
+ UINT64 Bga, Bge, Bgi, Bgo, Bgu; \
+ UINT64 Bka, Bke, Bki, Bko, Bku; \
+ UINT64 Bma, Bme, Bmi, Bmo, Bmu; \
+ UINT64 Bsa, Bse, Bsi, Bso, Bsu; \
+ UINT64 Ca, Ce, Ci, Co, Cu; \
+ UINT64 Da, De, Di, Do, Du; \
+ UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \
+ UINT64 Ega, Ege, Egi, Ego, Egu; \
+ UINT64 Eka, Eke, Eki, Eko, Eku; \
+ UINT64 Ema, Eme, Emi, Emo, Emu; \
+ UINT64 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+ Ca = Aba^Aga^Aka^Ama^Asa; \
+ Ce = Abe^Age^Ake^Ame^Ase; \
+ Ci = Abi^Agi^Aki^Ami^Asi; \
+ Co = Abo^Ago^Ako^Amo^Aso; \
+ Cu = Abu^Agu^Aku^Amu^Asu; \
+
+#ifdef UseBebigokimisa
+/* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */
+
+/* --- 64-bit lanes mapped to 64-bit words */
+
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+ Da = Cu^ROL64(Ce, 1); \
+ De = Ca^ROL64(Ci, 1); \
+ Di = Ce^ROL64(Co, 1); \
+ Do = Ci^ROL64(Cu, 1); \
+ Du = Co^ROL64(Ca, 1); \
+\
+ A##ba ^= Da; \
+ Bba = A##ba; \
+ A##ge ^= De; \
+ Bbe = ROL64(A##ge, 44); \
+ A##ki ^= Di; \
+ Bbi = ROL64(A##ki, 43); \
+ A##mo ^= Do; \
+ Bbo = ROL64(A##mo, 21); \
+ A##su ^= Du; \
+ Bbu = ROL64(A##su, 14); \
+ E##ba = Bba ^( Bbe | Bbi ); \
+ E##ba ^= KeccakF1600RoundConstants[i]; \
+ Ca = E##ba; \
+ E##be = Bbe ^((~Bbi)| Bbo ); \
+ Ce = E##be; \
+ E##bi = Bbi ^( Bbo & Bbu ); \
+ Ci = E##bi; \
+ E##bo = Bbo ^( Bbu | Bba ); \
+ Co = E##bo; \
+ E##bu = Bbu ^( Bba & Bbe ); \
+ Cu = E##bu; \
+\
+ A##bo ^= Do; \
+ Bga = ROL64(A##bo, 28); \
+ A##gu ^= Du; \
+ Bge = ROL64(A##gu, 20); \
+ A##ka ^= Da; \
+ Bgi = ROL64(A##ka, 3); \
+ A##me ^= De; \
+ Bgo = ROL64(A##me, 45); \
+ A##si ^= Di; \
+ Bgu = ROL64(A##si, 61); \
+ E##ga = Bga ^( Bge | Bgi ); \
+ Ca ^= E##ga; \
+ E##ge = Bge ^( Bgi & Bgo ); \
+ Ce ^= E##ge; \
+ E##gi = Bgi ^( Bgo |(~Bgu)); \
+ Ci ^= E##gi; \
+ E##go = Bgo ^( Bgu | Bga ); \
+ Co ^= E##go; \
+ E##gu = Bgu ^( Bga & Bge ); \
+ Cu ^= E##gu; \
+\
+ A##be ^= De; \
+ Bka = ROL64(A##be, 1); \
+ A##gi ^= Di; \
+ Bke = ROL64(A##gi, 6); \
+ A##ko ^= Do; \
+ Bki = ROL64(A##ko, 25); \
+ A##mu ^= Du; \
+ Bko = ROL64(A##mu, 8); \
+ A##sa ^= Da; \
+ Bku = ROL64(A##sa, 18); \
+ E##ka = Bka ^( Bke | Bki ); \
+ Ca ^= E##ka; \
+ E##ke = Bke ^( Bki & Bko ); \
+ Ce ^= E##ke; \
+ E##ki = Bki ^((~Bko)& Bku ); \
+ Ci ^= E##ki; \
+ E##ko = (~Bko)^( Bku | Bka ); \
+ Co ^= E##ko; \
+ E##ku = Bku ^( Bka & Bke ); \
+ Cu ^= E##ku; \
+\
+ A##bu ^= Du; \
+ Bma = ROL64(A##bu, 27); \
+ A##ga ^= Da; \
+ Bme = ROL64(A##ga, 36); \
+ A##ke ^= De; \
+ Bmi = ROL64(A##ke, 10); \
+ A##mi ^= Di; \
+ Bmo = ROL64(A##mi, 15); \
+ A##so ^= Do; \
+ Bmu = ROL64(A##so, 56); \
+ E##ma = Bma ^( Bme & Bmi ); \
+ Ca ^= E##ma; \
+ E##me = Bme ^( Bmi | Bmo ); \
+ Ce ^= E##me; \
+ E##mi = Bmi ^((~Bmo)| Bmu ); \
+ Ci ^= E##mi; \
+ E##mo = (~Bmo)^( Bmu & Bma ); \
+ Co ^= E##mo; \
+ E##mu = Bmu ^( Bma | Bme ); \
+ Cu ^= E##mu; \
+\
+ A##bi ^= Di; \
+ Bsa = ROL64(A##bi, 62); \
+ A##go ^= Do; \
+ Bse = ROL64(A##go, 55); \
+ A##ku ^= Du; \
+ Bsi = ROL64(A##ku, 39); \
+ A##ma ^= Da; \
+ Bso = ROL64(A##ma, 41); \
+ A##se ^= De; \
+ Bsu = ROL64(A##se, 2); \
+ E##sa = Bsa ^((~Bse)& Bsi ); \
+ Ca ^= E##sa; \
+ E##se = (~Bse)^( Bsi | Bso ); \
+ Ce ^= E##se; \
+ E##si = Bsi ^( Bso & Bsu ); \
+ Ci ^= E##si; \
+ E##so = Bso ^( Bsu | Bsa ); \
+ Co ^= E##so; \
+ E##su = Bsu ^( Bsa & Bse ); \
+ Cu ^= E##su; \
+\
+
+/* --- Code for round (lane complementing pattern 'bebigokimisa') */
+
+/* --- 64-bit lanes mapped to 64-bit words */
+
+#define thetaRhoPiChiIota(i, A, E) \
+ Da = Cu^ROL64(Ce, 1); \
+ De = Ca^ROL64(Ci, 1); \
+ Di = Ce^ROL64(Co, 1); \
+ Do = Ci^ROL64(Cu, 1); \
+ Du = Co^ROL64(Ca, 1); \
+\
+ A##ba ^= Da; \
+ Bba = A##ba; \
+ A##ge ^= De; \
+ Bbe = ROL64(A##ge, 44); \
+ A##ki ^= Di; \
+ Bbi = ROL64(A##ki, 43); \
+ A##mo ^= Do; \
+ Bbo = ROL64(A##mo, 21); \
+ A##su ^= Du; \
+ Bbu = ROL64(A##su, 14); \
+ E##ba = Bba ^( Bbe | Bbi ); \
+ E##ba ^= KeccakF1600RoundConstants[i]; \
+ E##be = Bbe ^((~Bbi)| Bbo ); \
+ E##bi = Bbi ^( Bbo & Bbu ); \
+ E##bo = Bbo ^( Bbu | Bba ); \
+ E##bu = Bbu ^( Bba & Bbe ); \
+\
+ A##bo ^= Do; \
+ Bga = ROL64(A##bo, 28); \
+ A##gu ^= Du; \
+ Bge = ROL64(A##gu, 20); \
+ A##ka ^= Da; \
+ Bgi = ROL64(A##ka, 3); \
+ A##me ^= De; \
+ Bgo = ROL64(A##me, 45); \
+ A##si ^= Di; \
+ Bgu = ROL64(A##si, 61); \
+ E##ga = Bga ^( Bge | Bgi ); \
+ E##ge = Bge ^( Bgi & Bgo ); \
+ E##gi = Bgi ^( Bgo |(~Bgu)); \
+ E##go = Bgo ^( Bgu | Bga ); \
+ E##gu = Bgu ^( Bga & Bge ); \
+\
+ A##be ^= De; \
+ Bka = ROL64(A##be, 1); \
+ A##gi ^= Di; \
+ Bke = ROL64(A##gi, 6); \
+ A##ko ^= Do; \
+ Bki = ROL64(A##ko, 25); \
+ A##mu ^= Du; \
+ Bko = ROL64(A##mu, 8); \
+ A##sa ^= Da; \
+ Bku = ROL64(A##sa, 18); \
+ E##ka = Bka ^( Bke | Bki ); \
+ E##ke = Bke ^( Bki & Bko ); \
+ E##ki = Bki ^((~Bko)& Bku ); \
+ E##ko = (~Bko)^( Bku | Bka ); \
+ E##ku = Bku ^( Bka & Bke ); \
+\
+ A##bu ^= Du; \
+ Bma = ROL64(A##bu, 27); \
+ A##ga ^= Da; \
+ Bme = ROL64(A##ga, 36); \
+ A##ke ^= De; \
+ Bmi = ROL64(A##ke, 10); \
+ A##mi ^= Di; \
+ Bmo = ROL64(A##mi, 15); \
+ A##so ^= Do; \
+ Bmu = ROL64(A##so, 56); \
+ E##ma = Bma ^( Bme & Bmi ); \
+ E##me = Bme ^( Bmi | Bmo ); \
+ E##mi = Bmi ^((~Bmo)| Bmu ); \
+ E##mo = (~Bmo)^( Bmu & Bma ); \
+ E##mu = Bmu ^( Bma | Bme ); \
+\
+ A##bi ^= Di; \
+ Bsa = ROL64(A##bi, 62); \
+ A##go ^= Do; \
+ Bse = ROL64(A##go, 55); \
+ A##ku ^= Du; \
+ Bsi = ROL64(A##ku, 39); \
+ A##ma ^= Da; \
+ Bso = ROL64(A##ma, 41); \
+ A##se ^= De; \
+ Bsu = ROL64(A##se, 2); \
+ E##sa = Bsa ^((~Bse)& Bsi ); \
+ E##se = (~Bse)^( Bsi | Bso ); \
+ E##si = Bsi ^( Bso & Bsu ); \
+ E##so = Bso ^( Bsu | Bsa ); \
+ E##su = Bsu ^( Bsa & Bse ); \
+\
+
+#else /* UseBebigokimisa */
+
+/* --- Code for round, with prepare-theta */
+
+/* --- 64-bit lanes mapped to 64-bit words */
+
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+ Da = Cu^ROL64(Ce, 1); \
+ De = Ca^ROL64(Ci, 1); \
+ Di = Ce^ROL64(Co, 1); \
+ Do = Ci^ROL64(Cu, 1); \
+ Du = Co^ROL64(Ca, 1); \
+\
+ A##ba ^= Da; \
+ Bba = A##ba; \
+ A##ge ^= De; \
+ Bbe = ROL64(A##ge, 44); \
+ A##ki ^= Di; \
+ Bbi = ROL64(A##ki, 43); \
+ A##mo ^= Do; \
+ Bbo = ROL64(A##mo, 21); \
+ A##su ^= Du; \
+ Bbu = ROL64(A##su, 14); \
+ E##ba = Bba ^((~Bbe)& Bbi ); \
+ E##ba ^= KeccakF1600RoundConstants[i]; \
+ Ca = E##ba; \
+ E##be = Bbe ^((~Bbi)& Bbo ); \
+ Ce = E##be; \
+ E##bi = Bbi ^((~Bbo)& Bbu ); \
+ Ci = E##bi; \
+ E##bo = Bbo ^((~Bbu)& Bba ); \
+ Co = E##bo; \
+ E##bu = Bbu ^((~Bba)& Bbe ); \
+ Cu = E##bu; \
+\
+ A##bo ^= Do; \
+ Bga = ROL64(A##bo, 28); \
+ A##gu ^= Du; \
+ Bge = ROL64(A##gu, 20); \
+ A##ka ^= Da; \
+ Bgi = ROL64(A##ka, 3); \
+ A##me ^= De; \
+ Bgo = ROL64(A##me, 45); \
+ A##si ^= Di; \
+ Bgu = ROL64(A##si, 61); \
+ E##ga = Bga ^((~Bge)& Bgi ); \
+ Ca ^= E##ga; \
+ E##ge = Bge ^((~Bgi)& Bgo ); \
+ Ce ^= E##ge; \
+ E##gi = Bgi ^((~Bgo)& Bgu ); \
+ Ci ^= E##gi; \
+ E##go = Bgo ^((~Bgu)& Bga ); \
+ Co ^= E##go; \
+ E##gu = Bgu ^((~Bga)& Bge ); \
+ Cu ^= E##gu; \
+\
+ A##be ^= De; \
+ Bka = ROL64(A##be, 1); \
+ A##gi ^= Di; \
+ Bke = ROL64(A##gi, 6); \
+ A##ko ^= Do; \
+ Bki = ROL64(A##ko, 25); \
+ A##mu ^= Du; \
+ Bko = ROL64(A##mu, 8); \
+ A##sa ^= Da; \
+ Bku = ROL64(A##sa, 18); \
+ E##ka = Bka ^((~Bke)& Bki ); \
+ Ca ^= E##ka; \
+ E##ke = Bke ^((~Bki)& Bko ); \
+ Ce ^= E##ke; \
+ E##ki = Bki ^((~Bko)& Bku ); \
+ Ci ^= E##ki; \
+ E##ko = Bko ^((~Bku)& Bka ); \
+ Co ^= E##ko; \
+ E##ku = Bku ^((~Bka)& Bke ); \
+ Cu ^= E##ku; \
+\
+ A##bu ^= Du; \
+ Bma = ROL64(A##bu, 27); \
+ A##ga ^= Da; \
+ Bme = ROL64(A##ga, 36); \
+ A##ke ^= De; \
+ Bmi = ROL64(A##ke, 10); \
+ A##mi ^= Di; \
+ Bmo = ROL64(A##mi, 15); \
+ A##so ^= Do; \
+ Bmu = ROL64(A##so, 56); \
+ E##ma = Bma ^((~Bme)& Bmi ); \
+ Ca ^= E##ma; \
+ E##me = Bme ^((~Bmi)& Bmo ); \
+ Ce ^= E##me; \
+ E##mi = Bmi ^((~Bmo)& Bmu ); \
+ Ci ^= E##mi; \
+ E##mo = Bmo ^((~Bmu)& Bma ); \
+ Co ^= E##mo; \
+ E##mu = Bmu ^((~Bma)& Bme ); \
+ Cu ^= E##mu; \
+\
+ A##bi ^= Di; \
+ Bsa = ROL64(A##bi, 62); \
+ A##go ^= Do; \
+ Bse = ROL64(A##go, 55); \
+ A##ku ^= Du; \
+ Bsi = ROL64(A##ku, 39); \
+ A##ma ^= Da; \
+ Bso = ROL64(A##ma, 41); \
+ A##se ^= De; \
+ Bsu = ROL64(A##se, 2); \
+ E##sa = Bsa ^((~Bse)& Bsi ); \
+ Ca ^= E##sa; \
+ E##se = Bse ^((~Bsi)& Bso ); \
+ Ce ^= E##se; \
+ E##si = Bsi ^((~Bso)& Bsu ); \
+ Ci ^= E##si; \
+ E##so = Bso ^((~Bsu)& Bsa ); \
+ Co ^= E##so; \
+ E##su = Bsu ^((~Bsa)& Bse ); \
+ Cu ^= E##su; \
+\
+
+/* --- Code for round */
+
+/* --- 64-bit lanes mapped to 64-bit words */
+
+#define thetaRhoPiChiIota(i, A, E) \
+ Da = Cu^ROL64(Ce, 1); \
+ De = Ca^ROL64(Ci, 1); \
+ Di = Ce^ROL64(Co, 1); \
+ Do = Ci^ROL64(Cu, 1); \
+ Du = Co^ROL64(Ca, 1); \
+\
+ A##ba ^= Da; \
+ Bba = A##ba; \
+ A##ge ^= De; \
+ Bbe = ROL64(A##ge, 44); \
+ A##ki ^= Di; \
+ Bbi = ROL64(A##ki, 43); \
+ A##mo ^= Do; \
+ Bbo = ROL64(A##mo, 21); \
+ A##su ^= Du; \
+ Bbu = ROL64(A##su, 14); \
+ E##ba = Bba ^((~Bbe)& Bbi ); \
+ E##ba ^= KeccakF1600RoundConstants[i]; \
+ E##be = Bbe ^((~Bbi)& Bbo ); \
+ E##bi = Bbi ^((~Bbo)& Bbu ); \
+ E##bo = Bbo ^((~Bbu)& Bba ); \
+ E##bu = Bbu ^((~Bba)& Bbe ); \
+\
+ A##bo ^= Do; \
+ Bga = ROL64(A##bo, 28); \
+ A##gu ^= Du; \
+ Bge = ROL64(A##gu, 20); \
+ A##ka ^= Da; \
+ Bgi = ROL64(A##ka, 3); \
+ A##me ^= De; \
+ Bgo = ROL64(A##me, 45); \
+ A##si ^= Di; \
+ Bgu = ROL64(A##si, 61); \
+ E##ga = Bga ^((~Bge)& Bgi ); \
+ E##ge = Bge ^((~Bgi)& Bgo ); \
+ E##gi = Bgi ^((~Bgo)& Bgu ); \
+ E##go = Bgo ^((~Bgu)& Bga ); \
+ E##gu = Bgu ^((~Bga)& Bge ); \
+\
+ A##be ^= De; \
+ Bka = ROL64(A##be, 1); \
+ A##gi ^= Di; \
+ Bke = ROL64(A##gi, 6); \
+ A##ko ^= Do; \
+ Bki = ROL64(A##ko, 25); \
+ A##mu ^= Du; \
+ Bko = ROL64(A##mu, 8); \
+ A##sa ^= Da; \
+ Bku = ROL64(A##sa, 18); \
+ E##ka = Bka ^((~Bke)& Bki ); \
+ E##ke = Bke ^((~Bki)& Bko ); \
+ E##ki = Bki ^((~Bko)& Bku ); \
+ E##ko = Bko ^((~Bku)& Bka ); \
+ E##ku = Bku ^((~Bka)& Bke ); \
+\
+ A##bu ^= Du; \
+ Bma = ROL64(A##bu, 27); \
+ A##ga ^= Da; \
+ Bme = ROL64(A##ga, 36); \
+ A##ke ^= De; \
+ Bmi = ROL64(A##ke, 10); \
+ A##mi ^= Di; \
+ Bmo = ROL64(A##mi, 15); \
+ A##so ^= Do; \
+ Bmu = ROL64(A##so, 56); \
+ E##ma = Bma ^((~Bme)& Bmi ); \
+ E##me = Bme ^((~Bmi)& Bmo ); \
+ E##mi = Bmi ^((~Bmo)& Bmu ); \
+ E##mo = Bmo ^((~Bmu)& Bma ); \
+ E##mu = Bmu ^((~Bma)& Bme ); \
+\
+ A##bi ^= Di; \
+ Bsa = ROL64(A##bi, 62); \
+ A##go ^= Do; \
+ Bse = ROL64(A##go, 55); \
+ A##ku ^= Du; \
+ Bsi = ROL64(A##ku, 39); \
+ A##ma ^= Da; \
+ Bso = ROL64(A##ma, 41); \
+ A##se ^= De; \
+ Bsu = ROL64(A##se, 2); \
+ E##sa = Bsa ^((~Bse)& Bsi ); \
+ E##se = Bse ^((~Bsi)& Bso ); \
+ E##si = Bsi ^((~Bso)& Bsu ); \
+ E##so = Bso ^((~Bsu)& Bsa ); \
+ E##su = Bsu ^((~Bsa)& Bse ); \
+\
+
+#endif /* UseBebigokimisa */
+
+
+#define copyFromState(X, state) \
+ X##ba = state[ 0]; \
+ X##be = state[ 1]; \
+ X##bi = state[ 2]; \
+ X##bo = state[ 3]; \
+ X##bu = state[ 4]; \
+ X##ga = state[ 5]; \
+ X##ge = state[ 6]; \
+ X##gi = state[ 7]; \
+ X##go = state[ 8]; \
+ X##gu = state[ 9]; \
+ X##ka = state[10]; \
+ X##ke = state[11]; \
+ X##ki = state[12]; \
+ X##ko = state[13]; \
+ X##ku = state[14]; \
+ X##ma = state[15]; \
+ X##me = state[16]; \
+ X##mi = state[17]; \
+ X##mo = state[18]; \
+ X##mu = state[19]; \
+ X##sa = state[20]; \
+ X##se = state[21]; \
+ X##si = state[22]; \
+ X##so = state[23]; \
+ X##su = state[24]; \
+
+#define copyToState(state, X) \
+ state[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ state[ 2] = X##bi; \
+ state[ 3] = X##bo; \
+ state[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ state[ 7] = X##gi; \
+ state[ 8] = X##go; \
+ state[ 9] = X##gu; \
+ state[10] = X##ka; \
+ state[11] = X##ke; \
+ state[12] = X##ki; \
+ state[13] = X##ko; \
+ state[14] = X##ku; \
+ state[15] = X##ma; \
+ state[16] = X##me; \
+ state[17] = X##mi; \
+ state[18] = X##mo; \
+ state[19] = X##mu; \
+ state[20] = X##sa; \
+ state[21] = X##se; \
+ state[22] = X##si; \
+ state[23] = X##so; \
+ state[24] = X##su; \
+
+#define copyStateVariables(X, Y) \
+ X##ba = Y##ba; \
+ X##be = Y##be; \
+ X##bi = Y##bi; \
+ X##bo = Y##bo; \
+ X##bu = Y##bu; \
+ X##ga = Y##ga; \
+ X##ge = Y##ge; \
+ X##gi = Y##gi; \
+ X##go = Y##go; \
+ X##gu = Y##gu; \
+ X##ka = Y##ka; \
+ X##ke = Y##ke; \
+ X##ki = Y##ki; \
+ X##ko = Y##ko; \
+ X##ku = Y##ku; \
+ X##ma = Y##ma; \
+ X##me = Y##me; \
+ X##mi = Y##mi; \
+ X##mo = Y##mo; \
+ X##mu = Y##mu; \
+ X##sa = Y##sa; \
+ X##se = Y##se; \
+ X##si = Y##si; \
+ X##so = Y##so; \
+ X##su = Y##su; \
+
+#define copyFromStateAndAdd(X, state, input, laneCount) \
+ if (laneCount < 16) { \
+ if (laneCount < 8) { \
+ if (laneCount < 4) { \
+ if (laneCount < 2) { \
+ if (laneCount < 1) { \
+ X##ba = state[ 0]; \
+ } \
+ else { \
+ X##ba = state[ 0]^input[ 0]; \
+ } \
+ X##be = state[ 1]; \
+ X##bi = state[ 2]; \
+ } \
+ else { \
+ X##ba = state[ 0]^input[ 0]; \
+ X##be = state[ 1]^input[ 1]; \
+ if (laneCount < 3) { \
+ X##bi = state[ 2]; \
+ } \
+ else { \
+ X##bi = state[ 2]^input[ 2]; \
+ } \
+ } \
+ X##bo = state[ 3]; \
+ X##bu = state[ 4]; \
+ X##ga = state[ 5]; \
+ X##ge = state[ 6]; \
+ } \
+ else { \
+ X##ba = state[ 0]^input[ 0]; \
+ X##be = state[ 1]^input[ 1]; \
+ X##bi = state[ 2]^input[ 2]; \
+ X##bo = state[ 3]^input[ 3]; \
+ if (laneCount < 6) { \
+ if (laneCount < 5) { \
+ X##bu = state[ 4]; \
+ } \
+ else { \
+ X##bu = state[ 4]^input[ 4]; \
+ } \
+ X##ga = state[ 5]; \
+ X##ge = state[ 6]; \
+ } \
+ else { \
+ X##bu = state[ 4]^input[ 4]; \
+ X##ga = state[ 5]^input[ 5]; \
+ if (laneCount < 7) { \
+ X##ge = state[ 6]; \
+ } \
+ else { \
+ X##ge = state[ 6]^input[ 6]; \
+ } \
+ } \
+ } \
+ X##gi = state[ 7]; \
+ X##go = state[ 8]; \
+ X##gu = state[ 9]; \
+ X##ka = state[10]; \
+ X##ke = state[11]; \
+ X##ki = state[12]; \
+ X##ko = state[13]; \
+ X##ku = state[14]; \
+ } \
+ else { \
+ X##ba = state[ 0]^input[ 0]; \
+ X##be = state[ 1]^input[ 1]; \
+ X##bi = state[ 2]^input[ 2]; \
+ X##bo = state[ 3]^input[ 3]; \
+ X##bu = state[ 4]^input[ 4]; \
+ X##ga = state[ 5]^input[ 5]; \
+ X##ge = state[ 6]^input[ 6]; \
+ X##gi = state[ 7]^input[ 7]; \
+ if (laneCount < 12) { \
+ if (laneCount < 10) { \
+ if (laneCount < 9) { \
+ X##go = state[ 8]; \
+ } \
+ else { \
+ X##go = state[ 8]^input[ 8]; \
+ } \
+ X##gu = state[ 9]; \
+ X##ka = state[10]; \
+ } \
+ else { \
+ X##go = state[ 8]^input[ 8]; \
+ X##gu = state[ 9]^input[ 9]; \
+ if (laneCount < 11) { \
+ X##ka = state[10]; \
+ } \
+ else { \
+ X##ka = state[10]^input[10]; \
+ } \
+ } \
+ X##ke = state[11]; \
+ X##ki = state[12]; \
+ X##ko = state[13]; \
+ X##ku = state[14]; \
+ } \
+ else { \
+ X##go = state[ 8]^input[ 8]; \
+ X##gu = state[ 9]^input[ 9]; \
+ X##ka = state[10]^input[10]; \
+ X##ke = state[11]^input[11]; \
+ if (laneCount < 14) { \
+ if (laneCount < 13) { \
+ X##ki = state[12]; \
+ } \
+ else { \
+ X##ki = state[12]^input[12]; \
+ } \
+ X##ko = state[13]; \
+ X##ku = state[14]; \
+ } \
+ else { \
+ X##ki = state[12]^input[12]; \
+ X##ko = state[13]^input[13]; \
+ if (laneCount < 15) { \
+ X##ku = state[14]; \
+ } \
+ else { \
+ X##ku = state[14]^input[14]; \
+ } \
+ } \
+ } \
+ } \
+ X##ma = state[15]; \
+ X##me = state[16]; \
+ X##mi = state[17]; \
+ X##mo = state[18]; \
+ X##mu = state[19]; \
+ X##sa = state[20]; \
+ X##se = state[21]; \
+ X##si = state[22]; \
+ X##so = state[23]; \
+ X##su = state[24]; \
+ } \
+ else { \
+ X##ba = state[ 0]^input[ 0]; \
+ X##be = state[ 1]^input[ 1]; \
+ X##bi = state[ 2]^input[ 2]; \
+ X##bo = state[ 3]^input[ 3]; \
+ X##bu = state[ 4]^input[ 4]; \
+ X##ga = state[ 5]^input[ 5]; \
+ X##ge = state[ 6]^input[ 6]; \
+ X##gi = state[ 7]^input[ 7]; \
+ X##go = state[ 8]^input[ 8]; \
+ X##gu = state[ 9]^input[ 9]; \
+ X##ka = state[10]^input[10]; \
+ X##ke = state[11]^input[11]; \
+ X##ki = state[12]^input[12]; \
+ X##ko = state[13]^input[13]; \
+ X##ku = state[14]^input[14]; \
+ X##ma = state[15]^input[15]; \
+ if (laneCount < 24) { \
+ if (laneCount < 20) { \
+ if (laneCount < 18) { \
+ if (laneCount < 17) { \
+ X##me = state[16]; \
+ } \
+ else { \
+ X##me = state[16]^input[16]; \
+ } \
+ X##mi = state[17]; \
+ X##mo = state[18]; \
+ } \
+ else { \
+ X##me = state[16]^input[16]; \
+ X##mi = state[17]^input[17]; \
+ if (laneCount < 19) { \
+ X##mo = state[18]; \
+ } \
+ else { \
+ X##mo = state[18]^input[18]; \
+ } \
+ } \
+ X##mu = state[19]; \
+ X##sa = state[20]; \
+ X##se = state[21]; \
+ X##si = state[22]; \
+ } \
+ else { \
+ X##me = state[16]^input[16]; \
+ X##mi = state[17]^input[17]; \
+ X##mo = state[18]^input[18]; \
+ X##mu = state[19]^input[19]; \
+ if (laneCount < 22) { \
+ if (laneCount < 21) { \
+ X##sa = state[20]; \
+ } \
+ else { \
+ X##sa = state[20]^input[20]; \
+ } \
+ X##se = state[21]; \
+ X##si = state[22]; \
+ } \
+ else { \
+ X##sa = state[20]^input[20]; \
+ X##se = state[21]^input[21]; \
+ if (laneCount < 23) { \
+ X##si = state[22]; \
+ } \
+ else { \
+ X##si = state[22]^input[22]; \
+ } \
+ } \
+ } \
+ X##so = state[23]; \
+ X##su = state[24]; \
+ } \
+ else { \
+ X##me = state[16]^input[16]; \
+ X##mi = state[17]^input[17]; \
+ X##mo = state[18]^input[18]; \
+ X##mu = state[19]^input[19]; \
+ X##sa = state[20]^input[20]; \
+ X##se = state[21]^input[21]; \
+ X##si = state[22]^input[22]; \
+ X##so = state[23]^input[23]; \
+ if (laneCount < 25) { \
+ X##su = state[24]; \
+ } \
+ else { \
+ X##su = state[24]^input[24]; \
+ } \
+ } \
+ }
+
+#define addInput(X, input, laneCount) \
+ if (laneCount == 21) { \
+ X##ba ^= input[ 0]; \
+ X##be ^= input[ 1]; \
+ X##bi ^= input[ 2]; \
+ X##bo ^= input[ 3]; \
+ X##bu ^= input[ 4]; \
+ X##ga ^= input[ 5]; \
+ X##ge ^= input[ 6]; \
+ X##gi ^= input[ 7]; \
+ X##go ^= input[ 8]; \
+ X##gu ^= input[ 9]; \
+ X##ka ^= input[10]; \
+ X##ke ^= input[11]; \
+ X##ki ^= input[12]; \
+ X##ko ^= input[13]; \
+ X##ku ^= input[14]; \
+ X##ma ^= input[15]; \
+ X##me ^= input[16]; \
+ X##mi ^= input[17]; \
+ X##mo ^= input[18]; \
+ X##mu ^= input[19]; \
+ X##sa ^= input[20]; \
+ } \
+ else if (laneCount < 16) { \
+ if (laneCount < 8) { \
+ if (laneCount < 4) { \
+ if (laneCount < 2) { \
+ if (laneCount < 1) { \
+ } \
+ else { \
+ X##ba ^= input[ 0]; \
+ } \
+ } \
+ else { \
+ X##ba ^= input[ 0]; \
+ X##be ^= input[ 1]; \
+ if (laneCount < 3) { \
+ } \
+ else { \
+ X##bi ^= input[ 2]; \
+ } \
+ } \
+ } \
+ else { \
+ X##ba ^= input[ 0]; \
+ X##be ^= input[ 1]; \
+ X##bi ^= input[ 2]; \
+ X##bo ^= input[ 3]; \
+ if (laneCount < 6) { \
+ if (laneCount < 5) { \
+ } \
+ else { \
+ X##bu ^= input[ 4]; \
+ } \
+ } \
+ else { \
+ X##bu ^= input[ 4]; \
+ X##ga ^= input[ 5]; \
+ if (laneCount < 7) { \
+ } \
+ else { \
+ X##ge ^= input[ 6]; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ X##ba ^= input[ 0]; \
+ X##be ^= input[ 1]; \
+ X##bi ^= input[ 2]; \
+ X##bo ^= input[ 3]; \
+ X##bu ^= input[ 4]; \
+ X##ga ^= input[ 5]; \
+ X##ge ^= input[ 6]; \
+ X##gi ^= input[ 7]; \
+ if (laneCount < 12) { \
+ if (laneCount < 10) { \
+ if (laneCount < 9) { \
+ } \
+ else { \
+ X##go ^= input[ 8]; \
+ } \
+ } \
+ else { \
+ X##go ^= input[ 8]; \
+ X##gu ^= input[ 9]; \
+ if (laneCount < 11) { \
+ } \
+ else { \
+ X##ka ^= input[10]; \
+ } \
+ } \
+ } \
+ else { \
+ X##go ^= input[ 8]; \
+ X##gu ^= input[ 9]; \
+ X##ka ^= input[10]; \
+ X##ke ^= input[11]; \
+ if (laneCount < 14) { \
+ if (laneCount < 13) { \
+ } \
+ else { \
+ X##ki ^= input[12]; \
+ } \
+ } \
+ else { \
+ X##ki ^= input[12]; \
+ X##ko ^= input[13]; \
+ if (laneCount < 15) { \
+ } \
+ else { \
+ X##ku ^= input[14]; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ X##ba ^= input[ 0]; \
+ X##be ^= input[ 1]; \
+ X##bi ^= input[ 2]; \
+ X##bo ^= input[ 3]; \
+ X##bu ^= input[ 4]; \
+ X##ga ^= input[ 5]; \
+ X##ge ^= input[ 6]; \
+ X##gi ^= input[ 7]; \
+ X##go ^= input[ 8]; \
+ X##gu ^= input[ 9]; \
+ X##ka ^= input[10]; \
+ X##ke ^= input[11]; \
+ X##ki ^= input[12]; \
+ X##ko ^= input[13]; \
+ X##ku ^= input[14]; \
+ X##ma ^= input[15]; \
+ if (laneCount < 24) { \
+ if (laneCount < 20) { \
+ if (laneCount < 18) { \
+ if (laneCount < 17) { \
+ } \
+ else { \
+ X##me ^= input[16]; \
+ } \
+ } \
+ else { \
+ X##me ^= input[16]; \
+ X##mi ^= input[17]; \
+ if (laneCount < 19) { \
+ } \
+ else { \
+ X##mo ^= input[18]; \
+ } \
+ } \
+ } \
+ else { \
+ X##me ^= input[16]; \
+ X##mi ^= input[17]; \
+ X##mo ^= input[18]; \
+ X##mu ^= input[19]; \
+ if (laneCount < 22) { \
+ if (laneCount < 21) { \
+ } \
+ else { \
+ X##sa ^= input[20]; \
+ } \
+ } \
+ else { \
+ X##sa ^= input[20]; \
+ X##se ^= input[21]; \
+ if (laneCount < 23) { \
+ } \
+ else { \
+ X##si ^= input[22]; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ X##me ^= input[16]; \
+ X##mi ^= input[17]; \
+ X##mo ^= input[18]; \
+ X##mu ^= input[19]; \
+ X##sa ^= input[20]; \
+ X##se ^= input[21]; \
+ X##si ^= input[22]; \
+ X##so ^= input[23]; \
+ if (laneCount < 25) { \
+ } \
+ else { \
+ X##su ^= input[24]; \
+ } \
+ } \
+ }
+
+#ifdef UseBebigokimisa
+
+#define copyToStateAndOutput(X, state, output, laneCount) \
+ if (laneCount < 16) { \
+ if (laneCount < 8) { \
+ if (laneCount < 4) { \
+ if (laneCount < 2) { \
+ state[ 0] = X##ba; \
+ if (laneCount >= 1) { \
+ output[ 0] = X##ba; \
+ } \
+ state[ 1] = X##be; \
+ state[ 2] = X##bi; \
+ } \
+ else { \
+ state[ 0] = X##ba; \
+ output[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ output[ 1] = ~X##be; \
+ state[ 2] = X##bi; \
+ if (laneCount >= 3) { \
+ output[ 2] = ~X##bi; \
+ } \
+ } \
+ state[ 3] = X##bo; \
+ state[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ } \
+ else { \
+ state[ 0] = X##ba; \
+ output[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ output[ 1] = ~X##be; \
+ state[ 2] = X##bi; \
+ output[ 2] = ~X##bi; \
+ state[ 3] = X##bo; \
+ output[ 3] = X##bo; \
+ if (laneCount < 6) { \
+ state[ 4] = X##bu; \
+ if (laneCount >= 5) { \
+ output[ 4] = X##bu; \
+ } \
+ state[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ } \
+ else { \
+ state[ 4] = X##bu; \
+ output[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ output[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ if (laneCount >= 7) { \
+ output[ 6] = X##ge; \
+ } \
+ } \
+ } \
+ state[ 7] = X##gi; \
+ state[ 8] = X##go; \
+ state[ 9] = X##gu; \
+ state[10] = X##ka; \
+ state[11] = X##ke; \
+ state[12] = X##ki; \
+ state[13] = X##ko; \
+ state[14] = X##ku; \
+ } \
+ else { \
+ state[ 0] = X##ba; \
+ output[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ output[ 1] = ~X##be; \
+ state[ 2] = X##bi; \
+ output[ 2] = ~X##bi; \
+ state[ 3] = X##bo; \
+ output[ 3] = X##bo; \
+ state[ 4] = X##bu; \
+ output[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ output[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ output[ 6] = X##ge; \
+ state[ 7] = X##gi; \
+ output[ 7] = X##gi; \
+ if (laneCount < 12) { \
+ if (laneCount < 10) { \
+ state[ 8] = X##go; \
+ if (laneCount >= 9) { \
+ output[ 8] = ~X##go; \
+ } \
+ state[ 9] = X##gu; \
+ state[10] = X##ka; \
+ } \
+ else { \
+ state[ 8] = X##go; \
+ output[ 8] = ~X##go; \
+ state[ 9] = X##gu; \
+ output[ 9] = X##gu; \
+ state[10] = X##ka; \
+ if (laneCount >= 11) { \
+ output[10] = X##ka; \
+ } \
+ } \
+ state[11] = X##ke; \
+ state[12] = X##ki; \
+ state[13] = X##ko; \
+ state[14] = X##ku; \
+ } \
+ else { \
+ state[ 8] = X##go; \
+ output[ 8] = ~X##go; \
+ state[ 9] = X##gu; \
+ output[ 9] = X##gu; \
+ state[10] = X##ka; \
+ output[10] = X##ka; \
+ state[11] = X##ke; \
+ output[11] = X##ke; \
+ if (laneCount < 14) { \
+ state[12] = X##ki; \
+ if (laneCount >= 13) { \
+ output[12] = ~X##ki; \
+ } \
+ state[13] = X##ko; \
+ state[14] = X##ku; \
+ } \
+ else { \
+ state[12] = X##ki; \
+ output[12] = ~X##ki; \
+ state[13] = X##ko; \
+ output[13] = X##ko; \
+ state[14] = X##ku; \
+ if (laneCount >= 15) { \
+ output[14] = X##ku; \
+ } \
+ } \
+ } \
+ } \
+ state[15] = X##ma; \
+ state[16] = X##me; \
+ state[17] = X##mi; \
+ state[18] = X##mo; \
+ state[19] = X##mu; \
+ state[20] = X##sa; \
+ state[21] = X##se; \
+ state[22] = X##si; \
+ state[23] = X##so; \
+ state[24] = X##su; \
+ } \
+ else { \
+ state[ 0] = X##ba; \
+ output[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ output[ 1] = ~X##be; \
+ state[ 2] = X##bi; \
+ output[ 2] = ~X##bi; \
+ state[ 3] = X##bo; \
+ output[ 3] = X##bo; \
+ state[ 4] = X##bu; \
+ output[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ output[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ output[ 6] = X##ge; \
+ state[ 7] = X##gi; \
+ output[ 7] = X##gi; \
+ state[ 8] = X##go; \
+ output[ 8] = ~X##go; \
+ state[ 9] = X##gu; \
+ output[ 9] = X##gu; \
+ state[10] = X##ka; \
+ output[10] = X##ka; \
+ state[11] = X##ke; \
+ output[11] = X##ke; \
+ state[12] = X##ki; \
+ output[12] = ~X##ki; \
+ state[13] = X##ko; \
+ output[13] = X##ko; \
+ state[14] = X##ku; \
+ output[14] = X##ku; \
+ state[15] = X##ma; \
+ output[15] = X##ma; \
+ if (laneCount < 24) { \
+ if (laneCount < 20) { \
+ if (laneCount < 18) { \
+ state[16] = X##me; \
+ if (laneCount >= 17) { \
+ output[16] = X##me; \
+ } \
+ state[17] = X##mi; \
+ state[18] = X##mo; \
+ } \
+ else { \
+ state[16] = X##me; \
+ output[16] = X##me; \
+ state[17] = X##mi; \
+ output[17] = ~X##mi; \
+ state[18] = X##mo; \
+ if (laneCount >= 19) { \
+ output[18] = X##mo; \
+ } \
+ } \
+ state[19] = X##mu; \
+ state[20] = X##sa; \
+ state[21] = X##se; \
+ state[22] = X##si; \
+ } \
+ else { \
+ state[16] = X##me; \
+ output[16] = X##me; \
+ state[17] = X##mi; \
+ output[17] = ~X##mi; \
+ state[18] = X##mo; \
+ output[18] = X##mo; \
+ state[19] = X##mu; \
+ output[19] = X##mu; \
+ if (laneCount < 22) { \
+ state[20] = X##sa; \
+ if (laneCount >= 21) { \
+ output[20] = ~X##sa; \
+ } \
+ state[21] = X##se; \
+ state[22] = X##si; \
+ } \
+ else { \
+ state[20] = X##sa; \
+ output[20] = ~X##sa; \
+ state[21] = X##se; \
+ output[21] = X##se; \
+ state[22] = X##si; \
+ if (laneCount >= 23) { \
+ output[22] = X##si; \
+ } \
+ } \
+ } \
+ state[23] = X##so; \
+ state[24] = X##su; \
+ } \
+ else { \
+ state[16] = X##me; \
+ output[16] = X##me; \
+ state[17] = X##mi; \
+ output[17] = ~X##mi; \
+ state[18] = X##mo; \
+ output[18] = X##mo; \
+ state[19] = X##mu; \
+ output[19] = X##mu; \
+ state[20] = X##sa; \
+ output[20] = ~X##sa; \
+ state[21] = X##se; \
+ output[21] = X##se; \
+ state[22] = X##si; \
+ output[22] = X##si; \
+ state[23] = X##so; \
+ output[23] = X##so; \
+ state[24] = X##su; \
+ if (laneCount >= 25) { \
+ output[24] = X##su; \
+ } \
+ } \
+ }
+
+#define output(X, output, laneCount) \
+ if (laneCount < 16) { \
+ if (laneCount < 8) { \
+ if (laneCount < 4) { \
+ if (laneCount < 2) { \
+ if (laneCount >= 1) { \
+ output[ 0] = X##ba; \
+ } \
+ } \
+ else { \
+ output[ 0] = X##ba; \
+ output[ 1] = ~X##be; \
+ if (laneCount >= 3) { \
+ output[ 2] = ~X##bi; \
+ } \
+ } \
+ } \
+ else { \
+ output[ 0] = X##ba; \
+ output[ 1] = ~X##be; \
+ output[ 2] = ~X##bi; \
+ output[ 3] = X##bo; \
+ if (laneCount < 6) { \
+ if (laneCount >= 5) { \
+ output[ 4] = X##bu; \
+ } \
+ } \
+ else { \
+ output[ 4] = X##bu; \
+ output[ 5] = X##ga; \
+ if (laneCount >= 7) { \
+ output[ 6] = X##ge; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ output[ 0] = X##ba; \
+ output[ 1] = ~X##be; \
+ output[ 2] = ~X##bi; \
+ output[ 3] = X##bo; \
+ output[ 4] = X##bu; \
+ output[ 5] = X##ga; \
+ output[ 6] = X##ge; \
+ output[ 7] = X##gi; \
+ if (laneCount < 12) { \
+ if (laneCount < 10) { \
+ if (laneCount >= 9) { \
+ output[ 8] = ~X##go; \
+ } \
+ } \
+ else { \
+ output[ 8] = ~X##go; \
+ output[ 9] = X##gu; \
+ if (laneCount >= 11) { \
+ output[10] = X##ka; \
+ } \
+ } \
+ } \
+ else { \
+ output[ 8] = ~X##go; \
+ output[ 9] = X##gu; \
+ output[10] = X##ka; \
+ output[11] = X##ke; \
+ if (laneCount < 14) { \
+ if (laneCount >= 13) { \
+ output[12] = ~X##ki; \
+ } \
+ } \
+ else { \
+ output[12] = ~X##ki; \
+ output[13] = X##ko; \
+ if (laneCount >= 15) { \
+ output[14] = X##ku; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ output[ 0] = X##ba; \
+ output[ 1] = ~X##be; \
+ output[ 2] = ~X##bi; \
+ output[ 3] = X##bo; \
+ output[ 4] = X##bu; \
+ output[ 5] = X##ga; \
+ output[ 6] = X##ge; \
+ output[ 7] = X##gi; \
+ output[ 8] = ~X##go; \
+ output[ 9] = X##gu; \
+ output[10] = X##ka; \
+ output[11] = X##ke; \
+ output[12] = ~X##ki; \
+ output[13] = X##ko; \
+ output[14] = X##ku; \
+ output[15] = X##ma; \
+ if (laneCount < 24) { \
+ if (laneCount < 20) { \
+ if (laneCount < 18) { \
+ if (laneCount >= 17) { \
+ output[16] = X##me; \
+ } \
+ } \
+ else { \
+ output[16] = X##me; \
+ output[17] = ~X##mi; \
+ if (laneCount >= 19) { \
+ output[18] = X##mo; \
+ } \
+ } \
+ } \
+ else { \
+ output[16] = X##me; \
+ output[17] = ~X##mi; \
+ output[18] = X##mo; \
+ output[19] = X##mu; \
+ if (laneCount < 22) { \
+ if (laneCount >= 21) { \
+ output[20] = ~X##sa; \
+ } \
+ } \
+ else { \
+ output[20] = ~X##sa; \
+ output[21] = X##se; \
+ if (laneCount >= 23) { \
+ output[22] = X##si; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ output[16] = X##me; \
+ output[17] = ~X##mi; \
+ output[18] = X##mo; \
+ output[19] = X##mu; \
+ output[20] = ~X##sa; \
+ output[21] = X##se; \
+ output[22] = X##si; \
+ output[23] = X##so; \
+ if (laneCount >= 25) { \
+ output[24] = X##su; \
+ } \
+ } \
+ }
+
+#define wrapOne(X, input, output, index, name) \
+ X##name ^= input[index]; \
+ output[index] = X##name;
+
+#define wrapOneInvert(X, input, output, index, name) \
+ X##name ^= input[index]; \
+ output[index] = ~X##name;
+
+#define unwrapOne(X, input, output, index, name) \
+ output[index] = input[index] ^ X##name; \
+ X##name ^= output[index];
+
+#define unwrapOneInvert(X, input, output, index, name) \
+ output[index] = ~(input[index] ^ X##name); \
+ X##name ^= output[index]; \
+
+#else /* UseBebigokimisa */
+
+
+#define copyToStateAndOutput(X, state, output, laneCount) \
+ if (laneCount < 16) { \
+ if (laneCount < 8) { \
+ if (laneCount < 4) { \
+ if (laneCount < 2) { \
+ state[ 0] = X##ba; \
+ if (laneCount >= 1) { \
+ output[ 0] = X##ba; \
+ } \
+ state[ 1] = X##be; \
+ state[ 2] = X##bi; \
+ } \
+ else { \
+ state[ 0] = X##ba; \
+ output[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ output[ 1] = X##be; \
+ state[ 2] = X##bi; \
+ if (laneCount >= 3) { \
+ output[ 2] = X##bi; \
+ } \
+ } \
+ state[ 3] = X##bo; \
+ state[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ } \
+ else { \
+ state[ 0] = X##ba; \
+ output[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ output[ 1] = X##be; \
+ state[ 2] = X##bi; \
+ output[ 2] = X##bi; \
+ state[ 3] = X##bo; \
+ output[ 3] = X##bo; \
+ if (laneCount < 6) { \
+ state[ 4] = X##bu; \
+ if (laneCount >= 5) { \
+ output[ 4] = X##bu; \
+ } \
+ state[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ } \
+ else { \
+ state[ 4] = X##bu; \
+ output[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ output[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ if (laneCount >= 7) { \
+ output[ 6] = X##ge; \
+ } \
+ } \
+ } \
+ state[ 7] = X##gi; \
+ state[ 8] = X##go; \
+ state[ 9] = X##gu; \
+ state[10] = X##ka; \
+ state[11] = X##ke; \
+ state[12] = X##ki; \
+ state[13] = X##ko; \
+ state[14] = X##ku; \
+ } \
+ else { \
+ state[ 0] = X##ba; \
+ output[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ output[ 1] = X##be; \
+ state[ 2] = X##bi; \
+ output[ 2] = X##bi; \
+ state[ 3] = X##bo; \
+ output[ 3] = X##bo; \
+ state[ 4] = X##bu; \
+ output[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ output[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ output[ 6] = X##ge; \
+ state[ 7] = X##gi; \
+ output[ 7] = X##gi; \
+ if (laneCount < 12) { \
+ if (laneCount < 10) { \
+ state[ 8] = X##go; \
+ if (laneCount >= 9) { \
+ output[ 8] = X##go; \
+ } \
+ state[ 9] = X##gu; \
+ state[10] = X##ka; \
+ } \
+ else { \
+ state[ 8] = X##go; \
+ output[ 8] = X##go; \
+ state[ 9] = X##gu; \
+ output[ 9] = X##gu; \
+ state[10] = X##ka; \
+ if (laneCount >= 11) { \
+ output[10] = X##ka; \
+ } \
+ } \
+ state[11] = X##ke; \
+ state[12] = X##ki; \
+ state[13] = X##ko; \
+ state[14] = X##ku; \
+ } \
+ else { \
+ state[ 8] = X##go; \
+ output[ 8] = X##go; \
+ state[ 9] = X##gu; \
+ output[ 9] = X##gu; \
+ state[10] = X##ka; \
+ output[10] = X##ka; \
+ state[11] = X##ke; \
+ output[11] = X##ke; \
+ if (laneCount < 14) { \
+ state[12] = X##ki; \
+ if (laneCount >= 13) { \
+ output[12]= X##ki; \
+ } \
+ state[13] = X##ko; \
+ state[14] = X##ku; \
+ } \
+ else { \
+ state[12] = X##ki; \
+ output[12]= X##ki; \
+ state[13] = X##ko; \
+ output[13] = X##ko; \
+ state[14] = X##ku; \
+ if (laneCount >= 15) { \
+ output[14] = X##ku; \
+ } \
+ } \
+ } \
+ } \
+ state[15] = X##ma; \
+ state[16] = X##me; \
+ state[17] = X##mi; \
+ state[18] = X##mo; \
+ state[19] = X##mu; \
+ state[20] = X##sa; \
+ state[21] = X##se; \
+ state[22] = X##si; \
+ state[23] = X##so; \
+ state[24] = X##su; \
+ } \
+ else { \
+ state[ 0] = X##ba; \
+ output[ 0] = X##ba; \
+ state[ 1] = X##be; \
+ output[ 1] = X##be; \
+ state[ 2] = X##bi; \
+ output[ 2] = X##bi; \
+ state[ 3] = X##bo; \
+ output[ 3] = X##bo; \
+ state[ 4] = X##bu; \
+ output[ 4] = X##bu; \
+ state[ 5] = X##ga; \
+ output[ 5] = X##ga; \
+ state[ 6] = X##ge; \
+ output[ 6] = X##ge; \
+ state[ 7] = X##gi; \
+ output[ 7] = X##gi; \
+ state[ 8] = X##go; \
+ output[ 8] = X##go; \
+ state[ 9] = X##gu; \
+ output[ 9] = X##gu; \
+ state[10] = X##ka; \
+ output[10] = X##ka; \
+ state[11] = X##ke; \
+ output[11] = X##ke; \
+ state[12] = X##ki; \
+ output[12]= X##ki; \
+ state[13] = X##ko; \
+ output[13] = X##ko; \
+ state[14] = X##ku; \
+ output[14] = X##ku; \
+ state[15] = X##ma; \
+ output[15] = X##ma; \
+ if (laneCount < 24) { \
+ if (laneCount < 20) { \
+ if (laneCount < 18) { \
+ state[16] = X##me; \
+ if (laneCount >= 17) { \
+ output[16] = X##me; \
+ } \
+ state[17] = X##mi; \
+ state[18] = X##mo; \
+ } \
+ else { \
+ state[16] = X##me; \
+ output[16] = X##me; \
+ state[17] = X##mi; \
+ output[17] = X##mi; \
+ state[18] = X##mo; \
+ if (laneCount >= 19) { \
+ output[18] = X##mo; \
+ } \
+ } \
+ state[19] = X##mu; \
+ state[20] = X##sa; \
+ state[21] = X##se; \
+ state[22] = X##si; \
+ } \
+ else { \
+ state[16] = X##me; \
+ output[16] = X##me; \
+ state[17] = X##mi; \
+ output[17] = X##mi; \
+ state[18] = X##mo; \
+ output[18] = X##mo; \
+ state[19] = X##mu; \
+ output[19] = X##mu; \
+ if (laneCount < 22) { \
+ state[20] = X##sa; \
+ if (laneCount >= 21) { \
+ output[20] = X##sa; \
+ } \
+ state[21] = X##se; \
+ state[22] = X##si; \
+ } \
+ else { \
+ state[20] = X##sa; \
+ output[20] = X##sa; \
+ state[21] = X##se; \
+ output[21] = X##se; \
+ state[22] = X##si; \
+ if (laneCount >= 23) { \
+ output[22] = X##si; \
+ } \
+ } \
+ } \
+ state[23] = X##so; \
+ state[24] = X##su; \
+ } \
+ else { \
+ state[16] = X##me; \
+ output[16] = X##me; \
+ state[17] = X##mi; \
+ output[17] = X##mi; \
+ state[18] = X##mo; \
+ output[18] = X##mo; \
+ state[19] = X##mu; \
+ output[19] = X##mu; \
+ state[20] = X##sa; \
+ output[20] = X##sa; \
+ state[21] = X##se; \
+ output[21] = X##se; \
+ state[22] = X##si; \
+ output[22] = X##si; \
+ state[23] = X##so; \
+ output[23] = X##so; \
+ state[24] = X##su; \
+ if (laneCount >= 25) { \
+ output[24] = X##su; \
+ } \
+ } \
+ }
+
+#define output(X, output, laneCount) \
+ if (laneCount < 16) { \
+ if (laneCount < 8) { \
+ if (laneCount < 4) { \
+ if (laneCount < 2) { \
+ if (laneCount >= 1) { \
+ output[ 0] = X##ba; \
+ } \
+ } \
+ else { \
+ output[ 0] = X##ba; \
+ output[ 1] = X##be; \
+ if (laneCount >= 3) { \
+ output[ 2] = X##bi; \
+ } \
+ } \
+ } \
+ else { \
+ output[ 0] = X##ba; \
+ output[ 1] = X##be; \
+ output[ 2] = X##bi; \
+ output[ 3] = X##bo; \
+ if (laneCount < 6) { \
+ if (laneCount >= 5) { \
+ output[ 4] = X##bu; \
+ } \
+ } \
+ else { \
+ output[ 4] = X##bu; \
+ output[ 5] = X##ga; \
+ if (laneCount >= 7) { \
+ output[ 6] = X##ge; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ output[ 0] = X##ba; \
+ output[ 1] = X##be; \
+ output[ 2] = X##bi; \
+ output[ 3] = X##bo; \
+ output[ 4] = X##bu; \
+ output[ 5] = X##ga; \
+ output[ 6] = X##ge; \
+ output[ 7] = X##gi; \
+ if (laneCount < 12) { \
+ if (laneCount < 10) { \
+ if (laneCount >= 9) { \
+ output[ 8] = X##go; \
+ } \
+ } \
+ else { \
+ output[ 8] = X##go; \
+ output[ 9] = X##gu; \
+ if (laneCount >= 11) { \
+ output[10] = X##ka; \
+ } \
+ } \
+ } \
+ else { \
+ output[ 8] = X##go; \
+ output[ 9] = X##gu; \
+ output[10] = X##ka; \
+ output[11] = X##ke; \
+ if (laneCount < 14) { \
+ if (laneCount >= 13) { \
+ output[12] = X##ki; \
+ } \
+ } \
+ else { \
+ output[12] = X##ki; \
+ output[13] = X##ko; \
+ if (laneCount >= 15) { \
+ output[14] = X##ku; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ output[ 0] = X##ba; \
+ output[ 1] = X##be; \
+ output[ 2] = X##bi; \
+ output[ 3] = X##bo; \
+ output[ 4] = X##bu; \
+ output[ 5] = X##ga; \
+ output[ 6] = X##ge; \
+ output[ 7] = X##gi; \
+ output[ 8] = X##go; \
+ output[ 9] = X##gu; \
+ output[10] = X##ka; \
+ output[11] = X##ke; \
+ output[12] = X##ki; \
+ output[13] = X##ko; \
+ output[14] = X##ku; \
+ output[15] = X##ma; \
+ if (laneCount < 24) { \
+ if (laneCount < 20) { \
+ if (laneCount < 18) { \
+ if (laneCount >= 17) { \
+ output[16] = X##me; \
+ } \
+ } \
+ else { \
+ output[16] = X##me; \
+ output[17] = X##mi; \
+ if (laneCount >= 19) { \
+ output[18] = X##mo; \
+ } \
+ } \
+ } \
+ else { \
+ output[16] = X##me; \
+ output[17] = X##mi; \
+ output[18] = X##mo; \
+ output[19] = X##mu; \
+ if (laneCount < 22) { \
+ if (laneCount >= 21) { \
+ output[20] = X##sa; \
+ } \
+ } \
+ else { \
+ output[20] = X##sa; \
+ output[21] = X##se; \
+ if (laneCount >= 23) { \
+ output[22] = X##si; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ output[16] = X##me; \
+ output[17] = X##mi; \
+ output[18] = X##mo; \
+ output[19] = X##mu; \
+ output[20] = X##sa; \
+ output[21] = X##se; \
+ output[22] = X##si; \
+ output[23] = X##so; \
+ if (laneCount >= 25) { \
+ output[24] = X##su; \
+ } \
+ } \
+ }
+
+#define wrapOne(X, input, output, index, name) \
+ X##name ^= input[index]; \
+ output[index] = X##name;
+
+#define wrapOneInvert(X, input, output, index, name) \
+ X##name ^= input[index]; \
+ output[index] = X##name;
+
+#define unwrapOne(X, input, output, index, name) \
+ output[index] = input[index] ^ X##name; \
+ X##name ^= output[index];
+
+#define unwrapOneInvert(X, input, output, index, name) \
+ output[index] = input[index] ^ X##name; \
+ X##name ^= output[index];
+
+#endif
+
+#define wrap(X, input, output, laneCount, trailingBits) \
+ if (laneCount < 16) { \
+ if (laneCount < 8) { \
+ if (laneCount < 4) { \
+ if (laneCount < 2) { \
+ if (laneCount < 1) { \
+ X##ba ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 0, ba) \
+ X##be ^= trailingBits; \
+ } \
+ } \
+ else { \
+ wrapOne(X, input, output, 0, ba) \
+ wrapOneInvert(X, input, output, 1, be) \
+ if (laneCount < 3) { \
+ X##bi ^= trailingBits; \
+ } \
+ else { \
+ wrapOneInvert(X, input, output, 2, bi) \
+ X##bo ^= trailingBits; \
+ } \
+ } \
+ } \
+ else { \
+ wrapOne(X, input, output, 0, ba) \
+ wrapOneInvert(X, input, output, 1, be) \
+ wrapOneInvert(X, input, output, 2, bi) \
+ wrapOne(X, input, output, 3, bo) \
+ if (laneCount < 6) { \
+ if (laneCount < 5) { \
+ X##bu ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 4, bu) \
+ X##ga ^= trailingBits; \
+ } \
+ } \
+ else { \
+ wrapOne(X, input, output, 4, bu) \
+ wrapOne(X, input, output, 5, ga) \
+ if (laneCount < 7) { \
+ X##ge ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 6, ge) \
+ X##gi ^= trailingBits; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ wrapOne(X, input, output, 0, ba) \
+ wrapOneInvert(X, input, output, 1, be) \
+ wrapOneInvert(X, input, output, 2, bi) \
+ wrapOne(X, input, output, 3, bo) \
+ wrapOne(X, input, output, 4, bu) \
+ wrapOne(X, input, output, 5, ga) \
+ wrapOne(X, input, output, 6, ge) \
+ wrapOne(X, input, output, 7, gi) \
+ if (laneCount < 12) { \
+ if (laneCount < 10) { \
+ if (laneCount < 9) { \
+ X##go ^= trailingBits; \
+ } \
+ else { \
+ wrapOneInvert(X, input, output, 8, go) \
+ X##gu ^= trailingBits; \
+ } \
+ } \
+ else { \
+ wrapOneInvert(X, input, output, 8, go) \
+ wrapOne(X, input, output, 9, gu) \
+ if (laneCount < 11) { \
+ X##ka ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 10, ka) \
+ X##ke ^= trailingBits; \
+ } \
+ } \
+ } \
+ else { \
+ wrapOneInvert(X, input, output, 8, go) \
+ wrapOne(X, input, output, 9, gu) \
+ wrapOne(X, input, output, 10, ka) \
+ wrapOne(X, input, output, 11, ke) \
+ if (laneCount < 14) { \
+ if (laneCount < 13) { \
+ X##ki ^= trailingBits; \
+ } \
+ else { \
+ wrapOneInvert(X, input, output, 12, ki) \
+ X##ko ^= trailingBits; \
+ } \
+ } \
+ else { \
+ wrapOneInvert(X, input, output, 12, ki) \
+ wrapOne(X, input, output, 13, ko) \
+ if (laneCount < 15) { \
+ X##ku ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 14, ku) \
+ X##ma ^= trailingBits; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ wrapOne(X, input, output, 0, ba) \
+ wrapOneInvert(X, input, output, 1, be) \
+ wrapOneInvert(X, input, output, 2, bi) \
+ wrapOne(X, input, output, 3, bo) \
+ wrapOne(X, input, output, 4, bu) \
+ wrapOne(X, input, output, 5, ga) \
+ wrapOne(X, input, output, 6, ge) \
+ wrapOne(X, input, output, 7, gi) \
+ wrapOneInvert(X, input, output, 8, go) \
+ wrapOne(X, input, output, 9, gu) \
+ wrapOne(X, input, output, 10, ka) \
+ wrapOne(X, input, output, 11, ke) \
+ wrapOneInvert(X, input, output, 12, ki) \
+ wrapOne(X, input, output, 13, ko) \
+ wrapOne(X, input, output, 14, ku) \
+ wrapOne(X, input, output, 15, ma) \
+ if (laneCount < 24) { \
+ if (laneCount < 20) { \
+ if (laneCount < 18) { \
+ if (laneCount < 17) { \
+ X##me ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 16, me) \
+ X##mi ^= trailingBits; \
+ } \
+ } \
+ else { \
+ wrapOne(X, input, output, 16, me) \
+ wrapOneInvert(X, input, output, 17, mi) \
+ if (laneCount < 19) { \
+ X##mo ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 18, mo) \
+ X##mu ^= trailingBits; \
+ } \
+ } \
+ } \
+ else { \
+ wrapOne(X, input, output, 16, me) \
+ wrapOneInvert(X, input, output, 17, mi) \
+ wrapOne(X, input, output, 18, mo) \
+ wrapOne(X, input, output, 19, mu) \
+ if (laneCount < 22) { \
+ if (laneCount < 21) { \
+ X##sa ^= trailingBits; \
+ } \
+ else { \
+ wrapOneInvert(X, input, output, 20, sa) \
+ X##se ^= trailingBits; \
+ } \
+ } \
+ else { \
+ wrapOneInvert(X, input, output, 20, sa) \
+ wrapOne(X, input, output, 21, se) \
+ if (laneCount < 23) { \
+ X##si ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 22, si) \
+ X##so ^= trailingBits; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ wrapOne(X, input, output, 16, me) \
+ wrapOneInvert(X, input, output, 17, mi) \
+ wrapOne(X, input, output, 18, mo) \
+ wrapOne(X, input, output, 19, mu) \
+ wrapOneInvert(X, input, output, 20, sa) \
+ wrapOne(X, input, output, 21, se) \
+ wrapOne(X, input, output, 22, si) \
+ wrapOne(X, input, output, 23, so) \
+ if (laneCount < 25) { \
+ X##su ^= trailingBits; \
+ } \
+ else { \
+ wrapOne(X, input, output, 24, su) \
+ } \
+ } \
+ }
+
+#define unwrap(X, input, output, laneCount, trailingBits) \
+ if (laneCount < 16) { \
+ if (laneCount < 8) { \
+ if (laneCount < 4) { \
+ if (laneCount < 2) { \
+ if (laneCount < 1) { \
+ X##ba ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 0, ba) \
+ X##be ^= trailingBits; \
+ } \
+ } \
+ else { \
+ unwrapOne(X, input, output, 0, ba) \
+ unwrapOneInvert(X, input, output, 1, be) \
+ if (laneCount < 3) { \
+ X##bi ^= trailingBits; \
+ } \
+ else { \
+ unwrapOneInvert(X, input, output, 2, bi) \
+ X##bo ^= trailingBits; \
+ } \
+ } \
+ } \
+ else { \
+ unwrapOne(X, input, output, 0, ba) \
+ unwrapOneInvert(X, input, output, 1, be) \
+ unwrapOneInvert(X, input, output, 2, bi) \
+ unwrapOne(X, input, output, 3, bo) \
+ if (laneCount < 6) { \
+ if (laneCount < 5) { \
+ X##bu ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 4, bu) \
+ X##ga ^= trailingBits; \
+ } \
+ } \
+ else { \
+ unwrapOne(X, input, output, 4, bu) \
+ unwrapOne(X, input, output, 5, ga) \
+ if (laneCount < 7) { \
+ X##ge ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 6, ge) \
+ X##gi ^= trailingBits; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ unwrapOne(X, input, output, 0, ba) \
+ unwrapOneInvert(X, input, output, 1, be) \
+ unwrapOneInvert(X, input, output, 2, bi) \
+ unwrapOne(X, input, output, 3, bo) \
+ unwrapOne(X, input, output, 4, bu) \
+ unwrapOne(X, input, output, 5, ga) \
+ unwrapOne(X, input, output, 6, ge) \
+ unwrapOne(X, input, output, 7, gi) \
+ if (laneCount < 12) { \
+ if (laneCount < 10) { \
+ if (laneCount < 9) { \
+ X##go ^= trailingBits; \
+ } \
+ else { \
+ unwrapOneInvert(X, input, output, 8, go) \
+ X##gu ^= trailingBits; \
+ } \
+ } \
+ else { \
+ unwrapOneInvert(X, input, output, 8, go) \
+ unwrapOne(X, input, output, 9, gu) \
+ if (laneCount < 11) { \
+ X##ka ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 10, ka) \
+ X##ke ^= trailingBits; \
+ } \
+ } \
+ } \
+ else { \
+ unwrapOneInvert(X, input, output, 8, go) \
+ unwrapOne(X, input, output, 9, gu) \
+ unwrapOne(X, input, output, 10, ka) \
+ unwrapOne(X, input, output, 11, ke) \
+ if (laneCount < 14) { \
+ if (laneCount < 13) { \
+ X##ki ^= trailingBits; \
+ } \
+ else { \
+ unwrapOneInvert(X, input, output, 12, ki) \
+ X##ko ^= trailingBits; \
+ } \
+ } \
+ else { \
+ unwrapOneInvert(X, input, output, 12, ki) \
+ unwrapOne(X, input, output, 13, ko) \
+ if (laneCount < 15) { \
+ X##ku ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 14, ku) \
+ X##ma ^= trailingBits; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ unwrapOne(X, input, output, 0, ba) \
+ unwrapOneInvert(X, input, output, 1, be) \
+ unwrapOneInvert(X, input, output, 2, bi) \
+ unwrapOne(X, input, output, 3, bo) \
+ unwrapOne(X, input, output, 4, bu) \
+ unwrapOne(X, input, output, 5, ga) \
+ unwrapOne(X, input, output, 6, ge) \
+ unwrapOne(X, input, output, 7, gi) \
+ unwrapOneInvert(X, input, output, 8, go) \
+ unwrapOne(X, input, output, 9, gu) \
+ unwrapOne(X, input, output, 10, ka) \
+ unwrapOne(X, input, output, 11, ke) \
+ unwrapOneInvert(X, input, output, 12, ki) \
+ unwrapOne(X, input, output, 13, ko) \
+ unwrapOne(X, input, output, 14, ku) \
+ unwrapOne(X, input, output, 15, ma) \
+ if (laneCount < 24) { \
+ if (laneCount < 20) { \
+ if (laneCount < 18) { \
+ if (laneCount < 17) { \
+ X##me ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 16, me) \
+ X##mi ^= trailingBits; \
+ } \
+ } \
+ else { \
+ unwrapOne(X, input, output, 16, me) \
+ unwrapOneInvert(X, input, output, 17, mi) \
+ if (laneCount < 19) { \
+ X##mo ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 18, mo) \
+ X##mu ^= trailingBits; \
+ } \
+ } \
+ } \
+ else { \
+ unwrapOne(X, input, output, 16, me) \
+ unwrapOneInvert(X, input, output, 17, mi) \
+ unwrapOne(X, input, output, 18, mo) \
+ unwrapOne(X, input, output, 19, mu) \
+ if (laneCount < 22) { \
+ if (laneCount < 21) { \
+ X##sa ^= trailingBits; \
+ } \
+ else { \
+ unwrapOneInvert(X, input, output, 20, sa) \
+ X##se ^= trailingBits; \
+ } \
+ } \
+ else { \
+ unwrapOneInvert(X, input, output, 20, sa) \
+ unwrapOne(X, input, output, 21, se) \
+ if (laneCount < 23) { \
+ X##si ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 22, si) \
+ X##so ^= trailingBits; \
+ } \
+ } \
+ } \
+ } \
+ else { \
+ unwrapOne(X, input, output, 16, me) \
+ unwrapOneInvert(X, input, output, 17, mi) \
+ unwrapOne(X, input, output, 18, mo) \
+ unwrapOne(X, input, output, 19, mu) \
+ unwrapOneInvert(X, input, output, 20, sa) \
+ unwrapOne(X, input, output, 21, se) \
+ unwrapOne(X, input, output, 22, si) \
+ unwrapOne(X, input, output, 23, so) \
+ if (laneCount < 25) { \
+ X##su ^= trailingBits; \
+ } \
+ else { \
+ unwrapOne(X, input, output, 24, su) \
+ } \
+ } \
+ }
diff --git a/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h
new file mode 100644
index 0000000000..6cf765e6ce
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h
@@ -0,0 +1,37 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/** For the documentation, see SnP-documentation.h.
+ */
+
+#define KeccakP1600_implementation "in-place 32-bit optimized implementation"
+#define KeccakP1600_stateSizeInBytes 200
+#define KeccakP1600_stateAlignment 8
+
+#define KeccakP1600_StaticInitialize()
+void KeccakP1600_Initialize(void *state);
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_Permute_24rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+
+#endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h
new file mode 100644
index 0000000000..889a31a794
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h
@@ -0,0 +1,49 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/** For the documentation, see SnP-documentation.h.
+ */
+
+/* #include "brg_endian.h" */
+#include "KeccakP-1600-opt64-config.h"
+
+#define KeccakP1600_implementation "generic 64-bit optimized implementation (" KeccakP1600_implementation_config ")"
+#define KeccakP1600_stateSizeInBytes 200
+#define KeccakP1600_stateAlignment 8
+#define KeccakF1600_FastLoop_supported
+
+#include <stddef.h>
+
+#define KeccakP1600_StaticInitialize()
+void KeccakP1600_Initialize(void *state);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define KeccakP1600_AddByte(state, byte, offset) \
+ ((unsigned char*)(state))[(offset)] ^= (byte)
+#else
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+#endif
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_Permute_24rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+#endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-SnP.h b/Modules/_sha3/kcp/KeccakP-1600-SnP.h
new file mode 100644
index 0000000000..0b23f09a6a
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-SnP.h
@@ -0,0 +1,7 @@
+#if KeccakOpt == 64
+ #include "KeccakP-1600-SnP-opt64.h"
+#elif KeccakOpt == 32
+ #include "KeccakP-1600-SnP-opt32.h"
+#else
+ #error "No KeccakOpt"
+#endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c b/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c
new file mode 100644
index 0000000000..a2f9ffea93
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c
@@ -0,0 +1,1162 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+/* #include "brg_endian.h" */
+#include "KeccakP-1600-SnP.h"
+#include "SnP-Relaned.h"
+
+typedef unsigned char UINT8;
+typedef unsigned int UINT32;
+/* WARNING: on 8-bit and 16-bit platforms, this should be replaced by: */
+
+/*typedef unsigned long UINT32; */
+
+
+#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset))))
+
+/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+
+#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+ temp0 = (low); \
+ temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \
+ temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \
+ temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \
+ temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \
+ temp1 = (high); \
+ temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); \
+ temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \
+ temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \
+ temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8);
+
+#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \
+ prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+ even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \
+ odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \
+ prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+ even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \
+ odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \
+ prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+ even = (temp0 & 0x0000FFFF) | (temp1 << 16); \
+ odd = (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+
+#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+ temp0 = (even); \
+ temp1 = (odd); \
+ temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \
+ temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \
+ temp0 = temp; \
+ temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \
+ temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \
+ temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \
+ temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \
+ temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); \
+ temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \
+ temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \
+ temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1);
+
+#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \
+ prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+ low = temp0; \
+ high = temp1;
+
+#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \
+ prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+ lowOut = lowIn ^ temp0; \
+ highOut = highIn ^ temp1;
+
+void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length)
+{
+ UINT8 laneAsBytes[8];
+ UINT32 low, high;
+ UINT32 temp, temp0, temp1;
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+ memset(laneAsBytes, 0xFF, offset);
+ memset(laneAsBytes+offset, 0x00, length);
+ memset(laneAsBytes+offset+length, 0xFF, 8-offset-length);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ low = *((UINT32*)(laneAsBytes+0));
+ high = *((UINT32*)(laneAsBytes+4));
+#else
+ low = laneAsBytes[0]
+ | ((UINT32)(laneAsBytes[1]) << 8)
+ | ((UINT32)(laneAsBytes[2]) << 16)
+ | ((UINT32)(laneAsBytes[3]) << 24);
+ high = laneAsBytes[4]
+ | ((UINT32)(laneAsBytes[5]) << 8)
+ | ((UINT32)(laneAsBytes[6]) << 16)
+ | ((UINT32)(laneAsBytes[7]) << 24);
+#endif
+ toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Initialize(void *state)
+{
+ memset(state, 0, 200);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+ unsigned int lanePosition = offset/8;
+ unsigned int offsetInLane = offset%8;
+ UINT32 low, high;
+ UINT32 temp, temp0, temp1;
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+ if (offsetInLane < 4) {
+ low = (UINT32)byte << (offsetInLane*8);
+ high = 0;
+ }
+ else {
+ low = 0;
+ high = (UINT32)byte << ((offsetInLane-4)*8);
+ }
+ toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ UINT8 laneAsBytes[8];
+ UINT32 low, high;
+ UINT32 temp, temp0, temp1;
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+ memset(laneAsBytes, 0, 8);
+ memcpy(laneAsBytes+offset, data, length);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ low = *((UINT32*)(laneAsBytes+0));
+ high = *((UINT32*)(laneAsBytes+4));
+#else
+ low = laneAsBytes[0]
+ | ((UINT32)(laneAsBytes[1]) << 8)
+ | ((UINT32)(laneAsBytes[2]) << 16)
+ | ((UINT32)(laneAsBytes[3]) << 24);
+ high = laneAsBytes[4]
+ | ((UINT32)(laneAsBytes[5]) << 8)
+ | ((UINT32)(laneAsBytes[6]) << 16)
+ | ((UINT32)(laneAsBytes[7]) << 24);
+#endif
+ toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ const UINT32 * pI = (const UINT32 *)data;
+ UINT32 * pS = (UINT32*)state;
+ UINT32 t, x0, x1;
+ int i;
+ for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+ UINT32 low;
+ UINT32 high;
+ memcpy(&low, pI++, 4);
+ memcpy(&high, pI++, 4);
+ toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1);
+#else
+ toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
+#endif
+ }
+#else
+ unsigned int lanePosition;
+ for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+ UINT8 laneAsBytes[8];
+ UINT32 low, high, temp, temp0, temp1;
+ UINT32 *stateAsHalfLanes;
+ memcpy(laneAsBytes, data+lanePosition*8, 8);
+ low = laneAsBytes[0]
+ | ((UINT32)(laneAsBytes[1]) << 8)
+ | ((UINT32)(laneAsBytes[2]) << 16)
+ | ((UINT32)(laneAsBytes[3]) << 24);
+ high = laneAsBytes[4]
+ | ((UINT32)(laneAsBytes[5]) << 8)
+ | ((UINT32)(laneAsBytes[6]) << 16)
+ | ((UINT32)(laneAsBytes[7]) << 24);
+ stateAsHalfLanes = (UINT32*)state;
+ toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+ }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ KeccakP1600_SetBytesInLaneToZero(state, lanePosition, offset, length);
+ KeccakP1600_AddBytesInLane(state, lanePosition, data, offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ const UINT32 * pI = (const UINT32 *)data;
+ UINT32 * pS = (UINT32 *)state;
+ UINT32 t, x0, x1;
+ int i;
+ for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+ UINT32 low;
+ UINT32 high;
+ memcpy(&low, pI++, 4);
+ memcpy(&high, pI++, 4);
+ toBitInterleavingAndSet(low, high, *(pS++), *(pS++), t, x0, x1);
+#else
+ toBitInterleavingAndSet(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
+#endif
+ }
+#else
+ unsigned int lanePosition;
+ for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+ UINT8 laneAsBytes[8];
+ UINT32 low, high, temp, temp0, temp1;
+ UINT32 *stateAsHalfLanes;
+ memcpy(laneAsBytes, data+lanePosition*8, 8);
+ low = laneAsBytes[0]
+ | ((UINT32)(laneAsBytes[1]) << 8)
+ | ((UINT32)(laneAsBytes[2]) << 16)
+ | ((UINT32)(laneAsBytes[3]) << 24);
+ high = laneAsBytes[4]
+ | ((UINT32)(laneAsBytes[5]) << 8)
+ | ((UINT32)(laneAsBytes[6]) << 16)
+ | ((UINT32)(laneAsBytes[7]) << 24);
+ stateAsHalfLanes = (UINT32*)state;
+ toBitInterleavingAndSet(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+ }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ SnP_OverwriteBytes(state, data, offset, length, KeccakP1600_OverwriteLanes, KeccakP1600_OverwriteBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
+{
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+ unsigned int i;
+
+ for(i=0; i<byteCount/8; i++) {
+ stateAsHalfLanes[i*2+0] = 0;
+ stateAsHalfLanes[i*2+1] = 0;
+ }
+ if (byteCount%8 != 0)
+ KeccakP1600_SetBytesInLaneToZero(state, byteCount/8, 0, byteCount%8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+ UINT32 low, high, temp, temp0, temp1;
+ UINT8 laneAsBytes[8];
+
+ fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ *((UINT32*)(laneAsBytes+0)) = low;
+ *((UINT32*)(laneAsBytes+4)) = high;
+#else
+ laneAsBytes[0] = low & 0xFF;
+ laneAsBytes[1] = (low >> 8) & 0xFF;
+ laneAsBytes[2] = (low >> 16) & 0xFF;
+ laneAsBytes[3] = (low >> 24) & 0xFF;
+ laneAsBytes[4] = high & 0xFF;
+ laneAsBytes[5] = (high >> 8) & 0xFF;
+ laneAsBytes[6] = (high >> 16) & 0xFF;
+ laneAsBytes[7] = (high >> 24) & 0xFF;
+#endif
+ memcpy(data, laneAsBytes+offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ UINT32 * pI = (UINT32 *)data;
+ const UINT32 * pS = ( const UINT32 *)state;
+ UINT32 t, x0, x1;
+ int i;
+ for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+ UINT32 low;
+ UINT32 high;
+ fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
+ memcpy(pI++, &low, 4);
+ memcpy(pI++, &high, 4);
+#else
+ fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1)
+#endif
+ }
+#else
+ unsigned int lanePosition;
+ for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+ UINT32 low, high, temp, temp0, temp1;
+ UINT8 laneAsBytes[8];
+ fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+ laneAsBytes[0] = low & 0xFF;
+ laneAsBytes[1] = (low >> 8) & 0xFF;
+ laneAsBytes[2] = (low >> 16) & 0xFF;
+ laneAsBytes[3] = (low >> 24) & 0xFF;
+ laneAsBytes[4] = high & 0xFF;
+ laneAsBytes[5] = (high >> 8) & 0xFF;
+ laneAsBytes[6] = (high >> 16) & 0xFF;
+ laneAsBytes[7] = (high >> 24) & 0xFF;
+ memcpy(data+lanePosition*8, laneAsBytes, 8);
+ }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+ SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+ UINT32 low, high, temp, temp0, temp1;
+ UINT8 laneAsBytes[8];
+ unsigned int i;
+
+ fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ *((UINT32*)(laneAsBytes+0)) = low;
+ *((UINT32*)(laneAsBytes+4)) = high;
+#else
+ laneAsBytes[0] = low & 0xFF;
+ laneAsBytes[1] = (low >> 8) & 0xFF;
+ laneAsBytes[2] = (low >> 16) & 0xFF;
+ laneAsBytes[3] = (low >> 24) & 0xFF;
+ laneAsBytes[4] = high & 0xFF;
+ laneAsBytes[5] = (high >> 8) & 0xFF;
+ laneAsBytes[6] = (high >> 16) & 0xFF;
+ laneAsBytes[7] = (high >> 24) & 0xFF;
+#endif
+ for(i=0; i<length; i++)
+ output[i] = input[i] ^ laneAsBytes[offset+i];
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *input, unsigned char *output, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ const UINT32 * pI = (const UINT32 *)input;
+ UINT32 * pO = (UINT32 *)output;
+ const UINT32 * pS = (const UINT32 *)state;
+ UINT32 t, x0, x1;
+ int i;
+ for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+ UINT32 low;
+ UINT32 high;
+ fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
+ *(pO++) = *(pI++) ^ low;
+ *(pO++) = *(pI++) ^ high;
+#else
+ fromBitInterleavingAndXOR(*(pS++), *(pS++), *(pI++), *(pI++), *(pO++), *(pO++), t, x0, x1)
+#endif
+ }
+#else
+ unsigned int lanePosition;
+ for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+ UINT32 low, high, temp, temp0, temp1;
+ UINT8 laneAsBytes[8];
+ fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+ laneAsBytes[0] = low & 0xFF;
+ laneAsBytes[1] = (low >> 8) & 0xFF;
+ laneAsBytes[2] = (low >> 16) & 0xFF;
+ laneAsBytes[3] = (low >> 24) & 0xFF;
+ laneAsBytes[4] = high & 0xFF;
+ laneAsBytes[5] = (high >> 8) & 0xFF;
+ laneAsBytes[6] = (high >> 16) & 0xFF;
+ laneAsBytes[7] = (high >> 24) & 0xFF;
+ ((UINT32*)(output+lanePosition*8))[0] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+0));
+ ((UINT32*)(output+lanePosition*8))[1] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+4));
+ }
+#endif
+}
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+ SnP_ExtractAndAddBytes(state, input, output, offset, length, KeccakP1600_ExtractAndAddLanes, KeccakP1600_ExtractAndAddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+static const UINT32 KeccakF1600RoundConstants_int2[2*24+1] =
+{
+ 0x00000001UL, 0x00000000UL,
+ 0x00000000UL, 0x00000089UL,
+ 0x00000000UL, 0x8000008bUL,
+ 0x00000000UL, 0x80008080UL,
+ 0x00000001UL, 0x0000008bUL,
+ 0x00000001UL, 0x00008000UL,
+ 0x00000001UL, 0x80008088UL,
+ 0x00000001UL, 0x80000082UL,
+ 0x00000000UL, 0x0000000bUL,
+ 0x00000000UL, 0x0000000aUL,
+ 0x00000001UL, 0x00008082UL,
+ 0x00000000UL, 0x00008003UL,
+ 0x00000001UL, 0x0000808bUL,
+ 0x00000001UL, 0x8000000bUL,
+ 0x00000001UL, 0x8000008aUL,
+ 0x00000001UL, 0x80000081UL,
+ 0x00000000UL, 0x80000081UL,
+ 0x00000000UL, 0x80000008UL,
+ 0x00000000UL, 0x00000083UL,
+ 0x00000000UL, 0x80008003UL,
+ 0x00000001UL, 0x80008088UL,
+ 0x00000000UL, 0x80000088UL,
+ 0x00000001UL, 0x00008000UL,
+ 0x00000000UL, 0x80008082UL,
+ 0x000000FFUL
+};
+
+#define KeccakAtoD_round0() \
+ Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \
+ Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+ Da0 = Cx^ROL32(Du1, 1); \
+ Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \
+ Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+ Da1 = Cz^Du0; \
+\
+ Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \
+ Do0 = Cw^ROL32(Cz, 1); \
+ Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \
+ Do1 = Cy^Cx; \
+\
+ Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \
+ De0 = Cx^ROL32(Cy, 1); \
+ Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \
+ De1 = Cz^Cw; \
+\
+ Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \
+ Di0 = Du0^ROL32(Cy, 1); \
+ Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \
+ Di1 = Du1^Cw; \
+\
+ Du0 = Cw^ROL32(Cz, 1); \
+ Du1 = Cy^Cx; \
+
+#define KeccakAtoD_round1() \
+ Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \
+ Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \
+ Da0 = Cx^ROL32(Du1, 1); \
+ Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \
+ Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \
+ Da1 = Cz^Du0; \
+\
+ Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \
+ Do0 = Cw^ROL32(Cz, 1); \
+ Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \
+ Do1 = Cy^Cx; \
+\
+ Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \
+ De0 = Cx^ROL32(Cy, 1); \
+ Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \
+ De1 = Cz^Cw; \
+\
+ Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \
+ Di0 = Du0^ROL32(Cy, 1); \
+ Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \
+ Di1 = Du1^Cw; \
+\
+ Du0 = Cw^ROL32(Cz, 1); \
+ Du1 = Cy^Cx; \
+
+#define KeccakAtoD_round2() \
+ Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \
+ Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \
+ Da0 = Cx^ROL32(Du1, 1); \
+ Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \
+ Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \
+ Da1 = Cz^Du0; \
+\
+ Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \
+ Do0 = Cw^ROL32(Cz, 1); \
+ Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \
+ Do1 = Cy^Cx; \
+\
+ Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \
+ De0 = Cx^ROL32(Cy, 1); \
+ Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \
+ De1 = Cz^Cw; \
+\
+ Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \
+ Di0 = Du0^ROL32(Cy, 1); \
+ Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \
+ Di1 = Du1^Cw; \
+\
+ Du0 = Cw^ROL32(Cz, 1); \
+ Du1 = Cy^Cx; \
+
+#define KeccakAtoD_round3() \
+ Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \
+ Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \
+ Da0 = Cx^ROL32(Du1, 1); \
+ Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \
+ Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \
+ Da1 = Cz^Du0; \
+\
+ Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \
+ Do0 = Cw^ROL32(Cz, 1); \
+ Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \
+ Do1 = Cy^Cx; \
+\
+ Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \
+ De0 = Cx^ROL32(Cy, 1); \
+ Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \
+ De1 = Cz^Cw; \
+\
+ Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \
+ Di0 = Du0^ROL32(Cy, 1); \
+ Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \
+ Di1 = Du1^Cw; \
+\
+ Du0 = Cw^ROL32(Cz, 1); \
+ Du1 = Cy^Cx; \
+
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds)
+{
+ {
+ UINT32 Da0, De0, Di0, Do0, Du0;
+ UINT32 Da1, De1, Di1, Do1, Du1;
+ UINT32 Ca0, Ce0, Ci0, Co0, Cu0;
+ UINT32 Cx, Cy, Cz, Cw;
+ #define Ba Ca0
+ #define Be Ce0
+ #define Bi Ci0
+ #define Bo Co0
+ #define Bu Cu0
+ const UINT32 *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2;
+ UINT32 *stateAsHalfLanes = (UINT32*)state;
+ #define Aba0 stateAsHalfLanes[ 0]
+ #define Aba1 stateAsHalfLanes[ 1]
+ #define Abe0 stateAsHalfLanes[ 2]
+ #define Abe1 stateAsHalfLanes[ 3]
+ #define Abi0 stateAsHalfLanes[ 4]
+ #define Abi1 stateAsHalfLanes[ 5]
+ #define Abo0 stateAsHalfLanes[ 6]
+ #define Abo1 stateAsHalfLanes[ 7]
+ #define Abu0 stateAsHalfLanes[ 8]
+ #define Abu1 stateAsHalfLanes[ 9]
+ #define Aga0 stateAsHalfLanes[10]
+ #define Aga1 stateAsHalfLanes[11]
+ #define Age0 stateAsHalfLanes[12]
+ #define Age1 stateAsHalfLanes[13]
+ #define Agi0 stateAsHalfLanes[14]
+ #define Agi1 stateAsHalfLanes[15]
+ #define Ago0 stateAsHalfLanes[16]
+ #define Ago1 stateAsHalfLanes[17]
+ #define Agu0 stateAsHalfLanes[18]
+ #define Agu1 stateAsHalfLanes[19]
+ #define Aka0 stateAsHalfLanes[20]
+ #define Aka1 stateAsHalfLanes[21]
+ #define Ake0 stateAsHalfLanes[22]
+ #define Ake1 stateAsHalfLanes[23]
+ #define Aki0 stateAsHalfLanes[24]
+ #define Aki1 stateAsHalfLanes[25]
+ #define Ako0 stateAsHalfLanes[26]
+ #define Ako1 stateAsHalfLanes[27]
+ #define Aku0 stateAsHalfLanes[28]
+ #define Aku1 stateAsHalfLanes[29]
+ #define Ama0 stateAsHalfLanes[30]
+ #define Ama1 stateAsHalfLanes[31]
+ #define Ame0 stateAsHalfLanes[32]
+ #define Ame1 stateAsHalfLanes[33]
+ #define Ami0 stateAsHalfLanes[34]
+ #define Ami1 stateAsHalfLanes[35]
+ #define Amo0 stateAsHalfLanes[36]
+ #define Amo1 stateAsHalfLanes[37]
+ #define Amu0 stateAsHalfLanes[38]
+ #define Amu1 stateAsHalfLanes[39]
+ #define Asa0 stateAsHalfLanes[40]
+ #define Asa1 stateAsHalfLanes[41]
+ #define Ase0 stateAsHalfLanes[42]
+ #define Ase1 stateAsHalfLanes[43]
+ #define Asi0 stateAsHalfLanes[44]
+ #define Asi1 stateAsHalfLanes[45]
+ #define Aso0 stateAsHalfLanes[46]
+ #define Aso1 stateAsHalfLanes[47]
+ #define Asu0 stateAsHalfLanes[48]
+ #define Asu1 stateAsHalfLanes[49]
+
+ do
+ {
+ /* --- Code for 4 rounds */
+
+ /* --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+
+ KeccakAtoD_round0();
+
+ Ba = (Aba0^Da0);
+ Be = ROL32((Age0^De0), 22);
+ Bi = ROL32((Aki1^Di1), 22);
+ Bo = ROL32((Amo1^Do1), 11);
+ Bu = ROL32((Asu0^Du0), 7);
+ Aba0 = Ba ^((~Be)& Bi );
+ Aba0 ^= *(pRoundConstants++);
+ Age0 = Be ^((~Bi)& Bo );
+ Aki1 = Bi ^((~Bo)& Bu );
+ Amo1 = Bo ^((~Bu)& Ba );
+ Asu0 = Bu ^((~Ba)& Be );
+
+ Ba = (Aba1^Da1);
+ Be = ROL32((Age1^De1), 22);
+ Bi = ROL32((Aki0^Di0), 21);
+ Bo = ROL32((Amo0^Do0), 10);
+ Bu = ROL32((Asu1^Du1), 7);
+ Aba1 = Ba ^((~Be)& Bi );
+ Aba1 ^= *(pRoundConstants++);
+ Age1 = Be ^((~Bi)& Bo );
+ Aki0 = Bi ^((~Bo)& Bu );
+ Amo0 = Bo ^((~Bu)& Ba );
+ Asu1 = Bu ^((~Ba)& Be );
+
+ Bi = ROL32((Aka1^Da1), 2);
+ Bo = ROL32((Ame1^De1), 23);
+ Bu = ROL32((Asi1^Di1), 31);
+ Ba = ROL32((Abo0^Do0), 14);
+ Be = ROL32((Agu0^Du0), 10);
+ Aka1 = Ba ^((~Be)& Bi );
+ Ame1 = Be ^((~Bi)& Bo );
+ Asi1 = Bi ^((~Bo)& Bu );
+ Abo0 = Bo ^((~Bu)& Ba );
+ Agu0 = Bu ^((~Ba)& Be );
+
+ Bi = ROL32((Aka0^Da0), 1);
+ Bo = ROL32((Ame0^De0), 22);
+ Bu = ROL32((Asi0^Di0), 30);
+ Ba = ROL32((Abo1^Do1), 14);
+ Be = ROL32((Agu1^Du1), 10);
+ Aka0 = Ba ^((~Be)& Bi );
+ Ame0 = Be ^((~Bi)& Bo );
+ Asi0 = Bi ^((~Bo)& Bu );
+ Abo1 = Bo ^((~Bu)& Ba );
+ Agu1 = Bu ^((~Ba)& Be );
+
+ Bu = ROL32((Asa0^Da0), 9);
+ Ba = ROL32((Abe1^De1), 1);
+ Be = ROL32((Agi0^Di0), 3);
+ Bi = ROL32((Ako1^Do1), 13);
+ Bo = ROL32((Amu0^Du0), 4);
+ Asa0 = Ba ^((~Be)& Bi );
+ Abe1 = Be ^((~Bi)& Bo );
+ Agi0 = Bi ^((~Bo)& Bu );
+ Ako1 = Bo ^((~Bu)& Ba );
+ Amu0 = Bu ^((~Ba)& Be );
+
+ Bu = ROL32((Asa1^Da1), 9);
+ Ba = (Abe0^De0);
+ Be = ROL32((Agi1^Di1), 3);
+ Bi = ROL32((Ako0^Do0), 12);
+ Bo = ROL32((Amu1^Du1), 4);
+ Asa1 = Ba ^((~Be)& Bi );
+ Abe0 = Be ^((~Bi)& Bo );
+ Agi1 = Bi ^((~Bo)& Bu );
+ Ako0 = Bo ^((~Bu)& Ba );
+ Amu1 = Bu ^((~Ba)& Be );
+
+ Be = ROL32((Aga0^Da0), 18);
+ Bi = ROL32((Ake0^De0), 5);
+ Bo = ROL32((Ami1^Di1), 8);
+ Bu = ROL32((Aso0^Do0), 28);
+ Ba = ROL32((Abu1^Du1), 14);
+ Aga0 = Ba ^((~Be)& Bi );
+ Ake0 = Be ^((~Bi)& Bo );
+ Ami1 = Bi ^((~Bo)& Bu );
+ Aso0 = Bo ^((~Bu)& Ba );
+ Abu1 = Bu ^((~Ba)& Be );
+
+ Be = ROL32((Aga1^Da1), 18);
+ Bi = ROL32((Ake1^De1), 5);
+ Bo = ROL32((Ami0^Di0), 7);
+ Bu = ROL32((Aso1^Do1), 28);
+ Ba = ROL32((Abu0^Du0), 13);
+ Aga1 = Ba ^((~Be)& Bi );
+ Ake1 = Be ^((~Bi)& Bo );
+ Ami0 = Bi ^((~Bo)& Bu );
+ Aso1 = Bo ^((~Bu)& Ba );
+ Abu0 = Bu ^((~Ba)& Be );
+
+ Bo = ROL32((Ama1^Da1), 21);
+ Bu = ROL32((Ase0^De0), 1);
+ Ba = ROL32((Abi0^Di0), 31);
+ Be = ROL32((Ago1^Do1), 28);
+ Bi = ROL32((Aku1^Du1), 20);
+ Ama1 = Ba ^((~Be)& Bi );
+ Ase0 = Be ^((~Bi)& Bo );
+ Abi0 = Bi ^((~Bo)& Bu );
+ Ago1 = Bo ^((~Bu)& Ba );
+ Aku1 = Bu ^((~Ba)& Be );
+
+ Bo = ROL32((Ama0^Da0), 20);
+ Bu = ROL32((Ase1^De1), 1);
+ Ba = ROL32((Abi1^Di1), 31);
+ Be = ROL32((Ago0^Do0), 27);
+ Bi = ROL32((Aku0^Du0), 19);
+ Ama0 = Ba ^((~Be)& Bi );
+ Ase1 = Be ^((~Bi)& Bo );
+ Abi1 = Bi ^((~Bo)& Bu );
+ Ago0 = Bo ^((~Bu)& Ba );
+ Aku0 = Bu ^((~Ba)& Be );
+
+ KeccakAtoD_round1();
+
+ Ba = (Aba0^Da0);
+ Be = ROL32((Ame1^De0), 22);
+ Bi = ROL32((Agi1^Di1), 22);
+ Bo = ROL32((Aso1^Do1), 11);
+ Bu = ROL32((Aku1^Du0), 7);
+ Aba0 = Ba ^((~Be)& Bi );
+ Aba0 ^= *(pRoundConstants++);
+ Ame1 = Be ^((~Bi)& Bo );
+ Agi1 = Bi ^((~Bo)& Bu );
+ Aso1 = Bo ^((~Bu)& Ba );
+ Aku1 = Bu ^((~Ba)& Be );
+
+ Ba = (Aba1^Da1);
+ Be = ROL32((Ame0^De1), 22);
+ Bi = ROL32((Agi0^Di0), 21);
+ Bo = ROL32((Aso0^Do0), 10);
+ Bu = ROL32((Aku0^Du1), 7);
+ Aba1 = Ba ^((~Be)& Bi );
+ Aba1 ^= *(pRoundConstants++);
+ Ame0 = Be ^((~Bi)& Bo );
+ Agi0 = Bi ^((~Bo)& Bu );
+ Aso0 = Bo ^((~Bu)& Ba );
+ Aku0 = Bu ^((~Ba)& Be );
+
+ Bi = ROL32((Asa1^Da1), 2);
+ Bo = ROL32((Ake1^De1), 23);
+ Bu = ROL32((Abi1^Di1), 31);
+ Ba = ROL32((Amo1^Do0), 14);
+ Be = ROL32((Agu0^Du0), 10);
+ Asa1 = Ba ^((~Be)& Bi );
+ Ake1 = Be ^((~Bi)& Bo );
+ Abi1 = Bi ^((~Bo)& Bu );
+ Amo1 = Bo ^((~Bu)& Ba );
+ Agu0 = Bu ^((~Ba)& Be );
+
+ Bi = ROL32((Asa0^Da0), 1);
+ Bo = ROL32((Ake0^De0), 22);
+ Bu = ROL32((Abi0^Di0), 30);
+ Ba = ROL32((Amo0^Do1), 14);
+ Be = ROL32((Agu1^Du1), 10);
+ Asa0 = Ba ^((~Be)& Bi );
+ Ake0 = Be ^((~Bi)& Bo );
+ Abi0 = Bi ^((~Bo)& Bu );
+ Amo0 = Bo ^((~Bu)& Ba );
+ Agu1 = Bu ^((~Ba)& Be );
+
+ Bu = ROL32((Ama1^Da0), 9);
+ Ba = ROL32((Age1^De1), 1);
+ Be = ROL32((Asi1^Di0), 3);
+ Bi = ROL32((Ako0^Do1), 13);
+ Bo = ROL32((Abu1^Du0), 4);
+ Ama1 = Ba ^((~Be)& Bi );
+ Age1 = Be ^((~Bi)& Bo );
+ Asi1 = Bi ^((~Bo)& Bu );
+ Ako0 = Bo ^((~Bu)& Ba );
+ Abu1 = Bu ^((~Ba)& Be );
+
+ Bu = ROL32((Ama0^Da1), 9);
+ Ba = (Age0^De0);
+ Be = ROL32((Asi0^Di1), 3);
+ Bi = ROL32((Ako1^Do0), 12);
+ Bo = ROL32((Abu0^Du1), 4);
+ Ama0 = Ba ^((~Be)& Bi );
+ Age0 = Be ^((~Bi)& Bo );
+ Asi0 = Bi ^((~Bo)& Bu );
+ Ako1 = Bo ^((~Bu)& Ba );
+ Abu0 = Bu ^((~Ba)& Be );
+
+ Be = ROL32((Aka1^Da0), 18);
+ Bi = ROL32((Abe1^De0), 5);
+ Bo = ROL32((Ami0^Di1), 8);
+ Bu = ROL32((Ago1^Do0), 28);
+ Ba = ROL32((Asu1^Du1), 14);
+ Aka1 = Ba ^((~Be)& Bi );
+ Abe1 = Be ^((~Bi)& Bo );
+ Ami0 = Bi ^((~Bo)& Bu );
+ Ago1 = Bo ^((~Bu)& Ba );
+ Asu1 = Bu ^((~Ba)& Be );
+
+ Be = ROL32((Aka0^Da1), 18);
+ Bi = ROL32((Abe0^De1), 5);
+ Bo = ROL32((Ami1^Di0), 7);
+ Bu = ROL32((Ago0^Do1), 28);
+ Ba = ROL32((Asu0^Du0), 13);
+ Aka0 = Ba ^((~Be)& Bi );
+ Abe0 = Be ^((~Bi)& Bo );
+ Ami1 = Bi ^((~Bo)& Bu );
+ Ago0 = Bo ^((~Bu)& Ba );
+ Asu0 = Bu ^((~Ba)& Be );
+
+ Bo = ROL32((Aga1^Da1), 21);
+ Bu = ROL32((Ase0^De0), 1);
+ Ba = ROL32((Aki1^Di0), 31);
+ Be = ROL32((Abo1^Do1), 28);
+ Bi = ROL32((Amu1^Du1), 20);
+ Aga1 = Ba ^((~Be)& Bi );
+ Ase0 = Be ^((~Bi)& Bo );
+ Aki1 = Bi ^((~Bo)& Bu );
+ Abo1 = Bo ^((~Bu)& Ba );
+ Amu1 = Bu ^((~Ba)& Be );
+
+ Bo = ROL32((Aga0^Da0), 20);
+ Bu = ROL32((Ase1^De1), 1);
+ Ba = ROL32((Aki0^Di1), 31);
+ Be = ROL32((Abo0^Do0), 27);
+ Bi = ROL32((Amu0^Du0), 19);
+ Aga0 = Ba ^((~Be)& Bi );
+ Ase1 = Be ^((~Bi)& Bo );
+ Aki0 = Bi ^((~Bo)& Bu );
+ Abo0 = Bo ^((~Bu)& Ba );
+ Amu0 = Bu ^((~Ba)& Be );
+
+ KeccakAtoD_round2();
+
+ Ba = (Aba0^Da0);
+ Be = ROL32((Ake1^De0), 22);
+ Bi = ROL32((Asi0^Di1), 22);
+ Bo = ROL32((Ago0^Do1), 11);
+ Bu = ROL32((Amu1^Du0), 7);
+ Aba0 = Ba ^((~Be)& Bi );
+ Aba0 ^= *(pRoundConstants++);
+ Ake1 = Be ^((~Bi)& Bo );
+ Asi0 = Bi ^((~Bo)& Bu );
+ Ago0 = Bo ^((~Bu)& Ba );
+ Amu1 = Bu ^((~Ba)& Be );
+
+ Ba = (Aba1^Da1);
+ Be = ROL32((Ake0^De1), 22);
+ Bi = ROL32((Asi1^Di0), 21);
+ Bo = ROL32((Ago1^Do0), 10);
+ Bu = ROL32((Amu0^Du1), 7);
+ Aba1 = Ba ^((~Be)& Bi );
+ Aba1 ^= *(pRoundConstants++);
+ Ake0 = Be ^((~Bi)& Bo );
+ Asi1 = Bi ^((~Bo)& Bu );
+ Ago1 = Bo ^((~Bu)& Ba );
+ Amu0 = Bu ^((~Ba)& Be );
+
+ Bi = ROL32((Ama0^Da1), 2);
+ Bo = ROL32((Abe0^De1), 23);
+ Bu = ROL32((Aki0^Di1), 31);
+ Ba = ROL32((Aso1^Do0), 14);
+ Be = ROL32((Agu0^Du0), 10);
+ Ama0 = Ba ^((~Be)& Bi );
+ Abe0 = Be ^((~Bi)& Bo );
+ Aki0 = Bi ^((~Bo)& Bu );
+ Aso1 = Bo ^((~Bu)& Ba );
+ Agu0 = Bu ^((~Ba)& Be );
+
+ Bi = ROL32((Ama1^Da0), 1);
+ Bo = ROL32((Abe1^De0), 22);
+ Bu = ROL32((Aki1^Di0), 30);
+ Ba = ROL32((Aso0^Do1), 14);
+ Be = ROL32((Agu1^Du1), 10);
+ Ama1 = Ba ^((~Be)& Bi );
+ Abe1 = Be ^((~Bi)& Bo );
+ Aki1 = Bi ^((~Bo)& Bu );
+ Aso0 = Bo ^((~Bu)& Ba );
+ Agu1 = Bu ^((~Ba)& Be );
+
+ Bu = ROL32((Aga1^Da0), 9);
+ Ba = ROL32((Ame0^De1), 1);
+ Be = ROL32((Abi1^Di0), 3);
+ Bi = ROL32((Ako1^Do1), 13);
+ Bo = ROL32((Asu1^Du0), 4);
+ Aga1 = Ba ^((~Be)& Bi );
+ Ame0 = Be ^((~Bi)& Bo );
+ Abi1 = Bi ^((~Bo)& Bu );
+ Ako1 = Bo ^((~Bu)& Ba );
+ Asu1 = Bu ^((~Ba)& Be );
+
+ Bu = ROL32((Aga0^Da1), 9);
+ Ba = (Ame1^De0);
+ Be = ROL32((Abi0^Di1), 3);
+ Bi = ROL32((Ako0^Do0), 12);
+ Bo = ROL32((Asu0^Du1), 4);
+ Aga0 = Ba ^((~Be)& Bi );
+ Ame1 = Be ^((~Bi)& Bo );
+ Abi0 = Bi ^((~Bo)& Bu );
+ Ako0 = Bo ^((~Bu)& Ba );
+ Asu0 = Bu ^((~Ba)& Be );
+
+ Be = ROL32((Asa1^Da0), 18);
+ Bi = ROL32((Age1^De0), 5);
+ Bo = ROL32((Ami1^Di1), 8);
+ Bu = ROL32((Abo1^Do0), 28);
+ Ba = ROL32((Aku0^Du1), 14);
+ Asa1 = Ba ^((~Be)& Bi );
+ Age1 = Be ^((~Bi)& Bo );
+ Ami1 = Bi ^((~Bo)& Bu );
+ Abo1 = Bo ^((~Bu)& Ba );
+ Aku0 = Bu ^((~Ba)& Be );
+
+ Be = ROL32((Asa0^Da1), 18);
+ Bi = ROL32((Age0^De1), 5);
+ Bo = ROL32((Ami0^Di0), 7);
+ Bu = ROL32((Abo0^Do1), 28);
+ Ba = ROL32((Aku1^Du0), 13);
+ Asa0 = Ba ^((~Be)& Bi );
+ Age0 = Be ^((~Bi)& Bo );
+ Ami0 = Bi ^((~Bo)& Bu );
+ Abo0 = Bo ^((~Bu)& Ba );
+ Aku1 = Bu ^((~Ba)& Be );
+
+ Bo = ROL32((Aka0^Da1), 21);
+ Bu = ROL32((Ase0^De0), 1);
+ Ba = ROL32((Agi1^Di0), 31);
+ Be = ROL32((Amo0^Do1), 28);
+ Bi = ROL32((Abu0^Du1), 20);
+ Aka0 = Ba ^((~Be)& Bi );
+ Ase0 = Be ^((~Bi)& Bo );
+ Agi1 = Bi ^((~Bo)& Bu );
+ Amo0 = Bo ^((~Bu)& Ba );
+ Abu0 = Bu ^((~Ba)& Be );
+
+ Bo = ROL32((Aka1^Da0), 20);
+ Bu = ROL32((Ase1^De1), 1);
+ Ba = ROL32((Agi0^Di1), 31);
+ Be = ROL32((Amo1^Do0), 27);
+ Bi = ROL32((Abu1^Du0), 19);
+ Aka1 = Ba ^((~Be)& Bi );
+ Ase1 = Be ^((~Bi)& Bo );
+ Agi0 = Bi ^((~Bo)& Bu );
+ Amo1 = Bo ^((~Bu)& Ba );
+ Abu1 = Bu ^((~Ba)& Be );
+
+ KeccakAtoD_round3();
+
+ Ba = (Aba0^Da0);
+ Be = ROL32((Abe0^De0), 22);
+ Bi = ROL32((Abi0^Di1), 22);
+ Bo = ROL32((Abo0^Do1), 11);
+ Bu = ROL32((Abu0^Du0), 7);
+ Aba0 = Ba ^((~Be)& Bi );
+ Aba0 ^= *(pRoundConstants++);
+ Abe0 = Be ^((~Bi)& Bo );
+ Abi0 = Bi ^((~Bo)& Bu );
+ Abo0 = Bo ^((~Bu)& Ba );
+ Abu0 = Bu ^((~Ba)& Be );
+
+ Ba = (Aba1^Da1);
+ Be = ROL32((Abe1^De1), 22);
+ Bi = ROL32((Abi1^Di0), 21);
+ Bo = ROL32((Abo1^Do0), 10);
+ Bu = ROL32((Abu1^Du1), 7);
+ Aba1 = Ba ^((~Be)& Bi );
+ Aba1 ^= *(pRoundConstants++);
+ Abe1 = Be ^((~Bi)& Bo );
+ Abi1 = Bi ^((~Bo)& Bu );
+ Abo1 = Bo ^((~Bu)& Ba );
+ Abu1 = Bu ^((~Ba)& Be );
+
+ Bi = ROL32((Aga0^Da1), 2);
+ Bo = ROL32((Age0^De1), 23);
+ Bu = ROL32((Agi0^Di1), 31);
+ Ba = ROL32((Ago0^Do0), 14);
+ Be = ROL32((Agu0^Du0), 10);
+ Aga0 = Ba ^((~Be)& Bi );
+ Age0 = Be ^((~Bi)& Bo );
+ Agi0 = Bi ^((~Bo)& Bu );
+ Ago0 = Bo ^((~Bu)& Ba );
+ Agu0 = Bu ^((~Ba)& Be );
+
+ Bi = ROL32((Aga1^Da0), 1);
+ Bo = ROL32((Age1^De0), 22);
+ Bu = ROL32((Agi1^Di0), 30);
+ Ba = ROL32((Ago1^Do1), 14);
+ Be = ROL32((Agu1^Du1), 10);
+ Aga1 = Ba ^((~Be)& Bi );
+ Age1 = Be ^((~Bi)& Bo );
+ Agi1 = Bi ^((~Bo)& Bu );
+ Ago1 = Bo ^((~Bu)& Ba );
+ Agu1 = Bu ^((~Ba)& Be );
+
+ Bu = ROL32((Aka0^Da0), 9);
+ Ba = ROL32((Ake0^De1), 1);
+ Be = ROL32((Aki0^Di0), 3);
+ Bi = ROL32((Ako0^Do1), 13);
+ Bo = ROL32((Aku0^Du0), 4);
+ Aka0 = Ba ^((~Be)& Bi );
+ Ake0 = Be ^((~Bi)& Bo );
+ Aki0 = Bi ^((~Bo)& Bu );
+ Ako0 = Bo ^((~Bu)& Ba );
+ Aku0 = Bu ^((~Ba)& Be );
+
+ Bu = ROL32((Aka1^Da1), 9);
+ Ba = (Ake1^De0);
+ Be = ROL32((Aki1^Di1), 3);
+ Bi = ROL32((Ako1^Do0), 12);
+ Bo = ROL32((Aku1^Du1), 4);
+ Aka1 = Ba ^((~Be)& Bi );
+ Ake1 = Be ^((~Bi)& Bo );
+ Aki1 = Bi ^((~Bo)& Bu );
+ Ako1 = Bo ^((~Bu)& Ba );
+ Aku1 = Bu ^((~Ba)& Be );
+
+ Be = ROL32((Ama0^Da0), 18);
+ Bi = ROL32((Ame0^De0), 5);
+ Bo = ROL32((Ami0^Di1), 8);
+ Bu = ROL32((Amo0^Do0), 28);
+ Ba = ROL32((Amu0^Du1), 14);
+ Ama0 = Ba ^((~Be)& Bi );
+ Ame0 = Be ^((~Bi)& Bo );
+ Ami0 = Bi ^((~Bo)& Bu );
+ Amo0 = Bo ^((~Bu)& Ba );
+ Amu0 = Bu ^((~Ba)& Be );
+
+ Be = ROL32((Ama1^Da1), 18);
+ Bi = ROL32((Ame1^De1), 5);
+ Bo = ROL32((Ami1^Di0), 7);
+ Bu = ROL32((Amo1^Do1), 28);
+ Ba = ROL32((Amu1^Du0), 13);
+ Ama1 = Ba ^((~Be)& Bi );
+ Ame1 = Be ^((~Bi)& Bo );
+ Ami1 = Bi ^((~Bo)& Bu );
+ Amo1 = Bo ^((~Bu)& Ba );
+ Amu1 = Bu ^((~Ba)& Be );
+
+ Bo = ROL32((Asa0^Da1), 21);
+ Bu = ROL32((Ase0^De0), 1);
+ Ba = ROL32((Asi0^Di0), 31);
+ Be = ROL32((Aso0^Do1), 28);
+ Bi = ROL32((Asu0^Du1), 20);
+ Asa0 = Ba ^((~Be)& Bi );
+ Ase0 = Be ^((~Bi)& Bo );
+ Asi0 = Bi ^((~Bo)& Bu );
+ Aso0 = Bo ^((~Bu)& Ba );
+ Asu0 = Bu ^((~Ba)& Be );
+
+ Bo = ROL32((Asa1^Da0), 20);
+ Bu = ROL32((Ase1^De1), 1);
+ Ba = ROL32((Asi1^Di1), 31);
+ Be = ROL32((Aso1^Do0), 27);
+ Bi = ROL32((Asu1^Du0), 19);
+ Asa1 = Ba ^((~Be)& Bi );
+ Ase1 = Be ^((~Bi)& Bo );
+ Asi1 = Bi ^((~Bo)& Bu );
+ Aso1 = Bo ^((~Bu)& Ba );
+ Asu1 = Bu ^((~Ba)& Be );
+ }
+ while ( *pRoundConstants != 0xFF );
+
+ #undef Aba0
+ #undef Aba1
+ #undef Abe0
+ #undef Abe1
+ #undef Abi0
+ #undef Abi1
+ #undef Abo0
+ #undef Abo1
+ #undef Abu0
+ #undef Abu1
+ #undef Aga0
+ #undef Aga1
+ #undef Age0
+ #undef Age1
+ #undef Agi0
+ #undef Agi1
+ #undef Ago0
+ #undef Ago1
+ #undef Agu0
+ #undef Agu1
+ #undef Aka0
+ #undef Aka1
+ #undef Ake0
+ #undef Ake1
+ #undef Aki0
+ #undef Aki1
+ #undef Ako0
+ #undef Ako1
+ #undef Aku0
+ #undef Aku1
+ #undef Ama0
+ #undef Ama1
+ #undef Ame0
+ #undef Ame1
+ #undef Ami0
+ #undef Ami1
+ #undef Amo0
+ #undef Amo1
+ #undef Amu0
+ #undef Amu1
+ #undef Asa0
+ #undef Asa1
+ #undef Ase0
+ #undef Ase1
+ #undef Asi0
+ #undef Asi1
+ #undef Aso0
+ #undef Aso1
+ #undef Asu0
+ #undef Asu1
+ }
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+ KeccakP1600_Permute_Nrounds(state, 12);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_24rounds(void *state)
+{
+ KeccakP1600_Permute_Nrounds(state, 24);
+}
diff --git a/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h b/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h
new file mode 100644
index 0000000000..9501c64b18
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h
@@ -0,0 +1,3 @@
+#define KeccakP1600_implementation_config "lane complementing, all rounds unrolled"
+#define KeccakP1600_fullUnrolling
+#define KeccakP1600_useLaneComplementing
diff --git a/Modules/_sha3/kcp/KeccakP-1600-opt64.c b/Modules/_sha3/kcp/KeccakP-1600-opt64.c
new file mode 100644
index 0000000000..c90010dd92
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-opt64.c
@@ -0,0 +1,474 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include <stdlib.h>
+/* #include "brg_endian.h" */
+#include "KeccakP-1600-opt64-config.h"
+
+#if NOT_PYTHON
+typedef unsigned char UINT8;
+/* typedef unsigned long long int UINT64; */
+#endif
+
+#if defined(KeccakP1600_useLaneComplementing)
+#define UseBebigokimisa
+#endif
+
+#if defined(_MSC_VER)
+#define ROL64(a, offset) _rotl64(a, offset)
+#elif defined(KeccakP1600_useSHLD)
+ #define ROL64(x,N) ({ \
+ register UINT64 __out; \
+ register UINT64 __in = x; \
+ __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \
+ __out; \
+ })
+#else
+#define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset)))
+#endif
+
+#include "KeccakP-1600-64.macros"
+#ifdef KeccakP1600_fullUnrolling
+#define FullUnrolling
+#else
+#define Unrolling KeccakP1600_unrolling
+#endif
+#include "KeccakP-1600-unrolling.macros"
+#include "SnP-Relaned.h"
+
+static const UINT64 KeccakF1600RoundConstants[24] = {
+ 0x0000000000000001ULL,
+ 0x0000000000008082ULL,
+ 0x800000000000808aULL,
+ 0x8000000080008000ULL,
+ 0x000000000000808bULL,
+ 0x0000000080000001ULL,
+ 0x8000000080008081ULL,
+ 0x8000000000008009ULL,
+ 0x000000000000008aULL,
+ 0x0000000000000088ULL,
+ 0x0000000080008009ULL,
+ 0x000000008000000aULL,
+ 0x000000008000808bULL,
+ 0x800000000000008bULL,
+ 0x8000000000008089ULL,
+ 0x8000000000008003ULL,
+ 0x8000000000008002ULL,
+ 0x8000000000000080ULL,
+ 0x000000000000800aULL,
+ 0x800000008000000aULL,
+ 0x8000000080008081ULL,
+ 0x8000000000008080ULL,
+ 0x0000000080000001ULL,
+ 0x8000000080008008ULL };
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Initialize(void *state)
+{
+ memset(state, 0, 200);
+#ifdef KeccakP1600_useLaneComplementing
+ ((UINT64*)state)[ 1] = ~(UINT64)0;
+ ((UINT64*)state)[ 2] = ~(UINT64)0;
+ ((UINT64*)state)[ 8] = ~(UINT64)0;
+ ((UINT64*)state)[12] = ~(UINT64)0;
+ ((UINT64*)state)[17] = ~(UINT64)0;
+ ((UINT64*)state)[20] = ~(UINT64)0;
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ UINT64 lane;
+ if (length == 0)
+ return;
+ if (length == 1)
+ lane = data[0];
+ else {
+ lane = 0;
+ memcpy(&lane, data, length);
+ }
+ lane <<= offset*8;
+#else
+ UINT64 lane = 0;
+ unsigned int i;
+ for(i=0; i<length; i++)
+ lane |= ((UINT64)data[i]) << ((i+offset)*8);
+#endif
+ ((UINT64*)state)[lanePosition] ^= lane;
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ unsigned int i = 0;
+#ifdef NO_MISALIGNED_ACCESSES
+ /* If either pointer is misaligned, fall back to byte-wise xor. */
+
+ if (((((uintptr_t)state) & 7) != 0) || ((((uintptr_t)data) & 7) != 0)) {
+ for (i = 0; i < laneCount * 8; i++) {
+ ((unsigned char*)state)[i] ^= data[i];
+ }
+ }
+ else
+#endif
+ {
+ /* Otherwise... */
+
+ for( ; (i+8)<=laneCount; i+=8) {
+ ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+ ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+ ((UINT64*)state)[i+2] ^= ((UINT64*)data)[i+2];
+ ((UINT64*)state)[i+3] ^= ((UINT64*)data)[i+3];
+ ((UINT64*)state)[i+4] ^= ((UINT64*)data)[i+4];
+ ((UINT64*)state)[i+5] ^= ((UINT64*)data)[i+5];
+ ((UINT64*)state)[i+6] ^= ((UINT64*)data)[i+6];
+ ((UINT64*)state)[i+7] ^= ((UINT64*)data)[i+7];
+ }
+ for( ; (i+4)<=laneCount; i+=4) {
+ ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+ ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+ ((UINT64*)state)[i+2] ^= ((UINT64*)data)[i+2];
+ ((UINT64*)state)[i+3] ^= ((UINT64*)data)[i+3];
+ }
+ for( ; (i+2)<=laneCount; i+=2) {
+ ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+ ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+ }
+ if (i<laneCount) {
+ ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+ }
+ }
+#else
+ unsigned int i;
+ UINT8 *curData = data;
+ for(i=0; i<laneCount; i++, curData+=8) {
+ UINT64 lane = (UINT64)curData[0]
+ | ((UINT64)curData[1] << 8)
+ | ((UINT64)curData[2] << 16)
+ | ((UINT64)curData[3] << 24)
+ | ((UINT64)curData[4] <<32)
+ | ((UINT64)curData[5] << 40)
+ | ((UINT64)curData[6] << 48)
+ | ((UINT64)curData[7] << 56);
+ ((UINT64*)state)[i] ^= lane;
+ }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+ UINT64 lane = byte;
+ lane <<= (offset%8)*8;
+ ((UINT64*)state)[offset/8] ^= lane;
+}
+#endif
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+ if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20)) {
+ unsigned int i;
+ for(i=0; i<length; i++)
+ ((unsigned char*)state)[lanePosition*8+offset+i] = ~data[i];
+ }
+ else
+#endif
+ {
+ memcpy((unsigned char*)state+lanePosition*8+offset, data, length);
+ }
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+ unsigned int lanePosition;
+
+ for(lanePosition=0; lanePosition<laneCount; lanePosition++)
+ if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+ ((UINT64*)state)[lanePosition] = ~((const UINT64*)data)[lanePosition];
+ else
+ ((UINT64*)state)[lanePosition] = ((const UINT64*)data)[lanePosition];
+#else
+ memcpy(state, data, laneCount*8);
+#endif
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ SnP_OverwriteBytes(state, data, offset, length, KeccakP1600_OverwriteLanes, KeccakP1600_OverwriteBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+ unsigned int lanePosition;
+
+ for(lanePosition=0; lanePosition<byteCount/8; lanePosition++)
+ if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+ ((UINT64*)state)[lanePosition] = ~0;
+ else
+ ((UINT64*)state)[lanePosition] = 0;
+ if (byteCount%8 != 0) {
+ lanePosition = byteCount/8;
+ if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+ memset((unsigned char*)state+lanePosition*8, 0xFF, byteCount%8);
+ else
+ memset((unsigned char*)state+lanePosition*8, 0, byteCount%8);
+ }
+#else
+ memset(state, 0, byteCount);
+#endif
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_24rounds(void *state)
+{
+ declareABCDE
+ #ifndef KeccakP1600_fullUnrolling
+ unsigned int i;
+ #endif
+ UINT64 *stateAsLanes = (UINT64*)state;
+
+ copyFromState(A, stateAsLanes)
+ rounds24
+ copyToState(stateAsLanes, A)
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+ declareABCDE
+ #ifndef KeccakP1600_fullUnrolling
+ unsigned int i;
+ #endif
+ UINT64 *stateAsLanes = (UINT64*)state;
+
+ copyFromState(A, stateAsLanes)
+ rounds12
+ copyToState(stateAsLanes, A)
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+ UINT64 lane = ((UINT64*)state)[lanePosition];
+#ifdef KeccakP1600_useLaneComplementing
+ if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+ lane = ~lane;
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ {
+ UINT64 lane1[1];
+ lane1[0] = lane;
+ memcpy(data, (UINT8*)lane1+offset, length);
+ }
+#else
+ unsigned int i;
+ lane >>= offset*8;
+ for(i=0; i<length; i++) {
+ data[i] = lane & 0xFF;
+ lane >>= 8;
+ }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+void fromWordToBytes(UINT8 *bytes, const UINT64 word)
+{
+ unsigned int i;
+
+ for(i=0; i<(64/8); i++)
+ bytes[i] = (word >> (8*i)) & 0xFF;
+}
+#endif
+
+void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ memcpy(data, state, laneCount*8);
+#else
+ unsigned int i;
+
+ for(i=0; i<laneCount; i++)
+ fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
+#endif
+#ifdef KeccakP1600_useLaneComplementing
+ if (laneCount > 1) {
+ ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+ if (laneCount > 2) {
+ ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+ if (laneCount > 8) {
+ ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+ if (laneCount > 12) {
+ ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+ if (laneCount > 17) {
+ ((UINT64*)data)[17] = ~((UINT64*)data)[17];
+ if (laneCount > 20) {
+ ((UINT64*)data)[20] = ~((UINT64*)data)[20];
+ }
+ }
+ }
+ }
+ }
+ }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+ SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+ UINT64 lane = ((UINT64*)state)[lanePosition];
+#ifdef KeccakP1600_useLaneComplementing
+ if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+ lane = ~lane;
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ {
+ unsigned int i;
+ UINT64 lane1[1];
+ lane1[0] = lane;
+ for(i=0; i<length; i++)
+ output[i] = input[i] ^ ((UINT8*)lane1)[offset+i];
+ }
+#else
+ unsigned int i;
+ lane >>= offset*8;
+ for(i=0; i<length; i++) {
+ output[i] = input[i] ^ (lane & 0xFF);
+ lane >>= 8;
+ }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *input, unsigned char *output, unsigned int laneCount)
+{
+ unsigned int i;
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+ unsigned char temp[8];
+ unsigned int j;
+#endif
+
+ for(i=0; i<laneCount; i++) {
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+ ((UINT64*)output)[i] = ((UINT64*)input)[i] ^ ((const UINT64*)state)[i];
+#else
+ fromWordToBytes(temp, ((const UINT64*)state)[i]);
+ for(j=0; j<8; j++)
+ output[i*8+j] = input[i*8+j] ^ temp[j];
+#endif
+ }
+#ifdef KeccakP1600_useLaneComplementing
+ if (laneCount > 1) {
+ ((UINT64*)output)[ 1] = ~((UINT64*)output)[ 1];
+ if (laneCount > 2) {
+ ((UINT64*)output)[ 2] = ~((UINT64*)output)[ 2];
+ if (laneCount > 8) {
+ ((UINT64*)output)[ 8] = ~((UINT64*)output)[ 8];
+ if (laneCount > 12) {
+ ((UINT64*)output)[12] = ~((UINT64*)output)[12];
+ if (laneCount > 17) {
+ ((UINT64*)output)[17] = ~((UINT64*)output)[17];
+ if (laneCount > 20) {
+ ((UINT64*)output)[20] = ~((UINT64*)output)[20];
+ }
+ }
+ }
+ }
+ }
+ }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+ SnP_ExtractAndAddBytes(state, input, output, offset, length, KeccakP1600_ExtractAndAddLanes, KeccakP1600_ExtractAndAddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
+{
+ size_t originalDataByteLen = dataByteLen;
+ declareABCDE
+ #ifndef KeccakP1600_fullUnrolling
+ unsigned int i;
+ #endif
+ UINT64 *stateAsLanes = (UINT64*)state;
+ UINT64 *inDataAsLanes = (UINT64*)data;
+
+ copyFromState(A, stateAsLanes)
+ while(dataByteLen >= laneCount*8) {
+ addInput(A, inDataAsLanes, laneCount)
+ rounds24
+ inDataAsLanes += laneCount;
+ dataByteLen -= laneCount*8;
+ }
+ copyToState(stateAsLanes, A)
+ return originalDataByteLen - dataByteLen;
+}
diff --git a/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros b/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros
new file mode 100644
index 0000000000..405ce29724
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros
@@ -0,0 +1,185 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#if (defined(FullUnrolling))
+#define rounds24 \
+ prepareTheta \
+ thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
+ thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
+ thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
+ thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
+ thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(10, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(11, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+ thetaRhoPiChiIota(23, E, A) \
+
+#define rounds12 \
+ prepareTheta \
+ thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+ thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 12)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=12) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+ thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 6)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=6) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i+=6) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+ } \
+
+#elif (Unrolling == 4)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=4) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i+=4) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+ } \
+
+#elif (Unrolling == 3)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=3) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ copyStateVariables(A, E) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i+=3) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+ copyStateVariables(A, E) \
+ } \
+
+#elif (Unrolling == 2)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i+=2) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i+=2) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+ } \
+
+#elif (Unrolling == 1)
+#define rounds24 \
+ prepareTheta \
+ for(i=0; i<24; i++) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ copyStateVariables(A, E) \
+ } \
+
+#define rounds12 \
+ prepareTheta \
+ for(i=12; i<24; i++) { \
+ thetaRhoPiChiIotaPrepareTheta(i , A, E) \
+ copyStateVariables(A, E) \
+ } \
+
+#else
+#error "Unrolling is not correctly specified!"
+#endif
diff --git a/Modules/_sha3/kcp/KeccakSponge.c b/Modules/_sha3/kcp/KeccakSponge.c
new file mode 100644
index 0000000000..afdb73172f
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakSponge.c
@@ -0,0 +1,92 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include "KeccakSponge.h"
+
+#ifdef KeccakReference
+ #include "displayIntermediateValues.h"
+#endif
+
+#ifndef KeccakP200_excluded
+ #include "KeccakP-200-SnP.h"
+
+ #define prefix KeccakWidth200
+ #define SnP KeccakP200
+ #define SnP_width 200
+ #define SnP_Permute KeccakP200_Permute_18rounds
+ #if defined(KeccakF200_FastLoop_supported)
+ #define SnP_FastLoop_Absorb KeccakF200_FastLoop_Absorb
+ #endif
+ #include "KeccakSponge.inc"
+ #undef prefix
+ #undef SnP
+ #undef SnP_width
+ #undef SnP_Permute
+ #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP400_excluded
+ #include "KeccakP-400-SnP.h"
+
+ #define prefix KeccakWidth400
+ #define SnP KeccakP400
+ #define SnP_width 400
+ #define SnP_Permute KeccakP400_Permute_20rounds
+ #if defined(KeccakF400_FastLoop_supported)
+ #define SnP_FastLoop_Absorb KeccakF400_FastLoop_Absorb
+ #endif
+ #include "KeccakSponge.inc"
+ #undef prefix
+ #undef SnP
+ #undef SnP_width
+ #undef SnP_Permute
+ #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP800_excluded
+ #include "KeccakP-800-SnP.h"
+
+ #define prefix KeccakWidth800
+ #define SnP KeccakP800
+ #define SnP_width 800
+ #define SnP_Permute KeccakP800_Permute_22rounds
+ #if defined(KeccakF800_FastLoop_supported)
+ #define SnP_FastLoop_Absorb KeccakF800_FastLoop_Absorb
+ #endif
+ #include "KeccakSponge.inc"
+ #undef prefix
+ #undef SnP
+ #undef SnP_width
+ #undef SnP_Permute
+ #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP1600_excluded
+ #include "KeccakP-1600-SnP.h"
+
+ #define prefix KeccakWidth1600
+ #define SnP KeccakP1600
+ #define SnP_width 1600
+ #define SnP_Permute KeccakP1600_Permute_24rounds
+ #if defined(KeccakF1600_FastLoop_supported)
+ #define SnP_FastLoop_Absorb KeccakF1600_FastLoop_Absorb
+ #endif
+ #include "KeccakSponge.inc"
+ #undef prefix
+ #undef SnP
+ #undef SnP_width
+ #undef SnP_Permute
+ #undef SnP_FastLoop_Absorb
+#endif
diff --git a/Modules/_sha3/kcp/KeccakSponge.h b/Modules/_sha3/kcp/KeccakSponge.h
new file mode 100644
index 0000000000..0f4badcac0
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakSponge.h
@@ -0,0 +1,172 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakSponge_h_
+#define _KeccakSponge_h_
+
+/** General information
+ *
+ * The following type and functions are not actually implemented. Their
+ * documentation is generic, with the prefix Prefix replaced by
+ * - KeccakWidth200 for a sponge function based on Keccak-f[200]
+ * - KeccakWidth400 for a sponge function based on Keccak-f[400]
+ * - KeccakWidth800 for a sponge function based on Keccak-f[800]
+ * - KeccakWidth1600 for a sponge function based on Keccak-f[1600]
+ *
+ * In all these functions, the rate and capacity must sum to the width of the
+ * chosen permutation. For instance, to use the sponge function
+ * Keccak[r=1344, c=256], one must use KeccakWidth1600_Sponge() or a combination
+ * of KeccakWidth1600_SpongeInitialize(), KeccakWidth1600_SpongeAbsorb(),
+ * KeccakWidth1600_SpongeAbsorbLastFewBits() and
+ * KeccakWidth1600_SpongeSqueeze().
+ *
+ * The Prefix_SpongeInstance contains the sponge instance attributes for use
+ * with the Prefix_Sponge* functions.
+ * It gathers the state processed by the permutation as well as the rate,
+ * the position of input/output bytes in the state and the phase
+ * (absorbing or squeezing).
+ */
+
+#ifdef DontReallyInclude_DocumentationOnly
+/** Function to evaluate the sponge function Keccak[r, c] in a single call.
+ * @param rate The value of the rate r.
+ * @param capacity The value of the capacity c.
+ * @param input Pointer to the input message (before the suffix).
+ * @param inputByteLen The length of the input message in bytes.
+ * @param suffix Byte containing from 0 to 7 suffix bits
+ * that must be absorbed after @a input.
+ * These <i>n</i> bits must be in the least significant bit positions.
+ * These bits must be delimited with a bit 1 at position <i>n</i>
+ * (counting from 0=LSB to 7=MSB) and followed by bits 0
+ * from position <i>n</i>+1 to position 7.
+ * Some examples:
+ * - If no bits are to be absorbed, then @a suffix must be 0x01.
+ * - If the 2-bit sequence 0,0 is to be absorbed, @a suffix must be 0x04.
+ * - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a suffix must be 0x32.
+ * - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a suffix must be 0x8B.
+ * .
+ * @param output Pointer to the output buffer.
+ * @param outputByteLen The desired number of output bytes.
+ * @pre One must have r+c equal to the supported width of this implementation
+ * and the rate a multiple of 8 bits (one byte) in this implementation.
+ * @pre @a suffix ≠ 0x00
+ * @return Zero if successful, 1 otherwise.
+ */
+int Prefix_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen);
+
+/**
+ * Function to initialize the state of the Keccak[r, c] sponge function.
+ * The phase of the sponge function is set to absorbing.
+ * @param spongeInstance Pointer to the sponge instance to be initialized.
+ * @param rate The value of the rate r.
+ * @param capacity The value of the capacity c.
+ * @pre One must have r+c equal to the supported width of this implementation
+ * and the rate a multiple of 8 bits (one byte) in this implementation.
+ * @return Zero if successful, 1 otherwise.
+ */
+int Prefix_SpongeInitialize(Prefix_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity);
+
+/**
+ * Function to give input data bytes for the sponge function to absorb.
+ * @param spongeInstance Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+ * @param data Pointer to the input data.
+ * @param dataByteLen The number of input bytes provided in the input data.
+ * @pre The sponge function must be in the absorbing phase,
+ * i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
+ * must not have been called before.
+ * @return Zero if successful, 1 otherwise.
+ */
+int Prefix_SpongeAbsorb(Prefix_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen);
+
+/**
+ * Function to give input data bits for the sponge function to absorb
+ * and then to switch to the squeezing phase.
+ * @param spongeInstance Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+ * @param delimitedData Byte containing from 0 to 7 trailing bits
+ * that must be absorbed.
+ * These <i>n</i> bits must be in the least significant bit positions.
+ * These bits must be delimited with a bit 1 at position <i>n</i>
+ * (counting from 0=LSB to 7=MSB) and followed by bits 0
+ * from position <i>n</i>+1 to position 7.
+ * Some examples:
+ * - If no bits are to be absorbed, then @a delimitedData must be 0x01.
+ * - If the 2-bit sequence 0,0 is to be absorbed, @a delimitedData must be 0x04.
+ * - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a delimitedData must be 0x32.
+ * - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a delimitedData must be 0x8B.
+ * .
+ * @pre The sponge function must be in the absorbing phase,
+ * i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
+ * must not have been called before.
+ * @pre @a delimitedData ≠ 0x00
+ * @return Zero if successful, 1 otherwise.
+ */
+int Prefix_SpongeAbsorbLastFewBits(Prefix_SpongeInstance *spongeInstance, unsigned char delimitedData);
+
+/**
+ * Function to squeeze output data from the sponge function.
+ * If the sponge function was in the absorbing phase, this function
+ * switches it to the squeezing phase
+ * as if Prefix_SpongeAbsorbLastFewBits(spongeInstance, 0x01) was called.
+ * @param spongeInstance Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+ * @param data Pointer to the buffer where to store the output data.
+ * @param dataByteLen The number of output bytes desired.
+ * @return Zero if successful, 1 otherwise.
+ */
+int Prefix_SpongeSqueeze(Prefix_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+#endif
+
+#include <string.h>
+#include "align.h"
+
+#define KCP_DeclareSpongeStructure(prefix, size, alignment) \
+ ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \
+ unsigned char state[size]; \
+ unsigned int rate; \
+ unsigned int byteIOIndex; \
+ int squeezing; \
+ } prefix##_SpongeInstance;
+
+#define KCP_DeclareSpongeFunctions(prefix) \
+ int prefix##_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen); \
+ int prefix##_SpongeInitialize(prefix##_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity); \
+ int prefix##_SpongeAbsorb(prefix##_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen); \
+ int prefix##_SpongeAbsorbLastFewBits(prefix##_SpongeInstance *spongeInstance, unsigned char delimitedData); \
+ int prefix##_SpongeSqueeze(prefix##_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+
+#ifndef KeccakP200_excluded
+ #include "KeccakP-200-SnP.h"
+ KCP_DeclareSpongeStructure(KeccakWidth200, KeccakP200_stateSizeInBytes, KeccakP200_stateAlignment)
+ KCP_DeclareSpongeFunctions(KeccakWidth200)
+#endif
+
+#ifndef KeccakP400_excluded
+ #include "KeccakP-400-SnP.h"
+ KCP_DeclareSpongeStructure(KeccakWidth400, KeccakP400_stateSizeInBytes, KeccakP400_stateAlignment)
+ KCP_DeclareSpongeFunctions(KeccakWidth400)
+#endif
+
+#ifndef KeccakP800_excluded
+ #include "KeccakP-800-SnP.h"
+ KCP_DeclareSpongeStructure(KeccakWidth800, KeccakP800_stateSizeInBytes, KeccakP800_stateAlignment)
+ KCP_DeclareSpongeFunctions(KeccakWidth800)
+#endif
+
+#ifndef KeccakP1600_excluded
+ #include "KeccakP-1600-SnP.h"
+ KCP_DeclareSpongeStructure(KeccakWidth1600, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+ KCP_DeclareSpongeFunctions(KeccakWidth1600)
+#endif
+
+#endif
diff --git a/Modules/_sha3/kcp/KeccakSponge.inc b/Modules/_sha3/kcp/KeccakSponge.inc
new file mode 100644
index 0000000000..e10739deaf
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakSponge.inc
@@ -0,0 +1,332 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define JOIN0(a, b) a ## b
+#define JOIN(a, b) JOIN0(a, b)
+
+#define Sponge JOIN(prefix, _Sponge)
+#define SpongeInstance JOIN(prefix, _SpongeInstance)
+#define SpongeInitialize JOIN(prefix, _SpongeInitialize)
+#define SpongeAbsorb JOIN(prefix, _SpongeAbsorb)
+#define SpongeAbsorbLastFewBits JOIN(prefix, _SpongeAbsorbLastFewBits)
+#define SpongeSqueeze JOIN(prefix, _SpongeSqueeze)
+
+#define SnP_stateSizeInBytes JOIN(SnP, _stateSizeInBytes)
+#define SnP_stateAlignment JOIN(SnP, _stateAlignment)
+#define SnP_StaticInitialize JOIN(SnP, _StaticInitialize)
+#define SnP_Initialize JOIN(SnP, _Initialize)
+#define SnP_AddByte JOIN(SnP, _AddByte)
+#define SnP_AddBytes JOIN(SnP, _AddBytes)
+#define SnP_ExtractBytes JOIN(SnP, _ExtractBytes)
+
+int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen)
+{
+ ALIGN(SnP_stateAlignment) unsigned char state[SnP_stateSizeInBytes];
+ unsigned int partialBlock;
+ const unsigned char *curInput = input;
+ unsigned char *curOutput = output;
+ unsigned int rateInBytes = rate/8;
+
+ if (rate+capacity != SnP_width)
+ return 1;
+ if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+ return 1;
+ if (suffix == 0)
+ return 1;
+
+ /* Initialize the state */
+
+ SnP_StaticInitialize();
+ SnP_Initialize(state);
+
+ /* First, absorb whole blocks */
+
+#ifdef SnP_FastLoop_Absorb
+ if (((rateInBytes % (SnP_width/200)) == 0) && (inputByteLen >= rateInBytes)) {
+ /* fast lane: whole lane rate */
+
+ size_t j;
+ j = SnP_FastLoop_Absorb(state, rateInBytes/(SnP_width/200), curInput, inputByteLen);
+ curInput += j;
+ inputByteLen -= j;
+ }
+#endif
+ while(inputByteLen >= (size_t)rateInBytes) {
+ #ifdef KeccakReference
+ displayBytes(1, "Block to be absorbed", curInput, rateInBytes);
+ #endif
+ SnP_AddBytes(state, curInput, 0, rateInBytes);
+ SnP_Permute(state);
+ curInput += rateInBytes;
+ inputByteLen -= rateInBytes;
+ }
+
+ /* Then, absorb what remains */
+
+ partialBlock = (unsigned int)inputByteLen;
+ #ifdef KeccakReference
+ displayBytes(1, "Block to be absorbed (part)", curInput, partialBlock);
+ #endif
+ SnP_AddBytes(state, curInput, 0, partialBlock);
+
+ /* Finally, absorb the suffix */
+
+ #ifdef KeccakReference
+ {
+ unsigned char delimitedData1[1];
+ delimitedData1[0] = suffix;
+ displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+ }
+ #endif
+ /* Last few bits, whose delimiter coincides with first bit of padding */
+
+ SnP_AddByte(state, suffix, partialBlock);
+ /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+
+ if ((suffix >= 0x80) && (partialBlock == (rateInBytes-1)))
+ SnP_Permute(state);
+ /* Second bit of padding */
+
+ SnP_AddByte(state, 0x80, rateInBytes-1);
+ #ifdef KeccakReference
+ {
+ unsigned char block[SnP_width/8];
+ memset(block, 0, SnP_width/8);
+ block[rateInBytes-1] = 0x80;
+ displayBytes(1, "Second bit of padding", block, rateInBytes);
+ }
+ #endif
+ SnP_Permute(state);
+ #ifdef KeccakReference
+ displayText(1, "--- Switching to squeezing phase ---");
+ #endif
+
+ /* First, output whole blocks */
+
+ while(outputByteLen > (size_t)rateInBytes) {
+ SnP_ExtractBytes(state, curOutput, 0, rateInBytes);
+ SnP_Permute(state);
+ #ifdef KeccakReference
+ displayBytes(1, "Squeezed block", curOutput, rateInBytes);
+ #endif
+ curOutput += rateInBytes;
+ outputByteLen -= rateInBytes;
+ }
+
+ /* Finally, output what remains */
+
+ partialBlock = (unsigned int)outputByteLen;
+ SnP_ExtractBytes(state, curOutput, 0, partialBlock);
+ #ifdef KeccakReference
+ displayBytes(1, "Squeezed block (part)", curOutput, partialBlock);
+ #endif
+
+ return 0;
+}
+
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+
+int SpongeInitialize(SpongeInstance *instance, unsigned int rate, unsigned int capacity)
+{
+ if (rate+capacity != SnP_width)
+ return 1;
+ if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+ return 1;
+ SnP_StaticInitialize();
+ SnP_Initialize(instance->state);
+ instance->rate = rate;
+ instance->byteIOIndex = 0;
+ instance->squeezing = 0;
+
+ return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorb(SpongeInstance *instance, const unsigned char *data, size_t dataByteLen)
+{
+ size_t i, j;
+ unsigned int partialBlock;
+ const unsigned char *curData;
+ unsigned int rateInBytes = instance->rate/8;
+
+ if (instance->squeezing)
+ return 1; /* Too late for additional input */
+
+
+ i = 0;
+ curData = data;
+ while(i < dataByteLen) {
+ if ((instance->byteIOIndex == 0) && (dataByteLen >= (i + rateInBytes))) {
+#ifdef SnP_FastLoop_Absorb
+ /* processing full blocks first */
+
+ if ((rateInBytes % (SnP_width/200)) == 0) {
+ /* fast lane: whole lane rate */
+
+ j = SnP_FastLoop_Absorb(instance->state, rateInBytes/(SnP_width/200), curData, dataByteLen - i);
+ i += j;
+ curData += j;
+ }
+ else {
+#endif
+ for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+ #ifdef KeccakReference
+ displayBytes(1, "Block to be absorbed", curData, rateInBytes);
+ #endif
+ SnP_AddBytes(instance->state, curData, 0, rateInBytes);
+ SnP_Permute(instance->state);
+ curData+=rateInBytes;
+ }
+ i = dataByteLen - j;
+#ifdef SnP_FastLoop_Absorb
+ }
+#endif
+ }
+ else {
+ /* normal lane: using the message queue */
+
+ partialBlock = (unsigned int)(dataByteLen - i);
+ if (partialBlock+instance->byteIOIndex > rateInBytes)
+ partialBlock = rateInBytes-instance->byteIOIndex;
+ #ifdef KeccakReference
+ displayBytes(1, "Block to be absorbed (part)", curData, partialBlock);
+ #endif
+ i += partialBlock;
+
+ SnP_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+ curData += partialBlock;
+ instance->byteIOIndex += partialBlock;
+ if (instance->byteIOIndex == rateInBytes) {
+ SnP_Permute(instance->state);
+ instance->byteIOIndex = 0;
+ }
+ }
+ }
+ return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorbLastFewBits(SpongeInstance *instance, unsigned char delimitedData)
+{
+ unsigned int rateInBytes = instance->rate/8;
+
+ if (delimitedData == 0)
+ return 1;
+ if (instance->squeezing)
+ return 1; /* Too late for additional input */
+
+
+ #ifdef KeccakReference
+ {
+ unsigned char delimitedData1[1];
+ delimitedData1[0] = delimitedData;
+ displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+ }
+ #endif
+ /* Last few bits, whose delimiter coincides with first bit of padding */
+
+ SnP_AddByte(instance->state, delimitedData, instance->byteIOIndex);
+ /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+
+ if ((delimitedData >= 0x80) && (instance->byteIOIndex == (rateInBytes-1)))
+ SnP_Permute(instance->state);
+ /* Second bit of padding */
+
+ SnP_AddByte(instance->state, 0x80, rateInBytes-1);
+ #ifdef KeccakReference
+ {
+ unsigned char block[SnP_width/8];
+ memset(block, 0, SnP_width/8);
+ block[rateInBytes-1] = 0x80;
+ displayBytes(1, "Second bit of padding", block, rateInBytes);
+ }
+ #endif
+ SnP_Permute(instance->state);
+ instance->byteIOIndex = 0;
+ instance->squeezing = 1;
+ #ifdef KeccakReference
+ displayText(1, "--- Switching to squeezing phase ---");
+ #endif
+ return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeSqueeze(SpongeInstance *instance, unsigned char *data, size_t dataByteLen)
+{
+ size_t i, j;
+ unsigned int partialBlock;
+ unsigned int rateInBytes = instance->rate/8;
+ unsigned char *curData;
+
+ if (!instance->squeezing)
+ SpongeAbsorbLastFewBits(instance, 0x01);
+
+ i = 0;
+ curData = data;
+ while(i < dataByteLen) {
+ if ((instance->byteIOIndex == rateInBytes) && (dataByteLen >= (i + rateInBytes))) {
+ for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+ SnP_Permute(instance->state);
+ SnP_ExtractBytes(instance->state, curData, 0, rateInBytes);
+ #ifdef KeccakReference
+ displayBytes(1, "Squeezed block", curData, rateInBytes);
+ #endif
+ curData+=rateInBytes;
+ }
+ i = dataByteLen - j;
+ }
+ else {
+ /* normal lane: using the message queue */
+
+ if (instance->byteIOIndex == rateInBytes) {
+ SnP_Permute(instance->state);
+ instance->byteIOIndex = 0;
+ }
+ partialBlock = (unsigned int)(dataByteLen - i);
+ if (partialBlock+instance->byteIOIndex > rateInBytes)
+ partialBlock = rateInBytes-instance->byteIOIndex;
+ i += partialBlock;
+
+ SnP_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+ #ifdef KeccakReference
+ displayBytes(1, "Squeezed block (part)", curData, partialBlock);
+ #endif
+ curData += partialBlock;
+ instance->byteIOIndex += partialBlock;
+ }
+ }
+ return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+#undef Sponge
+#undef SpongeInstance
+#undef SpongeInitialize
+#undef SpongeAbsorb
+#undef SpongeAbsorbLastFewBits
+#undef SpongeSqueeze
+#undef SnP_stateSizeInBytes
+#undef SnP_stateAlignment
+#undef SnP_StaticInitialize
+#undef SnP_Initialize
+#undef SnP_AddByte
+#undef SnP_AddBytes
+#undef SnP_ExtractBytes
diff --git a/Modules/_sha3/kcp/PlSnP-Fallback.inc b/Modules/_sha3/kcp/PlSnP-Fallback.inc
new file mode 100644
index 0000000000..3a9119ab4b
--- /dev/null
+++ b/Modules/_sha3/kcp/PlSnP-Fallback.inc
@@ -0,0 +1,257 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+/* expect PlSnP_baseParallelism, PlSnP_targetParallelism */
+
+/* expect SnP_stateSizeInBytes, SnP_stateAlignment */
+
+/* expect prefix */
+
+/* expect SnP_* */
+
+
+#define JOIN0(a, b) a ## b
+#define JOIN(a, b) JOIN0(a, b)
+
+#define PlSnP_StaticInitialize JOIN(prefix, _StaticInitialize)
+#define PlSnP_InitializeAll JOIN(prefix, _InitializeAll)
+#define PlSnP_AddByte JOIN(prefix, _AddByte)
+#define PlSnP_AddBytes JOIN(prefix, _AddBytes)
+#define PlSnP_AddLanesAll JOIN(prefix, _AddLanesAll)
+#define PlSnP_OverwriteBytes JOIN(prefix, _OverwriteBytes)
+#define PlSnP_OverwriteLanesAll JOIN(prefix, _OverwriteLanesAll)
+#define PlSnP_OverwriteWithZeroes JOIN(prefix, _OverwriteWithZeroes)
+#define PlSnP_ExtractBytes JOIN(prefix, _ExtractBytes)
+#define PlSnP_ExtractLanesAll JOIN(prefix, _ExtractLanesAll)
+#define PlSnP_ExtractAndAddBytes JOIN(prefix, _ExtractAndAddBytes)
+#define PlSnP_ExtractAndAddLanesAll JOIN(prefix, _ExtractAndAddLanesAll)
+
+#if (PlSnP_baseParallelism == 1)
+ #define SnP_stateSizeInBytes JOIN(SnP, _stateSizeInBytes)
+ #define SnP_stateAlignment JOIN(SnP, _stateAlignment)
+#else
+ #define SnP_stateSizeInBytes JOIN(SnP, _statesSizeInBytes)
+ #define SnP_stateAlignment JOIN(SnP, _statesAlignment)
+#endif
+#define PlSnP_factor ((PlSnP_targetParallelism)/(PlSnP_baseParallelism))
+#define SnP_stateOffset (((SnP_stateSizeInBytes+(SnP_stateAlignment-1))/SnP_stateAlignment)*SnP_stateAlignment)
+#define stateWithIndex(i) ((unsigned char *)states+((i)*SnP_stateOffset))
+
+#define SnP_StaticInitialize JOIN(SnP, _StaticInitialize)
+#define SnP_Initialize JOIN(SnP, _Initialize)
+#define SnP_InitializeAll JOIN(SnP, _InitializeAll)
+#define SnP_AddByte JOIN(SnP, _AddByte)
+#define SnP_AddBytes JOIN(SnP, _AddBytes)
+#define SnP_AddLanesAll JOIN(SnP, _AddLanesAll)
+#define SnP_OverwriteBytes JOIN(SnP, _OverwriteBytes)
+#define SnP_OverwriteLanesAll JOIN(SnP, _OverwriteLanesAll)
+#define SnP_OverwriteWithZeroes JOIN(SnP, _OverwriteWithZeroes)
+#define SnP_ExtractBytes JOIN(SnP, _ExtractBytes)
+#define SnP_ExtractLanesAll JOIN(SnP, _ExtractLanesAll)
+#define SnP_ExtractAndAddBytes JOIN(SnP, _ExtractAndAddBytes)
+#define SnP_ExtractAndAddLanesAll JOIN(SnP, _ExtractAndAddLanesAll)
+
+void PlSnP_StaticInitialize( void )
+{
+ SnP_StaticInitialize();
+}
+
+void PlSnP_InitializeAll(void *states)
+{
+ unsigned int i;
+
+ for(i=0; i<PlSnP_factor; i++)
+ #if (PlSnP_baseParallelism == 1)
+ SnP_Initialize(stateWithIndex(i));
+ #else
+ SnP_InitializeAll(stateWithIndex(i));
+ #endif
+}
+
+void PlSnP_AddByte(void *states, unsigned int instanceIndex, unsigned char byte, unsigned int offset)
+{
+ #if (PlSnP_baseParallelism == 1)
+ SnP_AddByte(stateWithIndex(instanceIndex), byte, offset);
+ #else
+ SnP_AddByte(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, byte, offset);
+ #endif
+}
+
+void PlSnP_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ #if (PlSnP_baseParallelism == 1)
+ SnP_AddBytes(stateWithIndex(instanceIndex), data, offset, length);
+ #else
+ SnP_AddBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
+ #endif
+}
+
+void PlSnP_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+ unsigned int i;
+
+ for(i=0; i<PlSnP_factor; i++) {
+ #if (PlSnP_baseParallelism == 1)
+ SnP_AddBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
+ #else
+ SnP_AddLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
+ #endif
+ data += PlSnP_baseParallelism*laneOffset*SnP_laneLengthInBytes;
+ }
+}
+
+void PlSnP_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+ #if (PlSnP_baseParallelism == 1)
+ SnP_OverwriteBytes(stateWithIndex(instanceIndex), data, offset, length);
+ #else
+ SnP_OverwriteBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
+ #endif
+}
+
+void PlSnP_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+ unsigned int i;
+
+ for(i=0; i<PlSnP_factor; i++) {
+ #if (PlSnP_baseParallelism == 1)
+ SnP_OverwriteBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
+ #else
+ SnP_OverwriteLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
+ #endif
+ data += PlSnP_baseParallelism*laneOffset*SnP_laneLengthInBytes;
+ }
+}
+
+void PlSnP_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
+{
+ #if (PlSnP_baseParallelism == 1)
+ SnP_OverwriteWithZeroes(stateWithIndex(instanceIndex), byteCount);
+ #else
+ SnP_OverwriteWithZeroes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, byteCount);
+ #endif
+}
+
+void PlSnP_PermuteAll(void *states)
+{
+ unsigned int i;
+
+ for(i=0; i<PlSnP_factor; i++) {
+ #if (PlSnP_baseParallelism == 1)
+ SnP_Permute(stateWithIndex(i));
+ #else
+ SnP_PermuteAll(stateWithIndex(i));
+ #endif
+ }
+}
+
+#if (defined(SnP_Permute_12rounds) || defined(SnP_PermuteAll_12rounds))
+void PlSnP_PermuteAll_12rounds(void *states)
+{
+ unsigned int i;
+
+ for(i=0; i<PlSnP_factor; i++) {
+ #if (PlSnP_baseParallelism == 1)
+ SnP_Permute_12rounds(stateWithIndex(i));
+ #else
+ SnP_PermuteAll_12rounds(stateWithIndex(i));
+ #endif
+ }
+}
+#endif
+
+void PlSnP_ExtractBytes(void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
+{
+ #if (PlSnP_baseParallelism == 1)
+ SnP_ExtractBytes(stateWithIndex(instanceIndex), data, offset, length);
+ #else
+ SnP_ExtractBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
+ #endif
+}
+
+void PlSnP_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+ unsigned int i;
+
+ for(i=0; i<PlSnP_factor; i++) {
+ #if (PlSnP_baseParallelism == 1)
+ SnP_ExtractBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
+ #else
+ SnP_ExtractLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
+ #endif
+ data += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
+ }
+}
+
+void PlSnP_ExtractAndAddBytes(void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+ #if (PlSnP_baseParallelism == 1)
+ SnP_ExtractAndAddBytes(stateWithIndex(instanceIndex), input, output, offset, length);
+ #else
+ SnP_ExtractAndAddBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, input, output, offset, length);
+ #endif
+}
+
+void PlSnP_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
+{
+ unsigned int i;
+
+ for(i=0; i<PlSnP_factor; i++) {
+ #if (PlSnP_baseParallelism == 1)
+ SnP_ExtractAndAddBytes(stateWithIndex(i), input, output, 0, laneCount*SnP_laneLengthInBytes);
+ #else
+ SnP_ExtractAndAddLanesAll(stateWithIndex(i), input, output, laneCount, laneOffset);
+ #endif
+ input += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
+ output += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
+ }
+}
+
+#undef PlSnP_factor
+#undef SnP_stateOffset
+#undef stateWithIndex
+#undef JOIN0
+#undef JOIN
+#undef PlSnP_StaticInitialize
+#undef PlSnP_InitializeAll
+#undef PlSnP_AddByte
+#undef PlSnP_AddBytes
+#undef PlSnP_AddLanesAll
+#undef PlSnP_OverwriteBytes
+#undef PlSnP_OverwriteLanesAll
+#undef PlSnP_OverwriteWithZeroes
+#undef PlSnP_PermuteAll
+#undef PlSnP_ExtractBytes
+#undef PlSnP_ExtractLanesAll
+#undef PlSnP_ExtractAndAddBytes
+#undef PlSnP_ExtractAndAddLanesAll
+#undef SnP_stateAlignment
+#undef SnP_stateSizeInBytes
+#undef PlSnP_factor
+#undef SnP_stateOffset
+#undef stateWithIndex
+#undef SnP_StaticInitialize
+#undef SnP_Initialize
+#undef SnP_InitializeAll
+#undef SnP_AddByte
+#undef SnP_AddBytes
+#undef SnP_AddLanesAll
+#undef SnP_OverwriteBytes
+#undef SnP_OverwriteWithZeroes
+#undef SnP_OverwriteLanesAll
+#undef SnP_ExtractBytes
+#undef SnP_ExtractLanesAll
+#undef SnP_ExtractAndAddBytes
+#undef SnP_ExtractAndAddLanesAll
diff --git a/Modules/_sha3/kcp/SnP-Relaned.h b/Modules/_sha3/kcp/SnP-Relaned.h
new file mode 100644
index 0000000000..086e635ff8
--- /dev/null
+++ b/Modules/_sha3/kcp/SnP-Relaned.h
@@ -0,0 +1,134 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _SnP_Relaned_h_
+#define _SnP_Relaned_h_
+
+#define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
+ { \
+ if ((offset) == 0) { \
+ SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
+ SnP_AddBytesInLane(state, \
+ (length)/SnP_laneLengthInBytes, \
+ (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+ 0, \
+ (length)%SnP_laneLengthInBytes); \
+ } \
+ else { \
+ unsigned int _sizeLeft = (length); \
+ unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+ unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+ const unsigned char *_curData = (data); \
+ while(_sizeLeft > 0) { \
+ unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+ if (_bytesInLane > _sizeLeft) \
+ _bytesInLane = _sizeLeft; \
+ SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+ _sizeLeft -= _bytesInLane; \
+ _lanePosition++; \
+ _offsetInLane = 0; \
+ _curData += _bytesInLane; \
+ } \
+ } \
+ }
+
+#define SnP_OverwriteBytes(state, data, offset, length, SnP_OverwriteLanes, SnP_OverwriteBytesInLane, SnP_laneLengthInBytes) \
+ { \
+ if ((offset) == 0) { \
+ SnP_OverwriteLanes(state, data, (length)/SnP_laneLengthInBytes); \
+ SnP_OverwriteBytesInLane(state, \
+ (length)/SnP_laneLengthInBytes, \
+ (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+ 0, \
+ (length)%SnP_laneLengthInBytes); \
+ } \
+ else { \
+ unsigned int _sizeLeft = (length); \
+ unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+ unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+ const unsigned char *_curData = (data); \
+ while(_sizeLeft > 0) { \
+ unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+ if (_bytesInLane > _sizeLeft) \
+ _bytesInLane = _sizeLeft; \
+ SnP_OverwriteBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+ _sizeLeft -= _bytesInLane; \
+ _lanePosition++; \
+ _offsetInLane = 0; \
+ _curData += _bytesInLane; \
+ } \
+ } \
+ }
+
+#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
+ { \
+ if ((offset) == 0) { \
+ SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
+ SnP_ExtractBytesInLane(state, \
+ (length)/SnP_laneLengthInBytes, \
+ (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+ 0, \
+ (length)%SnP_laneLengthInBytes); \
+ } \
+ else { \
+ unsigned int _sizeLeft = (length); \
+ unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+ unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+ unsigned char *_curData = (data); \
+ while(_sizeLeft > 0) { \
+ unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+ if (_bytesInLane > _sizeLeft) \
+ _bytesInLane = _sizeLeft; \
+ SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+ _sizeLeft -= _bytesInLane; \
+ _lanePosition++; \
+ _offsetInLane = 0; \
+ _curData += _bytesInLane; \
+ } \
+ } \
+ }
+
+#define SnP_ExtractAndAddBytes(state, input, output, offset, length, SnP_ExtractAndAddLanes, SnP_ExtractAndAddBytesInLane, SnP_laneLengthInBytes) \
+ { \
+ if ((offset) == 0) { \
+ SnP_ExtractAndAddLanes(state, input, output, (length)/SnP_laneLengthInBytes); \
+ SnP_ExtractAndAddBytesInLane(state, \
+ (length)/SnP_laneLengthInBytes, \
+ (input)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+ (output)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+ 0, \
+ (length)%SnP_laneLengthInBytes); \
+ } \
+ else { \
+ unsigned int _sizeLeft = (length); \
+ unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+ unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+ const unsigned char *_curInput = (input); \
+ unsigned char *_curOutput = (output); \
+ while(_sizeLeft > 0) { \
+ unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+ if (_bytesInLane > _sizeLeft) \
+ _bytesInLane = _sizeLeft; \
+ SnP_ExtractAndAddBytesInLane(state, _lanePosition, _curInput, _curOutput, _offsetInLane, _bytesInLane); \
+ _sizeLeft -= _bytesInLane; \
+ _lanePosition++; \
+ _offsetInLane = 0; \
+ _curInput += _bytesInLane; \
+ _curOutput += _bytesInLane; \
+ } \
+ } \
+ }
+
+#endif
diff --git a/Modules/_sha3/kcp/align.h b/Modules/_sha3/kcp/align.h
new file mode 100644
index 0000000000..6650fe8c3c
--- /dev/null
+++ b/Modules/_sha3/kcp/align.h
@@ -0,0 +1,35 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _align_h_
+#define _align_h_
+
+/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
+
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
+
+#endif