diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-22 17:07:53 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-22 19:57:27 +0300 |
commit | 98674fdaa30ab22a3ac86ca05d688b5b6112895d (patch) | |
tree | 937374c6701fa80161a727b200aaddf0933d37c5 /cipher | |
parent | e67c67321ce240c93dd0fa2b21c649c0a8e233f7 (diff) | |
download | libgcrypt-98674fdaa30ab22a3ac86ca05d688b5b6112895d.tar.gz |
twofish: add ARMv6 assembly implementation
* cipher/Makefile.am: Add 'twofish-armv6.S'.
* cipher/twofish-armv6.S: New.
* cipher/twofish.c (USE_ARMV6_ASM): New macro.
[USE_ARMV6_ASM] (_gcry_twofish_armv6_encrypt_block)
(_gcry_twofish_armv6_decrypt_block): New prototypes.
[USE_AMDV6_ASM] (twofish_encrypt, twofish_decrypt): Add.
[USE_AMD64_ASM] (do_twofish_encrypt, do_twofish_decrypt): Remove.
(_gcry_twofish_ctr_enc, _gcry_twofish_cfb_dec): Use 'twofish_encrypt'
instead of 'do_twofish_encrypt'.
(_gcry_twofish_cbc_dec): Use 'twofish_decrypt' instead of
'do_twofish_decrypt'.
* configure.ac [arm]: Add 'twofish-armv6.lo'.
--
Add optimized ARMv6 assembly implementation for Twofish. Implementation is tuned
for Cortex-A8. Unaligned access handling is done in assembly part.
For now, only enable this on little-endian systems as big-endian correctness
have not been tested yet.
Old (gcc-4.8) vs new (twofish-asm), Cortex-A8 (on armhf):
ECB/Stream CBC CFB OFB CTR CCM
--------------- --------------- --------------- --------------- --------------- ---------------
TWOFISH 1.23x 1.25x 1.16x 1.26x 1.16x 1.30x 1.18x 1.17x 1.23x 1.23x 1.22x 1.22x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/twofish-armv6.S | 365 | ||||
-rw-r--r-- | cipher/twofish.c | 88 |
3 files changed, 428 insertions, 27 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index b0efd89d..3d8149a5 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -80,7 +80,7 @@ sha512.c sha512-armv7-neon.S \ stribog.c \ tiger.c \ whirlpool.c \ -twofish.c twofish-amd64.S \ +twofish.c twofish-amd64.S twofish-armv6.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \ camellia-aesni-avx2-amd64.S camellia-armv6.S diff --git a/cipher/twofish-armv6.S b/cipher/twofish-armv6.S new file mode 100644 index 00000000..b76ab37c --- /dev/null +++ b/cipher/twofish-armv6.S @@ -0,0 +1,365 @@ +/* twofish-armv6.S - ARM assembly implementation of Twofish cipher + * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) +#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS + +.text + +.syntax unified +.arm + +/* structure of TWOFISH_context: */ +#define s0 0 +#define s1 ((s0) + 4 * 256) +#define s2 ((s1) + 4 * 256) +#define s3 ((s2) + 4 * 256) +#define w ((s3) + 4 * 256) +#define k ((w) + 4 * 8) + +/* register macros */ +#define CTX %r0 +#define CTXs0 %r0 +#define CTXs1 %r1 +#define CTXs3 %r7 + +#define RA %r3 +#define RB %r4 +#define RC %r5 +#define RD %r6 + +#define RX %r2 +#define RY %ip + +#define RMASK %lr + +#define RT0 %r8 +#define RT1 %r9 +#define RT2 %r10 +#define RT3 %r11 + +/* helper macros */ +#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 0)]; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 3)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 0)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 1)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 2)]; \ + strb rtmp0, [rdst, #((offs) + 3)]; + +#ifndef __ARMEL__ + /* bswap on big-endian */ + #define host_to_le(reg) \ + rev reg, reg; + #define le_to_host(reg) \ + rev reg, reg; +#else + /* nop on little-endian */ + #define host_to_le(reg) /*_*/ + #define le_to_host(reg) /*_*/ +#endif + +#define ldr_input_aligned_le(rin, a, b, c, d) \ + ldr a, [rin, #0]; \ + ldr b, [rin, #4]; \ + le_to_host(a); \ + ldr c, [rin, #8]; \ + le_to_host(b); \ + ldr d, [rin, #12]; \ + le_to_host(c); \ + le_to_host(d); + +#define str_output_aligned_le(rout, a, b, c, d) \ + le_to_host(a); \ + le_to_host(b); \ + str a, [rout, #0]; \ + le_to_host(c); \ + str b, [rout, #4]; \ + le_to_host(d); \ + str c, [rout, #8]; \ + str d, [rout, #12]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads/writes allowed */ + #define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \ + ldr_input_aligned_le(rin, ra, rb, rc, rd) + + #define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \ + str_output_aligned_le(rout, ra, rb, rc, rd) +#else + /* need to handle unaligned reads/writes by byte reads */ + #define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_le(ra, rin, 0, rtmp0); \ + ldr_unaligned_le(rb, rin, 4, rtmp0); \ + ldr_unaligned_le(rc, rin, 8, rtmp0); \ + ldr_unaligned_le(rd, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + ldr_input_aligned_le(rin, ra, rb, rc, rd); \ + 2:; + + #define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \ + str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \ + str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \ + str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + str_output_aligned_le(rout, ra, rb, rc, rd); \ + 2:; +#endif + +/********************************************************************** + 1-way twofish + **********************************************************************/ +#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \ + and RT0, RMASK, b, lsr#(8 - 2); \ + and RY, RMASK, b, lsr#(16 - 2); \ + add RT0, RT0, #(s2 - s1); \ + and RT1, RMASK, b, lsr#(24 - 2); \ + ldr RY, [CTXs3, RY]; \ + and RT2, RMASK, b, lsl#(2); \ + ldr RT0, [CTXs1, RT0]; \ + and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \ + ldr RT1, [CTXs0, RT1]; \ + and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \ + ldr RT2, [CTXs1, RT2]; \ + add RT3, RT3, #(s2 - s1); \ + ldr RX, [CTXs1, RX]; \ + ror_a(a); \ + \ + eor RY, RY, RT0; \ + ldr RT3, [CTXs1, RT3]; \ + and RT0, RMASK, a, lsl#(2); \ + eor RY, RY, RT1; \ + and RT1, RMASK, a, lsr#(24 - 2); \ + eor RY, RY, RT2; \ + ldr RT0, [CTXs0, RT0]; \ + eor RX, RX, RT3; \ + ldr RT1, [CTXs3, RT1]; \ + eor RX, RX, RT0; \ + \ + ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \ + eor RX, RX, RT1; \ + ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \ + \ + add RT0, RX, RY, lsl #1; \ + add RX, RX, RY; \ + add RT0, RT0, RT3; \ + add RX, RX, RT2; \ + eor rd, RT0, rd, ror #31; \ + eor rc, rc, RX; + +#define dummy(x) /*_*/ + +#define ror1(r) \ + ror r, r, #1; + +#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \ + and RT3, RMASK, b, lsl#(2 - (adj_b)); \ + and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \ + ror_b(b); \ + and RT2, RMASK, a, lsl#(2); \ + and RT0, RMASK, a, lsr#(8 - 2); \ + \ + ldr RY, [CTXs1, RT3]; \ + add RT1, RT1, #(s2 - s1); \ + ldr RX, [CTXs0, RT2]; \ + and RT3, RMASK, b, lsr#(16 - 2); \ + ldr RT1, [CTXs1, RT1]; \ + and RT2, RMASK, a, lsr#(16 - 2); \ + ldr RT0, [CTXs1, RT0]; \ + \ + add RT2, RT2, #(s2 - s1); \ + ldr RT3, [CTXs3, RT3]; \ + eor RY, RY, RT1; \ + \ + and RT1, RMASK, b, lsr#(24 - 2); \ + eor RX, RX, RT0; \ + ldr RT2, [CTXs1, RT2]; \ + and RT0, RMASK, a, lsr#(24 - 2); \ + \ + ldr RT1, [CTXs0, RT1]; \ + \ + eor RY, RY, RT3; \ + ldr RT0, [CTXs3, RT0]; \ + eor RX, RX, RT2; \ + eor RY, RY, RT1; \ + \ + ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \ + eor RX, RX, RT0; \ + ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \ + \ + add RT0, RX, RY, lsl #1; \ + add RX, RX, RY; \ + add RT0, RT0, RT1; \ + add RX, RX, RT2; \ + eor rd, rd, RT0; \ + eor rc, RX, rc, ror #31; + +#define first_encrypt_cycle(nc) \ + encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \ + encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); + +#define encrypt_cycle(nc) \ + encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \ + encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); + +#define last_encrypt_cycle(nc) \ + encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \ + encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \ + ror1(RA); + +#define first_decrypt_cycle(nc) \ + decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \ + decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); + +#define decrypt_cycle(nc) \ + decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \ + decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); + +#define last_decrypt_cycle(nc) \ + decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \ + decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \ + ror1(RD); + +.align 3 +.global _gcry_twofish_armv6_encrypt_block +.type _gcry_twofish_armv6_encrypt_block,%function; + +_gcry_twofish_armv6_encrypt_block: + /* input: + * %r0: ctx + * %r1: dst + * %r2: src + */ + push {%r1, %r4-%r11, %ip, %lr}; + + add RY, CTXs0, #w; + + ldr_input_le(%r2, RA, RB, RC, RD, RT0); + + /* Input whitening */ + ldm RY, {RT0, RT1, RT2, RT3}; + add CTXs3, CTXs0, #(s3 - s0); + add CTXs1, CTXs0, #(s1 - s0); + mov RMASK, #(0xff << 2); + eor RA, RA, RT0; + eor RB, RB, RT1; + eor RC, RC, RT2; + eor RD, RD, RT3; + + first_encrypt_cycle(0); + encrypt_cycle(1); + encrypt_cycle(2); + encrypt_cycle(3); + encrypt_cycle(4); + encrypt_cycle(5); + encrypt_cycle(6); + last_encrypt_cycle(7); + + add RY, CTXs3, #(w + 4*4 - s3); + pop {%r1}; /* dst */ + + /* Output whitening */ + ldm RY, {RT0, RT1, RT2, RT3}; + eor RC, RC, RT0; + eor RD, RD, RT1; + eor RA, RA, RT2; + eor RB, RB, RT3; + + str_output_le(%r1, RC, RD, RA, RB, RT0, RT1); + + pop {%r4-%r11, %ip, %lr}; + bx %lr; +.ltorg +.size _gcry_twofish_armv6_encrypt_block,.-_gcry_twofish_armv6_encrypt_block; + +.align 3 +.global _gcry_twofish_armv6_decrypt_block +.type _gcry_twofish_armv6_decrypt_block,%function; + +_gcry_twofish_armv6_decrypt_block: + /* input: + * %r0: ctx + * %r1: dst + * %r2: src + */ + push {%r1, %r4-%r11, %ip, %lr}; + + add CTXs3, CTXs0, #(s3 - s0); + + ldr_input_le(%r2, RC, RD, RA, RB, RT0); + + add RY, CTXs3, #(w + 4*4 - s3); + add CTXs3, CTXs0, #(s3 - s0); + + /* Input whitening */ + ldm RY, {RT0, RT1, RT2, RT3}; + add CTXs1, CTXs0, #(s1 - s0); + mov RMASK, #(0xff << 2); + eor RC, RC, RT0; + eor RD, RD, RT1; + eor RA, RA, RT2; + eor RB, RB, RT3; + + first_decrypt_cycle(7); + decrypt_cycle(6); + decrypt_cycle(5); + decrypt_cycle(4); + decrypt_cycle(3); + decrypt_cycle(2); + decrypt_cycle(1); + last_decrypt_cycle(0); + + add RY, CTXs0, #w; + pop {%r1}; /* dst */ + + /* Output whitening */ + ldm RY, {RT0, RT1, RT2, RT3}; + eor RA, RA, RT0; + eor RB, RB, RT1; + eor RC, RC, RT2; + eor RD, RD, RT3; + + str_output_le(%r1, RA, RB, RC, RD, RT0, RT1); + + pop {%r4-%r11, %ip, %lr}; + bx %lr; +.size _gcry_twofish_armv6_decrypt_block,.-_gcry_twofish_armv6_decrypt_block; + +#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*__ARM_ARCH >= 6*/ diff --git a/cipher/twofish.c b/cipher/twofish.c index 993ad0f4..d2cabbe8 100644 --- a/cipher/twofish.c +++ b/cipher/twofish.c @@ -57,6 +57,14 @@ # define USE_AMD64_ASM 1 #endif +/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ +#undef USE_ARMV6_ASM +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) +# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) +# define USE_ARMV6_ASM 1 +# endif +#endif + /* Prototype for the self-test function. */ static const char *selftest(void); @@ -746,7 +754,16 @@ extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, const byte *in, byte *iv); -#else /*!USE_AMD64_ASM*/ +#elif defined(USE_ARMV6_ASM) + +/* Assembly implementations of Twofish. */ +extern void _gcry_twofish_armv6_encrypt_block(const TWOFISH_context *c, + byte *out, const byte *in); + +extern void _gcry_twofish_armv6_decrypt_block(const TWOFISH_context *c, + byte *out, const byte *in); + +#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ /* Macros to compute the g() function in the encryption and decryption * rounds. G1 is the straight g() function; G2 includes the 8-bit @@ -812,21 +829,25 @@ extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, #ifdef USE_AMD64_ASM -static void -do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in) +static unsigned int +twofish_encrypt (void *context, byte *out, const byte *in) { + TWOFISH_context *ctx = context; _gcry_twofish_amd64_encrypt_block(ctx, out, in); + return /*burn_stack*/ (4*sizeof (void*)); } +#elif defined(USE_ARMV6_ASM) + static unsigned int twofish_encrypt (void *context, byte *out, const byte *in) { TWOFISH_context *ctx = context; - _gcry_twofish_amd64_encrypt_block(ctx, out, in); + _gcry_twofish_armv6_encrypt_block(ctx, out, in); return /*burn_stack*/ (4*sizeof (void*)); } -#else /*!USE_AMD64_ASM*/ +#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ static void do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in) @@ -868,28 +889,32 @@ twofish_encrypt (void *context, byte *out, const byte *in) return /*burn_stack*/ (24+3*sizeof (void*)); } -#endif /*!USE_AMD64_ASM*/ +#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ /* Decrypt one block. in and out may be the same. */ #ifdef USE_AMD64_ASM -static void -do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in) +static unsigned int +twofish_decrypt (void *context, byte *out, const byte *in) { + TWOFISH_context *ctx = context; _gcry_twofish_amd64_decrypt_block(ctx, out, in); + return /*burn_stack*/ (4*sizeof (void*)); } +#elif defined(USE_ARMV6_ASM) + static unsigned int twofish_decrypt (void *context, byte *out, const byte *in) { TWOFISH_context *ctx = context; - _gcry_twofish_amd64_decrypt_block(ctx, out, in); + _gcry_twofish_armv6_decrypt_block(ctx, out, in); return /*burn_stack*/ (4*sizeof (void*)); } -#else /*!USE_AMD64_ASM*/ +#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ static void do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in) @@ -932,7 +957,7 @@ twofish_decrypt (void *context, byte *out, const byte *in) return /*burn_stack*/ (24+3*sizeof (void*)); } -#endif /*!USE_AMD64_ASM*/ +#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ @@ -947,14 +972,11 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char tmpbuf[TWOFISH_BLOCKSIZE]; - int burn_stack_depth = 24 + 3 * sizeof (void*); + unsigned int burn, burn_stack_depth = 0; int i; #ifdef USE_AMD64_ASM { - if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*)) - burn_stack_depth = 8 * sizeof(void*); - /* Process data in 3 block chunks. */ while (nblocks >= 3) { @@ -963,6 +985,10 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, nblocks -= 3; outbuf += 3 * TWOFISH_BLOCKSIZE; inbuf += 3 * TWOFISH_BLOCKSIZE; + + burn = 8 * sizeof(void*); + if (burn > burn_stack_depth) + burn_stack_depth = burn; } /* Use generic code to handle smaller chunks... */ @@ -973,7 +999,10 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, for ( ;nblocks; nblocks-- ) { /* Encrypt the counter. */ - do_twofish_encrypt(ctx, tmpbuf, ctr); + burn = twofish_encrypt(ctx, tmpbuf, ctr); + if (burn > burn_stack_depth) + burn_stack_depth = burn; + /* XOR the input with the encrypted counter and store in output. */ buf_xor(outbuf, tmpbuf, inbuf, TWOFISH_BLOCKSIZE); outbuf += TWOFISH_BLOCKSIZE; @@ -1002,13 +1031,10 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char savebuf[TWOFISH_BLOCKSIZE]; - int burn_stack_depth = 24 + 3 * sizeof (void*); + unsigned int burn, burn_stack_depth = 0; #ifdef USE_AMD64_ASM { - if (nblocks >= 3 && burn_stack_depth < 9 * sizeof(void*)) - burn_stack_depth = 9 * sizeof(void*); - /* Process data in 3 block chunks. */ while (nblocks >= 3) { @@ -1017,6 +1043,10 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, nblocks -= 3; outbuf += 3 * TWOFISH_BLOCKSIZE; inbuf += 3 * TWOFISH_BLOCKSIZE; + + burn = 9 * sizeof(void*); + if (burn > burn_stack_depth) + burn_stack_depth = burn; } /* Use generic code to handle smaller chunks... */ @@ -1029,7 +1059,9 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, OUTBUF. */ memcpy(savebuf, inbuf, TWOFISH_BLOCKSIZE); - do_twofish_decrypt (ctx, outbuf, inbuf); + burn = twofish_decrypt (ctx, outbuf, inbuf); + if (burn > burn_stack_depth) + burn_stack_depth = burn; buf_xor(outbuf, outbuf, iv, TWOFISH_BLOCKSIZE); memcpy(iv, savebuf, TWOFISH_BLOCKSIZE); @@ -1051,13 +1083,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, TWOFISH_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; - int burn_stack_depth = 24 + 3 * sizeof (void*); + unsigned int burn, burn_stack_depth = 0; #ifdef USE_AMD64_ASM { - if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*)) - burn_stack_depth = 8 * sizeof(void*); - /* Process data in 3 block chunks. */ while (nblocks >= 3) { @@ -1066,6 +1095,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, nblocks -= 3; outbuf += 3 * TWOFISH_BLOCKSIZE; inbuf += 3 * TWOFISH_BLOCKSIZE; + + burn = 8 * sizeof(void*); + if (burn > burn_stack_depth) + burn_stack_depth = burn; } /* Use generic code to handle smaller chunks... */ @@ -1074,7 +1107,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, for ( ;nblocks; nblocks-- ) { - do_twofish_encrypt(ctx, iv, iv); + burn = twofish_encrypt(ctx, iv, iv); + if (burn > burn_stack_depth) + burn_stack_depth = burn; + buf_xor_n_copy(outbuf, iv, inbuf, TWOFISH_BLOCKSIZE); outbuf += TWOFISH_BLOCKSIZE; inbuf += TWOFISH_BLOCKSIZE; |