diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-05-24 12:43:29 +0300 |
---|---|---|
committer | Werner Koch <wk@gnupg.org> | 2013-05-24 12:51:16 +0200 |
commit | 0bdf26eea8cdbffefe7e37578f8f896c4f5f5275 (patch) | |
tree | 4e3f21742db99202068b3797dd1d04412d541037 | |
parent | ab8fc70b5f0c396a5bc941267f59166e860b8c5d (diff) | |
download | libgcrypt-0bdf26eea8cdbffefe7e37578f8f896c4f5f5275.tar.gz |
cast5: add amd64 assembly implementation
* cipher/Makefile.am: Add 'cast5-amd64.S'.
* cipher/cast5-amd64.S: New file.
* cipher/cast5.c (USE_AMD64_ASM): New macro.
(_gcry_cast5_s1tos4): Merge arrays s1, s2, s3, s4 to single array to
simplify access from assembly implementation.
(s1, s2, s3, s4): New macros pointing to subarrays in
_gcry_cast5_s1tos4.
[USE_AMD64_ASM] (_gcry_cast5_amd64_encrypt_block)
(_gcry_cast5_amd64_decrypt_block, _gcry_cast5_amd64_ctr_enc)
(_gcry_cast5_amd64_cbc_dec, _gcry_cast5_amd64_cfb_dec): New prototypes.
[USE_AMD64_ASM] (do_encrypt_block, do_decrypt_block, encrypt_block)
(decrypt_block): New functions.
(_gcry_cast5_ctr_enc, _gcry_cast5_cbc_dec, _gcry_cast5_cfb_dec)
(selftest_ctr, selftest_cbc, selftest_cfb): New functions.
(selftest): Call new bulk selftests.
* cipher/cipher.c (gcry_cipher_open) [USE_CAST5]: Register CAST5 bulk
functions for ctr-enc, cbc-dec and cfb-dec.
* configure.ac (cast5) [x86_64]: Add 'cast5-amd64.lo'.
* src/cipher.h (_gcry_cast5_ctr_enc, _gcry_cast5_cbc_dec)
(gcry_cast5_cfb_dec): New prototypes.
--
Provides non-parallel implementations for small speed-up and 4-way parallel
implementations that gets accelerated on `out-of-order' CPUs.
Speed old vs. new on AMD Phenom II X6 1055T:
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
CAST5 1.23x 1.22x 1.21x 2.86x 1.21x 2.83x 1.22x 1.17x 2.73x 2.73x
Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
CAST5 1.00x 1.04x 1.06x 2.56x 1.06x 2.37x 1.03x 1.01x 2.43x 2.41x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/cast5-amd64.S | 587 | ||||
-rw-r--r-- | cipher/cast5.c | 278 | ||||
-rw-r--r-- | cipher/cipher.c | 7 | ||||
-rw-r--r-- | configure.ac | 7 | ||||
-rw-r--r-- | src/cipher.h | 13 |
6 files changed, 885 insertions, 9 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 69f1e6d1..1e2696fb 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -54,7 +54,7 @@ rmd.h EXTRA_libcipher_la_SOURCES = \ arcfour.c \ blowfish.c \ -cast5.c \ +cast5.c cast5-amd64.S \ crc.c \ des.c \ dsa.c \ diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S new file mode 100644 index 00000000..c3007d32 --- /dev/null +++ b/cipher/cast5-amd64.S @@ -0,0 +1,587 @@ +/* cast5-amd64.S - AMD64 assembly implementation of CAST5 cipher + * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(USE_CAST5) + +#ifdef __PIC__ +# define RIP %rip +# define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg +#else +# define RIP +# define GET_EXTERN_POINTER(name, reg) leaq name, reg +#endif + +.text + +.extern _gcry_cast5_s1to4; + +#define s1 0 +#define s2 (s1 + (4 * 256)) +#define s3 (s2 + (4 * 256)) +#define s4 (s3 + (4 * 256)) + +/* structure of CAST5_context: */ +#define Km 0 +#define Kr (Km + (16 * 4)) + +/* register macros */ +#define CTX %rdi +#define RIO %rsi +#define RTAB %r8 + +#define RLR0 %r9 +#define RLR1 %r10 +#define RLR2 %r11 +#define RLR3 %r12 + +#define RLR0d %r9d +#define RLR1d %r10d +#define RLR2d %r11d +#define RLR3d %r12d + +#define RX0 %rax +#define RX1 %rbx +#define RX2 %rdx + +#define RX0d %eax +#define RX1d %ebx +#define RX2d %edx + +#define RX0bl %al +#define RX1bl %bl +#define RX2bl %dl + +#define RX0bh %ah +#define RX1bh %bh +#define RX2bh %dh + +#define RKR %rcx +#define RKRd %ecx +#define RKRbl %cl + +#define RT0 %rbp +#define RT1 %rsi + +#define RT0d %ebp +#define RT1d %esi + +#define RKM0d %r13d +#define RKM1d %r14d + +/*********************************************************************** + * 1-way cast5 + ***********************************************************************/ +#define dummy(x) + +#define shr_kr(none) \ + shrq $8, RKR; + +#define F(km, load_next_kr, op0, op1, op2, op3) \ + op0 ## l RLR0d, km ## d; \ + roll RKRbl, km ## d; \ + rorq $32, RLR0; \ + movzbl km ## bh, RT0d; \ + movzbl km ## bl, RT1d; \ + roll $16, km ## d; \ + movl s1(RTAB,RT0,4), RT0d; \ + op1 ## l s2(RTAB,RT1,4), RT0d; \ + load_next_kr(kr_next); \ + movzbl km ## bh, RT1d; \ + movzbl km ## bl, km ## d; \ + op2 ## l s3(RTAB,RT1,4), RT0d; \ + op3 ## l s4(RTAB,km,4), RT0d; \ + xorq RT0, RLR0; + +#define F1(km, load_next_kr) \ + F(##km, load_next_kr, add, xor, sub, add) +#define F2(km, load_next_kr) \ + F(##km, load_next_kr, xor, sub, add, xor) +#define F3(km, load_next_kr) \ + F(##km, load_next_kr, sub, add, xor, sub) + +#define get_round_km(n, km) \ + movl Km+4*(n)(CTX), km; + +#define get_round_kr_enc(n) \ + movq $0x1010101010101010, RKR; \ + \ + /* merge rorl rk and rorl $16 */ \ + xorq Kr+(n)(CTX), RKR; + +#define get_round_kr_dec(n) \ + movq $0x1010101010101010, RKR; \ + \ + /* merge rorl rk and rorl $16 */ \ + xorq Kr+(n - 7)(CTX), RKR; \ + bswapq RKR; + +#define round_enc(n, FA, FB, fn1, fn2) \ + get_round_km(n + 1, RX2d); \ + FA(RX0, fn1); \ + get_round_km(n + 2, RX0d); \ + FB(RX2, fn2); + +#define round_enc_last(n, FXA, FXB) \ + get_round_km(n + 1, RX2d); \ + \ + FXA(RX0, shr_kr); \ + FXB(RX2, dummy); + +#define round_enc_1(n, FA, FB) \ + round_enc(n, FA, FB, shr_kr, shr_kr) + +#define round_enc_2(n, FA, FB) \ + round_enc(n, FA, FB, shr_kr, dummy) + +#define round_dec(n, FA, FB, fn1, fn2) \ + get_round_km(n - 1, RX2d); \ + FA(RX0, fn1); \ + get_round_km(n - 2, RX0d); \ + FB(RX2, fn2); + +#define round_dec_last(n, FXA, FXB) \ + get_round_km(n - 1, RX2d); \ + FXA(RX0, shr_kr); \ + FXB(RX2, dummy); + +#define round_dec_1(n, FA, FB) \ + round_dec(n, FA, FB, shr_kr, shr_kr) + +#define round_dec_2(n, FA, FB) \ + round_dec(n, FA, FB, shr_kr, dummy) + +#define read_block() \ + movq (RIO), RLR0; \ + bswapq RLR0; + +#define write_block() \ + bswapq RLR0; \ + rorq $32, RLR0; \ + movq RLR0, (RIO); + +.align 8 +.global _gcry_cast5_amd64_encrypt_block +.type _gcry_cast5_amd64_encrypt_block,@function; + +_gcry_cast5_amd64_encrypt_block: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + pushq %rbp; + pushq %rbx; + + movq %rsi, %r10; + + GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); + + movq %rdx, RIO; + read_block(); + + get_round_km(0, RX0d); + get_round_kr_enc(0); + round_enc_1(0, F1, F2); + round_enc_1(2, F3, F1); + round_enc_1(4, F2, F3); + round_enc_2(6, F1, F2); + get_round_kr_enc(8); + round_enc_1(8, F3, F1); + round_enc_1(10, F2, F3); + round_enc_1(12, F1, F2); + round_enc_last(14, F3, F1); + + movq %r10, RIO; + write_block(); + + popq %rbx; + popq %rbp; + ret; +.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block; + +.align 8 +.global _gcry_cast5_amd64_decrypt_block +.type _gcry_cast5_amd64_decrypt_block,@function; + +_gcry_cast5_amd64_decrypt_block: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + pushq %rbp; + pushq %rbx; + + movq %rsi, %r10; + + GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); + + movq %rdx, RIO; + read_block(); + + get_round_km(15, RX0d); + get_round_kr_dec(15); + round_dec_1(15, F1, F3); + round_dec_1(13, F2, F1); + round_dec_1(11, F3, F2); + round_dec_2(9, F1, F3); + get_round_kr_dec(7); + round_dec_1(7, F2, F1); + round_dec_1(5, F3, F2); + round_dec_1(3, F1, F3); + round_dec_last(1, F2, F1); + + movq %r10, RIO; + write_block(); + + popq %rbx; + popq %rbp; + ret; +.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block; + +/********************************************************************** + 4-way cast5, four blocks parallel + **********************************************************************/ +#define F_tail(rlr, rx, op1, op2, op3) \ + movzbl rx ## bh, RT0d; \ + movzbl rx ## bl, RT1d; \ + roll $16, rx ## d; \ + movl s1(RTAB,RT0,4), RT0d; \ + op1 ## l s2(RTAB,RT1,4), RT0d; \ + movzbl rx ## bh, RT1d; \ + movzbl rx ## bl, rx ## d; \ + op2 ## l s3(RTAB,RT1,4), RT0d; \ + op3 ## l s4(RTAB,rx,4), RT0d; \ + xorq RT0, rlr; + +#define F4(km, load_next_kr, op0, op1, op2, op3) \ + movl km, RX0d; \ + op0 ## l RLR0d, RX0d; \ + roll RKRbl, RX0d; \ + rorq $32, RLR0; \ + \ + movl km, RX1d; \ + op0 ## l RLR1d, RX1d; \ + roll RKRbl, RX1d; \ + rorq $32, RLR1; \ + \ + movl km, RX2d; \ + op0 ## l RLR2d, RX2d; \ + roll RKRbl, RX2d; \ + rorq $32, RLR2; \ + \ + F_tail(RLR0, RX0, op1, op2, op3); \ + F_tail(RLR1, RX1, op1, op2, op3); \ + F_tail(RLR2, RX2, op1, op2, op3); \ + \ + movl km, RX0d; \ + op0 ## l RLR3d, RX0d; \ + roll RKRbl, RX0d; \ + load_next_kr(); \ + rorq $32, RLR3; \ + \ + F_tail(RLR3, RX0, op1, op2, op3); + +#define F4_1(km, load_next_kr) \ + F4(km, load_next_kr, add, xor, sub, add) +#define F4_2(km, load_next_kr) \ + F4(km, load_next_kr, xor, sub, add, xor) +#define F4_3(km, load_next_kr) \ + F4(km, load_next_kr, sub, add, xor, sub) + +#define round_enc4(n, FA, FB, fn1, fn2) \ + get_round_km(n + 1, RKM1d); \ + FA(RKM0d, fn1); \ + get_round_km(n + 2, RKM0d); \ + FB(RKM1d, fn2); + +#define round_enc_last4(n, FXA, FXB) \ + get_round_km(n + 1, RKM1d); \ + FXA(RKM0d, shr_kr); \ + FXB(RKM1d, dummy); + +#define round_enc4_1(n, FA, FB) \ + round_enc4(n, FA, FB, shr_kr, shr_kr); + +#define round_enc4_2(n, FA, FB) \ + round_enc4(n, FA, FB, shr_kr, dummy); + +#define round_dec4(n, FA, FB, fn1, fn2) \ + get_round_km(n - 1, RKM1d); \ + FA(RKM0d, fn1); \ + get_round_km(n - 2, RKM0d); \ + FB(RKM1d, fn2); + +#define round_dec_last4(n, FXA, FXB) \ + get_round_km(n - 1, RKM1d); \ + FXA(RKM0d, shr_kr); \ + FXB(RKM1d, dummy); + +#define round_dec4_1(n, FA, FB) \ + round_dec4(n, FA, FB, shr_kr, shr_kr); + +#define round_dec4_2(n, FA, FB) \ + round_dec4(n, FA, FB, shr_kr, dummy); + +#define inbswap_block4(a, b, c, d) \ + bswapq a; \ + bswapq b; \ + bswapq c; \ + bswapq d; + +#define outbswap_block4(a, b, c, d) \ + bswapq a; \ + bswapq b; \ + bswapq c; \ + bswapq d; \ + rorq $32, a; \ + rorq $32, b; \ + rorq $32, c; \ + rorq $32, d; + +.align 8 +.type __cast5_enc_blk4,@function; + +__cast5_enc_blk4: + /* input: + * %rdi: ctx, CTX + * RLR0,RLR1,RLR2,RLR3: four input plaintext blocks + * output: + * RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks + */ + GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); + + get_round_km(0, RKM0d); + get_round_kr_enc(0); + round_enc4_1(0, F4_1, F4_2); + round_enc4_1(2, F4_3, F4_1); + round_enc4_1(4, F4_2, F4_3); + round_enc4_2(6, F4_1, F4_2); + get_round_kr_enc(8); + round_enc4_1(8, F4_3, F4_1); + round_enc4_1(10, F4_2, F4_3); + round_enc4_1(12, F4_1, F4_2); + round_enc_last4(14, F4_3, F4_1); + + outbswap_block4(RLR0, RLR1, RLR2, RLR3); + ret; +.size __cast5_enc_blk4,.-__cast5_enc_blk4; + +.align 8 +.type __cast5_dec_blk4,@function; + +__cast5_dec_blk4: + /* input: + * %rdi: ctx, CTX + * RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks + * output: + * RLR0,RLR1,RLR2,RLR3: four output plaintext blocks + */ + GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); + + inbswap_block4(RLR0, RLR1, RLR2, RLR3); + + get_round_km(15, RKM0d); + get_round_kr_dec(15); + round_dec4_1(15, F4_1, F4_3); + round_dec4_1(13, F4_2, F4_1); + round_dec4_1(11, F4_3, F4_2); + round_dec4_2(9, F4_1, F4_3); + get_round_kr_dec(7); + round_dec4_1(7, F4_2, F4_1); + round_dec4_1(5, F4_3, F4_2); + round_dec4_1(3, F4_1, F4_3); + round_dec_last4(1, F4_2, F4_1); + + outbswap_block4(RLR0, RLR1, RLR2, RLR3); + ret; +.size __cast5_dec_blk4,.-__cast5_dec_blk4; + +.align 8 +.global _gcry_cast5_amd64_ctr_enc +.type _gcry_cast5_amd64_ctr_enc,@function; +_gcry_cast5_amd64_ctr_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv (big endian, 64bit) + */ + + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + + pushq %rsi; + pushq %rdx; + + /* load IV and byteswap */ + movq (%rcx), RX0; + bswapq RX0; + movq RX0, RLR0; + + /* construct IVs */ + leaq 1(RX0), RLR1; + leaq 2(RX0), RLR2; + leaq 3(RX0), RLR3; + leaq 4(RX0), RX0; + bswapq RX0; + + /* store new IV */ + movq RX0, (%rcx); + + call __cast5_enc_blk4; + + popq %r14; /*src*/ + popq %r13; /*dst*/ + + /* XOR key-stream with plaintext */ + xorq 0 * 8(%r14), RLR0; + xorq 1 * 8(%r14), RLR1; + xorq 2 * 8(%r14), RLR2; + xorq 3 * 8(%r14), RLR3; + movq RLR0, 0 * 8(%r13); + movq RLR1, 1 * 8(%r13); + movq RLR2, 2 * 8(%r13); + movq RLR3, 3 * 8(%r13); + + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + ret +.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc; + +.align 8 +.global _gcry_cast5_amd64_cbc_dec +.type _gcry_cast5_amd64_cbc_dec,@function; +_gcry_cast5_amd64_cbc_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv (64bit) + */ + + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + + pushq %rcx; + pushq %rsi; + pushq %rdx; + + /* load input */ + movq 0 * 8(%rdx), RLR0; + movq 1 * 8(%rdx), RLR1; + movq 2 * 8(%rdx), RLR2; + movq 3 * 8(%rdx), RLR3; + + call __cast5_dec_blk4; + + popq RX0; /*src*/ + popq RX1; /*dst*/ + popq RX2; /*iv*/ + + movq 3 * 8(RX0), %r14; + xorq (RX2), RLR0; + xorq 0 * 8(RX0), RLR1; + xorq 1 * 8(RX0), RLR2; + xorq 2 * 8(RX0), RLR3; + movq %r14, (RX2); /* store new IV */ + + movq RLR0, 0 * 8(RX1); + movq RLR1, 1 * 8(RX1); + movq RLR2, 2 * 8(RX1); + movq RLR3, 3 * 8(RX1); + + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + ret; + +.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec; + +.align 8 +.global _gcry_cast5_amd64_cfb_dec +.type _gcry_cast5_amd64_cfb_dec,@function; +_gcry_cast5_amd64_cfb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv (64bit) + */ + + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + + pushq %rsi; + pushq %rdx; + + /* Load input */ + movq (%rcx), RLR0; + movq 0 * 8(%rdx), RLR1; + movq 1 * 8(%rdx), RLR2; + movq 2 * 8(%rdx), RLR3; + + inbswap_block4(RLR0, RLR1, RLR2, RLR3); + + /* Update IV */ + movq 3 * 8(%rdx), %rdx; + movq %rdx, (%rcx); + + call __cast5_enc_blk4; + + popq %rdx; /*src*/ + popq %rcx; /*dst*/ + + xorq 0 * 8(%rdx), RLR0; + xorq 1 * 8(%rdx), RLR1; + xorq 2 * 8(%rdx), RLR2; + xorq 3 * 8(%rdx), RLR3; + movq RLR0, 0 * 8(%rcx); + movq RLR1, 1 * 8(%rcx); + movq RLR2, 2 * 8(%rcx); + movq RLR3, 3 * 8(%rcx); + + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + ret; + +.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec; + +#endif /*defined(USE_CAST5)*/ +#endif /*__x86_64*/ diff --git a/cipher/cast5.c b/cipher/cast5.c index 9905f5cb..8c5664dd 100644 --- a/cipher/cast5.c +++ b/cipher/cast5.c @@ -42,6 +42,14 @@ #include "g10lib.h" #include "types.h" #include "cipher.h" +#include "bufhelp.h" +#include "cipher-selftest.h" + +/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ +#undef USE_AMD64_ASM +#if defined(__x86_64__) +# define USE_AMD64_ASM 1 +#endif #define CAST5_BLOCKSIZE 8 @@ -56,8 +64,12 @@ static void decrypt_block (void *c, byte *outbuf, const byte *inbuf); +#define s1 _gcry_cast5_s1to4[0] +#define s2 _gcry_cast5_s1to4[1] +#define s3 _gcry_cast5_s1to4[2] +#define s4 _gcry_cast5_s1to4[3] -static const u32 s1[256] = { +const u32 _gcry_cast5_s1to4[4][256] = { { 0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f, 0x9c004dd3, 0x6003e540, 0xcf9fc949, 0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0, 0x15c361d2, 0xc2e7661d, 0x22d4ff8e, 0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3, 0xdf2f8656, 0x887ca41a, 0xa2d2bd2d, @@ -90,8 +102,7 @@ static const u32 s1[256] = { 0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db, 0xab838653, 0x6e2f1e23, 0x83719c9e, 0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c, 0xe1e696ff, 0xb141ab08, 0x7cca89b9, 0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c, 0x5ac9f049, 0xdd8f0f00, 0x5c8165bf -}; -static const u32 s2[256] = { +}, { 0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a, 0xeec5207a, 0x55889c94, 0x72fc0651, 0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef, 0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3, 0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086, 0xef944459, 0xba83ccb3, 0xe0c3cdfb, @@ -124,8 +135,7 @@ static const u32 s2[256] = { 0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3, 0x230eabb0, 0x6438bc87, 0xf0b5b1fa, 0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589, 0xa345415e, 0x5c038323, 0x3e5d3bb9, 0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539, 0x73bfbe70, 0x83877605, 0x4523ecf1 -}; -static const u32 s3[256] = { +}, { 0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff, 0x369fe44b, 0x8c1fc644, 0xaececa90, 0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806, 0xf0ad0548, 0xe13c8d83, 0x927010d5, 0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820, 0xfade82e0, 0xa067268b, 0x8272792e, @@ -158,8 +168,7 @@ static const u32 s3[256] = { 0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f, 0x68458425, 0x99833be5, 0x600d457d, 0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31, 0x9c305a00, 0x52bce688, 0x1b03588a, 0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636, 0xa133c501, 0xe9d3531c, 0xee353783 -}; -static const u32 s4[256] = { +}, { 0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb, 0x64ad8c57, 0x85510443, 0xfa020ed1, 0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43, 0x6497b7b1, 0xf3641f63, 0x241e4adf, 0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30, 0xc0a5374f, 0x1d2d00d9, 0x24147b15, @@ -192,7 +201,7 @@ static const u32 s4[256] = { 0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee, 0xf65324e6, 0x6afce36c, 0x0316cc04, 0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979, 0x932bcdf6, 0xb657c34d, 0x4edfd282, 0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0, 0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2 -}; +} }; static const u32 s5[256] = { 0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, 0x1dd358f5, 0x44dd9d44, 0x1731167f, 0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, 0x386381cb, 0xacf6243a, 0x69befd7a, @@ -331,6 +340,53 @@ static const u32 s8[256] = { }; +#ifdef USE_AMD64_ASM + +/* Assembly implementations of CAST5. */ +extern void _gcry_cast5_amd64_encrypt_block(CAST5_context *c, byte *outbuf, + const byte *inbuf); + +extern void _gcry_cast5_amd64_decrypt_block(CAST5_context *c, byte *outbuf, + const byte *inbuf); + +/* These assembly implementations process four blocks in parallel. */ +extern void _gcry_cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out, + const byte *in, byte *ctr); + +extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out, + const byte *in, byte *iv); + +extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out, + const byte *in, byte *iv); + +static void +do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf) +{ + _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf); +} + +static void +do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf) +{ + _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf); +} + +static void encrypt_block (void *context , byte *outbuf, const byte *inbuf) +{ + CAST5_context *c = (CAST5_context *) context; + do_encrypt_block (c, outbuf, inbuf); + _gcry_burn_stack (2*8); +} + +static void decrypt_block (void *context, byte *outbuf, const byte *inbuf) +{ + CAST5_context *c = (CAST5_context *) context; + _gcry_cast5_amd64_decrypt_block (c, outbuf, inbuf); + _gcry_burn_stack (2*8); +} + +#else /*USE_AMD64_ASM*/ + #if defined(__GNUC__) && defined(__i386__) static inline u32 rol(int n, u32 x) @@ -463,6 +519,201 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf) _gcry_burn_stack (20+4*sizeof(void*)); } +#endif /*!USE_AMD64_ASM*/ + + +/* Bulk encryption of complete blocks in CTR mode. This function is only + intended for the bulk encryption feature of cipher.c. CTR is expected to be + of size CAST5_BLOCKSIZE. */ +void +_gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + CAST5_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char tmpbuf[CAST5_BLOCKSIZE]; + int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE; + + int i; + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 4) + burn_stack_depth += 8 * sizeof(void*); + + /* Process data in 4 block chunks. */ + while (nblocks >= 4) + { + _gcry_cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 4; + outbuf += 4 * CAST5_BLOCKSIZE; + inbuf += 4 * CAST5_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* Encrypt the counter. */ + do_encrypt_block(ctx, tmpbuf, ctr); + /* XOR the input with the encrypted counter and store in output. */ + buf_xor(outbuf, tmpbuf, inbuf, CAST5_BLOCKSIZE); + outbuf += CAST5_BLOCKSIZE; + inbuf += CAST5_BLOCKSIZE; + /* Increment the counter. */ + for (i = CAST5_BLOCKSIZE; i > 0; i--) + { + ctr[i-1]++; + if (ctr[i-1]) + break; + } + } + + wipememory(tmpbuf, sizeof(tmpbuf)); + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk decryption of complete blocks in CBC mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + CAST5_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char savebuf[CAST5_BLOCKSIZE]; + int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE; + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 4) + burn_stack_depth += 8 * sizeof(void*); + + /* Process data in 4 block chunks. */ + while (nblocks >= 4) + { + _gcry_cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 4; + outbuf += 4 * CAST5_BLOCKSIZE; + inbuf += 4 * CAST5_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* We need to save INBUF away because it may be identical to + OUTBUF. */ + memcpy(savebuf, inbuf, CAST5_BLOCKSIZE); + + do_decrypt_block (ctx, outbuf, inbuf); + + buf_xor(outbuf, outbuf, iv, CAST5_BLOCKSIZE); + memcpy(iv, savebuf, CAST5_BLOCKSIZE); + inbuf += CAST5_BLOCKSIZE; + outbuf += CAST5_BLOCKSIZE; + } + + wipememory(savebuf, sizeof(savebuf)); + _gcry_burn_stack(burn_stack_depth); +} + +/* Bulk decryption of complete blocks in CFB mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + CAST5_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE; + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 4) + burn_stack_depth += 8 * sizeof(void*); + + /* Process data in 4 block chunks. */ + while (nblocks >= 4) + { + _gcry_cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 4; + outbuf += 4 * CAST5_BLOCKSIZE; + inbuf += 4 * CAST5_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + do_encrypt_block(ctx, iv, iv); + buf_xor_n_copy(outbuf, iv, inbuf, CAST5_BLOCKSIZE); + outbuf += CAST5_BLOCKSIZE; + inbuf += CAST5_BLOCKSIZE; + } + + _gcry_burn_stack(burn_stack_depth); +} + + +/* Run the self-tests for CAST5-CTR, tests IV increment of bulk CTR + encryption. Returns NULL on success. */ +static const char * +selftest_ctr (void) +{ + const int nblocks = 4+1; + const int blocksize = CAST5_BLOCKSIZE; + const int context_size = sizeof(CAST5_context); + + return _gcry_selftest_helper_ctr("CAST5", &cast_setkey, + &encrypt_block, &_gcry_cast5_ctr_enc, nblocks, blocksize, + context_size); +} + + +/* Run the self-tests for CAST5-CBC, tests bulk CBC decryption. + Returns NULL on success. */ +static const char * +selftest_cbc (void) +{ + const int nblocks = 4+2; + const int blocksize = CAST5_BLOCKSIZE; + const int context_size = sizeof(CAST5_context); + + return _gcry_selftest_helper_cbc("CAST5", &cast_setkey, + &encrypt_block, &_gcry_cast5_cbc_dec, nblocks, blocksize, + context_size); +} + + +/* Run the self-tests for CAST5-CFB, tests bulk CBC decryption. + Returns NULL on success. */ +static const char * +selftest_cfb (void) +{ + const int nblocks = 4+2; + const int blocksize = CAST5_BLOCKSIZE; + const int context_size = sizeof(CAST5_context); + + return _gcry_selftest_helper_cfb("CAST5", &cast_setkey, + &encrypt_block, &_gcry_cast5_cfb_dec, nblocks, blocksize, + context_size); +} + static const char* selftest(void) @@ -473,6 +724,7 @@ selftest(void) byte plain[8] = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF }; byte cipher[8]= { 0x23, 0x8B, 0x4F, 0xE5, 0x84, 0x7E, 0x44, 0xB2 }; byte buffer[8]; + const char *r; cast_setkey( &c, key, 16 ); encrypt_block( &c, buffer, plain ); @@ -507,6 +759,16 @@ selftest(void) } #endif + + if ( (r = selftest_cbc ()) ) + return r; + + if ( (r = selftest_cfb ()) ) + return r; + + if ( (r = selftest_ctr ()) ) + return r; + return NULL; } diff --git a/cipher/cipher.c b/cipher/cipher.c index 652d7953..79ca7553 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -718,6 +718,13 @@ gcry_cipher_open (gcry_cipher_hd_t *handle, h->bulk.ctr_enc = _gcry_aes_ctr_enc; break; #endif /*USE_AES*/ +#ifdef USE_CAST5 + case GCRY_CIPHER_CAST5: + h->bulk.cfb_dec = _gcry_cast5_cfb_dec; + h->bulk.cbc_dec = _gcry_cast5_cbc_dec; + h->bulk.ctr_enc = _gcry_cast5_ctr_enc; + break; +#endif /*USE_CAMELLIA*/ #ifdef USE_CAMELLIA case GCRY_CIPHER_CAMELLIA128: case GCRY_CIPHER_CAMELLIA192: diff --git a/configure.ac b/configure.ac index 3fec8bc9..113c71f4 100644 --- a/configure.ac +++ b/configure.ac @@ -1195,6 +1195,13 @@ LIST_MEMBER(cast5, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo" AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo" + ;; + esac fi LIST_MEMBER(des, $enabled_ciphers) diff --git a/src/cipher.h b/src/cipher.h index 9d6cc015..1742003c 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -95,6 +95,19 @@ void _gcry_aes_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, unsigned int nblocks); +/*-- cast5.c --*/ +void _gcry_cast5_cfb_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + unsigned int nblocks); + +void _gcry_cast5_cbc_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + unsigned int nblocks); + +void _gcry_cast5_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + unsigned int nblocks); + /*-- camellia-glue.c --*/ void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, |