diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2019-12-22 16:44:07 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2019-12-23 11:28:32 +0200 |
commit | 110077505acacae62cec3d09b32a084b9cee0368 (patch) | |
tree | 9597d2db3c79198b215628f34ed1af6d5dcc2aa7 /cipher | |
parent | 0837d7e6be3e604c1f7b86d18c582d8aa7ed858c (diff) | |
download | libgcrypt-110077505acacae62cec3d09b32a084b9cee0368.tar.gz |
rijndael-ppc: performance improvements
* cipher/rijndael-ppc.c (ALIGNED_LOAD, ALIGNED_STORE, VEC_LOAD_BE)
(VEC_STORE_BE): Rewrite.
(VEC_BE_SWAP, VEC_LOAD_BE_NOSWAP, VEC_STORE_BE_NOSWAP): New.
(PRELOAD_ROUND_KEYS, AES_ENCRYPT, AES_DECRYPT): Adjust to new
input parameters for vector load macros.
(ROUND_KEY_VARIABLES_ALL, PRELOAD_ROUND_KEYS_ALL)
(AES_ENCRYPT_ALL): New.
(vec_bswap32_const_neg): New.
(vec_aligned_ld, vec_aligned_st, vec_load_be_const): Rename to...
(asm_aligned_ls, asm_aligned_st, asm_load_be_const): ...these.
(asm_be_swap, asm_vperm1, asm_load_be_noswap)
(asm_store_be_noswap): New.
(vec_add_uint128): Rename to...
(asm_add_uint128): ...this.
(asm_xor, asm_cipher_be, asm_cipherlast_be, asm_ncipher_be)
(asm_ncipherlast_be): New inline assembly functions with volatile
keyword to allow manual instruction ordering.
(_gcry_aes_ppc8_setkey, aes_ppc8_prepare_decryption)
(_gcry_aes_ppc8_encrypt, _gcry_aes_ppc8_decrypt)
(_gcry_aes_ppc8_cfb_enc, _gcry_aes_ppc8_cbc_enc)
(_gcry_aes_ppc8_ocb_auth): Update to use new&rewritten helper macros.
(_gcry_aes_ppc8_cfb_dec, _gcry_aes_ppc8_cbc_dec)
(_gcry_aes_ppc8_ctr_enc, _gcry_aes_ppc8_ocb_crypt)
(_gcry_aes_ppc8_xts_crypt): Update to use new&rewritten helper
macros; Tune 8-block parallel paths with manual instruction ordering.
--
Benchmarks on POWER8 (ppc64le, ~3.8Ghz):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC enc | 1.06 ns/B 902.2 MiB/s 4.02 c/B
CBC dec | 0.208 ns/B 4585 MiB/s 0.790 c/B
CFB enc | 1.06 ns/B 900.4 MiB/s 4.02 c/B
CFB dec | 0.208 ns/B 4588 MiB/s 0.790 c/B
CTR enc | 0.238 ns/B 4007 MiB/s 0.904 c/B
CTR dec | 0.238 ns/B 4009 MiB/s 0.904 c/B
XTS enc | 0.492 ns/B 1937 MiB/s 1.87 c/B
XTS dec | 0.488 ns/B 1955 MiB/s 1.85 c/B
OCB enc | 0.243 ns/B 3928 MiB/s 0.922 c/B
OCB dec | 0.247 ns/B 3858 MiB/s 0.939 c/B
OCB auth | 0.213 ns/B 4482 MiB/s 0.809 c/B
After (cbc-dec & cfb-dec & xts & ocb ~6% faster, ctr ~11% faster):
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC enc | 1.06 ns/B 902.1 MiB/s 4.02 c/B
CBC dec | 0.196 ns/B 4877 MiB/s 0.743 c/B
CFB enc | 1.06 ns/B 902.2 MiB/s 4.02 c/B
CFB dec | 0.195 ns/B 4889 MiB/s 0.741 c/B
CTR enc | 0.214 ns/B 4448 MiB/s 0.815 c/B
CTR dec | 0.214 ns/B 4452 MiB/s 0.814 c/B
XTS enc | 0.461 ns/B 2067 MiB/s 1.75 c/B
XTS dec | 0.456 ns/B 2092 MiB/s 1.73 c/B
OCB enc | 0.227 ns/B 4200 MiB/s 0.863 c/B
OCB dec | 0.234 ns/B 4072 MiB/s 0.890 c/B
OCB auth | 0.207 ns/B 4604 MiB/s 0.787 c/B
Benchmarks on POWER9 (ppc64le, ~3.8Ghz):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC enc | 1.04 ns/B 918.7 MiB/s 3.94 c/B
CBC dec | 0.240 ns/B 3982 MiB/s 0.910 c/B
CFB enc | 1.04 ns/B 917.6 MiB/s 3.95 c/B
CFB dec | 0.241 ns/B 3963 MiB/s 0.914 c/B
CTR enc | 0.249 ns/B 3835 MiB/s 0.945 c/B
CTR dec | 0.252 ns/B 3787 MiB/s 0.957 c/B
XTS enc | 0.505 ns/B 1889 MiB/s 1.92 c/B
XTS dec | 0.495 ns/B 1926 MiB/s 1.88 c/B
OCB enc | 0.303 ns/B 3152 MiB/s 1.15 c/B
OCB dec | 0.305 ns/B 3129 MiB/s 1.16 c/B
OCB auth | 0.265 ns/B 3595 MiB/s 1.01 c/B
After (cbc-dec & cfb-dec ~6% faster, ctr ~11% faster, ocb ~4% faster):
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC enc | 1.04 ns/B 917.3 MiB/s 3.95 c/B
CBC dec | 0.225 ns/B 4234 MiB/s 0.856 c/B
CFB enc | 1.04 ns/B 917.8 MiB/s 3.95 c/B
CFB dec | 0.226 ns/B 4214 MiB/s 0.860 c/B
CTR enc | 0.221 ns/B 4306 MiB/s 0.842 c/B
CTR dec | 0.223 ns/B 4271 MiB/s 0.848 c/B
XTS enc | 0.503 ns/B 1897 MiB/s 1.91 c/B
XTS dec | 0.495 ns/B 1928 MiB/s 1.88 c/B
OCB enc | 0.288 ns/B 3309 MiB/s 1.10 c/B
OCB dec | 0.292 ns/B 3266 MiB/s 1.11 c/B
OCB auth | 0.267 ns/B 3570 MiB/s 1.02 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/rijndael-ppc.c | 1839 |
1 files changed, 1112 insertions, 727 deletions
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c index 48a47edd..a8bcae46 100644 --- a/cipher/rijndael-ppc.c +++ b/cipher/rijndael-ppc.c @@ -51,17 +51,27 @@ typedef union #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE -#define ALIGNED_LOAD(in_ptr) \ - (vec_aligned_ld (0, (const unsigned char *)(in_ptr))) +#define ALIGNED_LOAD(in_ptr, offs) \ + (asm_aligned_ld ((offs) * 16, (const void *)(in_ptr))) -#define ALIGNED_STORE(out_ptr, vec) \ - (vec_aligned_st ((vec), 0, (unsigned char *)(out_ptr))) +#define ALIGNED_STORE(out_ptr, offs, vec) \ + (asm_aligned_st ((vec), (offs) * 16, (void *)(out_ptr))) -#define VEC_LOAD_BE(in_ptr, bige_const) \ - (vec_load_be (0, (const unsigned char *)(in_ptr), bige_const)) +#define VEC_BE_SWAP(vec, bige_const) (asm_be_swap ((vec), (bige_const))) -#define VEC_STORE_BE(out_ptr, vec, bige_const) \ - (vec_store_be ((vec), 0, (unsigned char *)(out_ptr), bige_const)) +#define VEC_LOAD_BE(in_ptr, offs, bige_const) \ + (asm_be_swap (asm_load_be_noswap ((offs) * 16, (const void *)(in_ptr)), \ + bige_const)) + +#define VEC_LOAD_BE_NOSWAP(in_ptr, offs) \ + (asm_load_be_noswap ((offs) * 16, (const unsigned char *)(in_ptr))) + +#define VEC_STORE_BE(out_ptr, offs, vec, bige_const) \ + (asm_store_be_noswap (asm_be_swap ((vec), (bige_const)), (offs) * 16, \ + (void *)(out_ptr))) + +#define VEC_STORE_BE_NOSWAP(out_ptr, offs, vec) \ + (asm_store_be_noswap ((vec), (offs) * 16, (void *)(out_ptr))) #define ROUND_KEY_VARIABLES \ @@ -69,166 +79,257 @@ typedef union #define PRELOAD_ROUND_KEYS(nrounds) \ do { \ - rkey0 = ALIGNED_LOAD(&rk[0]); \ - rkeylast = ALIGNED_LOAD(&rk[nrounds]); \ + rkey0 = ALIGNED_LOAD (rk, 0); \ + rkeylast = ALIGNED_LOAD (rk, nrounds); \ } while (0) - #define AES_ENCRYPT(blk, nrounds) \ do { \ blk ^= rkey0; \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[1])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[2])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[3])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[4])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[5])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[6])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[7])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[8])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[9])); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 1)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 2)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 3)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 4)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 5)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 6)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 7)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 8)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 9)); \ if (nrounds >= 12) \ { \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[10])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[11])); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 10)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 11)); \ if (rounds > 12) \ { \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[12])); \ - blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[13])); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 12)); \ + blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 13)); \ } \ } \ - blk = vec_cipherlast_be (blk, rkeylast); \ + blk = asm_cipherlast_be (blk, rkeylast); \ } while (0) - #define AES_DECRYPT(blk, nrounds) \ do { \ blk ^= rkey0; \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[1])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[2])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[3])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[4])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[5])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[6])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[7])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[8])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[9])); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 1)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 2)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 3)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 4)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 5)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 6)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 7)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 8)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 9)); \ if (nrounds >= 12) \ { \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[10])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[11])); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 10)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 11)); \ if (rounds > 12) \ { \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[12])); \ - blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[13])); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 12)); \ + blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 13)); \ } \ } \ - blk = vec_ncipherlast_be (blk, rkeylast); \ + blk = asm_ncipherlast_be (blk, rkeylast); \ } while (0) +#define ROUND_KEY_VARIABLES_ALL \ + block rkey0, rkey1, rkey2, rkey3, rkey4, rkey5, rkey6, rkey7, rkey8, \ + rkey9, rkey10, rkey11, rkey12, rkey13, rkeylast + +#define PRELOAD_ROUND_KEYS_ALL(nrounds) \ + do { \ + rkey0 = ALIGNED_LOAD (rk, 0); \ + rkey1 = ALIGNED_LOAD (rk, 1); \ + rkey2 = ALIGNED_LOAD (rk, 2); \ + rkey3 = ALIGNED_LOAD (rk, 3); \ + rkey4 = ALIGNED_LOAD (rk, 4); \ + rkey5 = ALIGNED_LOAD (rk, 5); \ + rkey6 = ALIGNED_LOAD (rk, 6); \ + rkey7 = ALIGNED_LOAD (rk, 7); \ + rkey8 = ALIGNED_LOAD (rk, 8); \ + rkey9 = ALIGNED_LOAD (rk, 9); \ + if (nrounds >= 12) \ + { \ + rkey10 = ALIGNED_LOAD (rk, 10); \ + rkey11 = ALIGNED_LOAD (rk, 11); \ + if (rounds > 12) \ + { \ + rkey12 = ALIGNED_LOAD (rk, 12); \ + rkey13 = ALIGNED_LOAD (rk, 13); \ + } \ + } \ + rkeylast = ALIGNED_LOAD (rk, nrounds); \ + } while (0) + +#define AES_ENCRYPT_ALL(blk, nrounds) \ + do { \ + blk ^= rkey0; \ + blk = asm_cipher_be (blk, rkey1); \ + blk = asm_cipher_be (blk, rkey2); \ + blk = asm_cipher_be (blk, rkey3); \ + blk = asm_cipher_be (blk, rkey4); \ + blk = asm_cipher_be (blk, rkey5); \ + blk = asm_cipher_be (blk, rkey6); \ + blk = asm_cipher_be (blk, rkey7); \ + blk = asm_cipher_be (blk, rkey8); \ + blk = asm_cipher_be (blk, rkey9); \ + if (nrounds >= 12) \ + { \ + blk = asm_cipher_be (blk, rkey10); \ + blk = asm_cipher_be (blk, rkey11); \ + if (rounds > 12) \ + { \ + blk = asm_cipher_be (blk, rkey12); \ + blk = asm_cipher_be (blk, rkey13); \ + } \ + } \ + blk = asm_cipherlast_be (blk, rkeylast); \ + } while (0) + + +#ifdef WORDS_BIGENDIAN static const block vec_bswap32_const = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; +#else +static const block vec_bswap32_const_neg = + { ~3, ~2, ~1, ~0, ~7, ~6, ~5, ~4, ~11, ~10, ~9, ~8, ~15, ~14, ~13, ~12 }; +#endif static ASM_FUNC_ATTR_INLINE block -vec_aligned_ld(unsigned long offset, const unsigned char *ptr) +asm_aligned_ld(unsigned long offset, const void *ptr) { -#ifndef WORDS_BIGENDIAN block vec; - __asm__ ("lvx %0,%1,%2\n\t" - : "=v" (vec) - : "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); + __asm__ volatile ("lvx %0,%1,%2\n\t" + : "=v" (vec) + : "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); return vec; -#else - return vec_vsx_ld (offset, ptr); -#endif } +static ASM_FUNC_ATTR_INLINE void +asm_aligned_st(block vec, unsigned long offset, void *ptr) +{ + __asm__ volatile ("stvx %0,%1,%2\n\t" + : + : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); +} static ASM_FUNC_ATTR_INLINE block -vec_load_be_const(void) +asm_load_be_const(void) { #ifndef WORDS_BIGENDIAN - return ~ALIGNED_LOAD(&vec_bswap32_const); + return ALIGNED_LOAD (&vec_bswap32_const_neg, 0); #else static const block vec_dummy = { 0 }; return vec_dummy; #endif } - static ASM_FUNC_ATTR_INLINE block -vec_load_be(unsigned long offset, const unsigned char *ptr, - block be_bswap_const) +asm_vperm1(block vec, block mask) { -#ifndef WORDS_BIGENDIAN - block vec; - /* GCC vec_vsx_ld is generating two instructions on little-endian. Use - * lxvw4x directly instead. */ - __asm__ ("lxvw4x %x0,%1,%2\n\t" - : "=wa" (vec) - : "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); - __asm__ ("vperm %0,%1,%1,%2\n\t" - : "=v" (vec) - : "v" (vec), "v" (be_bswap_const)); - return vec; -#else - (void)be_bswap_const; - return vec_vsx_ld (offset, ptr); -#endif + block o; + __asm__ volatile ("vperm %0,%1,%1,%2\n\t" + : "=v" (o) + : "v" (vec), "v" (mask)); + return o; } - -static ASM_FUNC_ATTR_INLINE void -vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr) +static ASM_FUNC_ATTR_INLINE block +asm_be_swap(block vec, block be_bswap_const) { + (void)be_bswap_const; #ifndef WORDS_BIGENDIAN - __asm__ ("stvx %0,%1,%2\n\t" - : - : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); + return asm_vperm1 (vec, be_bswap_const); #else - vec_vsx_st (vec, offset, ptr); + return vec; #endif } +static ASM_FUNC_ATTR_INLINE block +asm_load_be_noswap(unsigned long offset, const void *ptr) +{ + block vec; + __asm__ volatile ("lxvw4x %x0,%1,%2\n\t" + : "=wa" (vec) + : "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); + /* NOTE: vec needs to be be-swapped using 'asm_be_swap' by caller */ + return vec; +} static ASM_FUNC_ATTR_INLINE void -vec_store_be(block vec, unsigned long offset, unsigned char *ptr, - block be_bswap_const) +asm_store_be_noswap(block vec, unsigned long offset, void *ptr) { -#ifndef WORDS_BIGENDIAN - /* GCC vec_vsx_st is generating two instructions on little-endian. Use - * stxvw4x directly instead. */ - __asm__ ("vperm %0,%1,%1,%2\n\t" - : "=v" (vec) - : "v" (vec), "v" (be_bswap_const)); - __asm__ ("stxvw4x %x0,%1,%2\n\t" - : - : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); -#else - (void)be_bswap_const; - vec_vsx_st (vec, offset, ptr); -#endif + /* NOTE: vec be-swapped using 'asm_be_swap' by caller */ + __asm__ volatile ("stxvw4x %x0,%1,%2\n\t" + : + : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); } +static ASM_FUNC_ATTR_INLINE block +asm_add_uint128(block a, block b) +{ + block res; + __asm__ volatile ("vadduqm %0,%1,%2\n\t" + : "=v" (res) + : "v" (a), "v" (b)); + return res; +} static ASM_FUNC_ATTR_INLINE block -vec_add_uint128(block a, block b) +asm_xor(block a, block b) { -#if 1 block res; - /* Use assembly as GCC (v8.3) generates slow code for vec_vadduqm. */ - __asm__ ("vadduqm %0,%1,%2\n\t" - : "=v" (res) - : "v" (a), "v" (b)); + __asm__ volatile ("vxor %0,%1,%2\n\t" + : "=v" (res) + : "v" (a), "v" (b)); return res; -#else - return (block)vec_vadduqm((vector __uint128_t)a, (vector __uint128_t)b); -#endif +} + +static ASM_FUNC_ATTR_INLINE block +asm_cipher_be(block b, block rk) +{ + block o; + __asm__ volatile ("vcipher %0, %1, %2\n\t" + : "=v" (o) + : "v" (b), "v" (rk)); + return o; +} + +static ASM_FUNC_ATTR_INLINE block +asm_cipherlast_be(block b, block rk) +{ + block o; + __asm__ volatile ("vcipherlast %0, %1, %2\n\t" + : "=v" (o) + : "v" (b), "v" (rk)); + return o; +} + +static ASM_FUNC_ATTR_INLINE block +asm_ncipher_be(block b, block rk) +{ + block o; + __asm__ volatile ("vncipher %0, %1, %2\n\t" + : "=v" (o) + : "v" (b), "v" (rk)); + return o; +} + +static ASM_FUNC_ATTR_INLINE block +asm_ncipherlast_be(block b, block rk) +{ + block o; + __asm__ volatile ("vncipherlast %0, %1, %2\n\t" + : "=v" (o) + : "v" (b), "v" (rk)); + return o; } @@ -250,7 +351,7 @@ _gcry_aes_sbox4_ppc8(u32 fourbytes) void _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); union { PROPERLY_ALIGNED_TYPE dummy; @@ -345,11 +446,11 @@ _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key) for (r = 0; r <= rounds; r++) { #ifndef WORDS_BIGENDIAN - VEC_STORE_BE(&ekey[r], ALIGNED_LOAD(&ekey[r]), bige_const); + VEC_STORE_BE(ekey, r, ALIGNED_LOAD (ekey, r), bige_const); #else - block rvec = ALIGNED_LOAD(&ekey[r]); - ALIGNED_STORE(&ekey[r], - vec_perm(rvec, rvec, vec_bswap32_const)); + block rvec = ALIGNED_LOAD (ekey, r); + ALIGNED_STORE (ekey, r, + vec_perm(rvec, rvec, vec_bswap32_const)); (void)bige_const; #endif } @@ -378,7 +479,7 @@ aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx) rr = rounds; for (r = 0, rr = rounds; r <= rounds; r++, rr--) { - ALIGNED_STORE(&dkey[r], ALIGNED_LOAD(&ekey[rr])); + ALIGNED_STORE (dkey, r, ALIGNED_LOAD (ekey, rr)); } } @@ -394,18 +495,18 @@ unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx, unsigned char *out, const unsigned char *in) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); const u128_t *rk = (u128_t *)&ctx->keyschenc; int rounds = ctx->rounds; ROUND_KEY_VARIABLES; block b; - b = VEC_LOAD_BE (in, bige_const); + b = VEC_LOAD_BE (in, 0, bige_const); PRELOAD_ROUND_KEYS (rounds); AES_ENCRYPT (b, rounds); - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); return 0; /* does not use stack */ } @@ -415,18 +516,18 @@ unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx, unsigned char *out, const unsigned char *in) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); const u128_t *rk = (u128_t *)&ctx->keyschdec; int rounds = ctx->rounds; ROUND_KEY_VARIABLES; block b; - b = VEC_LOAD_BE (in, bige_const); + b = VEC_LOAD_BE (in, 0, bige_const); PRELOAD_ROUND_KEYS (rounds); AES_DECRYPT (b, rounds); - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); return 0; /* does not use stack */ } @@ -436,41 +537,41 @@ void _gcry_aes_ppc8_cfb_enc (void *context, unsigned char *iv_arg, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); RIJNDAEL_context *ctx = context; const u128_t *rk = (u128_t *)&ctx->keyschenc; const u128_t *in = (const u128_t *)inbuf_arg; u128_t *out = (u128_t *)outbuf_arg; int rounds = ctx->rounds; - ROUND_KEY_VARIABLES; + ROUND_KEY_VARIABLES_ALL; block rkeylast_orig; block iv; - iv = VEC_LOAD_BE (iv_arg, bige_const); + iv = VEC_LOAD_BE (iv_arg, 0, bige_const); - PRELOAD_ROUND_KEYS (rounds); + PRELOAD_ROUND_KEYS_ALL (rounds); rkeylast_orig = rkeylast; for (; nblocks; nblocks--) { - rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, bige_const); + rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const); - AES_ENCRYPT (iv, rounds); + AES_ENCRYPT_ALL (iv, rounds); - VEC_STORE_BE (out, iv, bige_const); + VEC_STORE_BE (out, 0, iv, bige_const); out++; in++; } - VEC_STORE_BE (iv_arg, iv, bige_const); + VEC_STORE_BE (iv_arg, 0, iv, bige_const); } void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); RIJNDAEL_context *ctx = context; const u128_t *rk = (u128_t *)&ctx->keyschenc; const u128_t *in = (const u128_t *)inbuf_arg; @@ -483,7 +584,7 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg, block b0, b1, b2, b3, b4, b5, b6, b7; block rkey; - iv = VEC_LOAD_BE (iv_arg, bige_const); + iv = VEC_LOAD_BE (iv_arg, 0, bige_const); PRELOAD_ROUND_KEYS (rounds); rkeylast_orig = rkeylast; @@ -491,34 +592,42 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg, for (; nblocks >= 8; nblocks -= 8) { in0 = iv; - in1 = VEC_LOAD_BE (in + 0, bige_const); - in2 = VEC_LOAD_BE (in + 1, bige_const); - in3 = VEC_LOAD_BE (in + 2, bige_const); - in4 = VEC_LOAD_BE (in + 3, bige_const); - in5 = VEC_LOAD_BE (in + 4, bige_const); - in6 = VEC_LOAD_BE (in + 5, bige_const); - in7 = VEC_LOAD_BE (in + 6, bige_const); - iv = VEC_LOAD_BE (in + 7, bige_const); - - b0 = rkey0 ^ in0; - b1 = rkey0 ^ in1; - b2 = rkey0 ^ in2; - b3 = rkey0 ^ in3; - b4 = rkey0 ^ in4; - b5 = rkey0 ^ in5; - b6 = rkey0 ^ in6; - b7 = rkey0 ^ in7; + in1 = VEC_LOAD_BE_NOSWAP (in, 0); + in2 = VEC_LOAD_BE_NOSWAP (in, 1); + in3 = VEC_LOAD_BE_NOSWAP (in, 2); + in4 = VEC_LOAD_BE_NOSWAP (in, 3); + in1 = VEC_BE_SWAP (in1, bige_const); + in2 = VEC_BE_SWAP (in2, bige_const); + in5 = VEC_LOAD_BE_NOSWAP (in, 4); + in6 = VEC_LOAD_BE_NOSWAP (in, 5); + in3 = VEC_BE_SWAP (in3, bige_const); + in4 = VEC_BE_SWAP (in4, bige_const); + in7 = VEC_LOAD_BE_NOSWAP (in, 6); + iv = VEC_LOAD_BE_NOSWAP (in, 7); + in += 8; + in5 = VEC_BE_SWAP (in5, bige_const); + in6 = VEC_BE_SWAP (in6, bige_const); + b0 = asm_xor (rkey0, in0); + b1 = asm_xor (rkey0, in1); + in7 = VEC_BE_SWAP (in7, bige_const); + iv = VEC_BE_SWAP (iv, bige_const); + b2 = asm_xor (rkey0, in2); + b3 = asm_xor (rkey0, in3); + b4 = asm_xor (rkey0, in4); + b5 = asm_xor (rkey0, in5); + b6 = asm_xor (rkey0, in6); + b7 = asm_xor (rkey0, in7); #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD(&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); \ - b4 = vec_cipher_be (b4, rkey); \ - b5 = vec_cipher_be (b5, rkey); \ - b6 = vec_cipher_be (b6, rkey); \ - b7 = vec_cipher_be (b7, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); \ + b4 = asm_cipher_be (b4, rkey); \ + b5 = asm_cipher_be (b5, rkey); \ + b6 = asm_cipher_be (b6, rkey); \ + b7 = asm_cipher_be (b7, rkey); DO_ROUND(1); DO_ROUND(2); @@ -542,48 +651,60 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg, #undef DO_ROUND - rkey = rkeylast; - b0 = vec_cipherlast_be (b0, rkey ^ in1); - b1 = vec_cipherlast_be (b1, rkey ^ in2); - b2 = vec_cipherlast_be (b2, rkey ^ in3); - b3 = vec_cipherlast_be (b3, rkey ^ in4); - b4 = vec_cipherlast_be (b4, rkey ^ in5); - b5 = vec_cipherlast_be (b5, rkey ^ in6); - b6 = vec_cipherlast_be (b6, rkey ^ in7); - b7 = vec_cipherlast_be (b7, rkey ^ iv); - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); - VEC_STORE_BE (out + 4, b4, bige_const); - VEC_STORE_BE (out + 5, b5, bige_const); - VEC_STORE_BE (out + 6, b6, bige_const); - VEC_STORE_BE (out + 7, b7, bige_const); - - in += 8; + in1 = asm_xor (rkeylast, in1); + in2 = asm_xor (rkeylast, in2); + in3 = asm_xor (rkeylast, in3); + in4 = asm_xor (rkeylast, in4); + b0 = asm_cipherlast_be (b0, in1); + b1 = asm_cipherlast_be (b1, in2); + in5 = asm_xor (rkeylast, in5); + in6 = asm_xor (rkeylast, in6); + b2 = asm_cipherlast_be (b2, in3); + b3 = asm_cipherlast_be (b3, in4); + in7 = asm_xor (rkeylast, in7); + in0 = asm_xor (rkeylast, iv); + b0 = VEC_BE_SWAP (b0, bige_const); + b1 = VEC_BE_SWAP (b1, bige_const); + b4 = asm_cipherlast_be (b4, in5); + b5 = asm_cipherlast_be (b5, in6); + b2 = VEC_BE_SWAP (b2, bige_const); + b3 = VEC_BE_SWAP (b3, bige_const); + b6 = asm_cipherlast_be (b6, in7); + b7 = asm_cipherlast_be (b7, in0); + b4 = VEC_BE_SWAP (b4, bige_const); + b5 = VEC_BE_SWAP (b5, bige_const); + b6 = VEC_BE_SWAP (b6, bige_const); + b7 = VEC_BE_SWAP (b7, bige_const); + VEC_STORE_BE_NOSWAP (out, 0, b0); + VEC_STORE_BE_NOSWAP (out, 1, b1); + VEC_STORE_BE_NOSWAP (out, 2, b2); + VEC_STORE_BE_NOSWAP (out, 3, b3); + VEC_STORE_BE_NOSWAP (out, 4, b4); + VEC_STORE_BE_NOSWAP (out, 5, b5); + VEC_STORE_BE_NOSWAP (out, 6, b6); + VEC_STORE_BE_NOSWAP (out, 7, b7); out += 8; } if (nblocks >= 4) { in0 = iv; - in1 = VEC_LOAD_BE (in + 0, bige_const); - in2 = VEC_LOAD_BE (in + 1, bige_const); - in3 = VEC_LOAD_BE (in + 2, bige_const); - iv = VEC_LOAD_BE (in + 3, bige_const); + in1 = VEC_LOAD_BE (in, 0, bige_const); + in2 = VEC_LOAD_BE (in, 1, bige_const); + in3 = VEC_LOAD_BE (in, 2, bige_const); + iv = VEC_LOAD_BE (in, 3, bige_const); - b0 = rkey0 ^ in0; - b1 = rkey0 ^ in1; - b2 = rkey0 ^ in2; - b3 = rkey0 ^ in3; + b0 = asm_xor (rkey0, in0); + b1 = asm_xor (rkey0, in1); + b2 = asm_xor (rkey0, in2); + b3 = asm_xor (rkey0, in3); #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD(&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); DO_ROUND(1); DO_ROUND(2); @@ -607,16 +728,18 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg, #undef DO_ROUND - rkey = rkeylast; - b0 = vec_cipherlast_be (b0, rkey ^ in1); - b1 = vec_cipherlast_be (b1, rkey ^ in2); - b2 = vec_cipherlast_be (b2, rkey ^ in3); - b3 = vec_cipherlast_be (b3, rkey ^ iv); - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); + in1 = asm_xor (rkeylast, in1); + in2 = asm_xor (rkeylast, in2); + in3 = asm_xor (rkeylast, in3); + in0 = asm_xor (rkeylast, iv); + b0 = asm_cipherlast_be (b0, in1); + b1 = asm_cipherlast_be (b1, in2); + b2 = asm_cipherlast_be (b2, in3); + b3 = asm_cipherlast_be (b3, in0); + VEC_STORE_BE (out, 0, b0, bige_const); + VEC_STORE_BE (out, 1, b1, bige_const); + VEC_STORE_BE (out, 2, b2, bige_const); + VEC_STORE_BE (out, 3, b3, bige_const); in += 4; out += 4; @@ -625,20 +748,20 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg, for (; nblocks; nblocks--) { - bin = VEC_LOAD_BE (in, bige_const); + bin = VEC_LOAD_BE (in, 0, bige_const); rkeylast = rkeylast_orig ^ bin; b = iv; iv = bin; AES_ENCRYPT (b, rounds); - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); out++; in++; } - VEC_STORE_BE (iv_arg, iv, bige_const); + VEC_STORE_BE (iv_arg, 0, iv, bige_const); } @@ -646,41 +769,41 @@ void _gcry_aes_ppc8_cbc_enc (void *context, unsigned char *iv_arg, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); RIJNDAEL_context *ctx = context; const u128_t *rk = (u128_t *)&ctx->keyschenc; const u128_t *in = (const u128_t *)inbuf_arg; u128_t *out = (u128_t *)outbuf_arg; int rounds = ctx->rounds; - ROUND_KEY_VARIABLES; + ROUND_KEY_VARIABLES_ALL; block lastiv, b; + unsigned int outadd = !cbc_mac; - lastiv = VEC_LOAD_BE (iv_arg, bige_const); + lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const); - PRELOAD_ROUND_KEYS (rounds); + PRELOAD_ROUND_KEYS_ALL (rounds); for (; nblocks; nblocks--) { - b = lastiv ^ VEC_LOAD_BE (in, bige_const); + b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const); - AES_ENCRYPT (b, rounds); + AES_ENCRYPT_ALL (b, rounds); lastiv = b; - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); in++; - if (!cbc_mac) - out++; + out += outadd; } - VEC_STORE_BE (iv_arg, lastiv, bige_const); + VEC_STORE_BE (iv_arg, 0, lastiv, bige_const); } void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); RIJNDAEL_context *ctx = context; const u128_t *rk = (u128_t *)&ctx->keyschdec; const u128_t *in = (const u128_t *)inbuf_arg; @@ -699,41 +822,49 @@ void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg, ctx->decryption_prepared = 1; } - iv = VEC_LOAD_BE (iv_arg, bige_const); + iv = VEC_LOAD_BE (iv_arg, 0, bige_const); PRELOAD_ROUND_KEYS (rounds); rkeylast_orig = rkeylast; for (; nblocks >= 8; nblocks -= 8) { - in0 = VEC_LOAD_BE (in + 0, bige_const); - in1 = VEC_LOAD_BE (in + 1, bige_const); - in2 = VEC_LOAD_BE (in + 2, bige_const); - in3 = VEC_LOAD_BE (in + 3, bige_const); - in4 = VEC_LOAD_BE (in + 4, bige_const); - in5 = VEC_LOAD_BE (in + 5, bige_const); - in6 = VEC_LOAD_BE (in + 6, bige_const); - in7 = VEC_LOAD_BE (in + 7, bige_const); - - b0 = rkey0 ^ in0; - b1 = rkey0 ^ in1; - b2 = rkey0 ^ in2; - b3 = rkey0 ^ in3; - b4 = rkey0 ^ in4; - b5 = rkey0 ^ in5; - b6 = rkey0 ^ in6; - b7 = rkey0 ^ in7; + in0 = VEC_LOAD_BE_NOSWAP (in, 0); + in1 = VEC_LOAD_BE_NOSWAP (in, 1); + in2 = VEC_LOAD_BE_NOSWAP (in, 2); + in3 = VEC_LOAD_BE_NOSWAP (in, 3); + in0 = VEC_BE_SWAP (in0, bige_const); + in1 = VEC_BE_SWAP (in1, bige_const); + in4 = VEC_LOAD_BE_NOSWAP (in, 4); + in5 = VEC_LOAD_BE_NOSWAP (in, 5); + in2 = VEC_BE_SWAP (in2, bige_const); + in3 = VEC_BE_SWAP (in3, bige_const); + in6 = VEC_LOAD_BE_NOSWAP (in, 6); + in7 = VEC_LOAD_BE_NOSWAP (in, 7); + in += 8; + b0 = asm_xor (rkey0, in0); + b1 = asm_xor (rkey0, in1); + in4 = VEC_BE_SWAP (in4, bige_const); + in5 = VEC_BE_SWAP (in5, bige_const); + b2 = asm_xor (rkey0, in2); + b3 = asm_xor (rkey0, in3); + in6 = VEC_BE_SWAP (in6, bige_const); + in7 = VEC_BE_SWAP (in7, bige_const); + b4 = asm_xor (rkey0, in4); + b5 = asm_xor (rkey0, in5); + b6 = asm_xor (rkey0, in6); + b7 = asm_xor (rkey0, in7); #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD(&rk[r]); \ - b0 = vec_ncipher_be (b0, rkey); \ - b1 = vec_ncipher_be (b1, rkey); \ - b2 = vec_ncipher_be (b2, rkey); \ - b3 = vec_ncipher_be (b3, rkey); \ - b4 = vec_ncipher_be (b4, rkey); \ - b5 = vec_ncipher_be (b5, rkey); \ - b6 = vec_ncipher_be (b6, rkey); \ - b7 = vec_ncipher_be (b7, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_ncipher_be (b0, rkey); \ + b1 = asm_ncipher_be (b1, rkey); \ + b2 = asm_ncipher_be (b2, rkey); \ + b3 = asm_ncipher_be (b3, rkey); \ + b4 = asm_ncipher_be (b4, rkey); \ + b5 = asm_ncipher_be (b5, rkey); \ + b6 = asm_ncipher_be (b6, rkey); \ + b7 = asm_ncipher_be (b7, rkey); DO_ROUND(1); DO_ROUND(2); @@ -757,48 +888,60 @@ void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg, #undef DO_ROUND - rkey = rkeylast; - b0 = vec_ncipherlast_be (b0, rkey ^ iv); - b1 = vec_ncipherlast_be (b1, rkey ^ in0); - b2 = vec_ncipherlast_be (b2, rkey ^ in1); - b3 = vec_ncipherlast_be (b3, rkey ^ in2); - b4 = vec_ncipherlast_be (b4, rkey ^ in3); - b5 = vec_ncipherlast_be (b5, rkey ^ in4); - b6 = vec_ncipherlast_be (b6, rkey ^ in5); - b7 = vec_ncipherlast_be (b7, rkey ^ in6); + iv = asm_xor (rkeylast, iv); + in0 = asm_xor (rkeylast, in0); + in1 = asm_xor (rkeylast, in1); + in2 = asm_xor (rkeylast, in2); + b0 = asm_ncipherlast_be (b0, iv); iv = in7; - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); - VEC_STORE_BE (out + 4, b4, bige_const); - VEC_STORE_BE (out + 5, b5, bige_const); - VEC_STORE_BE (out + 6, b6, bige_const); - VEC_STORE_BE (out + 7, b7, bige_const); - - in += 8; + b1 = asm_ncipherlast_be (b1, in0); + in3 = asm_xor (rkeylast, in3); + in4 = asm_xor (rkeylast, in4); + b2 = asm_ncipherlast_be (b2, in1); + b3 = asm_ncipherlast_be (b3, in2); + in5 = asm_xor (rkeylast, in5); + in6 = asm_xor (rkeylast, in6); + b0 = VEC_BE_SWAP (b0, bige_const); + b1 = VEC_BE_SWAP (b1, bige_const); + b4 = asm_ncipherlast_be (b4, in3); + b5 = asm_ncipherlast_be (b5, in4); + b2 = VEC_BE_SWAP (b2, bige_const); + b3 = VEC_BE_SWAP (b3, bige_const); + b6 = asm_ncipherlast_be (b6, in5); + b7 = asm_ncipherlast_be (b7, in6); + b4 = VEC_BE_SWAP (b4, bige_const); + b5 = VEC_BE_SWAP (b5, bige_const); + b6 = VEC_BE_SWAP (b6, bige_const); + b7 = VEC_BE_SWAP (b7, bige_const); + VEC_STORE_BE_NOSWAP (out, 0, b0); + VEC_STORE_BE_NOSWAP (out, 1, b1); + VEC_STORE_BE_NOSWAP (out, 2, b2); + VEC_STORE_BE_NOSWAP (out, 3, b3); + VEC_STORE_BE_NOSWAP (out, 4, b4); + VEC_STORE_BE_NOSWAP (out, 5, b5); + VEC_STORE_BE_NOSWAP (out, 6, b6); + VEC_STORE_BE_NOSWAP (out, 7, b7); out += 8; } if (nblocks >= 4) { - in0 = VEC_LOAD_BE (in + 0, bige_const); - in1 = VEC_LOAD_BE (in + 1, bige_const); - in2 = VEC_LOAD_BE (in + 2, bige_const); - in3 = VEC_LOAD_BE (in + 3, bige_const); + in0 = VEC_LOAD_BE (in, 0, bige_const); + in1 = VEC_LOAD_BE (in, 1, bige_const); + in2 = VEC_LOAD_BE (in, 2, bige_const); + in3 = VEC_LOAD_BE (in, 3, bige_const); - b0 = rkey0 ^ in0; - b1 = rkey0 ^ in1; - b2 = rkey0 ^ in2; - b3 = rkey0 ^ in3; + b0 = asm_xor (rkey0, in0); + b1 = asm_xor (rkey0, in1); + b2 = asm_xor (rkey0, in2); + b3 = asm_xor (rkey0, in3); #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD(&rk[r]); \ - b0 = vec_ncipher_be (b0, rkey); \ - b1 = vec_ncipher_be (b1, rkey); \ - b2 = vec_ncipher_be (b2, rkey); \ - b3 = vec_ncipher_be (b3, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_ncipher_be (b0, rkey); \ + b1 = asm_ncipher_be (b1, rkey); \ + b2 = asm_ncipher_be (b2, rkey); \ + b3 = asm_ncipher_be (b3, rkey); DO_ROUND(1); DO_ROUND(2); @@ -822,17 +965,21 @@ void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg, #undef DO_ROUND - rkey = rkeylast; - b0 = vec_ncipherlast_be (b0, rkey ^ iv); - b1 = vec_ncipherlast_be (b1, rkey ^ in0); - b2 = vec_ncipherlast_be (b2, rkey ^ in1); - b3 = vec_ncipherlast_be (b3, rkey ^ in2); + iv = asm_xor (rkeylast, iv); + in0 = asm_xor (rkeylast, in0); + in1 = asm_xor (rkeylast, in1); + in2 = asm_xor (rkeylast, in2); + + b0 = asm_ncipherlast_be (b0, iv); iv = in3; + b1 = asm_ncipherlast_be (b1, in0); + b2 = asm_ncipherlast_be (b2, in1); + b3 = asm_ncipherlast_be (b3, in2); - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); + VEC_STORE_BE (out, 0, b0, bige_const); + VEC_STORE_BE (out, 1, b1, bige_const); + VEC_STORE_BE (out, 2, b2, bige_const); + VEC_STORE_BE (out, 3, b3, bige_const); in += 4; out += 4; @@ -843,17 +990,17 @@ void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg, { rkeylast = rkeylast_orig ^ iv; - iv = VEC_LOAD_BE (in, bige_const); + iv = VEC_LOAD_BE (in, 0, bige_const); b = iv; AES_DECRYPT (b, rounds); - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); in++; out++; } - VEC_STORE_BE (iv_arg, iv, bige_const); + VEC_STORE_BE (iv_arg, 0, iv, bige_const); } @@ -863,7 +1010,7 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg, { static const unsigned char vec_one_const[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); RIJNDAEL_context *ctx = context; const u128_t *rk = (u128_t *)&ctx->keyschenc; const u128_t *in = (const u128_t *)inbuf_arg; @@ -873,56 +1020,80 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg, block rkeylast_orig; block ctr, b, one; - ctr = VEC_LOAD_BE (ctr_arg, bige_const); - one = VEC_LOAD_BE (&vec_one_const, bige_const); + ctr = VEC_LOAD_BE (ctr_arg, 0, bige_const); + one = VEC_LOAD_BE (&vec_one_const, 0, bige_const); PRELOAD_ROUND_KEYS (rounds); rkeylast_orig = rkeylast; if (nblocks >= 4) { + block in0, in1, in2, in3, in4, in5, in6, in7; block b0, b1, b2, b3, b4, b5, b6, b7; block two, three, four; - block ctr4; block rkey; - two = vec_add_uint128 (one, one); - three = vec_add_uint128 (two, one); - four = vec_add_uint128 (two, two); + two = asm_add_uint128 (one, one); + three = asm_add_uint128 (two, one); + four = asm_add_uint128 (two, two); for (; nblocks >= 8; nblocks -= 8) { - ctr4 = vec_add_uint128 (ctr, four); - b0 = rkey0 ^ ctr; - b1 = rkey0 ^ vec_add_uint128 (ctr, one); - b2 = rkey0 ^ vec_add_uint128 (ctr, two); - b3 = rkey0 ^ vec_add_uint128 (ctr, three); - b4 = rkey0 ^ ctr4; - b5 = rkey0 ^ vec_add_uint128 (ctr4, one); - b6 = rkey0 ^ vec_add_uint128 (ctr4, two); - b7 = rkey0 ^ vec_add_uint128 (ctr4, three); - ctr = vec_add_uint128 (ctr4, four); + b1 = asm_add_uint128 (ctr, one); + b2 = asm_add_uint128 (ctr, two); + b3 = asm_add_uint128 (ctr, three); + b4 = asm_add_uint128 (ctr, four); + b5 = asm_add_uint128 (b1, four); + b6 = asm_add_uint128 (b2, four); + b7 = asm_add_uint128 (b3, four); + b0 = asm_xor (rkey0, ctr); + rkey = ALIGNED_LOAD (rk, 1); + ctr = asm_add_uint128 (b4, four); + b1 = asm_xor (rkey0, b1); + b2 = asm_xor (rkey0, b2); + b3 = asm_xor (rkey0, b3); + b0 = asm_cipher_be (b0, rkey); + b1 = asm_cipher_be (b1, rkey); + b2 = asm_cipher_be (b2, rkey); + b3 = asm_cipher_be (b3, rkey); + b4 = asm_xor (rkey0, b4); + b5 = asm_xor (rkey0, b5); + b6 = asm_xor (rkey0, b6); + b7 = asm_xor (rkey0, b7); + b4 = asm_cipher_be (b4, rkey); + b5 = asm_cipher_be (b5, rkey); + b6 = asm_cipher_be (b6, rkey); + b7 = asm_cipher_be (b7, rkey); #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD(&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); \ - b4 = vec_cipher_be (b4, rkey); \ - b5 = vec_cipher_be (b5, rkey); \ - b6 = vec_cipher_be (b6, rkey); \ - b7 = vec_cipher_be (b7, rkey); - - DO_ROUND(1); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); \ + b4 = asm_cipher_be (b4, rkey); \ + b5 = asm_cipher_be (b5, rkey); \ + b6 = asm_cipher_be (b6, rkey); \ + b7 = asm_cipher_be (b7, rkey); + + in0 = VEC_LOAD_BE_NOSWAP (in, 0); DO_ROUND(2); + in1 = VEC_LOAD_BE_NOSWAP (in, 1); DO_ROUND(3); + in2 = VEC_LOAD_BE_NOSWAP (in, 2); DO_ROUND(4); + in3 = VEC_LOAD_BE_NOSWAP (in, 3); DO_ROUND(5); + in4 = VEC_LOAD_BE_NOSWAP (in, 4); DO_ROUND(6); + in5 = VEC_LOAD_BE_NOSWAP (in, 5); DO_ROUND(7); + in6 = VEC_LOAD_BE_NOSWAP (in, 6); DO_ROUND(8); + in7 = VEC_LOAD_BE_NOSWAP (in, 7); + in += 8; DO_ROUND(9); + if (rounds >= 12) { DO_ROUND(10); @@ -936,43 +1107,68 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg, #undef DO_ROUND - rkey = rkeylast; - b0 = vec_cipherlast_be (b0, rkey ^ VEC_LOAD_BE (in + 0, bige_const)); - b1 = vec_cipherlast_be (b1, rkey ^ VEC_LOAD_BE (in + 1, bige_const)); - b2 = vec_cipherlast_be (b2, rkey ^ VEC_LOAD_BE (in + 2, bige_const)); - b3 = vec_cipherlast_be (b3, rkey ^ VEC_LOAD_BE (in + 3, bige_const)); - b4 = vec_cipherlast_be (b4, rkey ^ VEC_LOAD_BE (in + 4, bige_const)); - b5 = vec_cipherlast_be (b5, rkey ^ VEC_LOAD_BE (in + 5, bige_const)); - b6 = vec_cipherlast_be (b6, rkey ^ VEC_LOAD_BE (in + 6, bige_const)); - b7 = vec_cipherlast_be (b7, rkey ^ VEC_LOAD_BE (in + 7, bige_const)); - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); - VEC_STORE_BE (out + 4, b4, bige_const); - VEC_STORE_BE (out + 5, b5, bige_const); - VEC_STORE_BE (out + 6, b6, bige_const); - VEC_STORE_BE (out + 7, b7, bige_const); - - in += 8; + in0 = VEC_BE_SWAP (in0, bige_const); + in1 = VEC_BE_SWAP (in1, bige_const); + in2 = VEC_BE_SWAP (in2, bige_const); + in3 = VEC_BE_SWAP (in3, bige_const); + in4 = VEC_BE_SWAP (in4, bige_const); + in5 = VEC_BE_SWAP (in5, bige_const); + in6 = VEC_BE_SWAP (in6, bige_const); + in7 = VEC_BE_SWAP (in7, bige_const); + + in0 = asm_xor (rkeylast, in0); + in1 = asm_xor (rkeylast, in1); + in2 = asm_xor (rkeylast, in2); + in3 = asm_xor (rkeylast, in3); + b0 = asm_cipherlast_be (b0, in0); + b1 = asm_cipherlast_be (b1, in1); + in4 = asm_xor (rkeylast, in4); + in5 = asm_xor (rkeylast, in5); + b2 = asm_cipherlast_be (b2, in2); + b3 = asm_cipherlast_be (b3, in3); + in6 = asm_xor (rkeylast, in6); + in7 = asm_xor (rkeylast, in7); + b4 = asm_cipherlast_be (b4, in4); + b5 = asm_cipherlast_be (b5, in5); + b6 = asm_cipherlast_be (b6, in6); + b7 = asm_cipherlast_be (b7, in7); + + b0 = VEC_BE_SWAP (b0, bige_const); + b1 = VEC_BE_SWAP (b1, bige_const); + b2 = VEC_BE_SWAP (b2, bige_const); + b3 = VEC_BE_SWAP (b3, bige_const); + b4 = VEC_BE_SWAP (b4, bige_const); + b5 = VEC_BE_SWAP (b5, bige_const); + b6 = VEC_BE_SWAP (b6, bige_const); + b7 = VEC_BE_SWAP (b7, bige_const); + VEC_STORE_BE_NOSWAP (out, 0, b0); + VEC_STORE_BE_NOSWAP (out, 1, b1); + VEC_STORE_BE_NOSWAP (out, 2, b2); + VEC_STORE_BE_NOSWAP (out, 3, b3); + VEC_STORE_BE_NOSWAP (out, 4, b4); + VEC_STORE_BE_NOSWAP (out, 5, b5); + VEC_STORE_BE_NOSWAP (out, 6, b6); + VEC_STORE_BE_NOSWAP (out, 7, b7); out += 8; } if (nblocks >= 4) { - b0 = rkey0 ^ ctr; - b1 = rkey0 ^ vec_add_uint128 (ctr, one); - b2 = rkey0 ^ vec_add_uint128 (ctr, two); - b3 = rkey0 ^ vec_add_uint128 (ctr, three); - ctr = vec_add_uint128 (ctr, four); + b1 = asm_add_uint128 (ctr, one); + b2 = asm_add_uint128 (ctr, two); + b3 = asm_add_uint128 (ctr, three); + b0 = asm_xor (rkey0, ctr); + ctr = asm_add_uint128 (ctr, four); + b1 = asm_xor (rkey0, b1); + b2 = asm_xor (rkey0, b2); + b3 = asm_xor (rkey0, b3); #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD(&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); DO_ROUND(1); DO_ROUND(2); @@ -982,6 +1178,12 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg, DO_ROUND(6); DO_ROUND(7); DO_ROUND(8); + + in0 = VEC_LOAD_BE (in, 0, bige_const); + in1 = VEC_LOAD_BE (in, 1, bige_const); + in2 = VEC_LOAD_BE (in, 2, bige_const); + in3 = VEC_LOAD_BE (in, 3, bige_const); + DO_ROUND(9); if (rounds >= 12) { @@ -996,16 +1198,21 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg, #undef DO_ROUND - rkey = rkeylast; - b0 = vec_cipherlast_be (b0, rkey ^ VEC_LOAD_BE (in + 0, bige_const)); - b1 = vec_cipherlast_be (b1, rkey ^ VEC_LOAD_BE (in + 1, bige_const)); - b2 = vec_cipherlast_be (b2, rkey ^ VEC_LOAD_BE (in + 2, bige_const)); - b3 = vec_cipherlast_be (b3, rkey ^ VEC_LOAD_BE (in + 3, bige_const)); - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); + in0 = asm_xor (rkeylast, in0); + in1 = asm_xor (rkeylast, in1); + in2 = asm_xor (rkeylast, in2); + in3 = asm_xor (rkeylast, in3); + + b0 = asm_cipherlast_be (b0, in0); + b1 = asm_cipherlast_be (b1, in1); + b2 = asm_cipherlast_be (b2, in2); + b3 = asm_cipherlast_be (b3, in3); + + VEC_STORE_BE (out, 0, b0, bige_const); + VEC_STORE_BE (out, 1, b1, bige_const); + VEC_STORE_BE (out, 2, b2, bige_const); + VEC_STORE_BE (out, 3, b3, bige_const); + in += 4; out += 4; nblocks -= 4; @@ -1015,18 +1222,18 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg, for (; nblocks; nblocks--) { b = ctr; - ctr = vec_add_uint128 (ctr, one); - rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, bige_const); + ctr = asm_add_uint128 (ctr, one); + rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const); AES_ENCRYPT (b, rounds); - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); out++; in++; } - VEC_STORE_BE (ctr_arg, ctr, bige_const); + VEC_STORE_BE (ctr_arg, 0, ctr, bige_const); } @@ -1034,7 +1241,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); RIJNDAEL_context *ctx = (void *)&c->context.c; const u128_t *in = (const u128_t *)inbuf_arg; u128_t *out = (u128_t *)outbuf_arg; @@ -1043,16 +1250,16 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, block l0, l1, l2, l; block b0, b1, b2, b3, b4, b5, b6, b7, b; block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7; - block rkey; + block rkey, rkeylf; block ctr, iv; ROUND_KEY_VARIABLES; - iv = VEC_LOAD_BE (c->u_iv.iv, bige_const); - ctr = VEC_LOAD_BE (c->u_ctr.ctr, bige_const); + iv = VEC_LOAD_BE (c->u_iv.iv, 0, bige_const); + ctr = VEC_LOAD_BE (c->u_ctr.ctr, 0, bige_const); - l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], bige_const); - l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], bige_const); - l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], bige_const); + l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const); + l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const); + l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const); if (encrypt) { @@ -1062,8 +1269,8 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, for (; nblocks >= 8 && data_nblocks % 8; nblocks--) { - l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const); - b = VEC_LOAD_BE (in, bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const); + b = VEC_LOAD_BE (in, 0, bige_const); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ iv ^= l; @@ -1074,7 +1281,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, AES_ENCRYPT (b, rounds); b ^= iv; - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); in += 1; out += 1; @@ -1082,16 +1289,25 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, for (; nblocks >= 8; nblocks -= 8) { - b0 = VEC_LOAD_BE (in + 0, bige_const); - b1 = VEC_LOAD_BE (in + 1, bige_const); - b2 = VEC_LOAD_BE (in + 2, bige_const); - b3 = VEC_LOAD_BE (in + 3, bige_const); - b4 = VEC_LOAD_BE (in + 4, bige_const); - b5 = VEC_LOAD_BE (in + 5, bige_const); - b6 = VEC_LOAD_BE (in + 6, bige_const); - b7 = VEC_LOAD_BE (in + 7, bige_const); - - l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), bige_const); + b0 = VEC_LOAD_BE_NOSWAP (in, 0); + b1 = VEC_LOAD_BE_NOSWAP (in, 1); + b2 = VEC_LOAD_BE_NOSWAP (in, 2); + b3 = VEC_LOAD_BE_NOSWAP (in, 3); + b4 = VEC_LOAD_BE_NOSWAP (in, 4); + b5 = VEC_LOAD_BE_NOSWAP (in, 5); + b6 = VEC_LOAD_BE_NOSWAP (in, 6); + b7 = VEC_LOAD_BE_NOSWAP (in, 7); + in += 8; + l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0); + b0 = VEC_BE_SWAP(b0, bige_const); + b1 = VEC_BE_SWAP(b1, bige_const); + b2 = VEC_BE_SWAP(b2, bige_const); + b3 = VEC_BE_SWAP(b3, bige_const); + b4 = VEC_BE_SWAP(b4, bige_const); + b5 = VEC_BE_SWAP(b5, bige_const); + b6 = VEC_BE_SWAP(b6, bige_const); + b7 = VEC_BE_SWAP(b7, bige_const); + l = VEC_BE_SWAP(l, bige_const); ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7; @@ -1117,15 +1333,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, iv = iv7 ^ rkey0; #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); \ - b4 = vec_cipher_be (b4, rkey); \ - b5 = vec_cipher_be (b5, rkey); \ - b6 = vec_cipher_be (b6, rkey); \ - b7 = vec_cipher_be (b7, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); \ + b4 = asm_cipher_be (b4, rkey); \ + b5 = asm_cipher_be (b5, rkey); \ + b6 = asm_cipher_be (b6, rkey); \ + b7 = asm_cipher_be (b7, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1134,7 +1350,20 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, DO_ROUND(5); DO_ROUND(6); DO_ROUND(7); + + rkeylf = asm_xor (rkeylast, rkey0); + DO_ROUND(8); + + iv0 = asm_xor (rkeylf, iv0); + iv1 = asm_xor (rkeylf, iv1); + iv2 = asm_xor (rkeylf, iv2); + iv3 = asm_xor (rkeylf, iv3); + iv4 = asm_xor (rkeylf, iv4); + iv5 = asm_xor (rkeylf, iv5); + iv6 = asm_xor (rkeylf, iv6); + iv7 = asm_xor (rkeylf, iv7); + DO_ROUND(9); if (rounds >= 12) { @@ -1149,37 +1378,42 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, #undef DO_ROUND - rkey = rkeylast ^ rkey0; - b0 = vec_cipherlast_be (b0, rkey ^ iv0); - b1 = vec_cipherlast_be (b1, rkey ^ iv1); - b2 = vec_cipherlast_be (b2, rkey ^ iv2); - b3 = vec_cipherlast_be (b3, rkey ^ iv3); - b4 = vec_cipherlast_be (b4, rkey ^ iv4); - b5 = vec_cipherlast_be (b5, rkey ^ iv5); - b6 = vec_cipherlast_be (b6, rkey ^ iv6); - b7 = vec_cipherlast_be (b7, rkey ^ iv7); - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); - VEC_STORE_BE (out + 4, b4, bige_const); - VEC_STORE_BE (out + 5, b5, bige_const); - VEC_STORE_BE (out + 6, b6, bige_const); - VEC_STORE_BE (out + 7, b7, bige_const); - - in += 8; + b0 = asm_cipherlast_be (b0, iv0); + b1 = asm_cipherlast_be (b1, iv1); + b2 = asm_cipherlast_be (b2, iv2); + b3 = asm_cipherlast_be (b3, iv3); + b4 = asm_cipherlast_be (b4, iv4); + b5 = asm_cipherlast_be (b5, iv5); + b6 = asm_cipherlast_be (b6, iv6); + b7 = asm_cipherlast_be (b7, iv7); + + b0 = VEC_BE_SWAP (b0, bige_const); + b1 = VEC_BE_SWAP (b1, bige_const); + b2 = VEC_BE_SWAP (b2, bige_const); + b3 = VEC_BE_SWAP (b3, bige_const); + b4 = VEC_BE_SWAP (b4, bige_const); + b5 = VEC_BE_SWAP (b5, bige_const); + b6 = VEC_BE_SWAP (b6, bige_const); + b7 = VEC_BE_SWAP (b7, bige_const); + VEC_STORE_BE_NOSWAP (out, 0, b0); + VEC_STORE_BE_NOSWAP (out, 1, b1); + VEC_STORE_BE_NOSWAP (out, 2, b2); + VEC_STORE_BE_NOSWAP (out, 3, b3); + VEC_STORE_BE_NOSWAP (out, 4, b4); + VEC_STORE_BE_NOSWAP (out, 5, b5); + VEC_STORE_BE_NOSWAP (out, 6, b6); + VEC_STORE_BE_NOSWAP (out, 7, b7); out += 8; } if (nblocks >= 4 && (data_nblocks % 4) == 0) { - b0 = VEC_LOAD_BE (in + 0, bige_const); - b1 = VEC_LOAD_BE (in + 1, bige_const); - b2 = VEC_LOAD_BE (in + 2, bige_const); - b3 = VEC_LOAD_BE (in + 3, bige_const); + b0 = VEC_LOAD_BE (in, 0, bige_const); + b1 = VEC_LOAD_BE (in, 1, bige_const); + b2 = VEC_LOAD_BE (in, 2, bige_const); + b3 = VEC_LOAD_BE (in, 3, bige_const); - l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const); ctr ^= b0 ^ b1 ^ b2 ^ b3; @@ -1197,11 +1431,11 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, iv = iv3 ^ rkey0; #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1226,15 +1460,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, #undef DO_ROUND rkey = rkeylast ^ rkey0; - b0 = vec_cipherlast_be (b0, rkey ^ iv0); - b1 = vec_cipherlast_be (b1, rkey ^ iv1); - b2 = vec_cipherlast_be (b2, rkey ^ iv2); - b3 = vec_cipherlast_be (b3, rkey ^ iv3); + b0 = asm_cipherlast_be (b0, rkey ^ iv0); + b1 = asm_cipherlast_be (b1, rkey ^ iv1); + b2 = asm_cipherlast_be (b2, rkey ^ iv2); + b3 = asm_cipherlast_be (b3, rkey ^ iv3); - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); + VEC_STORE_BE (out, 0, b0, bige_const); + VEC_STORE_BE (out, 1, b1, bige_const); + VEC_STORE_BE (out, 2, b2, bige_const); + VEC_STORE_BE (out, 3, b3, bige_const); in += 4; out += 4; @@ -1243,8 +1477,8 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, for (; nblocks; nblocks--) { - l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const); - b = VEC_LOAD_BE (in, bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const); + b = VEC_LOAD_BE (in, 0, bige_const); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ iv ^= l; @@ -1255,7 +1489,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, AES_ENCRYPT (b, rounds); b ^= iv; - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); in += 1; out += 1; @@ -1275,8 +1509,8 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, for (; nblocks >= 8 && data_nblocks % 8; nblocks--) { - l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const); - b = VEC_LOAD_BE (in, bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const); + b = VEC_LOAD_BE (in, 0, bige_const); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ iv ^= l; @@ -1287,7 +1521,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, /* Checksum_i = Checksum_{i-1} xor P_i */ ctr ^= b; - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); in += 1; out += 1; @@ -1295,16 +1529,25 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, for (; nblocks >= 8; nblocks -= 8) { - b0 = VEC_LOAD_BE (in + 0, bige_const); - b1 = VEC_LOAD_BE (in + 1, bige_const); - b2 = VEC_LOAD_BE (in + 2, bige_const); - b3 = VEC_LOAD_BE (in + 3, bige_const); - b4 = VEC_LOAD_BE (in + 4, bige_const); - b5 = VEC_LOAD_BE (in + 5, bige_const); - b6 = VEC_LOAD_BE (in + 6, bige_const); - b7 = VEC_LOAD_BE (in + 7, bige_const); - - l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), bige_const); + b0 = VEC_LOAD_BE_NOSWAP (in, 0); + b1 = VEC_LOAD_BE_NOSWAP (in, 1); + b2 = VEC_LOAD_BE_NOSWAP (in, 2); + b3 = VEC_LOAD_BE_NOSWAP (in, 3); + b4 = VEC_LOAD_BE_NOSWAP (in, 4); + b5 = VEC_LOAD_BE_NOSWAP (in, 5); + b6 = VEC_LOAD_BE_NOSWAP (in, 6); + b7 = VEC_LOAD_BE_NOSWAP (in, 7); + in += 8; + l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0); + b0 = VEC_BE_SWAP(b0, bige_const); + b1 = VEC_BE_SWAP(b1, bige_const); + b2 = VEC_BE_SWAP(b2, bige_const); + b3 = VEC_BE_SWAP(b3, bige_const); + b4 = VEC_BE_SWAP(b4, bige_const); + b5 = VEC_BE_SWAP(b5, bige_const); + b6 = VEC_BE_SWAP(b6, bige_const); + b7 = VEC_BE_SWAP(b7, bige_const); + l = VEC_BE_SWAP(l, bige_const); iv ^= rkey0; @@ -1328,15 +1571,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, iv = iv7 ^ rkey0; #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_ncipher_be (b0, rkey); \ - b1 = vec_ncipher_be (b1, rkey); \ - b2 = vec_ncipher_be (b2, rkey); \ - b3 = vec_ncipher_be (b3, rkey); \ - b4 = vec_ncipher_be (b4, rkey); \ - b5 = vec_ncipher_be (b5, rkey); \ - b6 = vec_ncipher_be (b6, rkey); \ - b7 = vec_ncipher_be (b7, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_ncipher_be (b0, rkey); \ + b1 = asm_ncipher_be (b1, rkey); \ + b2 = asm_ncipher_be (b2, rkey); \ + b3 = asm_ncipher_be (b3, rkey); \ + b4 = asm_ncipher_be (b4, rkey); \ + b5 = asm_ncipher_be (b5, rkey); \ + b6 = asm_ncipher_be (b6, rkey); \ + b7 = asm_ncipher_be (b7, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1345,7 +1588,20 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, DO_ROUND(5); DO_ROUND(6); DO_ROUND(7); + + rkeylf = asm_xor (rkeylast, rkey0); + DO_ROUND(8); + + iv0 = asm_xor (rkeylf, iv0); + iv1 = asm_xor (rkeylf, iv1); + iv2 = asm_xor (rkeylf, iv2); + iv3 = asm_xor (rkeylf, iv3); + iv4 = asm_xor (rkeylf, iv4); + iv5 = asm_xor (rkeylf, iv5); + iv6 = asm_xor (rkeylf, iv6); + iv7 = asm_xor (rkeylf, iv7); + DO_ROUND(9); if (rounds >= 12) { @@ -1360,39 +1616,44 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, #undef DO_ROUND - rkey = rkeylast ^ rkey0; - b0 = vec_ncipherlast_be (b0, rkey ^ iv0); - b1 = vec_ncipherlast_be (b1, rkey ^ iv1); - b2 = vec_ncipherlast_be (b2, rkey ^ iv2); - b3 = vec_ncipherlast_be (b3, rkey ^ iv3); - b4 = vec_ncipherlast_be (b4, rkey ^ iv4); - b5 = vec_ncipherlast_be (b5, rkey ^ iv5); - b6 = vec_ncipherlast_be (b6, rkey ^ iv6); - b7 = vec_ncipherlast_be (b7, rkey ^ iv7); - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); - VEC_STORE_BE (out + 4, b4, bige_const); - VEC_STORE_BE (out + 5, b5, bige_const); - VEC_STORE_BE (out + 6, b6, bige_const); - VEC_STORE_BE (out + 7, b7, bige_const); + b0 = asm_ncipherlast_be (b0, iv0); + b1 = asm_ncipherlast_be (b1, iv1); + b2 = asm_ncipherlast_be (b2, iv2); + b3 = asm_ncipherlast_be (b3, iv3); + b4 = asm_ncipherlast_be (b4, iv4); + b5 = asm_ncipherlast_be (b5, iv5); + b6 = asm_ncipherlast_be (b6, iv6); + b7 = asm_ncipherlast_be (b7, iv7); ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7; - in += 8; + b0 = VEC_BE_SWAP (b0, bige_const); + b1 = VEC_BE_SWAP (b1, bige_const); + b2 = VEC_BE_SWAP (b2, bige_const); + b3 = VEC_BE_SWAP (b3, bige_const); + b4 = VEC_BE_SWAP (b4, bige_const); + b5 = VEC_BE_SWAP (b5, bige_const); + b6 = VEC_BE_SWAP (b6, bige_const); + b7 = VEC_BE_SWAP (b7, bige_const); + VEC_STORE_BE_NOSWAP (out, 0, b0); + VEC_STORE_BE_NOSWAP (out, 1, b1); + VEC_STORE_BE_NOSWAP (out, 2, b2); + VEC_STORE_BE_NOSWAP (out, 3, b3); + VEC_STORE_BE_NOSWAP (out, 4, b4); + VEC_STORE_BE_NOSWAP (out, 5, b5); + VEC_STORE_BE_NOSWAP (out, 6, b6); + VEC_STORE_BE_NOSWAP (out, 7, b7); out += 8; } if (nblocks >= 4 && (data_nblocks % 4) == 0) { - b0 = VEC_LOAD_BE (in + 0, bige_const); - b1 = VEC_LOAD_BE (in + 1, bige_const); - b2 = VEC_LOAD_BE (in + 2, bige_const); - b3 = VEC_LOAD_BE (in + 3, bige_const); + b0 = VEC_LOAD_BE (in, 0, bige_const); + b1 = VEC_LOAD_BE (in, 1, bige_const); + b2 = VEC_LOAD_BE (in, 2, bige_const); + b3 = VEC_LOAD_BE (in, 3, bige_const); - l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const); iv ^= rkey0; @@ -1408,11 +1669,11 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, iv = iv3 ^ rkey0; #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_ncipher_be (b0, rkey); \ - b1 = vec_ncipher_be (b1, rkey); \ - b2 = vec_ncipher_be (b2, rkey); \ - b3 = vec_ncipher_be (b3, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_ncipher_be (b0, rkey); \ + b1 = asm_ncipher_be (b1, rkey); \ + b2 = asm_ncipher_be (b2, rkey); \ + b3 = asm_ncipher_be (b3, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1437,15 +1698,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, #undef DO_ROUND rkey = rkeylast ^ rkey0; - b0 = vec_ncipherlast_be (b0, rkey ^ iv0); - b1 = vec_ncipherlast_be (b1, rkey ^ iv1); - b2 = vec_ncipherlast_be (b2, rkey ^ iv2); - b3 = vec_ncipherlast_be (b3, rkey ^ iv3); + b0 = asm_ncipherlast_be (b0, rkey ^ iv0); + b1 = asm_ncipherlast_be (b1, rkey ^ iv1); + b2 = asm_ncipherlast_be (b2, rkey ^ iv2); + b3 = asm_ncipherlast_be (b3, rkey ^ iv3); - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); + VEC_STORE_BE (out, 0, b0, bige_const); + VEC_STORE_BE (out, 1, b1, bige_const); + VEC_STORE_BE (out, 2, b2, bige_const); + VEC_STORE_BE (out, 3, b3, bige_const); ctr ^= b0 ^ b1 ^ b2 ^ b3; @@ -1456,8 +1717,8 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, for (; nblocks; nblocks--) { - l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const); - b = VEC_LOAD_BE (in, bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const); + b = VEC_LOAD_BE (in, 0, bige_const); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ iv ^= l; @@ -1468,15 +1729,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, /* Checksum_i = Checksum_{i-1} xor P_i */ ctr ^= b; - VEC_STORE_BE (out, b, bige_const); + VEC_STORE_BE (out, 0, b, bige_const); in += 1; out += 1; } } - VEC_STORE_BE (c->u_iv.iv, iv, bige_const); - VEC_STORE_BE (c->u_ctr.ctr, ctr, bige_const); + VEC_STORE_BE (c->u_iv.iv, 0, iv, bige_const); + VEC_STORE_BE (c->u_ctr.ctr, 0, ctr, bige_const); c->u_mode.ocb.data_nblocks = data_nblocks; return 0; @@ -1485,7 +1746,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks) { - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); RIJNDAEL_context *ctx = (void *)&c->context.c; const u128_t *rk = (u128_t *)&ctx->keyschenc; const u128_t *abuf = (const u128_t *)abuf_arg; @@ -1498,19 +1759,19 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, block ctr, iv; ROUND_KEY_VARIABLES; - iv = VEC_LOAD_BE (c->u_mode.ocb.aad_offset, bige_const); - ctr = VEC_LOAD_BE (c->u_mode.ocb.aad_sum, bige_const); + iv = VEC_LOAD_BE (c->u_mode.ocb.aad_offset, 0, bige_const); + ctr = VEC_LOAD_BE (c->u_mode.ocb.aad_sum, 0, bige_const); - l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], bige_const); - l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], bige_const); - l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], bige_const); + l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const); + l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const); + l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const); PRELOAD_ROUND_KEYS (rounds); for (; nblocks >= 8 && data_nblocks % 8; nblocks--) { - l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const); - b = VEC_LOAD_BE (abuf, bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const); + b = VEC_LOAD_BE (abuf, 0, bige_const); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ iv ^= l; @@ -1524,16 +1785,16 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, for (; nblocks >= 8; nblocks -= 8) { - b0 = VEC_LOAD_BE (abuf + 0, bige_const); - b1 = VEC_LOAD_BE (abuf + 1, bige_const); - b2 = VEC_LOAD_BE (abuf + 2, bige_const); - b3 = VEC_LOAD_BE (abuf + 3, bige_const); - b4 = VEC_LOAD_BE (abuf + 4, bige_const); - b5 = VEC_LOAD_BE (abuf + 5, bige_const); - b6 = VEC_LOAD_BE (abuf + 6, bige_const); - b7 = VEC_LOAD_BE (abuf + 7, bige_const); + b0 = VEC_LOAD_BE (abuf, 0, bige_const); + b1 = VEC_LOAD_BE (abuf, 1, bige_const); + b2 = VEC_LOAD_BE (abuf, 2, bige_const); + b3 = VEC_LOAD_BE (abuf, 3, bige_const); + b4 = VEC_LOAD_BE (abuf, 4, bige_const); + b5 = VEC_LOAD_BE (abuf, 5, bige_const); + b6 = VEC_LOAD_BE (abuf, 6, bige_const); + b7 = VEC_LOAD_BE (abuf, 7, bige_const); - l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), 0, bige_const); frkey = rkey0; iv ^= frkey; @@ -1558,15 +1819,15 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, iv = iv7 ^ frkey; #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); \ - b4 = vec_cipher_be (b4, rkey); \ - b5 = vec_cipher_be (b5, rkey); \ - b6 = vec_cipher_be (b6, rkey); \ - b7 = vec_cipher_be (b7, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); \ + b4 = asm_cipher_be (b4, rkey); \ + b5 = asm_cipher_be (b5, rkey); \ + b6 = asm_cipher_be (b6, rkey); \ + b7 = asm_cipher_be (b7, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1591,14 +1852,14 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, #undef DO_ROUND rkey = rkeylast; - b0 = vec_cipherlast_be (b0, rkey); - b1 = vec_cipherlast_be (b1, rkey); - b2 = vec_cipherlast_be (b2, rkey); - b3 = vec_cipherlast_be (b3, rkey); - b4 = vec_cipherlast_be (b4, rkey); - b5 = vec_cipherlast_be (b5, rkey); - b6 = vec_cipherlast_be (b6, rkey); - b7 = vec_cipherlast_be (b7, rkey); + b0 = asm_cipherlast_be (b0, rkey); + b1 = asm_cipherlast_be (b1, rkey); + b2 = asm_cipherlast_be (b2, rkey); + b3 = asm_cipherlast_be (b3, rkey); + b4 = asm_cipherlast_be (b4, rkey); + b5 = asm_cipherlast_be (b5, rkey); + b6 = asm_cipherlast_be (b6, rkey); + b7 = asm_cipherlast_be (b7, rkey); ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7; @@ -1607,12 +1868,12 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, if (nblocks >= 4 && (data_nblocks % 4) == 0) { - b0 = VEC_LOAD_BE (abuf + 0, bige_const); - b1 = VEC_LOAD_BE (abuf + 1, bige_const); - b2 = VEC_LOAD_BE (abuf + 2, bige_const); - b3 = VEC_LOAD_BE (abuf + 3, bige_const); + b0 = VEC_LOAD_BE (abuf, 0, bige_const); + b1 = VEC_LOAD_BE (abuf, 1, bige_const); + b2 = VEC_LOAD_BE (abuf, 2, bige_const); + b3 = VEC_LOAD_BE (abuf, 3, bige_const); - l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const); frkey = rkey0; iv ^= frkey; @@ -1629,11 +1890,11 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, iv = iv3 ^ frkey; #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1658,10 +1919,10 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, #undef DO_ROUND rkey = rkeylast; - b0 = vec_cipherlast_be (b0, rkey); - b1 = vec_cipherlast_be (b1, rkey); - b2 = vec_cipherlast_be (b2, rkey); - b3 = vec_cipherlast_be (b3, rkey); + b0 = asm_cipherlast_be (b0, rkey); + b1 = asm_cipherlast_be (b1, rkey); + b2 = asm_cipherlast_be (b2, rkey); + b3 = asm_cipherlast_be (b3, rkey); ctr ^= b0 ^ b1 ^ b2 ^ b3; @@ -1671,8 +1932,8 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, for (; nblocks; nblocks--) { - l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const); - b = VEC_LOAD_BE (abuf, bige_const); + l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const); + b = VEC_LOAD_BE (abuf, 0, bige_const); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ iv ^= l; @@ -1684,8 +1945,8 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, abuf += 1; } - VEC_STORE_BE (c->u_mode.ocb.aad_offset, iv, bige_const); - VEC_STORE_BE (c->u_mode.ocb.aad_sum, ctr, bige_const); + VEC_STORE_BE (c->u_mode.ocb.aad_offset, 0, iv, bige_const); + VEC_STORE_BE (c->u_mode.ocb.aad_sum, 0, ctr, bige_const); c->u_mode.ocb.aad_nblocks = data_nblocks; return 0; @@ -1696,44 +1957,59 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { +#ifdef WORDS_BIGENDIAN static const block vec_bswap64_const = - { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; + { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 }; static const block vec_bswap128_const = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; +#else + static const block vec_bswap64_const = + { ~8, ~9, ~10, ~11, ~12, ~13, ~14, ~15, ~0, ~1, ~2, ~3, ~4, ~5, ~6, ~7 }; + static const block vec_bswap128_const = + { ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8, ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0 }; + static const block vec_tweakin_swap_const = + { ~12, ~13, ~14, ~15, ~8, ~9, ~10, ~11, ~4, ~5, ~6, ~7, ~0, ~1, ~2, ~3 }; +#endif static const unsigned char vec_tweak_const[16] = { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0x87 }; static const vector unsigned long long vec_shift63_const = { 63, 63 }; static const vector unsigned long long vec_shift1_const = { 1, 1 }; - const block bige_const = vec_load_be_const(); + const block bige_const = asm_load_be_const(); RIJNDAEL_context *ctx = context; const u128_t *in = (const u128_t *)inbuf_arg; u128_t *out = (u128_t *)outbuf_arg; int rounds = ctx->rounds; - block tweak_tmp, tweak_next, tweak; - block b0, b1, b2, b3, b4, b5, b6, b7, b, rkey; + block tweak; + block b0, b1, b2, b3, b4, b5, b6, b7, b, rkey, rkeylf; block tweak0, tweak1, tweak2, tweak3, tweak4, tweak5, tweak6, tweak7; block tweak_const, bswap64_const, bswap128_const; vector unsigned long long shift63_const, shift1_const; ROUND_KEY_VARIABLES; - tweak_const = VEC_LOAD_BE (&vec_tweak_const, bige_const); - bswap64_const = ALIGNED_LOAD (&vec_bswap64_const); - bswap128_const = ALIGNED_LOAD (&vec_bswap128_const); - shift63_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift63_const); - shift1_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift1_const); + tweak_const = VEC_LOAD_BE (&vec_tweak_const, 0, bige_const); + bswap64_const = ALIGNED_LOAD (&vec_bswap64_const, 0); + bswap128_const = ALIGNED_LOAD (&vec_bswap128_const, 0); + shift63_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift63_const, 0); + shift1_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift1_const, 0); - tweak_next = VEC_LOAD_BE (tweak_arg, bige_const); +#ifdef WORDS_BIGENDIAN + tweak = VEC_LOAD_BE (tweak_arg, 0, bige_const); + tweak = asm_vperm1 (tweak, bswap128_const); +#else + tweak = VEC_LOAD_BE (tweak_arg, 0, vec_tweakin_swap_const); +#endif -#define GEN_TWEAK(tweak, tmp) /* Generate next tweak. */ \ - tmp = vec_vperm(tweak, tweak, bswap64_const); \ - tweak = vec_vperm(tweak, tweak, bswap128_const); \ - tmp = (block)(vec_sra((vector unsigned long long)tmp, shift63_const)) & \ - tweak_const; \ - tweak = (block)vec_sl((vector unsigned long long)tweak, shift1_const); \ - tweak = tweak ^ tmp; \ - tweak = vec_vperm(tweak, tweak, bswap128_const); +#define GEN_TWEAK(tout, tin) /* Generate next tweak. */ \ + do { \ + block tmp1, tmp2; \ + tmp1 = asm_vperm1((tin), bswap64_const); \ + tmp2 = (block)vec_sl((vector unsigned long long)(tin), shift1_const); \ + tmp1 = (block)(vec_sra((vector unsigned long long)tmp1, shift63_const)) & \ + tweak_const; \ + tout = asm_xor(tmp1, tmp2); \ + } while (0) if (encrypt) { @@ -1743,42 +2019,70 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, for (; nblocks >= 8; nblocks -= 8) { - tweak0 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak1 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak2 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak3 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak4 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak5 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak6 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak7 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - - b0 = VEC_LOAD_BE (in + 0, bige_const) ^ tweak0 ^ rkey0; - b1 = VEC_LOAD_BE (in + 1, bige_const) ^ tweak1 ^ rkey0; - b2 = VEC_LOAD_BE (in + 2, bige_const) ^ tweak2 ^ rkey0; - b3 = VEC_LOAD_BE (in + 3, bige_const) ^ tweak3 ^ rkey0; - b4 = VEC_LOAD_BE (in + 4, bige_const) ^ tweak4 ^ rkey0; - b5 = VEC_LOAD_BE (in + 5, bige_const) ^ tweak5 ^ rkey0; - b6 = VEC_LOAD_BE (in + 6, bige_const) ^ tweak6 ^ rkey0; - b7 = VEC_LOAD_BE (in + 7, bige_const) ^ tweak7 ^ rkey0; + b0 = VEC_LOAD_BE_NOSWAP (in, 0); + b1 = VEC_LOAD_BE_NOSWAP (in, 1); + b2 = VEC_LOAD_BE_NOSWAP (in, 2); + b3 = VEC_LOAD_BE_NOSWAP (in, 3); + tweak0 = tweak; + GEN_TWEAK (tweak1, tweak0); + tweak0 = asm_vperm1 (tweak0, bswap128_const); + b4 = VEC_LOAD_BE_NOSWAP (in, 4); + b5 = VEC_LOAD_BE_NOSWAP (in, 5); + GEN_TWEAK (tweak2, tweak1); + tweak1 = asm_vperm1 (tweak1, bswap128_const); + b6 = VEC_LOAD_BE_NOSWAP (in, 6); + b7 = VEC_LOAD_BE_NOSWAP (in, 7); + in += 8; + + b0 = VEC_BE_SWAP(b0, bige_const); + b1 = VEC_BE_SWAP(b1, bige_const); + GEN_TWEAK (tweak3, tweak2); + tweak2 = asm_vperm1 (tweak2, bswap128_const); + GEN_TWEAK (tweak4, tweak3); + tweak3 = asm_vperm1 (tweak3, bswap128_const); + b2 = VEC_BE_SWAP(b2, bige_const); + b3 = VEC_BE_SWAP(b3, bige_const); + GEN_TWEAK (tweak5, tweak4); + tweak4 = asm_vperm1 (tweak4, bswap128_const); + GEN_TWEAK (tweak6, tweak5); + tweak5 = asm_vperm1 (tweak5, bswap128_const); + b4 = VEC_BE_SWAP(b4, bige_const); + b5 = VEC_BE_SWAP(b5, bige_const); + GEN_TWEAK (tweak7, tweak6); + tweak6 = asm_vperm1 (tweak6, bswap128_const); + GEN_TWEAK (tweak, tweak7); + tweak7 = asm_vperm1 (tweak7, bswap128_const); + b6 = VEC_BE_SWAP(b6, bige_const); + b7 = VEC_BE_SWAP(b7, bige_const); + + tweak0 = asm_xor (tweak0, rkey0); + tweak1 = asm_xor (tweak1, rkey0); + tweak2 = asm_xor (tweak2, rkey0); + tweak3 = asm_xor (tweak3, rkey0); + tweak4 = asm_xor (tweak4, rkey0); + tweak5 = asm_xor (tweak5, rkey0); + tweak6 = asm_xor (tweak6, rkey0); + tweak7 = asm_xor (tweak7, rkey0); + + b0 = asm_xor (b0, tweak0); + b1 = asm_xor (b1, tweak1); + b2 = asm_xor (b2, tweak2); + b3 = asm_xor (b3, tweak3); + b4 = asm_xor (b4, tweak4); + b5 = asm_xor (b5, tweak5); + b6 = asm_xor (b6, tweak6); + b7 = asm_xor (b7, tweak7); #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); \ - b4 = vec_cipher_be (b4, rkey); \ - b5 = vec_cipher_be (b5, rkey); \ - b6 = vec_cipher_be (b6, rkey); \ - b7 = vec_cipher_be (b7, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); \ + b4 = asm_cipher_be (b4, rkey); \ + b5 = asm_cipher_be (b5, rkey); \ + b6 = asm_cipher_be (b6, rkey); \ + b7 = asm_cipher_be (b7, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1787,7 +2091,20 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, DO_ROUND(5); DO_ROUND(6); DO_ROUND(7); + + rkeylf = asm_xor (rkeylast, rkey0); + DO_ROUND(8); + + tweak0 = asm_xor (tweak0, rkeylf); + tweak1 = asm_xor (tweak1, rkeylf); + tweak2 = asm_xor (tweak2, rkeylf); + tweak3 = asm_xor (tweak3, rkeylf); + tweak4 = asm_xor (tweak4, rkeylf); + tweak5 = asm_xor (tweak5, rkeylf); + tweak6 = asm_xor (tweak6, rkeylf); + tweak7 = asm_xor (tweak7, rkeylf); + DO_ROUND(9); if (rounds >= 12) { @@ -1802,51 +2119,62 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, #undef DO_ROUND - rkey = rkeylast; - b0 = vec_cipherlast_be (b0, rkey ^ tweak0); - b1 = vec_cipherlast_be (b1, rkey ^ tweak1); - b2 = vec_cipherlast_be (b2, rkey ^ tweak2); - b3 = vec_cipherlast_be (b3, rkey ^ tweak3); - b4 = vec_cipherlast_be (b4, rkey ^ tweak4); - b5 = vec_cipherlast_be (b5, rkey ^ tweak5); - b6 = vec_cipherlast_be (b6, rkey ^ tweak6); - b7 = vec_cipherlast_be (b7, rkey ^ tweak7); - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); - VEC_STORE_BE (out + 4, b4, bige_const); - VEC_STORE_BE (out + 5, b5, bige_const); - VEC_STORE_BE (out + 6, b6, bige_const); - VEC_STORE_BE (out + 7, b7, bige_const); - - in += 8; + b0 = asm_cipherlast_be (b0, tweak0); + b1 = asm_cipherlast_be (b1, tweak1); + b2 = asm_cipherlast_be (b2, tweak2); + b3 = asm_cipherlast_be (b3, tweak3); + b0 = VEC_BE_SWAP (b0, bige_const); + b1 = VEC_BE_SWAP (b1, bige_const); + b4 = asm_cipherlast_be (b4, tweak4); + b5 = asm_cipherlast_be (b5, tweak5); + b2 = VEC_BE_SWAP (b2, bige_const); + b3 = VEC_BE_SWAP (b3, bige_const); + b6 = asm_cipherlast_be (b6, tweak6); + b7 = asm_cipherlast_be (b7, tweak7); + VEC_STORE_BE_NOSWAP (out, 0, b0); + VEC_STORE_BE_NOSWAP (out, 1, b1); + b4 = VEC_BE_SWAP (b4, bige_const); + b5 = VEC_BE_SWAP (b5, bige_const); + VEC_STORE_BE_NOSWAP (out, 2, b2); + VEC_STORE_BE_NOSWAP (out, 3, b3); + b6 = VEC_BE_SWAP (b6, bige_const); + b7 = VEC_BE_SWAP (b7, bige_const); + VEC_STORE_BE_NOSWAP (out, 4, b4); + VEC_STORE_BE_NOSWAP (out, 5, b5); + VEC_STORE_BE_NOSWAP (out, 6, b6); + VEC_STORE_BE_NOSWAP (out, 7, b7); out += 8; } if (nblocks >= 4) { - tweak0 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak1 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak2 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak3 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - - b0 = VEC_LOAD_BE (in + 0, bige_const) ^ tweak0 ^ rkey0; - b1 = VEC_LOAD_BE (in + 1, bige_const) ^ tweak1 ^ rkey0; - b2 = VEC_LOAD_BE (in + 2, bige_const) ^ tweak2 ^ rkey0; - b3 = VEC_LOAD_BE (in + 3, bige_const) ^ tweak3 ^ rkey0; + tweak0 = tweak; + GEN_TWEAK (tweak1, tweak0); + GEN_TWEAK (tweak2, tweak1); + GEN_TWEAK (tweak3, tweak2); + GEN_TWEAK (tweak, tweak3); + + b0 = VEC_LOAD_BE (in, 0, bige_const); + b1 = VEC_LOAD_BE (in, 1, bige_const); + b2 = VEC_LOAD_BE (in, 2, bige_const); + b3 = VEC_LOAD_BE (in, 3, bige_const); + + tweak0 = asm_vperm1 (tweak0, bswap128_const); + tweak1 = asm_vperm1 (tweak1, bswap128_const); + tweak2 = asm_vperm1 (tweak2, bswap128_const); + tweak3 = asm_vperm1 (tweak3, bswap128_const); + + b0 ^= tweak0 ^ rkey0; + b1 ^= tweak1 ^ rkey0; + b2 ^= tweak2 ^ rkey0; + b3 ^= tweak3 ^ rkey0; #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_cipher_be (b0, rkey); \ - b1 = vec_cipher_be (b1, rkey); \ - b2 = vec_cipher_be (b2, rkey); \ - b3 = vec_cipher_be (b3, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1871,15 +2199,15 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, #undef DO_ROUND rkey = rkeylast; - b0 = vec_cipherlast_be (b0, rkey ^ tweak0); - b1 = vec_cipherlast_be (b1, rkey ^ tweak1); - b2 = vec_cipherlast_be (b2, rkey ^ tweak2); - b3 = vec_cipherlast_be (b3, rkey ^ tweak3); + b0 = asm_cipherlast_be (b0, rkey ^ tweak0); + b1 = asm_cipherlast_be (b1, rkey ^ tweak1); + b2 = asm_cipherlast_be (b2, rkey ^ tweak2); + b3 = asm_cipherlast_be (b3, rkey ^ tweak3); - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); + VEC_STORE_BE (out, 0, b0, bige_const); + VEC_STORE_BE (out, 1, b1, bige_const); + VEC_STORE_BE (out, 2, b2, bige_const); + VEC_STORE_BE (out, 3, b3, bige_const); in += 4; out += 4; @@ -1888,18 +2216,18 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, for (; nblocks; nblocks--) { - tweak = tweak_next; + tweak0 = asm_vperm1 (tweak, bswap128_const); /* Xor-Encrypt/Decrypt-Xor block. */ - b = VEC_LOAD_BE (in, bige_const) ^ tweak; + b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0; /* Generate next tweak. */ - GEN_TWEAK (tweak_next, tweak_tmp); + GEN_TWEAK (tweak, tweak); AES_ENCRYPT (b, rounds); - b ^= tweak; - VEC_STORE_BE (out, b, bige_const); + b ^= tweak0; + VEC_STORE_BE (out, 0, b, bige_const); in++; out++; @@ -1919,42 +2247,70 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, for (; nblocks >= 8; nblocks -= 8) { - tweak0 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak1 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak2 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak3 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak4 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak5 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak6 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak7 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - - b0 = VEC_LOAD_BE (in + 0, bige_const) ^ tweak0 ^ rkey0; - b1 = VEC_LOAD_BE (in + 1, bige_const) ^ tweak1 ^ rkey0; - b2 = VEC_LOAD_BE (in + 2, bige_const) ^ tweak2 ^ rkey0; - b3 = VEC_LOAD_BE (in + 3, bige_const) ^ tweak3 ^ rkey0; - b4 = VEC_LOAD_BE (in + 4, bige_const) ^ tweak4 ^ rkey0; - b5 = VEC_LOAD_BE (in + 5, bige_const) ^ tweak5 ^ rkey0; - b6 = VEC_LOAD_BE (in + 6, bige_const) ^ tweak6 ^ rkey0; - b7 = VEC_LOAD_BE (in + 7, bige_const) ^ tweak7 ^ rkey0; + b0 = VEC_LOAD_BE_NOSWAP (in, 0); + b1 = VEC_LOAD_BE_NOSWAP (in, 1); + b2 = VEC_LOAD_BE_NOSWAP (in, 2); + b3 = VEC_LOAD_BE_NOSWAP (in, 3); + tweak0 = tweak; + GEN_TWEAK (tweak1, tweak0); + tweak0 = asm_vperm1 (tweak0, bswap128_const); + b4 = VEC_LOAD_BE_NOSWAP (in, 4); + b5 = VEC_LOAD_BE_NOSWAP (in, 5); + GEN_TWEAK (tweak2, tweak1); + tweak1 = asm_vperm1 (tweak1, bswap128_const); + b6 = VEC_LOAD_BE_NOSWAP (in, 6); + b7 = VEC_LOAD_BE_NOSWAP (in, 7); + in += 8; + + b0 = VEC_BE_SWAP(b0, bige_const); + b1 = VEC_BE_SWAP(b1, bige_const); + GEN_TWEAK (tweak3, tweak2); + tweak2 = asm_vperm1 (tweak2, bswap128_const); + GEN_TWEAK (tweak4, tweak3); + tweak3 = asm_vperm1 (tweak3, bswap128_const); + b2 = VEC_BE_SWAP(b2, bige_const); + b3 = VEC_BE_SWAP(b3, bige_const); + GEN_TWEAK (tweak5, tweak4); + tweak4 = asm_vperm1 (tweak4, bswap128_const); + GEN_TWEAK (tweak6, tweak5); + tweak5 = asm_vperm1 (tweak5, bswap128_const); + b4 = VEC_BE_SWAP(b4, bige_const); + b5 = VEC_BE_SWAP(b5, bige_const); + GEN_TWEAK (tweak7, tweak6); + tweak6 = asm_vperm1 (tweak6, bswap128_const); + GEN_TWEAK (tweak, tweak7); + tweak7 = asm_vperm1 (tweak7, bswap128_const); + b6 = VEC_BE_SWAP(b6, bige_const); + b7 = VEC_BE_SWAP(b7, bige_const); + + tweak0 = asm_xor (tweak0, rkey0); + tweak1 = asm_xor (tweak1, rkey0); + tweak2 = asm_xor (tweak2, rkey0); + tweak3 = asm_xor (tweak3, rkey0); + tweak4 = asm_xor (tweak4, rkey0); + tweak5 = asm_xor (tweak5, rkey0); + tweak6 = asm_xor (tweak6, rkey0); + tweak7 = asm_xor (tweak7, rkey0); + + b0 = asm_xor (b0, tweak0); + b1 = asm_xor (b1, tweak1); + b2 = asm_xor (b2, tweak2); + b3 = asm_xor (b3, tweak3); + b4 = asm_xor (b4, tweak4); + b5 = asm_xor (b5, tweak5); + b6 = asm_xor (b6, tweak6); + b7 = asm_xor (b7, tweak7); #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_ncipher_be (b0, rkey); \ - b1 = vec_ncipher_be (b1, rkey); \ - b2 = vec_ncipher_be (b2, rkey); \ - b3 = vec_ncipher_be (b3, rkey); \ - b4 = vec_ncipher_be (b4, rkey); \ - b5 = vec_ncipher_be (b5, rkey); \ - b6 = vec_ncipher_be (b6, rkey); \ - b7 = vec_ncipher_be (b7, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_ncipher_be (b0, rkey); \ + b1 = asm_ncipher_be (b1, rkey); \ + b2 = asm_ncipher_be (b2, rkey); \ + b3 = asm_ncipher_be (b3, rkey); \ + b4 = asm_ncipher_be (b4, rkey); \ + b5 = asm_ncipher_be (b5, rkey); \ + b6 = asm_ncipher_be (b6, rkey); \ + b7 = asm_ncipher_be (b7, rkey); DO_ROUND(1); DO_ROUND(2); @@ -1963,7 +2319,20 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, DO_ROUND(5); DO_ROUND(6); DO_ROUND(7); + + rkeylf = asm_xor (rkeylast, rkey0); + DO_ROUND(8); + + tweak0 = asm_xor (tweak0, rkeylf); + tweak1 = asm_xor (tweak1, rkeylf); + tweak2 = asm_xor (tweak2, rkeylf); + tweak3 = asm_xor (tweak3, rkeylf); + tweak4 = asm_xor (tweak4, rkeylf); + tweak5 = asm_xor (tweak5, rkeylf); + tweak6 = asm_xor (tweak6, rkeylf); + tweak7 = asm_xor (tweak7, rkeylf); + DO_ROUND(9); if (rounds >= 12) { @@ -1978,51 +2347,62 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, #undef DO_ROUND - rkey = rkeylast; - b0 = vec_ncipherlast_be (b0, rkey ^ tweak0); - b1 = vec_ncipherlast_be (b1, rkey ^ tweak1); - b2 = vec_ncipherlast_be (b2, rkey ^ tweak2); - b3 = vec_ncipherlast_be (b3, rkey ^ tweak3); - b4 = vec_ncipherlast_be (b4, rkey ^ tweak4); - b5 = vec_ncipherlast_be (b5, rkey ^ tweak5); - b6 = vec_ncipherlast_be (b6, rkey ^ tweak6); - b7 = vec_ncipherlast_be (b7, rkey ^ tweak7); - - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); - VEC_STORE_BE (out + 4, b4, bige_const); - VEC_STORE_BE (out + 5, b5, bige_const); - VEC_STORE_BE (out + 6, b6, bige_const); - VEC_STORE_BE (out + 7, b7, bige_const); - - in += 8; + b0 = asm_ncipherlast_be (b0, tweak0); + b1 = asm_ncipherlast_be (b1, tweak1); + b2 = asm_ncipherlast_be (b2, tweak2); + b3 = asm_ncipherlast_be (b3, tweak3); + b0 = VEC_BE_SWAP (b0, bige_const); + b1 = VEC_BE_SWAP (b1, bige_const); + b4 = asm_ncipherlast_be (b4, tweak4); + b5 = asm_ncipherlast_be (b5, tweak5); + b2 = VEC_BE_SWAP (b2, bige_const); + b3 = VEC_BE_SWAP (b3, bige_const); + b6 = asm_ncipherlast_be (b6, tweak6); + b7 = asm_ncipherlast_be (b7, tweak7); + VEC_STORE_BE_NOSWAP (out, 0, b0); + VEC_STORE_BE_NOSWAP (out, 1, b1); + b4 = VEC_BE_SWAP (b4, bige_const); + b5 = VEC_BE_SWAP (b5, bige_const); + VEC_STORE_BE_NOSWAP (out, 2, b2); + VEC_STORE_BE_NOSWAP (out, 3, b3); + b6 = VEC_BE_SWAP (b6, bige_const); + b7 = VEC_BE_SWAP (b7, bige_const); + VEC_STORE_BE_NOSWAP (out, 4, b4); + VEC_STORE_BE_NOSWAP (out, 5, b5); + VEC_STORE_BE_NOSWAP (out, 6, b6); + VEC_STORE_BE_NOSWAP (out, 7, b7); out += 8; } if (nblocks >= 4) { - tweak0 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak1 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak2 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - tweak3 = tweak_next; - GEN_TWEAK (tweak_next, tweak_tmp); - - b0 = VEC_LOAD_BE (in + 0, bige_const) ^ tweak0 ^ rkey0; - b1 = VEC_LOAD_BE (in + 1, bige_const) ^ tweak1 ^ rkey0; - b2 = VEC_LOAD_BE (in + 2, bige_const) ^ tweak2 ^ rkey0; - b3 = VEC_LOAD_BE (in + 3, bige_const) ^ tweak3 ^ rkey0; + tweak0 = tweak; + GEN_TWEAK (tweak1, tweak0); + GEN_TWEAK (tweak2, tweak1); + GEN_TWEAK (tweak3, tweak2); + GEN_TWEAK (tweak, tweak3); + + b0 = VEC_LOAD_BE (in, 0, bige_const); + b1 = VEC_LOAD_BE (in, 1, bige_const); + b2 = VEC_LOAD_BE (in, 2, bige_const); + b3 = VEC_LOAD_BE (in, 3, bige_const); + + tweak0 = asm_vperm1 (tweak0, bswap128_const); + tweak1 = asm_vperm1 (tweak1, bswap128_const); + tweak2 = asm_vperm1 (tweak2, bswap128_const); + tweak3 = asm_vperm1 (tweak3, bswap128_const); + + b0 ^= tweak0 ^ rkey0; + b1 ^= tweak1 ^ rkey0; + b2 ^= tweak2 ^ rkey0; + b3 ^= tweak3 ^ rkey0; #define DO_ROUND(r) \ - rkey = ALIGNED_LOAD (&rk[r]); \ - b0 = vec_ncipher_be (b0, rkey); \ - b1 = vec_ncipher_be (b1, rkey); \ - b2 = vec_ncipher_be (b2, rkey); \ - b3 = vec_ncipher_be (b3, rkey); + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_ncipher_be (b0, rkey); \ + b1 = asm_ncipher_be (b1, rkey); \ + b2 = asm_ncipher_be (b2, rkey); \ + b3 = asm_ncipher_be (b3, rkey); DO_ROUND(1); DO_ROUND(2); @@ -2047,15 +2427,15 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, #undef DO_ROUND rkey = rkeylast; - b0 = vec_ncipherlast_be (b0, rkey ^ tweak0); - b1 = vec_ncipherlast_be (b1, rkey ^ tweak1); - b2 = vec_ncipherlast_be (b2, rkey ^ tweak2); - b3 = vec_ncipherlast_be (b3, rkey ^ tweak3); + b0 = asm_ncipherlast_be (b0, rkey ^ tweak0); + b1 = asm_ncipherlast_be (b1, rkey ^ tweak1); + b2 = asm_ncipherlast_be (b2, rkey ^ tweak2); + b3 = asm_ncipherlast_be (b3, rkey ^ tweak3); - VEC_STORE_BE (out + 0, b0, bige_const); - VEC_STORE_BE (out + 1, b1, bige_const); - VEC_STORE_BE (out + 2, b2, bige_const); - VEC_STORE_BE (out + 3, b3, bige_const); + VEC_STORE_BE (out, 0, b0, bige_const); + VEC_STORE_BE (out, 1, b1, bige_const); + VEC_STORE_BE (out, 2, b2, bige_const); + VEC_STORE_BE (out, 3, b3, bige_const); in += 4; out += 4; @@ -2064,25 +2444,30 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg, for (; nblocks; nblocks--) { - tweak = tweak_next; + tweak0 = asm_vperm1 (tweak, bswap128_const); /* Xor-Encrypt/Decrypt-Xor block. */ - b = VEC_LOAD_BE (in, bige_const) ^ tweak; + b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0; /* Generate next tweak. */ - GEN_TWEAK (tweak_next, tweak_tmp); + GEN_TWEAK (tweak, tweak); AES_DECRYPT (b, rounds); - b ^= tweak; - VEC_STORE_BE (out, b, bige_const); + b ^= tweak0; + VEC_STORE_BE (out, 0, b, bige_const); in++; out++; } } - VEC_STORE_BE (tweak_arg, tweak_next, bige_const); +#ifdef WORDS_BIGENDIAN + tweak = asm_vperm1 (tweak, bswap128_const); + VEC_STORE_BE (tweak_arg, 0, tweak, bige_const); +#else + VEC_STORE_BE (tweak_arg, 0, tweak, vec_tweakin_swap_const); +#endif #undef GEN_TWEAK } |