summaryrefslogtreecommitdiff
path: root/cipher/twofish-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2019-04-15 22:09:24 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2019-04-15 22:09:32 +0300
commit0903b215ef5a18332b740a24e6e2bfbed9e1d97b (patch)
tree4b3a4a557563405f43061fe10106ccc3af049ca6 /cipher/twofish-amd64.S
parent2ffc689d4757f31f1e2c4961b94b0b0c8dc302b7 (diff)
downloadlibgcrypt-0903b215ef5a18332b740a24e6e2bfbed9e1d97b.tar.gz
twofish-amd64: do not use xchg instruction
* cipher/twofish-amd64.S (g1g2_3): Swap ab and cd registers using 'movq' instructions instead of 'xchgq'. -- Avoiding xchg instruction improves three block parallel performance by ~3% on Intel Haswell. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/twofish-amd64.S')
-rw-r--r--cipher/twofish-amd64.S12
1 files changed, 9 insertions, 3 deletions
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index 7a836463..134d6401 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -368,15 +368,21 @@ ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;
/* G1,2 && G2,2 */ \
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
- xchgq cd ## 0, ab ## 0; \
+ movq ab ## 0, RT0; \
+ movq cd ## 0, ab ## 0; \
+ movq RT0, cd ## 0; \
\
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
- xchgq cd ## 1, ab ## 1; \
+ movq ab ## 1, RT0; \
+ movq cd ## 1, ab ## 1; \
+ movq RT0, cd ## 1; \
\
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
- xchgq cd ## 2, ab ## 2;
+ movq ab ## 2, RT0; \
+ movq cd ## 2, ab ## 2; \
+ movq RT0, cd ## 2;
#define enc_round_end(ab, x, y, n) \
addl y ## d, x ## d; \