diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-07-03 11:14:56 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-07-10 21:53:25 +0300 |
commit | 71dda4507053379433dc8b0fc6462c15de7299df (patch) | |
tree | e656842061f5e946778f54c20c40d7814960351d /mpi/longlong.h | |
parent | 6540b84a6e9113813e7e49e3ad2024d4a0073300 (diff) | |
download | libgcrypt-71dda4507053379433dc8b0fc6462c15de7299df.tar.gz |
Tweak ARM inline assembly for mpi
mpi/longlong.h [__arm__]: Enable inline assembly if __thumb2__ is
defined.
[__arm__]: Use __ARCH_ARM when defined.
[__arm__] [__ARM_ARCH >= 5] (count_leading_zeros): New.
--
Current ARM Linux distributions use EABI that enables thumb2, and therefore
inline assembly is disable (because !defined(__thumb__) selector). However
thumb2 allows the use of assembly instructions that longlong.h contains for
ARM. So this patch enables inline assembly for ARM when __thumb2__ is defined
in addition to __thumb__.
Patch also adds optimization for count_leading_zeros() macro for ARM.
Results on Cortex-A8, 1Ghz:
===
Before:
Algorithm generate 100*sign 100*verify
------------------------------------------------
RSA 1024 bit 750ms 2780ms 110ms
RSA 2048 bit 14280ms 17250ms 300ms
RSA 3072 bit 38630ms 51300ms 650ms
RSA 4096 bit 60940ms 111430ms 1000ms
jussi@cubie:~/libgcrypt$ tests/benchmark dsa
Algorithm generate 100*sign 100*verify
------------------------------------------------
DSA 1024/160 - 1410ms 1680ms
DSA 2048/224 - 6100ms 7390ms
DSA 3072/256 - 14350ms 17120ms
jussi@cubie:~/libgcrypt$ tests/benchmark ecc
Algorithm generate 100*sign 100*verify
------------------------------------------------
ECDSA 192 bit 90ms 2160ms 3940ms
ECDSA 224 bit 110ms 2810ms 5400ms
ECDSA 256 bit 150ms 3570ms 6970ms
ECDSA 384 bit 340ms 8320ms 16420ms
ECDSA 521 bit 850ms 19760ms 38480ms
After:
jussi@cubie:~/libgcrypt$ tests/benchmark rsa
Algorithm generate 100*sign 100*verify
------------------------------------------------
RSA 1024 bit 590ms 2230ms 80ms
RSA 2048 bit 2320ms 13090ms 240ms
RSA 3072 bit 60580ms 38420ms 460ms
RSA 4096 bit 115130ms 82250ms 750ms
jussi@cubie:~/libgcrypt$ tests/benchmark dsa
Algorithm generate 100*sign 100*verify
------------------------------------------------
DSA 1024/160 - 1070ms 1290ms
DSA 2048/224 - 4500ms 5550ms
DSA 3072/256 - 10280ms 12200ms
jussi@cubie:~/libgcrypt$ tests/benchmark ecc
Algorithm generate 100*sign 100*verify
------------------------------------------------
ECDSA 192 bit 70ms 1900ms 3560ms
ECDSA 224 bit 100ms 2490ms 4750ms
ECDSA 256 bit 120ms 3140ms 5920ms
ECDSA 384 bit 270ms 6990ms 13790ms
ECDSA 521 bit 680ms 17080ms 33490ms
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'mpi/longlong.h')
-rw-r--r-- | mpi/longlong.h | 20 |
1 files changed, 16 insertions, 4 deletions
diff --git a/mpi/longlong.h b/mpi/longlong.h index 5dba7931..bb34fd7b 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -184,7 +184,8 @@ extern UDItype __udiv_qrnnd (); /*************************************** ************** ARM ****************** ***************************************/ -#if defined (__arm__) && W_TYPE_SIZE == 32 && !defined (__thumb__) +#if defined (__arm__) && W_TYPE_SIZE == 32 && \ + (!defined (__thumb__) || defined (__thumb2__)) #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("adds %1, %4, %5\n" \ "adc %0, %2, %3" \ @@ -203,7 +204,9 @@ extern UDItype __udiv_qrnnd (); "rI" ((USItype)(bh)), \ "r" ((USItype)(al)), \ "rI" ((USItype)(bl))) -#if defined __ARM_ARCH_2__ || defined __ARM_ARCH_3__ +/* The __ARM_ARCH define is provided by gcc 4.8 */ +#if (defined __ARM_ARCH && __ARM_ARCH <= 3) || \ + defined __ARM_ARCH_2__ || defined __ARM_ARCH_3__ #define umul_ppmm(xh, xl, a, b) \ __asm__ ("%@ Inlined umul_ppmm\n" \ "mov %|r0, %2, lsr #16 @ AAAA\n" \ @@ -223,7 +226,7 @@ extern UDItype __udiv_qrnnd (); : "r" ((USItype)(a)), \ "r" ((USItype)(b)) \ : "r0", "r1", "r2") -#else +#else /* __ARM_ARCH >= 4 */ #define umul_ppmm(xh, xl, a, b) \ __asm__ ("%@ Inlined umul_ppmm\n" \ "umull %r1, %r0, %r2, %r3" \ @@ -232,9 +235,18 @@ extern UDItype __udiv_qrnnd (); : "r" ((USItype)(a)), \ "r" ((USItype)(b)) \ : "r0", "r1") -#endif +#endif /* __ARM_ARCH >= 4 */ #define UMUL_TIME 20 #define UDIV_TIME 100 +/* The __ARM_ARCH define is provided by gcc 4.8 */ +#if (defined __ARM_ARCH && __ARM_ARCH >= 5) || !(defined __ARM_ARCH_2__ || \ + defined __ARM_ARCH_3__ || defined __ARM_ARCH_3M__ || __ARM_ARCH_4__ || \ + __ARM_ARCH_4T__) +#define count_leading_zeros(count, x) \ + __asm__ ("clz %0, %1" \ + : "=r" ((USItype)(count)) \ + : "r" ((USItype)(x))) +#endif /* __ARM_ARCH >= 5 */ #endif /* __arm__ */ /*************************************** |