diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/builtins/arm/udivmodsi4.S | 62 | ||||
-rw-r--r-- | lib/builtins/arm/udivsi3.S | 65 | ||||
-rw-r--r-- | lib/builtins/arm/umodsi3.S | 61 | ||||
-rw-r--r-- | lib/builtins/assembly.h | 14 |
4 files changed, 182 insertions, 20 deletions
diff --git a/lib/builtins/arm/udivmodsi4.S b/lib/builtins/arm/udivmodsi4.S index ddc875219..b93fb0a3a 100644 --- a/lib/builtins/arm/udivmodsi4.S +++ b/lib/builtins/arm/udivmodsi4.S @@ -16,6 +16,9 @@ .syntax unified .text +#if __ARM_ARCH_ISA_THUMB == 2 + .thumb +#endif .p2align 2 DEFINE_COMPILERRT_FUNCTION(__udivmodsi4) @@ -38,11 +41,15 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4) * * r0 is the numerator, r1 the denominator. * + * ARM: * The code before JMP computes the correct shift I, so that * r0 and (r1 << I) have the highest bit set in the same position. * At the time of JMP, ip := .Ldiv0block - 12 * I. * This depends on the fixed instruction size of block. * + * Thumb 2: + * Uses a jumptable to jump to the appropriate block. + * * block(shift) implements the test-and-update-quotient core. * It assumes (r0 << shift) can be computed without overflow and * that (r0 << shift) < 2 * r1. The quotient is stored in r3. @@ -52,17 +59,59 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4) clz ip, r0 clz r3, r1 /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */ +#if __ARM_ARCH_ISA_THUMB == 2 + sub ip, r3, ip + mov r3, #0 + tbb [pc, ip] +LOCAL_LABEL(JT): + .byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2 +#else sub r3, r3, ip - adr ip, LOCAL_LABEL(div0block) + adr ip, LOCAL_LABEL(0) sub ip, ip, r3, lsl #2 sub ip, ip, r3, lsl #3 mov r3, #0 bx ip +#endif # else +#if __ARM_ARCH_ISA_THUMB == 2 +#error unsupported configuration +#endif str r4, [sp, #-8]! mov r4, r0 - adr ip, LOCAL_LABEL(div0block) + adr ip, LOCAL_LABEL(0) lsr r3, r4, #16 cmp r3, r1 @@ -96,9 +145,11 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4) #define IMM # -#define block(shift) \ - cmp r0, r1, lsl IMM shift; \ - addhs r3, r3, IMM (1 << shift); \ +#define block(shift) \ +LOCAL_LABEL(shift): \ + cmp r0, r1, lsl IMM shift; \ + ITT hs; \ + addhs r3, r3, IMM (1 << shift); \ subhs r0, r0, r1, lsl IMM shift block(31) @@ -132,7 +183,6 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4) block(3) block(2) block(1) -LOCAL_LABEL(div0block): block(0) str r0, [r2] diff --git a/lib/builtins/arm/udivsi3.S b/lib/builtins/arm/udivsi3.S index 8fb1dca0f..c184b513a 100644 --- a/lib/builtins/arm/udivsi3.S +++ b/lib/builtins/arm/udivsi3.S @@ -16,6 +16,9 @@ .syntax unified .text +#if __ARM_ARCH_ISA_THUMB == 2 + .thumb +#endif .p2align 2 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3) @@ -32,6 +35,7 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) bcc LOCAL_LABEL(divby0) JMPc(lr, eq) cmp r0, r1 + IT cc movcc r0, #0 JMPc(lr, cc) /* @@ -39,11 +43,15 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) * * r0 is the numerator, r1 the denominator. * + * ARM: * The code before JMP computes the correct shift I, so that * r0 and (r1 << I) have the highest bit set in the same position. * At the time of JMP, ip := .Ldiv0block - 12 * I. * This depends on the fixed instruction size of block. * + * Thumb 2: + * Uses a jumptable to jump to the appropriate block. + * * block(shift) implements the test-and-update-quotient core. * It assumes (r0 << shift) can be computed without overflow and * that (r0 << shift) < 2 * r1. The quotient is stored in r3. @@ -53,15 +61,57 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) clz ip, r0 clz r3, r1 /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */ +#if __ARM_ARCH_ISA_THUMB == 2 + sub ip, r3, ip + mov r3, #0 + tbb [pc, ip] +LOCAL_LABEL(JT): + .byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2 +#else sub r3, r3, ip - adr ip, LOCAL_LABEL(div0block) + adr ip, LOCAL_LABEL(0) sub ip, ip, r3, lsl #2 sub ip, ip, r3, lsl #3 mov r3, #0 bx ip +#endif # else +#if __ARM_ARCH_ISA_THUMB == 2 +#error unsupported configuration +#endif mov r2, r0 - adr ip, LOCAL_LABEL(div0block) + adr ip, LOCAL_LABEL(0) lsr r3, r2, #16 cmp r3, r1 @@ -94,10 +144,12 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) #define IMM # -#define block(shift) \ - cmp r0, r1, lsl IMM shift; \ - addhs r3, r3, IMM (1 << shift); \ - subhs r0, r0, r1, lsl IMM shift +#define block(shift) \ +LOCAL_LABEL(shift): \ + cmp r0, r1, lsl IMM shift; \ + ITT hs; \ + addhs r3, r3, IMM(1 << shift); \ + subhs r0, r0, r1, lsl IMM shift block(31) block(30) @@ -130,7 +182,6 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) block(3) block(2) block(1) -LOCAL_LABEL(div0block): block(0) mov r0, r3 diff --git a/lib/builtins/arm/umodsi3.S b/lib/builtins/arm/umodsi3.S index 164646b1f..8a979e56c 100644 --- a/lib/builtins/arm/umodsi3.S +++ b/lib/builtins/arm/umodsi3.S @@ -16,6 +16,9 @@ .syntax unified .text +#if __ARM_ARCH_ISA_THUMB == 2 + .thumb +#endif .p2align 2 DEFINE_COMPILERRT_FUNCTION(__umodsi3) @@ -30,6 +33,7 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3) #else cmp r1, #1 bcc LOCAL_LABEL(divby0) + IT eq moveq r0, #0 JMPc(lr, eq) cmp r0, r1 @@ -39,11 +43,15 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3) * * r0 is the numerator, r1 the denominator. * + * For ARM: * The code before JMP computes the correct shift I, so that * r0 and (r1 << I) have the highest bit set in the same position. * At the time of JMP, ip := .Ldiv0block - 8 * I. * This depends on the fixed instruction size of block. * + * For Thumb: + * Uses a jumptable to jump to the appropriate block. + * * block(shift) implements the test-and-update-quotient core. * It assumes (r0 << shift) can be computed without overflow and * that (r0 << shift) < 2 * r1. The quotient is stored in r3. @@ -54,12 +62,52 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3) clz r3, r1 /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */ sub r3, r3, ip - adr ip, LOCAL_LABEL(div0block) +#if __ARM_ARCH_ISA_THUMB == 2 + tbb [pc, r3] +LOCAL_LABEL(JT): + .byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2 + .byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2 +#else + adr ip, LOCAL_LABEL(0) sub ip, ip, r3, lsl #3 bx ip +#endif # else +#if __ARM_ARCH_ISA_THUMB == 2 +#error unsupported configuration +#endif mov r2, r0 - adr ip, LOCAL_LABEL(div0block) + adr ip, LOCAL_LABEL(0) lsr r3, r2, #16 cmp r3, r1 @@ -90,9 +138,11 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3) #define IMM # -#define block(shift) \ - cmp r0, r1, lsl IMM shift; \ - subhs r0, r0, r1, lsl IMM shift +#define block(shift) \ +LOCAL_LABEL(shift): \ + cmp r0, r1, lsl IMM shift; \ + IT hs; \ + subhs r0, r0, r1, lsl IMM shift block(31) block(30) @@ -125,7 +175,6 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3) block(3) block(2) block(1) -LOCAL_LABEL(div0block): block(0) JMP(lr) #endif /* __ARM_ARCH_EXT_IDIV__ */ diff --git a/lib/builtins/assembly.h b/lib/builtins/assembly.h index d415a5f8d..b09fcd55a 100644 --- a/lib/builtins/assembly.h +++ b/lib/builtins/assembly.h @@ -22,6 +22,16 @@ #define SEPARATOR ; #endif +#if defined(__arm__) +#if __ARM_ARCH_ISA_THUMB == 2 +#define IT it +#define ITT itt +#else +#define IT @ +#define ITT @ +#endif +#endif + #if defined(__APPLE__) #define HIDDEN(name) .private_extern name #define LOCAL_LABEL(name) L_##name @@ -86,7 +96,9 @@ #ifdef ARM_HAS_BX #define JMP(r) bx r -#define JMPc(r, c) bx##c r +#define JMPc(r, c) \ + IT c; \ + bx##c r #else #define JMP(r) mov pc, r #define JMPc(r, c) mov##c pc, r |