diff options
author | Monty <xiphmont@xiph.org> | 2002-09-20 00:10:31 +0000 |
---|---|---|
committer | Monty <xiphmont@xiph.org> | 2002-09-20 00:10:31 +0000 |
commit | 1dad9587e30c61a8458fdda12da6248bf130c691 (patch) | |
tree | 028314f1b8071251d480ea836eb1a14e12c9f1f5 /asm_arm.h | |
parent | 55b25a7aadfb989f50a32cc7d146910de8d6f6cb (diff) | |
download | tremor-1dad9587e30c61a8458fdda12da6248bf130c691.tar.gz |
Latest improvements from Nicolas Pitre. Reviewed by Monty
From Nicolas's notes:
- Includes my previous patch with interpolation code for correct accuracy
with all block sizes.
- Interlaces sin and cos values in the lookup table to reduce register
pressure since only one pointer is required to walk the table instead of
two. This also accounts for better cache locality.
- Split the lookup table into two tables since half of it (one value every
two) is only used in separate section of the code and only with large
block sizes. Therefore the table size used for the common case is reduced
by 2 accounting for yet better cache usage.
- Abstracted all cross products throughout the code so they can be easily
optimized. First this prevents redundant register reloads on ARM due to
the implicit memory access ordering, next this allowed for the
opportunity to hook some inline assembly to perform the actual operation.
- Fix layout of current assembly in asm_arm.h to match GCC's output (more
enjoyable to read when inspecting the final assembly) plus some
constraint correctness issues.
- Added a memory barrier macro to force the compiler not to cache values
into registers or on the stack in some cases.
- Reordered some code for better ARM assembly generation by
the compiler.
git-svn-id: https://svn.xiph.org/trunk/Tremor@3923 0101bb08-14d6-0310-b084-bc0e0c8e3800
Diffstat (limited to 'asm_arm.h')
-rw-r--r-- | asm_arm.h | 85 |
1 files changed, 72 insertions, 13 deletions
@@ -21,9 +21,10 @@ static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) { int lo,hi; - asm volatile("smull %0,%1,%2,%3;\n" + asm volatile("smull\t%0, %1, %2, %3" : "=&r"(lo),"=&r"(hi) - : "%r"(x),"r"(y)); + : "%r"(x),"r"(y) + : "cc"); return(hi); } @@ -37,26 +38,84 @@ static inline ogg_int32_t MULT30(ogg_int32_t x, ogg_int32_t y) { static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) { int lo,hi; - asm volatile("smull %0,%1,%2,%3;\n" - "mov %0,%0, lsr #15;\n" - "orr %1,%0,%1, lsl #17;\n" + asm volatile("smull %0, %1, %2, %3\n\t" + "movs %0, %0, lsr #15\n\t" + "adc %1, %0, %1, lsl #17\n\t" : "=&r"(lo),"=&r"(hi) - : "%r"(x),"r"(y)); + : "%r"(x),"r"(y) + : "cc"); return(hi); } static inline ogg_int32_t CLIP_TO_15(ogg_int32_t x) { - asm volatile("subs r0,%0,#32768;\n" - "movpl %0,#0x7f00;\n" - "orrpl %0,%0,#0xff;\n" - "adds r0,%0,#32768;\n" - "movmi %0,#0x8000;\n" - : "+r"(x) + int tmp; + asm volatile("subs %1, %0, #32768\n\t" + "movpl %0, #0x7f00\n\t" + "orrpl %0, %0, #0xff\n" + "adds %1, %0, #32768\n\t" + "movmi %0, #0x8000" + : "+r"(x),"=r"(tmp) : - : "r0","cc"); + : "cc"); return(x); } +#define MB() asm volatile ("" : : : "memory") + +static inline void XPROD32(ogg_int32_t a, ogg_int32_t b, + ogg_int32_t t, ogg_int32_t v, + ogg_int32_t *x, ogg_int32_t *y) +{ + int x1, y1, l; + asm( "smull %0, %1, %4, %6\n\t" + "smlal %0, %1, %5, %7\n\t" + "rsb %3, %4, #0\n\t" + "smull %0, %2, %5, %6\n\t" + "smlal %0, %2, %3, %7" + : "=&r" (l), "=&r" (x1), "=&r" (y1), "=r" (a) + : "3" (a), "r" (b), "r" (t), "r" (v) + : "cc" ); + *x = x1; + MB(); + *y = y1; +} + +static inline void XPROD31(ogg_int32_t a, ogg_int32_t b, + ogg_int32_t t, ogg_int32_t v, + ogg_int32_t *x, ogg_int32_t *y) +{ + int x1, y1, l; + asm( "smull %0, %1, %4, %6\n\t" + "smlal %0, %1, %5, %7\n\t" + "rsb %3, %4, #0\n\t" + "smull %0, %2, %5, %6\n\t" + "smlal %0, %2, %3, %7" + : "=&r" (l), "=&r" (x1), "=&r" (y1), "=r" (a) + : "3" (a), "r" (b), "r" (t), "r" (v) + : "cc" ); + *x = x1 << 1; + MB(); + *y = y1 << 1; +} + +static inline void XNPROD31(ogg_int32_t a, ogg_int32_t b, + ogg_int32_t t, ogg_int32_t v, + ogg_int32_t *x, ogg_int32_t *y) +{ + int x1, y1, l; + asm( "rsb %2, %4, #0\n\t" + "smull %0, %1, %3, %5\n\t" + "smlal %0, %1, %2, %6\n\t" + "smull %0, %2, %4, %5\n\t" + "smlal %0, %2, %3, %6" + : "=&r" (l), "=&r" (x1), "=&r" (y1) + : "r" (a), "r" (b), "r" (t), "r" (v) + : "cc" ); + *x = x1 << 1; + MB(); + *y = y1 << 1; +} + #endif #ifndef _V_LSP_MATH_ASM |