Latest improvements from Nicolas Pitre. Reviewed by Monty

From Nicolas's notes: - Includes my previous patch with interpolation code for correct accuracy with all block sizes. - Interlaces sin and cos values in the lookup table to reduce register pressure since only one pointer is required to walk the table instead of two. This also accounts for better cache locality. - Split the lookup table into two tables since half of it (one value every two) is only used in separate section of the code and only with large block sizes. Therefore the table size used for the common case is reduced by 2 accounting for yet better cache usage. - Abstracted all cross products throughout the code so they can be easily optimized. First this prevents redundant register reloads on ARM due to the implicit memory access ordering, next this allowed for the opportunity to hook some inline assembly to perform the actual operation. - Fix layout of current assembly in asm_arm.h to match GCC's output (more enjoyable to read when inspecting the final assembly) plus some constraint correctness issues. - Added a memory barrier macro to force the compiler not to cache values into registers or on the stack in some cases. - Reordered some code for better ARM assembly generation by the compiler. git-svn-id: https://svn.xiph.org/trunk/Tremor@3923 0101bb08-14d6-0310-b084-bc0e0c8e3800
author: Monty <xiphmont@xiph.org> 2002-09-20 00:10:31 +0000
committer: Monty <xiphmont@xiph.org> 2002-09-20 00:10:31 +0000
commit: 1dad9587e30c61a8458fdda12da6248bf130c691 (patch)
tree: 028314f1b8071251d480ea836eb1a14e12c9f1f5 /asm_arm.h
parent: 55b25a7aadfb989f50a32cc7d146910de8d6f6cb (diff)
download: tremor-1dad9587e30c61a8458fdda12da6248bf130c691.tar.gz
1 files changed, 72 insertions, 13 deletions
diff --git a/asm_arm.h b/asm_arm.h
index 4b6362a..e8a718f 100644
--- a/asm_arm.h
+++ b/asm_arm.h
@@ -21,9 +21,10 @@
 
 static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
   int lo,hi;
-  asm volatile("smull  %0,%1,%2,%3;\n"
+  asm volatile("smull\t%0, %1, %2, %3"
                : "=&r"(lo),"=&r"(hi)
-               : "%r"(x),"r"(y));
+               : "%r"(x),"r"(y)
+	       : "cc");
   return(hi);
 }
 
@@ -37,26 +38,84 @@ static inline ogg_int32_t MULT30(ogg_int32_t x, ogg_int32_t y) {
 
 static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) {
   int lo,hi;
-  asm volatile("smull  %0,%1,%2,%3;\n"
-	       "mov    %0,%0, lsr #15;\n"
-	       "orr    %1,%0,%1, lsl #17;\n"
+  asm volatile("smull	%0, %1, %2, %3\n\t"
+	       "movs	%0, %0, lsr #15\n\t"
+	       "adc	%1, %0, %1, lsl #17\n\t"
                : "=&r"(lo),"=&r"(hi)
-               : "%r"(x),"r"(y));
+               : "%r"(x),"r"(y)
+	       : "cc");
   return(hi);
 }
 
 static inline ogg_int32_t CLIP_TO_15(ogg_int32_t x) {
-  asm volatile("subs   r0,%0,#32768;\n"
-	       "movpl  %0,#0x7f00;\n"
-	       "orrpl  %0,%0,#0xff;\n"
-	       "adds   r0,%0,#32768;\n"
-	       "movmi  %0,#0x8000;\n"
-	       : "+r"(x)
+  int tmp;
+  asm volatile("subs	%1, %0, #32768\n\t"
+	       "movpl	%0, #0x7f00\n\t"
+	       "orrpl	%0, %0, #0xff\n"
+	       "adds	%1, %0, #32768\n\t"
+	       "movmi	%0, #0x8000"
+	       : "+r"(x),"=r"(tmp)
 	       :
-	       : "r0","cc");
+	       : "cc");
   return(x);
 }
 
+#define MB() asm volatile ("" : : : "memory")
+
+static inline void XPROD32(ogg_int32_t  a, ogg_int32_t  b,
+			   ogg_int32_t  t, ogg_int32_t  v,
+			   ogg_int32_t *x, ogg_int32_t *y)
+{
+  int x1, y1, l;
+  asm(	"smull	%0, %1, %4, %6\n\t"
+	"smlal	%0, %1, %5, %7\n\t"
+	"rsb	%3, %4, #0\n\t"
+	"smull	%0, %2, %5, %6\n\t"
+	"smlal	%0, %2, %3, %7"
+	: "=&r" (l), "=&r" (x1), "=&r" (y1), "=r" (a)
+	: "3" (a), "r" (b), "r" (t), "r" (v)
+	: "cc" );
+  *x = x1;
+  MB();
+  *y = y1;
+}
+
+static inline void XPROD31(ogg_int32_t  a, ogg_int32_t  b,
+			   ogg_int32_t  t, ogg_int32_t  v,
+			   ogg_int32_t *x, ogg_int32_t *y)
+{
+  int x1, y1, l;
+  asm(	"smull	%0, %1, %4, %6\n\t"
+	"smlal	%0, %1, %5, %7\n\t"
+	"rsb	%3, %4, #0\n\t"
+	"smull	%0, %2, %5, %6\n\t"
+	"smlal	%0, %2, %3, %7"
+	: "=&r" (l), "=&r" (x1), "=&r" (y1), "=r" (a)
+	: "3" (a), "r" (b), "r" (t), "r" (v)
+	: "cc" );
+  *x = x1 << 1;
+  MB();
+  *y = y1 << 1;
+}
+
+static inline void XNPROD31(ogg_int32_t  a, ogg_int32_t  b,
+			    ogg_int32_t  t, ogg_int32_t  v,
+			    ogg_int32_t *x, ogg_int32_t *y)
+{
+  int x1, y1, l;
+  asm(	"rsb	%2, %4, #0\n\t"
+	"smull	%0, %1, %3, %5\n\t"
+	"smlal	%0, %1, %2, %6\n\t"
+	"smull	%0, %2, %4, %5\n\t"
+	"smlal	%0, %2, %3, %6"
+	: "=&r" (l), "=&r" (x1), "=&r" (y1)
+	: "r" (a), "r" (b), "r" (t), "r" (v)
+	: "cc" );
+  *x = x1 << 1;
+  MB();
+  *y = y1 << 1;
+}
+
 #endif
 
 #ifndef _V_LSP_MATH_ASM
author	Monty <xiphmont@xiph.org>	2002-09-20 00:10:31 +0000
committer	Monty <xiphmont@xiph.org>	2002-09-20 00:10:31 +0000
commit	1dad9587e30c61a8458fdda12da6248bf130c691 (patch)
tree	028314f1b8071251d480ea836eb1a14e12c9f1f5 /asm_arm.h
parent	55b25a7aadfb989f50a32cc7d146910de8d6f6cb (diff)
download	tremor-1dad9587e30c61a8458fdda12da6248bf130c691.tar.gz