summaryrefslogtreecommitdiff
path: root/misc.h
diff options
context:
space:
mode:
authorMonty <xiphmont@xiph.org>2002-09-20 00:10:31 +0000
committerMonty <xiphmont@xiph.org>2002-09-20 00:10:31 +0000
commit1dad9587e30c61a8458fdda12da6248bf130c691 (patch)
tree028314f1b8071251d480ea836eb1a14e12c9f1f5 /misc.h
parent55b25a7aadfb989f50a32cc7d146910de8d6f6cb (diff)
downloadtremor-1dad9587e30c61a8458fdda12da6248bf130c691.tar.gz
Latest improvements from Nicolas Pitre. Reviewed by Monty
From Nicolas's notes: - Includes my previous patch with interpolation code for correct accuracy with all block sizes. - Interlaces sin and cos values in the lookup table to reduce register pressure since only one pointer is required to walk the table instead of two. This also accounts for better cache locality. - Split the lookup table into two tables since half of it (one value every two) is only used in separate section of the code and only with large block sizes. Therefore the table size used for the common case is reduced by 2 accounting for yet better cache usage. - Abstracted all cross products throughout the code so they can be easily optimized. First this prevents redundant register reloads on ARM due to the implicit memory access ordering, next this allowed for the opportunity to hook some inline assembly to perform the actual operation. - Fix layout of current assembly in asm_arm.h to match GCC's output (more enjoyable to read when inspecting the final assembly) plus some constraint correctness issues. - Added a memory barrier macro to force the compiler not to cache values into registers or on the stack in some cases. - Reordered some code for better ARM assembly generation by the compiler. git-svn-id: https://svn.xiph.org/trunk/Tremor@3923 0101bb08-14d6-0310-b084-bc0e0c8e3800
Diffstat (limited to 'misc.h')
-rw-r--r--misc.h57
1 files changed, 57 insertions, 0 deletions
diff --git a/misc.h b/misc.h
index 0b8dd28..5088d53 100644
--- a/misc.h
+++ b/misc.h
@@ -79,6 +79,63 @@ static inline ogg_int32_t CLIP_TO_15(ogg_int32_t x) {
return(ret);
}
+/*
+ * This should be used as a memory barrier, forcing all cached values in
+ * registers to wr writen back to memory. Might or might not be beneficial
+ * depending on the architecture and compiler.
+ */
+#define MB()
+
+/*
+ * The XPROD functions are meant to optimize the cross products found all
+ * over the place in mdct.c by forcing memory operation ordering to avoid
+ * unnecessary register reloads as soon as memory is being written to.
+ * However this is only beneficial on CPUs with a sane number of general
+ * purpose registers which exclude the Intel x86. On Intel, better let the
+ * compiler actually reload registers directly from original memory by using
+ * macros.
+ */
+
+#ifdef __i386__
+
+#define XPROD32(_a, _b, _t, _v, _x, _y) \
+ { *(_x)= MULT32(_a,_t)+MULT32(_b,_v) ; \
+ *(_y)= MULT32(_b,_t)-MULT32(_a,_v) ; }
+#define XPROD31(_a, _b, _t, _v, _x, _y) \
+ { *(_x)=(MULT32(_a,_t)+MULT32(_b,_v))<<1; \
+ *(_y)=(MULT32(_b,_t)-MULT32(_a,_v))<<1; }
+#define XNPROD31(_a, _b, _t, _v, _x, _y) \
+ { *(_x)=(MULT32(_a,_t)-MULT32(_b,_v))<<1; \
+ *(_y)=(MULT32(_b,_t)+MULT32(_a,_v))<<1; }
+
+#else
+
+static inline void XPROD32(ogg_int32_t a, ogg_int32_t b,
+ ogg_int32_t t, ogg_int32_t v,
+ ogg_int32_t *x, ogg_int32_t *y)
+{
+ *x = MULT32(a, t) + MULT32(b, v);
+ *y = MULT32(b, t) - MULT32(a, v);
+}
+
+static inline void XPROD31(ogg_int32_t a, ogg_int32_t b,
+ ogg_int32_t t, ogg_int32_t v,
+ ogg_int32_t *x, ogg_int32_t *y)
+{
+ *x = (MULT32(a, t) + MULT32(b, v))<<1;
+ *y = (MULT32(b, t) - MULT32(a, v))<<1;
+}
+
+static inline void XNPROD31(ogg_int32_t a, ogg_int32_t b,
+ ogg_int32_t t, ogg_int32_t v,
+ ogg_int32_t *x, ogg_int32_t *y)
+{
+ *x = (MULT32(a, t) - MULT32(b, v))<<1;
+ *y = (MULT32(b, t) + MULT32(a, v))<<1;
+}
+
+#endif
+
#endif
static inline ogg_int32_t VFLOAT_MULT(ogg_int32_t a,ogg_int32_t ap,