summaryrefslogtreecommitdiff
path: root/libavcodec/x86/cabac.h
diff options
context:
space:
mode:
authorRoland Scheidegger <rscheidegger_lists@hispeed.ch>2012-04-27 22:12:20 +0200
committerMichael Niedermayer <michaelni@gmx.at>2012-04-28 20:02:27 +0200
commit82c71913e46552f9b41ed3f63571b7777a999f68 (patch)
treea0b43714aba491eb6f4b6096b3c901d51645bb55 /libavcodec/x86/cabac.h
parent7f668cd2b5f13afa0de9f593948ce2f703ab5aaa (diff)
downloadffmpeg-82c71913e46552f9b41ed3f63571b7777a999f68.tar.gz
h264: new assembly version of get_cabac for x86_64 with PIC
This adds a hand-optimized assembly version for get_cabac much like the existing one, but it works if the table offsets are RIP-relative. Compared to the non-RIP-relative version this adds 2 lea instructions and it needs one extra register. There is a surprisingly large performance improvement over the c version (more so than the generated assembly seems to suggest) just in get_cabac, I measured roughly 40% faster for get_cabac on a K8. However, overall the difference is not that big, I measured roughly 5% on a test clip on a K8 and a Core2. Hopefully it still compiles on x86 32bit... Now that only one table is used, there's some chance even darwin as compiles this (apparently the label arithmetic used previously doesn't work if it involves symbols defined in a different file, thanks to Ronald S. Bultje for helping me with this). Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/cabac.h')
-rw-r--r--libavcodec/x86/cabac.h89
1 files changed, 81 insertions, 8 deletions
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index f532be3d8e..86801ff258 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -27,6 +27,69 @@
#include "libavutil/internal.h"
#include "config.h"
+#ifdef BROKEN_RELOCATIONS
+#define TABLES_ARG , "r"(tables)
+
+#if HAVE_FAST_CMOV
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+ "cmp "low" , "tmp" \n\t"\
+ "cmova %%ecx , "range" \n\t"\
+ "sbb %%rcx , %%rcx \n\t"\
+ "and %%ecx , "tmp" \n\t"\
+ "xor %%rcx , "retq" \n\t"\
+ "sub "tmp" , "low" \n\t"
+#else /* HAVE_FAST_CMOV */
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \
+ "sub "low" , "tmp" \n\t"\
+ "sar $31 , "tmp" \n\t"\
+ "sub %%ecx , "range" \n\t"\
+ "and "tmp" , "range" \n\t"\
+ "add %%ecx , "range" \n\t"\
+ "shl $17 , %%ecx \n\t"\
+ "and "tmp" , %%ecx \n\t"\
+ "sub %%ecx , "low" \n\t"\
+ "xor "tmp" , "ret" \n\t"\
+ "movslq "ret" , "retq" \n\t"
+#endif /* HAVE_FAST_CMOV */
+
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
+ "movzbl "statep" , "ret" \n\t"\
+ "mov "range" , "tmp" \n\t"\
+ "and $0xC0 , "range" \n\t"\
+ "lea ("ret", "range", 2), %%ecx \n\t"\
+ "movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
+ "sub "range" , "tmp" \n\t"\
+ "mov "tmp" , %%ecx \n\t"\
+ "shl $17 , "tmp" \n\t"\
+ BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+ "movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
+ "shl %%cl , "range" \n\t"\
+ "movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
+ "shl %%cl , "low" \n\t"\
+ "mov "tmpbyte" , "statep" \n\t"\
+ "test "lowword" , "lowword" \n\t"\
+ "jnz 2f \n\t"\
+ "mov "byte" , %%"REG_c" \n\t"\
+ "add"OPSIZE" $2 , "byte" \n\t"\
+ "movzwl (%%"REG_c") , "tmp" \n\t"\
+ "lea -1("low") , %%ecx \n\t"\
+ "xor "low" , %%ecx \n\t"\
+ "shr $15 , %%ecx \n\t"\
+ "bswap "tmp" \n\t"\
+ "shr $15 , "tmp" \n\t"\
+ "movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
+ "sub $0xFFFF , "tmp" \n\t"\
+ "neg %%ecx \n\t"\
+ "add $7 , %%ecx \n\t"\
+ "shl %%cl , "tmp" \n\t"\
+ "add "tmp" , "low" \n\t"\
+ "2: \n\t"
+
+#else /* BROKEN_RELOCATIONS */
+#define TABLES_ARG
+#define RIP_ARG
+
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
@@ -52,7 +115,7 @@
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off) \
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
@@ -82,31 +145,41 @@
"add "tmp" , "low" \n\t"\
"2: \n\t"
+#endif /* BROKEN_RELOCATIONS */
-#if HAVE_7REGS && !defined(BROKEN_RELOCATIONS) && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
- && !(defined(__i386) && !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
+
+#if HAVE_7REGS && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+ && !(defined(__i386) && !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
#define get_cabac_inline get_cabac_inline_x86
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
uint8_t *const state)
{
int bit, tmp;
+#ifdef BROKEN_RELOCATIONS
+ void *tables;
+
+ __asm__ volatile(
+ "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
+ : "=&r"(tables)
+ );
+#endif
__asm__ volatile(
- BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
- "%2", "%3", "%b3",
- "%a6(%5)", "%a7(%5)", "%a8", "%a9", "%a10")
+ BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
+ "%2", "%q2", "%3", "%b3",
+ "%a6(%5)", "%a7(%5)", "%a8", "%a9", "%a10", "%11")
: "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(H264_NORM_SHIFT_OFFSET),
"i"(H264_LPS_RANGE_OFFSET),
- "i"(H264_MLPS_STATE_OFFSET)
+ "i"(H264_MLPS_STATE_OFFSET) TABLES_ARG
: "%"REG_c, "memory"
);
return bit & 1;
}
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif /* HAVE_7REGS */
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)