VP8: ARM optimised decode_block_coeffs_internal

Approximately 5% faster on Cortex-A8. Signed-off-by: Mans Rullgard <mans@mansr.com>
author: Mans Rullgard <mans@mansr.com> 2011-02-01 22:38:15 +0000
committer: Mans Rullgard <mans@mansr.com> 2011-02-11 15:48:11 +0000
commit: a7878c9f73c12cfa685bd8af8f3afcca85f56a8b (patch)
tree: 8d78fafcffa217021277ffe000f35edaa9e9adae /libavcodec/arm
parent: 7da48fd0111adf504cfcfc5ebda7fd0681968041 (diff)
download: ffmpeg-a7878c9f73c12cfa685bd8af8f3afcca85f56a8b.tar.gz
3 files changed, 251 insertions, 0 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 15269ea676..d223703cfe 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -3,6 +3,7 @@ OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o         \
 OBJS-$(CONFIG_VP5_DECODER)             += arm/vp56dsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER)             += arm/vp56dsp_init_arm.o
 OBJS-$(CONFIG_VP8_DECODER)             += arm/vp8dsp_init_arm.o
+ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o
 
 OBJS-$(CONFIG_H264DSP)                 += arm/h264dsp_init_arm.o
 OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
@@ -23,6 +24,7 @@ OBJS-$(HAVE_ARMV5TE)                   += arm/dsputil_init_armv5te.o    \
 OBJS-$(HAVE_ARMV6)                     += arm/dsputil_init_armv6.o      \
                                           arm/dsputil_armv6.o           \
                                           arm/simple_idct_armv6.o       \
+                                          $(ARMV6-OBJS-yes)
 
 VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \
 
diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h
new file mode 100644
index 0000000000..90e7344b62
--- /dev/null
+++ b/libavcodec/arm/vp8.h
@@ -0,0 +1,29 @@
+/**
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP8_H
+#define AVCODEC_ARM_VP8_H
+
+#if HAVE_ARMV6
+#define decode_block_coeffs_internal ff_decode_block_coeffs_armv6
+int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, DCTELEM block[16],
+                                 uint8_t probs[8][3][NUM_DCT_TOKENS-1],
+                                 int i, uint8_t *token_prob, int16_t qmul[2]);
+#endif
+
+#endif
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
new file mode 100644
index 0000000000..54c036b82a
--- /dev/null
+++ b/libavcodec/arm/vp8_armv6.S
@@ -0,0 +1,220 @@
+/**
+ * Copyright (C) 2010 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        .syntax         unified
+
+.macro rac_get_prob     h, bs, buf, cw, pr, t0, t1
+        adds            \bs, \bs, \t0
+        lsl             \cw, \cw, \t0
+        lsl             \t0, \h,  \t0
+        rsb             \h,  \pr, #256
+        ldrhcs          \t1, [\buf], #2
+        smlabb          \h,  \t0, \pr, \h
+        rev16cs         \t1, \t1
+        orrcs           \cw, \cw, \t1, lsl \bs
+        subcs           \bs, \bs, #16
+        lsr             \h,  \h,  #8
+        cmp             \cw, \h,  lsl #16
+        subge           \cw, \cw, \h,  lsl #16
+        subge           \h,  \t0, \h
+.endm
+
+.macro rac_get_128      h, bs, buf, cw, t0, t1
+        adds            \bs, \bs, \t0
+        lsl             \cw, \cw, \t0
+        lsl             \t0, \h,  \t0
+        ldrhcs          \t1, [\buf], #2
+        mov             \h,  #128
+        rev16cs         \t1, \t1
+        add             \h,  \h,  \t0, lsl #7
+        orrcs           \cw, \cw, \t1, lsl \bs
+        subcs           \bs, \bs, #16
+        lsr             \h,  \h,  #8
+        cmp             \cw, \h,  lsl #16
+        subge           \cw, \cw, \h,  lsl #16
+        subge           \h,  \t0, \h
+.endm
+
+function ff_decode_block_coeffs_armv6, export=1
+        push            {r0,r1,r4-r11,lr}
+        movrel          lr,  ff_vp56_norm_shift
+        ldrd            r4,  r5,  [sp, #44]             @ token_prob, qmul
+        cmp             r3,  #0
+        ldr             r11, [r5]
+        ldm             r0,  {r5-r7}                    @ high, bits, buf
+        pkhtbne         r11, r11, r11, asr #16
+        ldr             r8,  [r0, #16]                  @ code_word
+0:
+        ldrb            r9,  [lr, r5]
+        add             r3,  r3,  #1
+        ldrb            r0,  [r4, #1]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        blt             2f
+
+        ldrb            r9,  [lr, r5]
+        ldrb            r0,  [r4, #2]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        ldrb            r9,  [lr, r5]
+        bge             3f
+
+        add             r4,  r3,  r3,  lsl #5
+        sxth            r12, r11
+        add             r4,  r2,  r4
+        adds            r6,  r6,  r9
+        add             r4,  r4,  #11
+        lsl             r8,  r8,  r9
+        ldrhcs          r10, [r7], #2
+        lsl             r9,  r5,  r9
+        mov             r5,  #128
+        rev16cs         r10, r10
+        add             r5,  r5,  r9,  lsl #7
+        orrcs           r8,  r8,  r10, lsl r6
+        subcs           r6,  r6,  #16
+        lsr             r5,  r5,  #8
+        cmp             r8,  r5,  lsl #16
+        movrel          r10, zigzag_scan-1
+        subge           r8,  r8,  r5,  lsl #16
+        subge           r5,  r9,  r5
+        ldrb            r10, [r10, r3]
+        rsbge           r12, r12, #0
+        cmp             r3,  #16
+        strh            r12, [r1, r10]
+        bge             6f
+5:
+        ldrb            r9,  [lr, r5]
+        ldrb            r0,  [r4]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        pkhtb           r11, r11, r11, asr #16
+        bge             0b
+
+6:
+        ldr             r0,  [sp]
+        ldr             r9,  [r0, #12]
+        cmp             r7,  r9
+        movhi           r7,  r9
+        stm             r0,  {r5-r7}                    @ high, bits, buf
+        str             r8,  [r0, #16]                  @ code_word
+
+        add             sp,  sp,  #8
+        mov             r0,  r3
+        pop             {r4-r11,pc}
+2:
+        add             r4,  r3,  r3,  lsl #5
+        cmp             r3,  #16
+        add             r4,  r2,  r4
+        pkhtb           r11, r11, r11, asr #16
+        bne             0b
+        b               6b
+3:
+        ldrb            r0,  [r4, #3]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        ldrb            r9,  [lr, r5]
+        bge             1f
+
+        mov             r12, #2
+        ldrb            r0,  [r4, #4]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        addge           r12, #1
+        ldrb            r9,  [lr, r5]
+        blt             4f
+        ldrb            r0,  [r4, #5]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        addge           r12, #1
+        ldrb            r9,  [lr, r5]
+        b               4f
+1:
+        ldrb            r0,  [r4, #6]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        ldrb            r9,  [lr, r5]
+        bge             3f
+
+        ldrb            r0,  [r4, #7]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        ldrb            r9,  [lr, r5]
+        bge             2f
+
+        mov             r12, #5
+        mov             r0,  #159
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        addge           r12, r12, #1
+        ldrb            r9,  [lr, r5]
+        b               4f
+2:
+        mov             r12, #7
+        mov             r0,  #165
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        addge           r12, r12, #2
+        ldrb            r9,  [lr, r5]
+        mov             r0,  #145
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        addge           r12, r12, #1
+        ldrb            r9,  [lr, r5]
+        b               4f
+3:
+        ldrb            r0,  [r4, #8]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        addge           r4,  r4,  #1
+        ldrb            r9,  [lr, r5]
+        movge           r12, #2
+        movlt           r12, #0
+        ldrb            r0,  [r4, #9]
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        mov             r9,  #8
+        addge           r12, r12, #1
+        movrel          r4,  ff_vp8_dct_cat_prob
+        lsl             r9,  r9,  r12
+        ldr             r4,  [r4, r12, lsl #2]
+        add             r12, r9,  #3
+        mov             r1,  #0
+        ldrb            r0,  [r4], #1
+1:
+        ldrb            r9,  [lr, r5]
+        lsl             r1,  r1,  #1
+        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        ldrb            r0,  [r4], #1
+        addge           r1,  r1,  #1
+        cmp             r0,  #0
+        bne             1b
+        ldrb            r9,  [lr, r5]
+        add             r12, r12, r1
+        ldr             r1,  [sp, #4]
+4:
+        add             r4,  r3,  r3,  lsl #5
+        add             r4,  r2,  r4
+        add             r4,  r4,  #22
+        rac_get_128     r5,  r6,  r7,  r8,  r9,  r10
+        rsbge           r12, r12, #0
+        smulbb          r12, r12, r11
+        movrel          r9,  zigzag_scan-1
+        ldrb            r9,  [r9, r3]
+        cmp             r3,  #16
+        strh            r12, [r1, r9]
+        bge             6b
+        b               5b
+endfunc
+
+        .section        .rodata
+zigzag_scan:
+        .byte            0,  2,  8, 16
+        .byte           10,  4,  6, 12
+        .byte           18, 24, 26, 20
+        .byte           14, 22, 28, 30
author	Mans Rullgard <mans@mansr.com>	2011-02-01 22:38:15 +0000
committer	Mans Rullgard <mans@mansr.com>	2011-02-11 15:48:11 +0000
commit	a7878c9f73c12cfa685bd8af8f3afcca85f56a8b (patch)
tree	8d78fafcffa217021277ffe000f35edaa9e9adae /libavcodec/arm
parent	7da48fd0111adf504cfcfc5ebda7fd0681968041 (diff)
download	ffmpeg-a7878c9f73c12cfa685bd8af8f3afcca85f56a8b.tar.gz