summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2011-11-17 12:54:42 -0500
committerTero Rintaluoma <teror@google.com>2011-11-25 09:24:04 +0200
commit4a91541c946c1fc2655a942ec79033618f03c4ca (patch)
tree70093355ebd25dd2c79515f7950c8490f6937355
parent7b0feac4a4386eef3e1ea851e52e4f30935e255d (diff)
downloadlibvpx-4a91541c946c1fc2655a942ec79033618f03c4ca.tar.gz
Modified the inverse walsh to output directly
to the dqcoeff or qcoeff buffer. The encoder would populate the dc coeffs of the y blocks as a separate stage (recon_dcblock) and the decoder would use a special version of the idct. This change eliminates the extra copy and reduces the code footprint. [Tero] Added needed changes to armv6 and NEON assembly. Change-Id: I83202ffdbaf83f6e5dd69f4ba2519fcf0b13b3ba
-rw-r--r--vp8/common/arm/arm_systemdependent.c2
-rw-r--r--vp8/common/arm/armv6/iwalsh_v6.asm124
-rw-r--r--vp8/common/arm/idct_arm.h6
-rw-r--r--vp8/common/arm/neon/iwalsh_neon.asm37
-rw-r--r--vp8/common/idct.h4
-rw-r--r--vp8/common/idctllm.c20
-rw-r--r--vp8/common/invtrans.c16
-rw-r--r--vp8/common/x86/idct_x86.h4
-rw-r--r--vp8/common/x86/iwalsh_mmx.asm84
-rw-r--r--vp8/common/x86/iwalsh_sse2.asm46
-rw-r--r--vp8/common/x86/x86_systemdependent.c3
-rw-r--r--vp8/decoder/arm/arm_dsystemdependent.c5
-rw-r--r--vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm213
-rw-r--r--vp8/decoder/arm/armv6/idct_blk_v6.c41
-rw-r--r--vp8/decoder/arm/dequantize_arm.h16
-rw-r--r--vp8/decoder/arm/neon/idct_blk_neon.c35
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm75
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm208
-rw-r--r--vp8/decoder/decodframe.c68
-rw-r--r--vp8/decoder/dequantize.c19
-rw-r--r--vp8/decoder/dequantize.h27
-rw-r--r--vp8/decoder/generic/dsystemdependent.c2
-rw-r--r--vp8/decoder/idct_blk.c27
-rw-r--r--vp8/decoder/threading.c101
-rw-r--r--vp8/decoder/x86/dequantize_mmx.asm201
-rw-r--r--vp8/decoder/x86/dequantize_x86.h12
-rw-r--r--vp8/decoder/x86/idct_blk_mmx.c35
-rw-r--r--vp8/decoder/x86/idct_blk_sse2.c37
-rw-r--r--vp8/decoder/x86/x86_dsystemdependent.c4
-rw-r--r--vp8/vp8dx_arm.mk3
30 files changed, 275 insertions, 1200 deletions
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index b5f194d3d..cd55a6377 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -46,7 +46,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
- rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
@@ -80,7 +79,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
- rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
index 463bff0f5..31ef09cad 100644
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -9,7 +9,6 @@
;
EXPORT |vp8_short_inv_walsh4x4_v6|
- EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
@@ -17,19 +16,19 @@
AREA |.text|, CODE, READONLY ; name this block of code
-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_v6| PROC
- stmdb sp!, {r4 - r11, lr}
+ stmdb sp!, {r4 - r12, lr}
- ldr r2, [r0], #4 ; [1 | 0]
- ldr r3, [r0], #4 ; [3 | 2]
- ldr r4, [r0], #4 ; [5 | 4]
- ldr r5, [r0], #4 ; [7 | 6]
- ldr r6, [r0], #4 ; [9 | 8]
- ldr r7, [r0], #4 ; [11 | 10]
- ldr r8, [r0], #4 ; [13 | 12]
- ldr r9, [r0] ; [15 | 14]
+ ldr r2, [r0, #0] ; [1 | 0]
+ ldr r3, [r0, #4] ; [3 | 2]
+ ldr r4, [r0, #8] ; [5 | 4]
+ ldr r5, [r0, #12] ; [7 | 6]
+ ldr r6, [r0, #16] ; [9 | 8]
+ ldr r7, [r0, #20] ; [11 | 10]
+ ldr r8, [r0, #24] ; [13 | 12]
+ ldr r9, [r0, #28] ; [15 | 14]
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
@@ -69,24 +68,27 @@
qadd16 r4, r4, r10 ; [b2+3|c2+3]
qadd16 r5, r5, r10 ; [a2+3|d2+3]
- asr r12, r2, #3 ; [1 | x]
- pkhtb r12, r12, r3, asr #19; [1 | 0]
- lsl lr, r3, #16 ; [~3 | x]
- lsl r2, r2, #16 ; [~2 | x]
- asr lr, lr, #3 ; [3 | x]
- pkhtb lr, lr, r2, asr #19 ; [3 | 2]
-
- asr r2, r4, #3 ; [5 | x]
- pkhtb r2, r2, r5, asr #19 ; [5 | 4]
- lsl r3, r5, #16 ; [~7 | x]
- lsl r4, r4, #16 ; [~6 | x]
- asr r3, r3, #3 ; [7 | x]
- pkhtb r3, r3, r4, asr #19 ; [7 | 6]
-
- str r12, [r1], #4
- str lr, [r1], #4
- str r2, [r1], #4
- str r3, [r1], #4
+ asr r12, r3, #19 ; [0]
+ strh r12, [r1], #32
+ asr lr, r2, #19 ; [1]
+ strh lr, [r1], #32
+ sxth r2, r2
+ sxth r3, r3
+ asr r2, r2, #3 ; [2]
+ strh r2, [r1], #32
+ asr r3, r3, #3 ; [3]
+ strh r3, [r1], #32
+
+ asr r12, r5, #19 ; [4]
+ strh r12, [r1], #32
+ asr lr, r4, #19 ; [5]
+ strh lr, [r1], #32
+ sxth r4, r4
+ sxth r5, r5
+ asr r4, r4, #3 ; [6]
+ strh r4, [r1], #32
+ asr r5, r5, #3 ; [7]
+ strh r5, [r1], #32
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
@@ -103,50 +105,32 @@
qadd16 r8, r8, r10 ; [b2+3|c2+3]
qadd16 r9, r9, r10 ; [a2+3|d2+3]
- asr r2, r6, #3 ; [9 | x]
- pkhtb r2, r2, r7, asr #19 ; [9 | 8]
- lsl r3, r7, #16 ; [~11| x]
- lsl r4, r6, #16 ; [~10| x]
- asr r3, r3, #3 ; [11 | x]
- pkhtb r3, r3, r4, asr #19 ; [11 | 10]
-
- asr r4, r8, #3 ; [13 | x]
- pkhtb r4, r4, r9, asr #19 ; [13 | 12]
- lsl r5, r9, #16 ; [~15| x]
- lsl r6, r8, #16 ; [~14| x]
- asr r5, r5, #3 ; [15 | x]
- pkhtb r5, r5, r6, asr #19 ; [15 | 14]
-
- str r2, [r1], #4
- str r3, [r1], #4
- str r4, [r1], #4
- str r5, [r1]
-
- ldmia sp!, {r4 - r11, pc}
+ asr r12, r7, #19 ; [8]
+ strh r12, [r1], #32
+ asr lr, r6, #19 ; [9]
+ strh lr, [r1], #32
+ sxth r6, r6
+ sxth r7, r7
+ asr r6, r6, #3 ; [10]
+ strh r6, [r1], #32
+ asr r7, r7, #3 ; [11]
+ strh r7, [r1], #32
+
+ asr r12, r9, #19 ; [12]
+ strh r12, [r1], #32
+ asr lr, r8, #19 ; [13]
+ strh lr, [r1], #32
+ sxth r8, r8
+ sxth r9, r9
+ asr r8, r8, #3 ; [14]
+ strh r8, [r1], #32
+ asr r9, r9, #3 ; [15]
+ strh r9, [r1], #32
+
+ ldmia sp!, {r4 - r12, pc}
ENDP ; |vp8_short_inv_walsh4x4_v6|
-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_v6| PROC
-
- ldrsh r2, [r0] ; [0]
- add r2, r2, #3 ; [0] + 3
- asr r2, r2, #3 ; a1 ([0]+3) >> 3
- lsl r2, r2, #16 ; [a1 | x]
- orr r2, r2, r2, lsr #16 ; [a1 | a1]
-
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1]
-
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_v6|
-
; Constant Pool
c0x00030003 DCD 0x00030003
END
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
index c710c2eb0..68c0cad11 100644
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
-
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
#endif
@@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
-
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
#endif
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm
index 01c79d937..e8ea2a619 100644
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@@ -8,7 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
- EXPORT |vp8_short_inv_walsh4x4_1_neon|
ARM
REQUIRE8
@@ -16,7 +15,7 @@
AREA |.text|, CODE, READONLY ; name this block of code
-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
@@ -59,22 +58,30 @@
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
- vst4.i16 {d0,d1,d2,d3}, [r1@128]
+ mov r2, #64
+ add r3, r1, #32
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_neon|
+ vst1.i16 d0[0], [r1],r2
+ vst1.i16 d1[0], [r3],r2
+ vst1.i16 d2[0], [r1],r2
+ vst1.i16 d3[0], [r3],r2
+
+ vst1.i16 d0[1], [r1],r2
+ vst1.i16 d1[1], [r3],r2
+ vst1.i16 d2[1], [r1],r2
+ vst1.i16 d3[1], [r3],r2
+ vst1.i16 d0[2], [r1],r2
+ vst1.i16 d1[2], [r3],r2
+ vst1.i16 d2[2], [r1],r2
+ vst1.i16 d3[2], [r3],r2
+
+ vst1.i16 d0[3], [r1],r2
+ vst1.i16 d1[3], [r3],r2
+ vst1.i16 d2[3], [r1]
+ vst1.i16 d3[3], [r3]
-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_neon| PROC
- ldrsh r2, [r0] ; load input[0]
- add r3, r2, #3 ; add 3
- add r2, r1, #16 ; base for last 8 output
- asr r0, r3, #3 ; right shift 3
- vdup.16 q0, r0 ; load and duplicate
- vst1.16 {q0}, [r1@128] ; write back 8
- vst1.16 {q0}, [r2@128] ; write back last 8
bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_neon|
+ ENDP ; |vp8_short_inv_walsh4x4_neon|
END
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index 411a1b472..7371f85ff 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -37,6 +37,10 @@
#define vp8_idct_idct16 vp8_short_idct4x4llm_c
#endif
extern prototype_idct(vp8_idct_idct16);
+/* add this prototype to prevent compiler warning about implicit
+ * declaration of vp8_short_idct4x4llm_c function in dequantize.c
+ * when building, for example, neon optimized version */
+extern prototype_idct(vp8_short_idct4x4llm_c);
#ifndef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 49496abef..47af52f04 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -137,8 +137,9 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
}
-void vp8_short_inv_walsh4x4_c(short *input, short *output)
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
{
+ short output[16];
int i;
int a1, b1, c1, d1;
int a2, b2, c2, d2;
@@ -183,22 +184,21 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output)
ip += 4;
op += 4;
}
+
+ for(i = 0; i < 16; i++)
+ {
+ mb_dqcoeff[i * 16] = output[i];
+ }
}
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output)
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
{
int i;
int a1;
- short *op = output;
a1 = ((input[0] + 3) >> 3);
-
- for (i = 0; i < 4; i++)
+ for(i = 0; i < 16; i++)
{
- op[0] = a1;
- op[1] = a1;
- op[2] = a1;
- op[3] = a1;
- op += 4;
+ mb_dqcoeff[i * 16] = a1;
}
}
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
index 478cb329f..95e6980fe 100644
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -28,18 +28,6 @@ void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
}
-static void recon_dcblock(MACROBLOCKD *x)
-{
- BLOCKD *b = &x->block[24];
- int i;
-
- for (i = 0; i < 16; i++)
- {
- x->block[i].dqcoeff[0] = b->diff[i];
- }
-
-}
-
void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
int i;
@@ -47,9 +35,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
if(x->mode_info_context->mbmi.mode != SPLITMV)
{
/* do 2nd order transform on the dc block */
- IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
-
- recon_dcblock(x);
+ IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);
}
for (i = 0; i < 16; i++)
diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h
index f9e3a794d..06e3ea4b5 100644
--- a/vp8/common/x86/idct_x86.h
+++ b/vp8/common/x86/idct_x86.h
@@ -24,7 +24,6 @@ extern prototype_idct(vp8_short_idct4x4llm_mmx);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
-extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct16
@@ -36,9 +35,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx
-
#endif
#endif
diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm
index 10b5274dc..3ab066ba4 100644
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -11,42 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
-;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_1_mmx)
-sym(vp8_short_inv_walsh4x4_1_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rax, 3
-
- mov rdi, arg(1)
- add rax, [rsi] ;input[0] + 3
-
- movd mm0, eax
-
- punpcklwd mm0, mm0 ;x x val val
-
- punpckldq mm0, mm0 ;val val val val
-
- psraw mm0, 3 ;(input[0] + 3) >> 3
-
- movq [rdi + 0], mm0
- movq [rdi + 8], mm0
- movq [rdi + 16], mm0
- movq [rdi + 24], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_mmx)
sym(vp8_short_inv_walsh4x4_mmx):
@@ -159,10 +123,50 @@ sym(vp8_short_inv_walsh4x4_mmx):
psraw mm2, 3
psraw mm3, 3
- movq [rdi + 0], mm0
- movq [rdi + 8], mm1
- movq [rdi + 16], mm2
- movq [rdi + 24], mm3
+; movq [rdi + 0], mm0
+; movq [rdi + 8], mm1
+; movq [rdi + 16], mm2
+; movq [rdi + 24], mm3
+
+ movd eax, mm0
+ psrlq mm0, 32
+ mov word ptr[rdi+32*0], ax
+ shr eax, 16
+ mov word ptr[rdi+32*1], ax
+ movd eax, mm0
+ mov word ptr[rdi+32*2], ax
+ shr eax, 16
+ mov word ptr[rdi+32*3], ax
+
+ movd ecx, mm1
+ psrlq mm1, 32
+ mov word ptr[rdi+32*4], cx
+ shr ecx, 16
+ mov word ptr[rdi+32*5], cx
+ movd ecx, mm1
+ mov word ptr[rdi+32*6], cx
+ shr ecx, 16
+ mov word ptr[rdi+32*7], cx
+
+ movd eax, mm2
+ psrlq mm2, 32
+ mov word ptr[rdi+32*8], ax
+ shr eax, 16
+ mov word ptr[rdi+32*9], ax
+ movd eax, mm2
+ mov word ptr[rdi+32*10], ax
+ shr eax, 16
+ mov word ptr[rdi+32*11], ax
+
+ movd ecx, mm3
+ psrlq mm3, 32
+ mov word ptr[rdi+32*12], cx
+ shr ecx, 16
+ mov word ptr[rdi+32*13], cx
+ movd ecx, mm3
+ mov word ptr[rdi+32*14], cx
+ shr ecx, 16
+ mov word ptr[rdi+32*15], cx
; begin epilog
pop rdi
diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm
index 1da4fd8da..5a7133d6c 100644
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -96,8 +96,50 @@ sym(vp8_short_inv_walsh4x4_sse2):
psraw xmm5, 3
psraw xmm1, 3
- movdqa [rdi + 0], xmm5
- movdqa [rdi + 16], xmm1
+;; movdqa [rdi + 0], xmm5
+;; movdqa [rdi + 16], xmm1
+
+ movd eax, xmm5
+ psrldq xmm5, 4
+ mov word ptr[rdi+32*0], ax
+ shr eax, 16
+ mov word ptr[rdi+32*1], ax
+ movd eax, xmm5
+ psrldq xmm5, 4
+ mov word ptr[rdi+32*2], ax
+ shr eax, 16
+ mov word ptr[rdi+32*3], ax
+
+ movd eax, xmm5
+ psrldq xmm5, 4
+ mov word ptr[rdi+32*4], ax
+ shr eax, 16
+ mov word ptr[rdi+32*5], ax
+ movd eax, xmm5
+ mov word ptr[rdi+32*6], ax
+ shr eax, 16
+ mov word ptr[rdi+32*7], ax
+
+ movd eax, xmm1
+ psrldq xmm1, 4
+ mov word ptr[rdi+32*8], ax
+ shr eax, 16
+ mov word ptr[rdi+32*9], ax
+ movd eax, xmm1
+ psrldq xmm1, 4
+ mov word ptr[rdi+32*10], ax
+ shr eax, 16
+ mov word ptr[rdi+32*11], ax
+
+ movd eax, xmm1
+ psrldq xmm1, 4
+ mov word ptr[rdi+32*12], ax
+ shr eax, 16
+ mov word ptr[rdi+32*13], ax
+ movd eax, xmm1
+ mov word ptr[rdi+32*14], ax
+ shr eax, 16
+ mov word ptr[rdi+32*15], ax
; begin epilog
pop rdi
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index c4e616a67..b24cbe48f 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -40,9 +40,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
- rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx;
-
-
rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx;
diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c
index 1b0091cdb..f802c5181 100644
--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -32,8 +32,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
{
pbi->dequant.block = vp8_dequantize_b_v6;
pbi->dequant.idct_add = vp8_dequant_idct_add_v6;
- pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_v6;
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_v6;
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6;
}
@@ -44,9 +42,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
{
pbi->dequant.block = vp8_dequantize_b_neon;
pbi->dequant.idct_add = vp8_dequant_idct_add_neon;
- /*This is not used: NEON always dequants two blocks at once.
- pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_neon;*/
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_neon;
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon;
}
diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
deleted file mode 100644
index 19f94e089..000000000
--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
+++ /dev/null
@@ -1,213 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequant_dc_idct_add_v6|
-
- AREA |.text|, CODE, READONLY
-
-;void vp8_dequant_dc_idct_v6(short *input, short *dq,
-; unsigned char *dest, int stride, int Dc)
-; r0 = input
-; r1 = dq
-; r2 = dst
-; r3 = stride
-; sp + 36 = Dc
-
-
-|vp8_dequant_dc_idct_add_v6| PROC
- stmdb sp!, {r4-r11, lr}
-
- ldr r6, [sp, #36]
-
- ldr r4, [r0] ;input
- ldr r5, [r1], #4 ;dq
-
- sub sp, sp, #4
- str r3, [sp]
-
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- mov r12, #3
-
-vp8_dequant_dc_add_loop
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- subs r12, r12, #1
-
- ldrne r4, [r0, #4]
- ldrne r5, [r1], #4
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- bne vp8_dequant_dc_add_loop
-
- sub r0, r0, #32
- mov r1, r0
-
-; short_idct4x4llm_v6_dual
- ldr r3, cospi8sqrt2minus1
- ldr r4, sinpi8sqrt2
- ldr r6, [r0, #8]
- mov r5, #2
-vp8_dequant_dc_idct_loop1_v6
- ldr r12, [r0, #24]
- ldr r14, [r0, #16]
- smulwt r9, r3, r6
- smulwb r7, r3, r6
- smulwt r10, r4, r6
- smulwb r8, r4, r6
- pkhbt r7, r7, r9, lsl #16
- smulwt r11, r3, r12
- pkhbt r8, r8, r10, lsl #16
- uadd16 r6, r6, r7
- smulwt r7, r4, r12
- smulwb r9, r3, r12
- smulwb r10, r4, r12
- subs r5, r5, #1
- pkhbt r9, r9, r11, lsl #16
- ldr r11, [r0], #4
- pkhbt r10, r10, r7, lsl #16
- uadd16 r7, r12, r9
- usub16 r7, r8, r7
- uadd16 r6, r6, r10
- uadd16 r10, r11, r14
- usub16 r8, r11, r14
- uadd16 r9, r10, r6
- usub16 r10, r10, r6
- uadd16 r6, r8, r7
- usub16 r7, r8, r7
- str r6, [r1, #8]
- ldrne r6, [r0, #8]
- str r7, [r1, #16]
- str r10, [r1, #24]
- str r9, [r1], #4
- bne vp8_dequant_dc_idct_loop1_v6
-
- mov r5, #2
- sub r0, r1, #8
-vp8_dequant_dc_idct_loop2_v6
- ldr r6, [r0], #4
- ldr r7, [r0], #4
- ldr r8, [r0], #4
- ldr r9, [r0], #4
- smulwt r1, r3, r6
- smulwt r12, r4, r6
- smulwt lr, r3, r8
- smulwt r10, r4, r8
- pkhbt r11, r8, r6, lsl #16
- pkhbt r1, lr, r1, lsl #16
- pkhbt r12, r10, r12, lsl #16
- pkhtb r6, r6, r8, asr #16
- uadd16 r6, r1, r6
- pkhbt lr, r9, r7, lsl #16
- uadd16 r10, r11, lr
- usub16 lr, r11, lr
- pkhtb r8, r7, r9, asr #16
- subs r5, r5, #1
- smulwt r1, r3, r8
- smulwb r7, r3, r8
- smulwt r11, r4, r8
- smulwb r9, r4, r8
- pkhbt r1, r7, r1, lsl #16
- uadd16 r8, r1, r8
- pkhbt r11, r9, r11, lsl #16
- usub16 r1, r12, r8
- uadd16 r8, r11, r6
- ldr r9, c0x00040004
- ldr r12, [sp] ; get stride from stack
- uadd16 r6, r10, r8
- usub16 r7, r10, r8
- uadd16 r7, r7, r9
- uadd16 r6, r6, r9
- uadd16 r10, r14, r1
- usub16 r1, r14, r1
- uadd16 r10, r10, r9
- uadd16 r1, r1, r9
- ldr r11, [r2] ; load input from dst
- mov r8, r7, asr #3
- pkhtb r9, r8, r10, asr #19
- mov r8, r1, asr #3
- pkhtb r8, r8, r6, asr #19
- uxtb16 lr, r11, ror #8
- qadd16 r9, r9, lr
- uxtb16 lr, r11
- qadd16 r8, r8, lr
- usat16 r9, #8, r9
- usat16 r8, #8, r8
- orr r9, r8, r9, lsl #8
- ldr r11, [r2, r12] ; load input from dst
- mov r7, r7, lsl #16
- mov r1, r1, lsl #16
- mov r10, r10, lsl #16
- mov r6, r6, lsl #16
- mov r7, r7, asr #3
- pkhtb r7, r7, r10, asr #19
- mov r1, r1, asr #3
- pkhtb r1, r1, r6, asr #19
- uxtb16 r8, r11, ror #8
- qadd16 r7, r7, r8
- uxtb16 r8, r11
- qadd16 r1, r1, r8
- usat16 r7, #8, r7
- usat16 r1, #8, r1
- orr r1, r1, r7, lsl #8
- str r9, [r2], r12 ; store output to dst
- str r1, [r2], r12 ; store output to dst
- bne vp8_dequant_dc_idct_loop2_v6
-
-; vpx_memset
- sub r0, r0, #32
- add sp, sp, #4
-
- mov r12, #0
- str r12, [r0]
- str r12, [r0, #4]
- str r12, [r0, #8]
- str r12, [r0, #12]
- str r12, [r0, #16]
- str r12, [r0, #20]
- str r12, [r0, #24]
- str r12, [r0, #28]
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_dequant_dc_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2 DCD 0x00008A8C
-c0x00040004 DCD 0x00040004
-
- END
diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c
index 686bb737f..c1ef2852f 100644
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c
@@ -13,47 +13,6 @@
#include "vp8/decoder/dequantize.h"
-void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
- unsigned char *dst, int stride,
- char *eobs, short *dc)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);
- else if (eobs[0] == 1)
- vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);
-
- if (eobs[1] > 1)
- {
- vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);
- }
- else if (eobs[1] == 1)
- vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);
-
- if (eobs[2] > 1)
- {
- vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);
- }
- else if (eobs[2] == 1)
- vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);
-
- if (eobs[3] > 1)
- {
- vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);
- }
- else if (eobs[3] == 1)
- vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);
-
- q += 64;
- dc += 4;
- dst += 4*stride;
- eobs += 4;
- }
-}
-
void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
unsigned char *dst,
int stride, char *eobs)
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h
index c020c8530..1123e8446 100644
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -15,8 +15,6 @@
#if HAVE_ARMV6
extern prototype_dequant_block(vp8_dequantize_b_v6);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
@@ -27,12 +25,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
-
#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
@@ -44,8 +36,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
#if HAVE_ARMV7
extern prototype_dequant_block(vp8_dequantize_b_neon);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
@@ -57,12 +47,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
-
#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c
index 086293114..185895f05 100644
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -15,46 +15,11 @@
/* place these declarations here because we don't want to maintain them
* outside of this scope
*/
-void idct_dequant_dc_full_2x_neon(short *input, short *dq,
- unsigned char *dst,
- int stride, short *dc);
-void idct_dequant_dc_0_2x_neon(short *input, short *dq,
- unsigned char *dst,
- int stride, short *dc);
void idct_dequant_full_2x_neon(short *q, short *dq,
unsigned char *dst, int stride);
void idct_dequant_0_2x_neon(short *q, short dq,
unsigned char *dst, int stride);
-void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
- unsigned char *dst,
- int stride, char *eobs, short *dc)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (((short *)(eobs))[0])
- {
- if (((short *)eobs)[0] & 0xfefe)
- idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);
- else
- idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);
- }
-
- if (((short *)(eobs))[1])
- {
- if (((short *)eobs)[1] & 0xfefe)
- idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);
- else
- idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);
- }
- q += 64;
- dc += 4;
- dst += 4*stride;
- eobs += 4;
- }
-}
void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
unsigned char *dst,
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
deleted file mode 100644
index bf8d7ddcd..000000000
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ /dev/null
@@ -1,75 +0,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_dc_0_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void idct_dequant_dc_0_2x_neon(short *q, short *dq,
-; unsigned char *dst, int stride);
-; r0 *q,
-; r1 *dq,
-; r2 *dst
-; r3 stride
-; sp *dc
-|idct_dequant_dc_0_2x_neon| PROC
-
- ; no q- or dq-coeffs, so r0 and r1 are free to use
- ldr r1, [sp] ; *dc
- add r12, r2, #4
- ldr r0, [r1]
-
- vld1.32 {d2[0]}, [r2], r3 ; lo
- vld1.32 {d8[0]}, [r12], r3 ; hi
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d8[1]}, [r12], r3
- vld1.32 {d4[0]}, [r2], r3
- vld1.32 {d10[0]}, [r12], r3
- vld1.32 {d4[1]}, [r2], r3
- vld1.32 {d10[1]}, [r12]
-
- sxth r1, r0 ; lo *dc
- add r1, r1, #4
- asr r1, r1, #3
- vdup.16 q0, r1
- sxth r0, r0, ror #16 ; hi *dc
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q3, r0
-
- vaddw.u8 q1, q0, d2 ; lo
- vaddw.u8 q2, q0, d4
- vaddw.u8 q4, q3, d8 ; hi
- vaddw.u8 q5, q3, d10
-
- vqmovun.s16 d2, q1 ; lo
- vqmovun.s16 d4, q2
- vqmovun.s16 d8, q4 ; hi
- vqmovun.s16 d10, q5
-
- sub r2, r2, r3, lsl #2 ; dst - 4*stride
- add r0, r2, #4
-
- vst1.32 {d2[0]}, [r2], r3 ; lo
- vst1.32 {d8[0]}, [r0], r3 ; hi
- vst1.32 {d2[1]}, [r2], r3
- vst1.32 {d8[1]}, [r0], r3
- vst1.32 {d4[0]}, [r2], r3
- vst1.32 {d10[0]}, [r0], r3
- vst1.32 {d4[1]}, [r2]
- vst1.32 {d10[1]}, [r0]
-
- bx lr
-
- ENDP ;|idct_dequant_dc_0_2x_neon|
- END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
deleted file mode 100644
index eea41f68c..000000000
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ /dev/null
@@ -1,208 +0,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_dc_full_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_full_2x_neon(short *q, short *dq,
-; unsigned char *dst, int stride, short *dc);
-; r0 *q,
-; r1 *dq,
-; r2 *dst
-; r3 stride
-; sp *dc
-|idct_dequant_dc_full_2x_neon| PROC
- push {r4}
-
- vld1.16 {q0, q1}, [r1] ; dq (same l/r)
- vld1.16 {q2, q3}, [r0] ; l q
- add r0, r0, #32
- vld1.16 {q4, q5}, [r0] ; r q
- add r12, r2, #4
-
- ; interleave the predictors
- vld1.32 {d28[0]}, [r2], r3 ; l pre
- vld1.32 {d28[1]}, [r12], r3 ; r pre
- vld1.32 {d29[0]}, [r2], r3
- vld1.32 {d29[1]}, [r12], r3
- vld1.32 {d30[0]}, [r2], r3
- vld1.32 {d30[1]}, [r12], r3
- vld1.32 {d31[0]}, [r2], r3
- ldr r1, [sp, #4] ; *dc
- vld1.32 {d31[1]}, [r12]
-
- adr r4, cospi8sqrt2minus1 ; pointer to the first constant
-
- ldrh r12, [r1], #2 ; lo *dc
- ldrh r1, [r1] ; hi *dc
-
- ; dequant: q[i] = q[i] * dq[i]
- vmul.i16 q2, q2, q0
- vmul.i16 q3, q3, q1
- vmul.i16 q4, q4, q0
- vmul.i16 q5, q5, q1
-
- ; move dc up to neon and overwrite first element
- vmov.16 d4[0], r12
- vmov.16 d8[0], r1
-
- vld1.16 {d0}, [r4]
-
- ; q2: l0r0 q3: l8r8
- ; q4: l4r4 q5: l12r12
- vswp d5, d8
- vswp d7, d10
-
- ; _CONSTANTS_ * 4,12 >> 16
- ; q6: 4 * sinpi : c1/temp1
- ; q7: 12 * sinpi : d1/temp2
- ; q8: 4 * cospi
- ; q9: 12 * cospi
- vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q7, q5, d0[2]
- vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q9, q5, d0[0]
-
- vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
- vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
-
- ; vqdmulh only accepts signed values. this was a problem because
- ; our constant had the high bit set, and was treated as a negative value.
- ; vqdmulh also doubles the value before it shifts by 16. we need to
- ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
- ; so we can shift the constant without losing precision. this avoids
- ; shift again afterward, but also avoids the sign issue. win win!
- ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
- ; pre-shift it
- vshr.s16 q8, q8, #1
- vshr.s16 q9, q9, #1
-
- ; q4: 4 + 4 * cospi : d1/temp1
- ; q5: 12 + 12 * cospi : c1/temp2
- vqadd.s16 q4, q4, q8
- vqadd.s16 q5, q5, q9
-
- ; c1 = temp1 - temp2
- ; d1 = temp1 + temp2
- vqsub.s16 q2, q6, q5
- vqadd.s16 q3, q4, q7
-
- ; [0]: a1+d1
- ; [1]: b1+c1
- ; [2]: b1-c1
- ; [3]: a1-d1
- vqadd.s16 q4, q10, q3
- vqadd.s16 q5, q11, q2
- vqsub.s16 q6, q11, q2
- vqsub.s16 q7, q10, q3
-
- ; rotate
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
- ; idct loop 2
- ; q4: l 0, 4, 8,12 r 0, 4, 8,12
- ; q5: l 1, 5, 9,13 r 1, 5, 9,13
- ; q6: l 2, 6,10,14 r 2, 6,10,14
- ; q7: l 3, 7,11,15 r 3, 7,11,15
-
- ; q8: 1 * sinpi : c1/temp1
- ; q9: 3 * sinpi : d1/temp2
- ; q10: 1 * cospi
- ; q11: 3 * cospi
- vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q9, q7, d0[2]
- vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q11, q7, d0[0]
-
- vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
- vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
-
- ; see note on shifting above
- vshr.s16 q10, q10, #1
- vshr.s16 q11, q11, #1
-
- ; q10: 1 + 1 * cospi : d1/temp1
- ; q11: 3 + 3 * cospi : c1/temp2
- vqadd.s16 q10, q5, q10
- vqadd.s16 q11, q7, q11
-
- ; q8: c1 = temp1 - temp2
- ; q9: d1 = temp1 + temp2
- vqsub.s16 q8, q8, q11
- vqadd.s16 q9, q10, q9
-
- ; a1+d1
- ; b1+c1
- ; b1-c1
- ; a1-d1
- vqadd.s16 q4, q2, q9
- vqadd.s16 q5, q3, q8
- vqsub.s16 q6, q3, q8
- vqsub.s16 q7, q2, q9
-
- ; +4 >> 3 (rounding)
- vrshr.s16 q4, q4, #3 ; lo
- vrshr.s16 q5, q5, #3
- vrshr.s16 q6, q6, #3 ; hi
- vrshr.s16 q7, q7, #3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- ; adding pre
- ; input is still packed. pre was read interleaved
- vaddw.u8 q4, q4, d28
- vaddw.u8 q5, q5, d29
- vaddw.u8 q6, q6, d30
- vaddw.u8 q7, q7, d31
-
- vmov.i16 q14, #0
- vmov q15, q14
- vst1.16 {q14, q15}, [r0] ; write over high input
- sub r0, r0, #32
- vst1.16 {q14, q15}, [r0] ; write over low input
-
- sub r2, r2, r3, lsl #2 ; dst - 4*stride
- add r1, r2, #4 ; hi
-
- ;saturate and narrow
- vqmovun.s16 d0, q4 ; lo
- vqmovun.s16 d1, q5
- vqmovun.s16 d2, q6 ; hi
- vqmovun.s16 d3, q7
-
- vst1.32 {d0[0]}, [r2], r3 ; lo
- vst1.32 {d0[1]}, [r1], r3 ; hi
- vst1.32 {d1[0]}, [r2], r3
- vst1.32 {d1[1]}, [r1], r3
- vst1.32 {d2[0]}, [r2], r3
- vst1.32 {d2[1]}, [r1], r3
- vst1.32 {d3[0]}, [r2]
- vst1.32 {d3[1]}, [r1]
-
- pop {r4}
- bx lr
-
- ENDP ; |idct_dequant_dc_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2 DCD 0x4546
-
- END
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index e501b9ec7..1133efb54 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -232,45 +232,53 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
}
}
}
-
- }
- else if (mode == SPLITMV)
- {
- DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
- xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs);
}
else
{
- BLOCKD *b = &xd->block[24];
+ short *DQC = xd->block[0].dequant;
- /* do 2nd order transform on the dc block */
- if (xd->eobs[24] > 1)
- {
- DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
- }
- else
+ /* save the dc dequant constant in case it is overridden */
+ short dc_dequant_temp = DQC[0];
+
+ if (mode != SPLITMV)
{
- b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
+ BLOCKD *b = &xd->block[24];
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, block)(b);
+
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
+ }
+ else
+ {
+ b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+
+ /* override the dc dequant constant */
+ DQC[0] = 1;
}
- DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
+ DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+ xd->dst.y_stride, xd->eobs);
+
+ /* restore the dc dequant constant */
+ DQC[0] = dc_dequant_temp;
}
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index 0861965eb..4a48a3192 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -42,22 +42,3 @@ void vp8_dequant_idct_add_c(short *input, short *dq,
vpx_memset(input, 0, 32);
}
-
-void vp8_dequant_dc_idct_add_c(short *input, short *dq,
- unsigned char *dest, int stride,
- int Dc)
-{
- int i;
-
- input[0] = (short)Dc;
-
- for (i = 1; i < 16; i++)
- {
- input[i] = dq[i] * input[i];
- }
-
- vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
-
- vpx_memset(input, 0, 32);
-
-}
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index 019b7f6d1..f66cf2bac 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -21,17 +21,6 @@
unsigned char *output, \
int stride)
-#define prototype_dequant_dc_idct_add(sym) \
- void sym(short *input, short *dq, \
- unsigned char *dst, \
- int stride, \
- int dc)
-
-#define prototype_dequant_dc_idct_add_y_block(sym) \
- void sym(short *q, short *dq, \
- unsigned char *dst, \
- int stride, char *eobs, short *dc)
-
#define prototype_dequant_idct_add_y_block(sym) \
void sym(short *q, short *dq, \
unsigned char *dst, \
@@ -60,16 +49,6 @@ extern prototype_dequant_block(vp8_dequant_block);
#endif
extern prototype_dequant_idct_add(vp8_dequant_idct_add);
-#ifndef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
-#endif
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
-
-#ifndef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
-#endif
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
-
#ifndef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
#endif
@@ -85,10 +64,6 @@ typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
-typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
-
-typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
-
typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
@@ -97,8 +72,6 @@ typedef struct
{
vp8_dequant_block_fn_t block;
vp8_dequant_idct_add_fn_t idct_add;
- vp8_dequant_dc_idct_add_fn_t dc_idct_add;
- vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
vp8_dequant_idct_add_y_block_fn_t idct_add_y_block;
vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block;
} vp8_dequant_rtcd_vtable_t;
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index 9c42bc62d..d9f9ba3c8 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -23,8 +23,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
pbi->mb.rtcd = &pbi->common.rtcd;
pbi->dequant.block = vp8_dequantize_b_c;
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
- pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c;
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_c;
#endif
diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
index 1c16b92a9..249fad4ea 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -12,39 +12,12 @@
#include "vp8/common/idct.h"
#include "dequantize.h"
-void vp8_dequant_dc_idct_add_c(short *input, short *dq,
- unsigned char *dest, int stride,
- int Dc);
void vp8_dequant_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride);
void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
int pred_stride, unsigned char *dst_ptr,
int dst_stride);
-void vp8_dequant_dc_idct_add_y_block_c
- (short *q, short *dq,
- unsigned char *dst, int stride, char *eobs, short *dc)
-{
- int i, j;
-
- for (i = 0; i < 4; i++)
- {
- for (j = 0; j < 4; j++)
- {
- if (*eobs++ > 1)
- vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);
- else
- vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);
-
- q += 16;
- dst += 4;
- dc ++;
- }
-
- dst += 4*stride - 16;
- }
-}
-
void vp8_dequant_idct_add_y_block_c
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index eba5830d5..1967781eb 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -175,36 +175,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
#endif
/* dequantization and idct */
- if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
- {
- BLOCKD *b = &xd->block[24];
- DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
- /* do 2nd order transform on the dc block */
- if (xd->eobs[24] > 1)
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
- }
- else
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- }
-
- DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
- xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs, xd->block[24].diff);
- }
- else if (xd->mode_info_context->mbmi.mode == B_PRED)
+ if (xd->mode_info_context->mbmi.mode == B_PRED)
{
for (i = 0; i < 16; i++)
{
@@ -214,26 +185,71 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
b->dst_stride, mb_row, mb_col, i);
- if (xd->eobs[i] > 1)
+ if (xd->eobs[i] )
{
- DEQUANT_INVOKE(&pbi->dequant, idct_add)
- (b->qcoeff, b->dequant,
- *(b->base_dst) + b->dst, b->dst_stride);
+ if (xd->eobs[i] > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, idct_add)
+ (b->qcoeff, b->dequant,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+ (b->qcoeff[0] * b->dequant[0],
+ *(b->base_dst) + b->dst, b->dst_stride,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+ }
+ }
+ }
+ else
+ {
+ short *DQC = xd->block[0].dequant;
+
+ DECLARE_ALIGNED(16, short, local_dequant[16]);
+
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ BLOCKD *b = &xd->block[24];
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, block)(b);
+
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
}
else
{
- IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
- (b->qcoeff[0] * b->dequant[0],
- *(b->base_dst) + b->dst, b->dst_stride,
- *(b->base_dst) + b->dst, b->dst_stride);
+ b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
((int *)b->qcoeff)[0] = 0;
}
+
+ /* make a local copy of the dequant constants */
+ vpx_memcpy(local_dequant, xd->block[0].dequant,
+ sizeof(local_dequant));
+
+ /* override the dc dequant constant */
+ local_dequant[0] = 1;
+
+ /* use the new dequant constants */
+ DQC = local_dequant;
}
- }
- else
- {
+
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
+ (xd->qcoeff, DQC,
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
@@ -244,7 +260,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
xd->dst.uv_stride, xd->eobs+16);
}
-
static THREAD_FUNCTION thread_decoding_proc(void *p_data)
{
int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm
index 648bde4c5..de9eba89f 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -246,207 +246,6 @@ sym(vp8_dequant_idct_add_mmx):
pop rbp
ret
-
-;void dequant_dc_idct_add_mmx(
-;short *input, 0
-;short *dq, 1
-;unsigned char *dest, 2
-;int stride, 3
-;int Dc) 4
-global sym(vp8_dequant_dc_idct_add_mmx)
-sym(vp8_dequant_dc_idct_add_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- ; end prolog
-
- mov rax, arg(0) ;input
- mov rdx, arg(1) ;dq
-
- movq mm0, [rax ]
- pmullw mm0, [rdx]
-
- movq mm1, [rax +8]
- pmullw mm1, [rdx +8]
-
- movq mm2, [rax+16]
- pmullw mm2, [rdx+16]
-
- movq mm3, [rax+24]
- pmullw mm3, [rdx+24]
-
- mov rdx, arg(2) ;pred
- pxor mm7, mm7
-
-
- movq [rax], mm7
- movq [rax+8], mm7
-
- movq [rax+16],mm7
- movq [rax+24],mm7
-
- ; move lower word of Dc to lower word of mm0
- psrlq mm0, 16
- movzx rcx, word ptr arg(4) ;Dc
- psllq mm0, 16
- movq mm7, rcx
- por mm0, mm7
-
- movsxd rax, dword ptr arg(3) ;stride
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)];
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
-
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
-
- movq mm5, mm1
- movq mm4, mm3
-
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
-
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
-
- paddw mm3, mm5 ; d1
- movq mm6, mm2 ; a1
-
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
-
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
-
- psubw mm6, mm3 ;3
-
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
-
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
-
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
-
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
-
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
-
- movq mm3, mm5 ; 33 23 13 03
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)];
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
-
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
-
- movq mm5, mm1
- movq mm4, mm3
-
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
-
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
-
- paddw mm3, mm5 ; d1
- paddw mm0, [GLOBAL(fours)]
-
- paddw mm2, [GLOBAL(fours)]
- movq mm6, mm2 ; a1
-
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
-
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
-
- psubw mm6, mm3 ;3
- psraw mm2, 3
-
- psraw mm0, 3
- psraw mm4, 3
-
- psraw mm6, 3
-
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
-
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
-
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
-
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
-
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
-
- pxor mm7, mm7
-
- movd mm4, [rdx]
- punpcklbw mm4, mm7
- paddsw mm0, mm4
- packuswb mm0, mm7
- movd [rdx], mm0
-
- movd mm4, [rdx+rax]
- punpcklbw mm4, mm7
- paddsw mm1, mm4
- packuswb mm1, mm7
- movd [rdx+rax], mm1
-
- movd mm4, [rdx+2*rax]
- punpcklbw mm4, mm7
- paddsw mm2, mm4
- packuswb mm2, mm7
- movd [rdx+rax*2], mm2
-
- add rdx, rax
-
- movd mm4, [rdx+2*rax]
- punpcklbw mm4, mm7
- paddsw mm5, mm4
- packuswb mm5, mm7
- movd [rdx+rax*2], mm5
-
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
SECTION_RODATA
align 16
x_s1sqr2:
diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h
index dc68daab3..49bcb7f19 100644
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/decoder/x86/dequantize_x86.h
@@ -22,8 +22,6 @@
#if HAVE_MMX
extern prototype_dequant_block(vp8_dequantize_b_mmx);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
@@ -34,12 +32,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx
-
#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
@@ -50,14 +42,10 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
#endif
#if HAVE_SSE2
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2
-
#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/decoder/x86/idct_blk_mmx.c
index 37de5b9fd..29276e5d7 100644
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/decoder/x86/idct_blk_mmx.c
@@ -12,41 +12,6 @@
#include "vp8/common/idct.h"
#include "vp8/decoder/dequantize.h"
-void vp8_dequant_dc_idct_add_y_block_mmx
- (short *q, short *dq,
- unsigned char *dst, int stride, char *eobs, short *dc)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);
- else if (eobs[0] == 1)
- vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);
-
- if (eobs[1] > 1)
- vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);
- else if (eobs[1] == 1)
- vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);
-
- if (eobs[2] > 1)
- vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);
- else if (eobs[2] == 1)
- vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);
-
- if (eobs[3] > 1)
- vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);
- else if (eobs[3] == 1)
- vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);
-
- q += 64;
- dc += 4;
- dst += 4*stride;
- eobs += 4;
- }
-}
-
void vp8_dequant_idct_add_y_block_mmx
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c
index 0495b0610..03c2878c1 100644
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -12,13 +12,6 @@
#include "vp8/common/idct.h"
#include "vp8/decoder/dequantize.h"
-void vp8_idct_dequant_dc_0_2x_sse2
- (short *q, short *dq,
- unsigned char *dst, int dst_stride, short *dc);
-void vp8_idct_dequant_dc_full_2x_sse2
- (short *q, short *dq,
- unsigned char *dst, int dst_stride, short *dc);
-
void vp8_idct_dequant_0_2x_sse2
(short *q, short *dq ,
unsigned char *dst, int dst_stride);
@@ -26,36 +19,6 @@ void vp8_idct_dequant_full_2x_sse2
(short *q, short *dq ,
unsigned char *dst, int dst_stride);
-void vp8_dequant_dc_idct_add_y_block_sse2
- (short *q, short *dq,
- unsigned char *dst, int stride, char *eobs, short *dc)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (((short *)(eobs))[0])
- {
- if (((short *)(eobs))[0] & 0xfefe)
- vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);
- else
- vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);
- }
-
- if (((short *)(eobs))[1])
- {
- if (((short *)(eobs))[1] & 0xfefe)
- vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
- else
- vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
- }
- q += 64;
- dc += 4;
- dst += stride*4;
- eobs += 4;
- }
-}
-
void vp8_dequant_idct_add_y_block_sse2
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
index 443150483..424052c1b 100644
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -43,8 +43,6 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
{
pbi->dequant.block = vp8_dequantize_b_mmx;
pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
- pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx;
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx;
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx;
}
@@ -52,8 +50,6 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
#if HAVE_SSE2
if (flags & HAS_SSE2)
{
- pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;
- pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2;
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2;
}
#endif
diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index 6bde42f4c..b08f9464f 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -16,14 +16,11 @@ VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.c
VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.h
#File list for armv6
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c
#File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)