Modified the inverse walsh to output directly

to the dqcoeff or qcoeff buffer. The encoder would populate the dc coeffs of the y blocks as a separate stage (recon_dcblock) and the decoder would use a special version of the idct. This change eliminates the extra copy and reduces the code footprint. [Tero] Added needed changes to armv6 and NEON assembly. Change-Id: I83202ffdbaf83f6e5dd69f4ba2519fcf0b13b3ba
author: Scott LaVarnway <slavarnway@google.com> 2011-11-17 12:54:42 -0500
committer: Tero Rintaluoma <teror@google.com> 2011-11-25 09:24:04 +0200
commit: 4a91541c946c1fc2655a942ec79033618f03c4ca (patch)
tree: 70093355ebd25dd2c79515f7950c8490f6937355
parent: 7b0feac4a4386eef3e1ea851e52e4f30935e255d (diff)
download: libvpx-4a91541c946c1fc2655a942ec79033618f03c4ca.tar.gz
30 files changed, 275 insertions, 1200 deletions
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index b5f194d3d..cd55a6377 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -46,7 +46,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
         rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;
 
         rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;
-        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_v6;
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_v6;
 
         rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
@@ -80,7 +79,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
         rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;
 
         rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;
-        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;
 
         rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
index 463bff0f5..31ef09cad 100644
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -9,7 +9,6 @@
 ;
 
     EXPORT |vp8_short_inv_walsh4x4_v6|
-    EXPORT |vp8_short_inv_walsh4x4_1_v6|
 
     ARM
     REQUIRE8
@@ -17,19 +16,19 @@
 
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
 |vp8_short_inv_walsh4x4_v6| PROC
 
-    stmdb       sp!, {r4 - r11, lr}
+    stmdb       sp!, {r4 - r12, lr}
 
-    ldr         r2, [r0], #4         ; [1  |  0]
-    ldr         r3, [r0], #4         ; [3  |  2]
-    ldr         r4, [r0], #4         ; [5  |  4]
-    ldr         r5, [r0], #4         ; [7  |  6]
-    ldr         r6, [r0], #4         ; [9  |  8]
-    ldr         r7, [r0], #4         ; [11 | 10]
-    ldr         r8, [r0], #4         ; [13 | 12]
-    ldr         r9, [r0]             ; [15 | 14]
+    ldr         r2, [r0, #0]         ; [1  |  0]
+    ldr         r3, [r0, #4]         ; [3  |  2]
+    ldr         r4, [r0, #8]         ; [5  |  4]
+    ldr         r5, [r0, #12]        ; [7  |  6]
+    ldr         r6, [r0, #16]        ; [9  |  8]
+    ldr         r7, [r0, #20]        ; [11 | 10]
+    ldr         r8, [r0, #24]        ; [13 | 12]
+    ldr         r9, [r0, #28]        ; [15 | 14]
 
     qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
     qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
@@ -69,24 +68,27 @@
     qadd16      r4, r4, r10          ; [b2+3|c2+3]
     qadd16      r5, r5, r10          ; [a2+3|d2+3]
 
-    asr         r12, r2, #3          ; [1  |  x]
-    pkhtb       r12, r12, r3, asr #19; [1  |  0]
-    lsl         lr, r3, #16          ; [~3 |  x]
-    lsl         r2, r2, #16          ; [~2 |  x]
-    asr         lr, lr, #3           ; [3  |  x]
-    pkhtb       lr, lr, r2, asr #19  ; [3  |  2]
-
-    asr         r2, r4, #3           ; [5  |  x]
-    pkhtb       r2, r2, r5, asr #19  ; [5  |  4]
-    lsl         r3, r5, #16          ; [~7 |  x]
-    lsl         r4, r4, #16          ; [~6 |  x]
-    asr         r3, r3, #3           ; [7  |  x]
-    pkhtb       r3, r3, r4, asr #19  ; [7  |  6]
-
-    str         r12, [r1], #4
-    str         lr, [r1], #4
-    str         r2, [r1], #4
-    str         r3, [r1], #4
+    asr         r12, r3, #19         ; [0]
+    strh        r12, [r1], #32
+    asr         lr, r2, #19          ; [1]
+    strh        lr, [r1], #32
+    sxth        r2, r2
+    sxth        r3, r3
+    asr         r2, r2, #3           ; [2]
+    strh        r2, [r1], #32
+    asr         r3, r3, #3           ; [3]
+    strh        r3, [r1], #32
+
+    asr         r12, r5, #19         ; [4]
+    strh        r12, [r1], #32
+    asr         lr, r4, #19          ; [5]
+    strh        lr, [r1], #32
+    sxth        r4, r4
+    sxth        r5, r5
+    asr         r4, r4, #3           ; [6]
+    strh        r4, [r1], #32
+    asr         r5, r5, #3           ; [7]
+    strh        r5, [r1], #32
 
     qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
     qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
@@ -103,50 +105,32 @@
     qadd16      r8, r8, r10          ; [b2+3|c2+3]
     qadd16      r9, r9, r10          ; [a2+3|d2+3]
 
-    asr         r2, r6, #3           ; [9  |  x]
-    pkhtb       r2, r2, r7, asr #19  ; [9  |  8]
-    lsl         r3, r7, #16          ; [~11|  x]
-    lsl         r4, r6, #16          ; [~10|  x]
-    asr         r3, r3, #3           ; [11 |  x]
-    pkhtb       r3, r3, r4, asr #19  ; [11 | 10]
-
-    asr         r4, r8, #3           ; [13 |  x]
-    pkhtb       r4, r4, r9, asr #19  ; [13 | 12]
-    lsl         r5, r9, #16          ; [~15|  x]
-    lsl         r6, r8, #16          ; [~14|  x]
-    asr         r5, r5, #3           ; [15 |  x]
-    pkhtb       r5, r5, r6, asr #19  ; [15 | 14]
-
-    str         r2, [r1], #4
-    str         r3, [r1], #4
-    str         r4, [r1], #4
-    str         r5, [r1]
-
-    ldmia       sp!, {r4 - r11, pc}
+    asr         r12, r7, #19         ; [8]
+    strh        r12, [r1], #32
+    asr         lr, r6, #19          ; [9]
+    strh        lr, [r1], #32
+    sxth        r6, r6
+    sxth        r7, r7
+    asr         r6, r6, #3           ; [10]
+    strh        r6, [r1], #32
+    asr         r7, r7, #3           ; [11]
+    strh        r7, [r1], #32
+
+    asr         r12, r9, #19         ; [12]
+    strh        r12, [r1], #32
+    asr         lr, r8, #19          ; [13]
+    strh        lr, [r1], #32
+    sxth        r8, r8
+    sxth        r9, r9
+    asr         r8, r8, #3           ; [14]
+    strh        r8, [r1], #32
+    asr         r9, r9, #3           ; [15]
+    strh        r9, [r1], #32
+
+    ldmia       sp!, {r4 - r12, pc}
     ENDP        ; |vp8_short_inv_walsh4x4_v6|
 
 
-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_v6| PROC
-
-    ldrsh       r2, [r0]             ; [0]
-    add         r2, r2, #3           ; [0] + 3
-    asr         r2, r2, #3           ; a1 ([0]+3) >> 3
-    lsl         r2, r2, #16          ; [a1 |  x]
-    orr         r2, r2, r2, lsr #16  ; [a1 | a1]
-
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1]
-
-    bx          lr
-    ENDP        ; |vp8_short_inv_walsh4x4_1_v6|
-
 ; Constant Pool
 c0x00030003 DCD 0x00030003
     END
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
index c710c2eb0..68c0cad11 100644
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
 #undef  vp8_idct_idct1_scalar_add
 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
 
-#undef  vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
-
 #undef  vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
 #endif
@@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
 #undef  vp8_idct_idct1_scalar_add
 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
 
-#undef  vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
-
 #undef  vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
 #endif
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm
index 01c79d937..e8ea2a619 100644
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@@ -8,7 +8,6 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
     EXPORT  |vp8_short_inv_walsh4x4_neon|
-    EXPORT  |vp8_short_inv_walsh4x4_1_neon|
 
     ARM
     REQUIRE8
@@ -16,7 +15,7 @@
 
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
 |vp8_short_inv_walsh4x4_neon| PROC
 
     ; read in all four lines of values: d0->d3
@@ -59,22 +58,30 @@
     vshr.s16 q0, q0, #3 ;e/f >> 3
     vshr.s16 q1, q1, #3 ;g/h >> 3
 
-    vst4.i16 {d0,d1,d2,d3}, [r1@128]
+    mov      r2, #64
+    add      r3, r1, #32
 
-    bx lr
-    ENDP    ; |vp8_short_inv_walsh4x4_neon|
+    vst1.i16 d0[0], [r1],r2
+    vst1.i16 d1[0], [r3],r2
+    vst1.i16 d2[0], [r1],r2
+    vst1.i16 d3[0], [r3],r2
+
+    vst1.i16 d0[1], [r1],r2
+    vst1.i16 d1[1], [r3],r2
+    vst1.i16 d2[1], [r1],r2
+    vst1.i16 d3[1], [r3],r2
 
+    vst1.i16 d0[2], [r1],r2
+    vst1.i16 d1[2], [r3],r2
+    vst1.i16 d2[2], [r1],r2
+    vst1.i16 d3[2], [r3],r2
+
+    vst1.i16 d0[3], [r1],r2
+    vst1.i16 d1[3], [r3],r2
+    vst1.i16 d2[3], [r1]
+    vst1.i16 d3[3], [r3]
 
-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_neon| PROC
-    ldrsh r2, [r0]          ; load input[0]
-    add r3, r2, #3          ; add 3
-    add r2, r1, #16         ; base for last 8 output
-    asr r0, r3, #3          ; right shift 3
-    vdup.16 q0, r0          ; load and duplicate
-    vst1.16 {q0}, [r1@128]  ; write back 8
-    vst1.16 {q0}, [r2@128]  ; write back last 8
     bx lr
-    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|
+    ENDP    ; |vp8_short_inv_walsh4x4_neon|
 
     END
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index 411a1b472..7371f85ff 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -37,6 +37,10 @@
 #define vp8_idct_idct16 vp8_short_idct4x4llm_c
 #endif
 extern prototype_idct(vp8_idct_idct16);
+/* add this prototype to prevent compiler warning about implicit
+ * declaration of vp8_short_idct4x4llm_c function in dequantize.c
+ * when building, for example, neon optimized version */
+extern prototype_idct(vp8_short_idct4x4llm_c);
 
 #ifndef vp8_idct_idct1_scalar_add
 #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 49496abef..47af52f04 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -137,8 +137,9 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
 
 }
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output)
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
 {
+    short output[16];
     int i;
     int a1, b1, c1, d1;
     int a2, b2, c2, d2;
@@ -183,22 +184,21 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output)
         ip += 4;
         op += 4;
     }
+
+    for(i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = output[i];
+    }
 }
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output)
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
 {
     int i;
     int a1;
-    short *op = output;
 
     a1 = ((input[0] + 3) >> 3);
-
-    for (i = 0; i < 4; i++)
+    for(i = 0; i < 16; i++)
     {
-        op[0] = a1;
-        op[1] = a1;
-        op[2] = a1;
-        op[3] = a1;
-        op += 4;
+        mb_dqcoeff[i * 16] = a1;
     }
 }
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
index 478cb329f..95e6980fe 100644
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -28,18 +28,6 @@ void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
 
 }
 
-static void recon_dcblock(MACROBLOCKD *x)
-{
-    BLOCKD *b = &x->block[24];
-    int i;
-
-    for (i = 0; i < 16; i++)
-    {
-        x->block[i].dqcoeff[0] = b->diff[i];
-    }
-
-}
-
 void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
     int i;
@@ -47,9 +35,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
     if(x->mode_info_context->mbmi.mode != SPLITMV)
     {
         /* do 2nd order transform on the dc block */
-        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
-
-        recon_dcblock(x);
+        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);
     }
 
     for (i = 0; i < 16; i++)
diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h
index f9e3a794d..06e3ea4b5 100644
--- a/vp8/common/x86/idct_x86.h
+++ b/vp8/common/x86/idct_x86.h
@@ -24,7 +24,6 @@ extern prototype_idct(vp8_short_idct4x4llm_mmx);
 extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
 
 extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
-extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_idct_idct16
@@ -36,9 +35,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
 #undef vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
 
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx
-
 #endif
 #endif
 
diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm
index 10b5274dc..3ab066ba4 100644
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -11,42 +11,6 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_1_mmx)
-sym(vp8_short_inv_walsh4x4_1_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)
-    mov     rax, 3
-
-    mov     rdi, arg(1)
-    add     rax, [rsi]          ;input[0] + 3
-
-    movd    mm0, eax
-
-    punpcklwd mm0, mm0          ;x x val val
-
-    punpckldq mm0, mm0          ;val val val val
-
-    psraw   mm0, 3            ;(input[0] + 3) >> 3
-
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm0
-    movq  [rdi + 16], mm0
-    movq  [rdi + 24], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
 global sym(vp8_short_inv_walsh4x4_mmx)
 sym(vp8_short_inv_walsh4x4_mmx):
@@ -159,10 +123,50 @@ sym(vp8_short_inv_walsh4x4_mmx):
     psraw   mm2, 3
     psraw   mm3, 3
 
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm1
-    movq  [rdi + 16], mm2
-    movq  [rdi + 24], mm3
+;    movq  [rdi + 0], mm0
+;    movq  [rdi + 8], mm1
+;    movq  [rdi + 16], mm2
+;    movq  [rdi + 24], mm3
+
+    movd    eax, mm0
+    psrlq   mm0, 32
+    mov     word ptr[rdi+32*0], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*1], ax
+    movd    eax, mm0
+    mov     word ptr[rdi+32*2], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*3], ax
+
+    movd    ecx, mm1
+    psrlq   mm1, 32
+    mov     word ptr[rdi+32*4], cx
+    shr     ecx, 16
+    mov     word ptr[rdi+32*5], cx
+    movd    ecx, mm1
+    mov     word ptr[rdi+32*6], cx
+    shr     ecx, 16
+    mov     word ptr[rdi+32*7], cx
+
+    movd    eax, mm2
+    psrlq   mm2, 32
+    mov     word ptr[rdi+32*8], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*9], ax
+    movd    eax, mm2
+    mov     word ptr[rdi+32*10], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*11], ax
+
+    movd    ecx, mm3
+    psrlq   mm3, 32
+    mov     word ptr[rdi+32*12], cx
+    shr     ecx, 16
+    mov     word ptr[rdi+32*13], cx
+    movd    ecx, mm3
+    mov     word ptr[rdi+32*14], cx
+    shr     ecx, 16
+    mov     word ptr[rdi+32*15], cx
 
     ; begin epilog
     pop rdi
diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm
index 1da4fd8da..5a7133d6c 100644
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -96,8 +96,50 @@ sym(vp8_short_inv_walsh4x4_sse2):
     psraw   xmm5, 3
     psraw   xmm1, 3
 
-    movdqa  [rdi + 0], xmm5
-    movdqa  [rdi + 16], xmm1
+;;    movdqa  [rdi + 0], xmm5
+;;    movdqa  [rdi + 16], xmm1
+
+    movd    eax, xmm5
+    psrldq   xmm5, 4
+    mov     word ptr[rdi+32*0], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*1], ax
+    movd    eax, xmm5
+    psrldq   xmm5, 4
+    mov     word ptr[rdi+32*2], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*3], ax
+
+    movd    eax, xmm5
+    psrldq   xmm5, 4
+    mov     word ptr[rdi+32*4], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*5], ax
+    movd    eax, xmm5
+    mov     word ptr[rdi+32*6], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*7], ax
+
+    movd    eax, xmm1
+    psrldq   xmm1, 4
+    mov     word ptr[rdi+32*8], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*9], ax
+    movd    eax, xmm1
+    psrldq   xmm1, 4
+    mov     word ptr[rdi+32*10], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*11], ax
+
+    movd    eax, xmm1
+    psrldq   xmm1, 4
+    mov     word ptr[rdi+32*12], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*13], ax
+    movd    eax, xmm1
+    mov     word ptr[rdi+32*14], ax
+    shr     eax, 16
+    mov     word ptr[rdi+32*15], ax
 
     ; begin epilog
     pop rdi
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index c4e616a67..b24cbe48f 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -40,9 +40,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
         rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;
         rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;
-        rtcd->idct.iwalsh1     = vp8_short_inv_walsh4x4_1_mmx;
-
-
 
         rtcd->recon.copy8x8     = vp8_copy_mem8x8_mmx;
         rtcd->recon.copy8x4     = vp8_copy_mem8x4_mmx;
diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c
index 1b0091cdb..f802c5181 100644
--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -32,8 +32,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
     {
         pbi->dequant.block               = vp8_dequantize_b_v6;
         pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;
-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_v6;
-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
         pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
     }
@@ -44,9 +42,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
     {
         pbi->dequant.block               = vp8_dequantize_b_neon;
         pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;
-        /*This is not used: NEON always dequants two blocks at once.
-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_neon;*/
-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
         pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
     }
diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
deleted file mode 100644
index 19f94e089..000000000
--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
+++ /dev/null
@@ -1,213 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_dequant_dc_idct_add_v6|
-
-    AREA |.text|, CODE, READONLY
-
-;void vp8_dequant_dc_idct_v6(short *input, short *dq,
-;                            unsigned char *dest, int stride, int Dc)
-; r0 = input
-; r1 = dq
-; r2 = dst
-; r3 = stride
-; sp + 36 = Dc
-
-
-|vp8_dequant_dc_idct_add_v6| PROC
-    stmdb   sp!, {r4-r11, lr}
-
-    ldr     r6, [sp, #36]
-
-    ldr     r4, [r0]                ;input
-    ldr     r5, [r1], #4            ;dq
-
-    sub     sp, sp, #4
-    str     r3, [sp]
-
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    mov     r12, #3
-
-vp8_dequant_dc_add_loop
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    subs    r12, r12, #1
-
-    ldrne   r4, [r0, #4]
-    ldrne   r5, [r1], #4
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    bne     vp8_dequant_dc_add_loop
-
-    sub     r0, r0, #32
-    mov     r1, r0
-
-; short_idct4x4llm_v6_dual
-    ldr     r3, cospi8sqrt2minus1
-    ldr     r4, sinpi8sqrt2
-    ldr     r6, [r0, #8]
-    mov     r5, #2
-vp8_dequant_dc_idct_loop1_v6
-    ldr     r12, [r0, #24]
-    ldr     r14, [r0, #16]
-    smulwt  r9, r3, r6
-    smulwb  r7, r3, r6
-    smulwt  r10, r4, r6
-    smulwb  r8, r4, r6
-    pkhbt   r7, r7, r9, lsl #16
-    smulwt  r11, r3, r12
-    pkhbt   r8, r8, r10, lsl #16
-    uadd16  r6, r6, r7
-    smulwt  r7, r4, r12
-    smulwb  r9, r3, r12
-    smulwb  r10, r4, r12
-    subs    r5, r5, #1
-    pkhbt   r9, r9, r11, lsl #16
-    ldr     r11, [r0], #4
-    pkhbt   r10, r10, r7, lsl #16
-    uadd16  r7, r12, r9
-    usub16  r7, r8, r7
-    uadd16  r6, r6, r10
-    uadd16  r10, r11, r14
-    usub16  r8, r11, r14
-    uadd16  r9, r10, r6
-    usub16  r10, r10, r6
-    uadd16  r6, r8, r7
-    usub16  r7, r8, r7
-    str     r6, [r1, #8]
-    ldrne   r6, [r0, #8]
-    str     r7, [r1, #16]
-    str     r10, [r1, #24]
-    str     r9, [r1], #4
-    bne     vp8_dequant_dc_idct_loop1_v6
-
-    mov     r5, #2
-    sub     r0, r1, #8
-vp8_dequant_dc_idct_loop2_v6
-    ldr     r6, [r0], #4
-    ldr     r7, [r0], #4
-    ldr     r8, [r0], #4
-    ldr     r9, [r0], #4
-    smulwt  r1, r3, r6
-    smulwt  r12, r4, r6
-    smulwt  lr, r3, r8
-    smulwt  r10, r4, r8
-    pkhbt   r11, r8, r6, lsl #16
-    pkhbt   r1, lr, r1, lsl #16
-    pkhbt   r12, r10, r12, lsl #16
-    pkhtb   r6, r6, r8, asr #16
-    uadd16  r6, r1, r6
-    pkhbt   lr, r9, r7, lsl #16
-    uadd16  r10, r11, lr
-    usub16  lr, r11, lr
-    pkhtb   r8, r7, r9, asr #16
-    subs    r5, r5, #1
-    smulwt  r1, r3, r8
-    smulwb  r7, r3, r8
-    smulwt  r11, r4, r8
-    smulwb  r9, r4, r8
-    pkhbt   r1, r7, r1, lsl #16
-    uadd16  r8, r1, r8
-    pkhbt   r11, r9, r11, lsl #16
-    usub16  r1, r12, r8
-    uadd16  r8, r11, r6
-    ldr     r9, c0x00040004
-    ldr     r12, [sp]               ; get stride from stack
-    uadd16  r6, r10, r8
-    usub16  r7, r10, r8
-    uadd16  r7, r7, r9
-    uadd16  r6, r6, r9
-    uadd16  r10, r14, r1
-    usub16  r1, r14, r1
-    uadd16  r10, r10, r9
-    uadd16  r1, r1, r9
-    ldr     r11, [r2]               ; load input from dst
-    mov     r8, r7, asr #3
-    pkhtb   r9, r8, r10, asr #19
-    mov     r8, r1, asr #3
-    pkhtb   r8, r8, r6, asr #19
-    uxtb16  lr, r11, ror #8
-    qadd16  r9, r9, lr
-    uxtb16  lr, r11
-    qadd16  r8, r8, lr
-    usat16  r9, #8, r9
-    usat16  r8, #8, r8
-    orr     r9, r8, r9, lsl #8
-    ldr     r11, [r2, r12]          ; load input from dst
-    mov     r7, r7, lsl #16
-    mov     r1, r1, lsl #16
-    mov     r10, r10, lsl #16
-    mov     r6, r6, lsl #16
-    mov     r7, r7, asr #3
-    pkhtb   r7, r7, r10, asr #19
-    mov     r1, r1, asr #3
-    pkhtb   r1, r1, r6, asr #19
-    uxtb16  r8, r11, ror #8
-    qadd16  r7, r7, r8
-    uxtb16  r8, r11
-    qadd16  r1, r1, r8
-    usat16  r7, #8, r7
-    usat16  r1, #8, r1
-    orr     r1, r1, r7, lsl #8
-    str     r9, [r2], r12           ; store output to dst
-    str     r1, [r2], r12           ; store output to dst
-    bne     vp8_dequant_dc_idct_loop2_v6
-
-; vpx_memset
-    sub     r0, r0, #32
-    add     sp, sp, #4
-
-    mov     r12, #0
-    str     r12, [r0]
-    str     r12, [r0, #4]
-    str     r12, [r0, #8]
-    str     r12, [r0, #12]
-    str     r12, [r0, #16]
-    str     r12, [r0, #20]
-    str     r12, [r0, #24]
-    str     r12, [r0, #28]
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ; |vp8_dequant_dc_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2       DCD 0x00008A8C
-c0x00040004       DCD 0x00040004
-
-    END
diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c
index 686bb737f..c1ef2852f 100644
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c
@@ -13,47 +13,6 @@
 #include "vp8/decoder/dequantize.h"
 
 
-void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
-                                        unsigned char *dst, int stride,
-                                        char *eobs, short *dc)
-{
-    int i;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (eobs[0] > 1)
-            vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);
-        else if (eobs[0] == 1)
-            vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);
-
-        if (eobs[1] > 1)
-        {
-            vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);
-        }
-        else if (eobs[1] == 1)
-            vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);
-
-        if (eobs[2] > 1)
-        {
-            vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);
-        }
-        else if (eobs[2] == 1)
-            vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);
-
-        if (eobs[3] > 1)
-        {
-            vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);
-        }
-        else if (eobs[3] == 1)
-            vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);
-
-        q    += 64;
-        dc   += 4;
-        dst  += 4*stride;
-        eobs += 4;
-    }
-}
-
 void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
                                      unsigned char *dst,
                                      int stride, char *eobs)
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h
index c020c8530..1123e8446 100644
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -15,8 +15,6 @@
 #if HAVE_ARMV6
 extern prototype_dequant_block(vp8_dequantize_b_v6);
 extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 
@@ -27,12 +25,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #undef vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_v6
 
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
-
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
 
@@ -44,8 +36,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #if HAVE_ARMV7
 extern prototype_dequant_block(vp8_dequantize_b_neon);
 extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 
@@ -57,12 +47,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 #undef vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_neon
 
-#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
-
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
 
diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c
index 086293114..185895f05 100644
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -15,46 +15,11 @@
 /* place these declarations here because we don't want to maintain them
  * outside of this scope
  */
-void idct_dequant_dc_full_2x_neon(short *input, short *dq,
-                                  unsigned char *dst,
-                                  int stride, short *dc);
-void idct_dequant_dc_0_2x_neon(short *input, short *dq,
-                               unsigned char *dst,
-                               int stride, short *dc);
 void idct_dequant_full_2x_neon(short *q, short *dq,
                                unsigned char *dst, int stride);
 void idct_dequant_0_2x_neon(short *q, short dq,
                             unsigned char *dst, int stride);
 
-void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
-                                          unsigned char *dst,
-                                          int stride, char *eobs, short *dc)
-{
-    int i;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (((short *)(eobs))[0])
-        {
-            if (((short *)eobs)[0] & 0xfefe)
-                idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);
-            else
-                idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);
-        }
-
-        if (((short *)(eobs))[1])
-        {
-            if (((short *)eobs)[1] & 0xfefe)
-                idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);
-            else
-                idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);
-        }
-        q    += 64;
-        dc   += 4;
-        dst  += 4*stride;
-        eobs += 4;
-    }
-}
 
 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
                                        unsigned char *dst,
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
deleted file mode 100644
index bf8d7ddcd..000000000
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ /dev/null
@@ -1,75 +0,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_dc_0_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void idct_dequant_dc_0_2x_neon(short *q, short *dq,
-;                               unsigned char *dst, int stride);
-; r0    *q,
-; r1    *dq,
-; r2    *dst
-; r3    stride
-; sp    *dc
-|idct_dequant_dc_0_2x_neon| PROC
-
-    ; no q- or dq-coeffs, so r0 and r1 are free to use
-    ldr             r1, [sp]                ; *dc
-    add             r12, r2, #4
-    ldr             r0, [r1]
-
-    vld1.32         {d2[0]}, [r2], r3       ; lo
-    vld1.32         {d8[0]}, [r12], r3      ; hi
-    vld1.32         {d2[1]}, [r2], r3
-    vld1.32         {d8[1]}, [r12], r3
-    vld1.32         {d4[0]}, [r2], r3
-    vld1.32         {d10[0]}, [r12], r3
-    vld1.32         {d4[1]}, [r2], r3
-    vld1.32         {d10[1]}, [r12]
-
-    sxth            r1, r0                  ; lo *dc
-    add             r1, r1, #4
-    asr             r1, r1, #3
-    vdup.16         q0, r1
-    sxth            r0, r0, ror #16         ; hi *dc
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q3, r0
-
-    vaddw.u8        q1, q0, d2              ; lo
-    vaddw.u8        q2, q0, d4
-    vaddw.u8        q4, q3, d8              ; hi
-    vaddw.u8        q5, q3, d10
-
-    vqmovun.s16     d2, q1                  ; lo
-    vqmovun.s16     d4, q2
-    vqmovun.s16     d8, q4                  ; hi
-    vqmovun.s16     d10, q5
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r0, r2, #4
-
-    vst1.32         {d2[0]}, [r2], r3       ; lo
-    vst1.32         {d8[0]}, [r0], r3       ; hi
-    vst1.32         {d2[1]}, [r2], r3
-    vst1.32         {d8[1]}, [r0], r3
-    vst1.32         {d4[0]}, [r2], r3
-    vst1.32         {d10[0]}, [r0], r3
-    vst1.32         {d4[1]}, [r2]
-    vst1.32         {d10[1]}, [r0]
-
-    bx             lr
-
-    ENDP           ;|idct_dequant_dc_0_2x_neon|
-    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
deleted file mode 100644
index eea41f68c..000000000
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ /dev/null
@@ -1,208 +0,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_dc_full_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_full_2x_neon(short *q, short *dq,
-;                                  unsigned char *dst, int stride, short *dc);
-; r0    *q,
-; r1    *dq,
-; r2    *dst
-; r3    stride
-; sp    *dc
-|idct_dequant_dc_full_2x_neon| PROC
-    push            {r4}
-
-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
-    vld1.16         {q2, q3}, [r0]          ; l q
-    add             r0, r0, #32
-    vld1.16         {q4, q5}, [r0]          ; r q
-    add             r12, r2, #4
-
-    ; interleave the predictors
-    vld1.32         {d28[0]}, [r2], r3      ; l pre
-    vld1.32         {d28[1]}, [r12], r3     ; r pre
-    vld1.32         {d29[0]}, [r2], r3
-    vld1.32         {d29[1]}, [r12], r3
-    vld1.32         {d30[0]}, [r2], r3
-    vld1.32         {d30[1]}, [r12], r3
-    vld1.32         {d31[0]}, [r2], r3
-    ldr             r1, [sp, #4]            ; *dc
-    vld1.32         {d31[1]}, [r12]
-
-    adr             r4, cospi8sqrt2minus1   ; pointer to the first constant
-
-    ldrh            r12, [r1], #2           ; lo *dc
-    ldrh            r1, [r1]                ; hi *dc
-
-    ; dequant: q[i] = q[i] * dq[i]
-    vmul.i16        q2, q2, q0
-    vmul.i16        q3, q3, q1
-    vmul.i16        q4, q4, q0
-    vmul.i16        q5, q5, q1
-
-    ; move dc up to neon and overwrite first element
-    vmov.16         d4[0], r12
-    vmov.16         d8[0], r1
-
-    vld1.16         {d0}, [r4]
-
-    ; q2: l0r0  q3: l8r8
-    ; q4: l4r4  q5: l12r12
-    vswp            d5, d8
-    vswp            d7, d10
-
-    ; _CONSTANTS_ * 4,12 >> 16
-    ; q6:  4 * sinpi : c1/temp1
-    ; q7: 12 * sinpi : d1/temp2
-    ; q8:  4 * cospi
-    ; q9: 12 * cospi
-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q7, q5, d0[2]
-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
-    vqdmulh.s16     q9, q5, d0[0]
-
-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
-
-    ; vqdmulh only accepts signed values. this was a problem because
-    ; our constant had the high bit set, and was treated as a negative value.
-    ; vqdmulh also doubles the value before it shifts by 16. we need to
-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
-    ; so we can shift the constant without losing precision. this avoids
-    ; shift again afterward, but also avoids the sign issue. win win!
-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
-    ; pre-shift it
-    vshr.s16        q8, q8, #1
-    vshr.s16        q9, q9, #1
-
-    ; q4:  4 +  4 * cospi : d1/temp1
-    ; q5: 12 + 12 * cospi : c1/temp2
-    vqadd.s16       q4, q4, q8
-    vqadd.s16       q5, q5, q9
-
-    ; c1 = temp1 - temp2
-    ; d1 = temp1 + temp2
-    vqsub.s16       q2, q6, q5
-    vqadd.s16       q3, q4, q7
-
-    ; [0]: a1+d1
-    ; [1]: b1+c1
-    ; [2]: b1-c1
-    ; [3]: a1-d1
-    vqadd.s16       q4, q10, q3
-    vqadd.s16       q5, q11, q2
-    vqsub.s16       q6, q11, q2
-    vqsub.s16       q7, q10, q3
-
-    ; rotate
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-    ; idct loop 2
-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
-    ; q6: l 2, 6,10,14 r 2, 6,10,14
-    ; q7: l 3, 7,11,15 r 3, 7,11,15
-
-    ; q8:  1 * sinpi : c1/temp1
-    ; q9:  3 * sinpi : d1/temp2
-    ; q10: 1 * cospi
-    ; q11: 3 * cospi
-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q9, q7, d0[2]
-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
-    vqdmulh.s16     q11, q7, d0[0]
-
-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
-
-    ; see note on shifting above
-    vshr.s16        q10, q10, #1
-    vshr.s16        q11, q11, #1
-
-    ; q10: 1 + 1 * cospi : d1/temp1
-    ; q11: 3 + 3 * cospi : c1/temp2
-    vqadd.s16       q10, q5, q10
-    vqadd.s16       q11, q7, q11
-
-    ; q8: c1 = temp1 - temp2
-    ; q9: d1 = temp1 + temp2
-    vqsub.s16       q8, q8, q11
-    vqadd.s16       q9, q10, q9
-
-    ; a1+d1
-    ; b1+c1
-    ; b1-c1
-    ; a1-d1
-    vqadd.s16       q4, q2, q9
-    vqadd.s16       q5, q3, q8
-    vqsub.s16       q6, q3, q8
-    vqsub.s16       q7, q2, q9
-
-    ; +4 >> 3 (rounding)
-    vrshr.s16       q4, q4, #3              ; lo
-    vrshr.s16       q5, q5, #3
-    vrshr.s16       q6, q6, #3              ; hi
-    vrshr.s16       q7, q7, #3
-
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-
-    ; adding pre
-    ; input is still packed. pre was read interleaved
-    vaddw.u8        q4, q4, d28
-    vaddw.u8        q5, q5, d29
-    vaddw.u8        q6, q6, d30
-    vaddw.u8        q7, q7, d31
-
-    vmov.i16        q14, #0
-    vmov            q15, q14
-    vst1.16         {q14, q15}, [r0]        ; write over high input
-    sub             r0, r0, #32
-    vst1.16         {q14, q15}, [r0]        ; write over low input
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r1, r2, #4              ; hi
-
-    ;saturate and narrow
-    vqmovun.s16     d0, q4                  ; lo
-    vqmovun.s16     d1, q5
-    vqmovun.s16     d2, q6                  ; hi
-    vqmovun.s16     d3, q7
-
-    vst1.32         {d0[0]}, [r2], r3       ; lo
-    vst1.32         {d0[1]}, [r1], r3       ; hi
-    vst1.32         {d1[0]}, [r2], r3
-    vst1.32         {d1[1]}, [r1], r3
-    vst1.32         {d2[0]}, [r2], r3
-    vst1.32         {d2[1]}, [r1], r3
-    vst1.32         {d3[0]}, [r2]
-    vst1.32         {d3[1]}, [r1]
-
-    pop             {r4}
-    bx              lr
-
-    ENDP            ; |idct_dequant_dc_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2       DCD 0x4546
-
-    END
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index e501b9ec7..1133efb54 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -232,45 +232,53 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                 }
             }
         }
-
-    }
-    else if (mode == SPLITMV)
-    {
-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs);
     }
     else
     {
-        BLOCKD *b = &xd->block[24];
+        short *DQC = xd->block[0].dequant;
 
-        /* do 2nd order transform on the dc block */
-        if (xd->eobs[24] > 1)
-        {
-            DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
-            ((int *)b->qcoeff)[1] = 0;
-            ((int *)b->qcoeff)[2] = 0;
-            ((int *)b->qcoeff)[3] = 0;
-            ((int *)b->qcoeff)[4] = 0;
-            ((int *)b->qcoeff)[5] = 0;
-            ((int *)b->qcoeff)[6] = 0;
-            ((int *)b->qcoeff)[7] = 0;
-        }
-        else
+        /* save the dc dequant constant in case it is overridden */
+        short dc_dequant_temp = DQC[0];
+
+        if (mode != SPLITMV)
         {
-            b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
+            BLOCKD *b = &xd->block[24];
+
+            /* do 2nd order transform on the dc block */
+            if (xd->eobs[24] > 1)
+            {
+                DEQUANT_INVOKE(&pbi->dequant, block)(b);
+
+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+                    xd->qcoeff);
+                ((int *)b->qcoeff)[0] = 0;
+                ((int *)b->qcoeff)[1] = 0;
+                ((int *)b->qcoeff)[2] = 0;
+                ((int *)b->qcoeff)[3] = 0;
+                ((int *)b->qcoeff)[4] = 0;
+                ((int *)b->qcoeff)[5] = 0;
+                ((int *)b->qcoeff)[6] = 0;
+                ((int *)b->qcoeff)[7] = 0;
+            }
+            else
+            {
+                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
+                    xd->qcoeff);
+                ((int *)b->qcoeff)[0] = 0;
+            }
+
+            /* override the dc dequant constant */
+            DQC[0] = 1;
         }
 
-        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
+        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
                         (xd->qcoeff, xd->block[0].dequant,
                          xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+                         xd->dst.y_stride, xd->eobs);
+
+        /* restore the dc dequant constant */
+        DQC[0] = dc_dequant_temp;
     }
 
     DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index 0861965eb..4a48a3192 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -42,22 +42,3 @@ void vp8_dequant_idct_add_c(short *input, short *dq,
     vpx_memset(input, 0, 32);
 
 }
-
-void vp8_dequant_dc_idct_add_c(short *input, short *dq,
-                               unsigned char *dest, int stride,
-                               int Dc)
-{
-    int i;
-
-    input[0] = (short)Dc;
-
-    for (i = 1; i < 16; i++)
-    {
-        input[i] = dq[i] * input[i];
-    }
-
-    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
-
-    vpx_memset(input, 0, 32);
-
-}
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index 019b7f6d1..f66cf2bac 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -21,17 +21,6 @@
              unsigned char *output, \
              int stride)
 
-#define prototype_dequant_dc_idct_add(sym) \
-    void sym(short *input, short *dq, \
-             unsigned char *dst, \
-             int stride, \
-             int dc)
-
-#define prototype_dequant_dc_idct_add_y_block(sym) \
-    void sym(short *q, short *dq, \
-             unsigned char *dst, \
-             int stride, char *eobs, short *dc)
-
 #define prototype_dequant_idct_add_y_block(sym) \
     void sym(short *q, short *dq, \
              unsigned char *dst, \
@@ -60,16 +49,6 @@ extern prototype_dequant_block(vp8_dequant_block);
 #endif
 extern prototype_dequant_idct_add(vp8_dequant_idct_add);
 
-#ifndef vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
-#endif
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
-
-#ifndef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
-#endif
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
-
 #ifndef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
 #endif
@@ -85,10 +64,6 @@ typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
 
 typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
 
-typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
-
-typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
-
 typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
 
 typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
@@ -97,8 +72,6 @@ typedef struct
 {
     vp8_dequant_block_fn_t               block;
     vp8_dequant_idct_add_fn_t            idct_add;
-    vp8_dequant_dc_idct_add_fn_t         dc_idct_add;
-    vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
     vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;
     vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
 } vp8_dequant_rtcd_vtable_t;
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index 9c42bc62d..d9f9ba3c8 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -23,8 +23,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
     pbi->mb.rtcd                     = &pbi->common.rtcd;
     pbi->dequant.block               = vp8_dequantize_b_c;
     pbi->dequant.idct_add            = vp8_dequant_idct_add_c;
-    pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_c;
-    pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
     pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
     pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
 #endif
diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
index 1c16b92a9..249fad4ea 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -12,39 +12,12 @@
 #include "vp8/common/idct.h"
 #include "dequantize.h"
 
-void vp8_dequant_dc_idct_add_c(short *input, short *dq,
-                               unsigned char *dest, int stride,
-                               int Dc);
 void vp8_dequant_idct_add_c(short *input, short *dq,
                             unsigned char *dest, int stride);
 void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
                             int pred_stride, unsigned char *dst_ptr,
                             int dst_stride);
 
-void vp8_dequant_dc_idct_add_y_block_c
-            (short *q, short *dq,
-             unsigned char *dst, int stride, char *eobs, short *dc)
-{
-    int i, j;
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            if (*eobs++ > 1)
-                vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);
-            else
-                vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);
-
-            q   += 16;
-            dst += 4;
-            dc  ++;
-        }
-
-        dst += 4*stride - 16;
-    }
-}
-
 void vp8_dequant_idct_add_y_block_c
             (short *q, short *dq,
              unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index eba5830d5..1967781eb 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -175,36 +175,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
 #endif
 
     /* dequantization and idct */
-    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        BLOCKD *b = &xd->block[24];
-        DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
-        /* do 2nd order transform on the dc block */
-        if (xd->eobs[24] > 1)
-        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
-            ((int *)b->qcoeff)[1] = 0;
-            ((int *)b->qcoeff)[2] = 0;
-            ((int *)b->qcoeff)[3] = 0;
-            ((int *)b->qcoeff)[4] = 0;
-            ((int *)b->qcoeff)[5] = 0;
-            ((int *)b->qcoeff)[6] = 0;
-            ((int *)b->qcoeff)[7] = 0;
-        }
-        else
-        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
-        }
-
-        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
-    }
-    else if (xd->mode_info_context->mbmi.mode == B_PRED)
+    if (xd->mode_info_context->mbmi.mode == B_PRED)
     {
         for (i = 0; i < 16; i++)
         {
@@ -214,26 +185,71 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
             vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
                                    b->dst_stride, mb_row, mb_col, i);
 
-            if (xd->eobs[i] > 1)
+            if (xd->eobs[i] )
             {
-                DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                    (b->qcoeff, b->dequant,
-                    *(b->base_dst) + b->dst, b->dst_stride);
+                if (xd->eobs[i] > 1)
+                {
+                    DEQUANT_INVOKE(&pbi->dequant, idct_add)
+                        (b->qcoeff, b->dequant,
+                        *(b->base_dst) + b->dst, b->dst_stride);
+                }
+                else
+                {
+                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                        (b->qcoeff[0] * b->dequant[0],
+                        *(b->base_dst) + b->dst, b->dst_stride,
+                        *(b->base_dst) + b->dst, b->dst_stride);
+                    ((int *)b->qcoeff)[0] = 0;
+                }
+            }
+        }
+    }
+    else
+    {
+        short *DQC = xd->block[0].dequant;
+
+        DECLARE_ALIGNED(16, short, local_dequant[16]);
+
+        if (xd->mode_info_context->mbmi.mode != SPLITMV)
+        {
+            BLOCKD *b = &xd->block[24];
+
+            /* do 2nd order transform on the dc block */
+            if (xd->eobs[24] > 1)
+            {
+                DEQUANT_INVOKE(&pbi->dequant, block)(b);
+
+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+                    xd->qcoeff);
+                ((int *)b->qcoeff)[0] = 0;
+                ((int *)b->qcoeff)[1] = 0;
+                ((int *)b->qcoeff)[2] = 0;
+                ((int *)b->qcoeff)[3] = 0;
+                ((int *)b->qcoeff)[4] = 0;
+                ((int *)b->qcoeff)[5] = 0;
+                ((int *)b->qcoeff)[6] = 0;
+                ((int *)b->qcoeff)[7] = 0;
             }
             else
             {
-                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                    (b->qcoeff[0] * b->dequant[0],
-                    *(b->base_dst) + b->dst, b->dst_stride,
-                    *(b->base_dst) + b->dst, b->dst_stride);
+                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
                 ((int *)b->qcoeff)[0] = 0;
             }
+
+            /* make a local copy of the dequant constants */
+            vpx_memcpy(local_dequant, xd->block[0].dequant,
+                       sizeof(local_dequant));
+
+            /* override the dc dequant constant */
+            local_dequant[0] = 1;
+
+            /* use the new dequant constants */
+            DQC = local_dequant;
         }
-    }
-    else
-    {
+
         DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
+                        (xd->qcoeff, DQC,
                          xd->dst.y_buffer,
                          xd->dst.y_stride, xd->eobs);
     }
@@ -244,7 +260,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
                      xd->dst.uv_stride, xd->eobs+16);
 }
 
-
 static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 {
     int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm
index 648bde4c5..de9eba89f 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -246,207 +246,6 @@ sym(vp8_dequant_idct_add_mmx):
     pop         rbp
     ret
 
-
-;void dequant_dc_idct_add_mmx(
-;short *input,          0
-;short *dq,             1
-;unsigned char *dest,   2
-;int stride,            3
-;int Dc)                4
-global sym(vp8_dequant_dc_idct_add_mmx)
-sym(vp8_dequant_dc_idct_add_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    ; end prolog
-
-        mov         rax,    arg(0) ;input
-        mov         rdx,    arg(1) ;dq
-
-        movq        mm0,    [rax   ]
-        pmullw      mm0,    [rdx]
-
-        movq        mm1,    [rax +8]
-        pmullw      mm1,    [rdx +8]
-
-        movq        mm2,    [rax+16]
-        pmullw      mm2,    [rdx+16]
-
-        movq        mm3,    [rax+24]
-        pmullw      mm3,    [rdx+24]
-
-        mov         rdx,    arg(2) ;pred
-        pxor        mm7,    mm7
-
-
-        movq        [rax],   mm7
-        movq        [rax+8], mm7
-
-        movq        [rax+16],mm7
-        movq        [rax+24],mm7
-
-        ; move lower word of Dc to lower word of mm0
-        psrlq       mm0,    16
-        movzx       rcx,    word ptr arg(4) ;Dc
-        psllq       mm0,    16
-        movq        mm7,    rcx
-        por         mm0,    mm7
-
-        movsxd      rax,            dword ptr arg(3) ;stride
-
-        psubw       mm0,            mm2             ; b1= 0-2
-        paddw       mm2,            mm2             ;
-
-        movq        mm5,            mm1
-        paddw       mm2,            mm0             ; a1 =0+2
-
-        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
-        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
-
-        movq        mm7,            mm3             ;
-        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
-
-        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       mm7,            mm5             ; c1
-
-        movq        mm5,            mm1
-        movq        mm4,            mm3
-
-        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
-        paddw       mm5,            mm1
-
-        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
-        paddw       mm3,            mm4
-
-        paddw       mm3,            mm5             ; d1
-        movq        mm6,            mm2             ; a1
-
-        movq        mm4,            mm0             ; b1
-        paddw       mm2,            mm3             ;0
-
-        paddw       mm4,            mm7             ;1
-        psubw       mm0,            mm7             ;2
-
-        psubw       mm6,            mm3             ;3
-
-        movq        mm1,            mm2             ; 03 02 01 00
-        movq        mm3,            mm4             ; 23 22 21 20
-
-        punpcklwd   mm1,            mm0             ; 11 01 10 00
-        punpckhwd   mm2,            mm0             ; 13 03 12 02
-
-        punpcklwd   mm3,            mm6             ; 31 21 30 20
-        punpckhwd   mm4,            mm6             ; 33 23 32 22
-
-        movq        mm0,            mm1             ; 11 01 10 00
-        movq        mm5,            mm2             ; 13 03 12 02
-
-        punpckldq   mm0,            mm3             ; 30 20 10 00
-        punpckhdq   mm1,            mm3             ; 31 21 11 01
-
-        punpckldq   mm2,            mm4             ; 32 22 12 02
-        punpckhdq   mm5,            mm4             ; 33 23 13 03
-
-        movq        mm3,            mm5             ; 33 23 13 03
-
-        psubw       mm0,            mm2             ; b1= 0-2
-        paddw       mm2,            mm2             ;
-
-        movq        mm5,            mm1
-        paddw       mm2,            mm0             ; a1 =0+2
-
-        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
-        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
-
-        movq        mm7,            mm3             ;
-        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
-
-        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       mm7,            mm5             ; c1
-
-        movq        mm5,            mm1
-        movq        mm4,            mm3
-
-        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
-        paddw       mm5,            mm1
-
-        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
-        paddw       mm3,            mm4
-
-        paddw       mm3,            mm5             ; d1
-        paddw       mm0,            [GLOBAL(fours)]
-
-        paddw       mm2,            [GLOBAL(fours)]
-        movq        mm6,            mm2             ; a1
-
-        movq        mm4,            mm0             ; b1
-        paddw       mm2,            mm3             ;0
-
-        paddw       mm4,            mm7             ;1
-        psubw       mm0,            mm7             ;2
-
-        psubw       mm6,            mm3             ;3
-        psraw       mm2,            3
-
-        psraw       mm0,            3
-        psraw       mm4,            3
-
-        psraw       mm6,            3
-
-        movq        mm1,            mm2             ; 03 02 01 00
-        movq        mm3,            mm4             ; 23 22 21 20
-
-        punpcklwd   mm1,            mm0             ; 11 01 10 00
-        punpckhwd   mm2,            mm0             ; 13 03 12 02
-
-        punpcklwd   mm3,            mm6             ; 31 21 30 20
-        punpckhwd   mm4,            mm6             ; 33 23 32 22
-
-        movq        mm0,            mm1             ; 11 01 10 00
-        movq        mm5,            mm2             ; 13 03 12 02
-
-        punpckldq   mm0,            mm3             ; 30 20 10 00
-        punpckhdq   mm1,            mm3             ; 31 21 11 01
-
-        punpckldq   mm2,            mm4             ; 32 22 12 02
-        punpckhdq   mm5,            mm4             ; 33 23 13 03
-
-        pxor        mm7,            mm7
-
-        movd        mm4,            [rdx]
-        punpcklbw   mm4,            mm7
-        paddsw      mm0,            mm4
-        packuswb    mm0,            mm7
-        movd        [rdx],          mm0
-
-        movd        mm4,            [rdx+rax]
-        punpcklbw   mm4,            mm7
-        paddsw      mm1,            mm4
-        packuswb    mm1,            mm7
-        movd        [rdx+rax],      mm1
-
-        movd        mm4,            [rdx+2*rax]
-        punpcklbw   mm4,            mm7
-        paddsw      mm2,            mm4
-        packuswb    mm2,            mm7
-        movd        [rdx+rax*2],    mm2
-
-        add         rdx,            rax
-
-        movd        mm4,            [rdx+2*rax]
-        punpcklbw   mm4,            mm7
-        paddsw      mm5,            mm4
-        packuswb    mm5,            mm7
-        movd        [rdx+rax*2],    mm5
-
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 x_s1sqr2:
diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h
index dc68daab3..49bcb7f19 100644
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/decoder/x86/dequantize_x86.h
@@ -22,8 +22,6 @@
 #if HAVE_MMX
 extern prototype_dequant_block(vp8_dequantize_b_mmx);
 extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
-extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
 
@@ -34,12 +32,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
 #undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
 
-#undef  vp8_dequant_dc_idct_add
-#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
-
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx
-
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
 
@@ -50,14 +42,10 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
 #endif
 
 #if HAVE_SSE2
-extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_dequant_dc_idct_add_y_block
-#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2
-
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
 
diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/decoder/x86/idct_blk_mmx.c
index 37de5b9fd..29276e5d7 100644
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/decoder/x86/idct_blk_mmx.c
@@ -12,41 +12,6 @@
 #include "vp8/common/idct.h"
 #include "vp8/decoder/dequantize.h"
 
-void vp8_dequant_dc_idct_add_y_block_mmx
-            (short *q, short *dq,
-             unsigned char *dst, int stride, char *eobs, short *dc)
-{
-    int i;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (eobs[0] > 1)
-            vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);
-        else if (eobs[0] == 1)
-            vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);
-
-        if (eobs[1] > 1)
-            vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);
-        else if (eobs[1] == 1)
-            vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);
-
-        if (eobs[2] > 1)
-            vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);
-        else if (eobs[2] == 1)
-            vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);
-
-        if (eobs[3] > 1)
-            vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);
-        else if (eobs[3] == 1)
-            vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);
-
-        q    += 64;
-        dc   += 4;
-        dst  += 4*stride;
-        eobs += 4;
-    }
-}
-
 void vp8_dequant_idct_add_y_block_mmx
             (short *q, short *dq,
              unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c
index 0495b0610..03c2878c1 100644
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -12,13 +12,6 @@
 #include "vp8/common/idct.h"
 #include "vp8/decoder/dequantize.h"
 
-void vp8_idct_dequant_dc_0_2x_sse2
-            (short *q, short *dq,
-             unsigned char *dst, int dst_stride, short *dc);
-void vp8_idct_dequant_dc_full_2x_sse2
-            (short *q, short *dq,
-             unsigned char *dst, int dst_stride, short *dc);
-
 void vp8_idct_dequant_0_2x_sse2
             (short *q, short *dq ,
              unsigned char *dst, int dst_stride);
@@ -26,36 +19,6 @@ void vp8_idct_dequant_full_2x_sse2
             (short *q, short *dq ,
              unsigned char *dst, int dst_stride);
 
-void vp8_dequant_dc_idct_add_y_block_sse2
-            (short *q, short *dq,
-             unsigned char *dst, int stride, char *eobs, short *dc)
-{
-    int i;
-
-    for (i = 0; i < 4; i++)
-    {
-        if (((short *)(eobs))[0])
-        {
-            if (((short *)(eobs))[0] & 0xfefe)
-                vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);
-            else
-                vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);
-        }
-
-        if (((short *)(eobs))[1])
-        {
-            if (((short *)(eobs))[1] & 0xfefe)
-                vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
-            else
-                vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
-        }
-        q    += 64;
-        dc   += 4;
-        dst  += stride*4;
-        eobs += 4;
-    }
-}
-
 void vp8_dequant_idct_add_y_block_sse2
             (short *q, short *dq,
              unsigned char *dst, int stride, char *eobs)
diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
index 443150483..424052c1b 100644
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -43,8 +43,6 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
     {
         pbi->dequant.block               = vp8_dequantize_b_mmx;
         pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
-        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;
-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
         pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
     }
@@ -52,8 +50,6 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 #if HAVE_SSE2
     if (flags & HAS_SSE2)
     {
-        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
     }
 #endif
diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index 6bde42f4c..b08f9464f 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -16,14 +16,11 @@ VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c
 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.h
 
 #File list for armv6
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c
 
 #File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
author	Scott LaVarnway <slavarnway@google.com>	2011-11-17 12:54:42 -0500
committer	Tero Rintaluoma <teror@google.com>	2011-11-25 09:24:04 +0200
commit	4a91541c946c1fc2655a942ec79033618f03c4ca (patch)
tree	70093355ebd25dd2c79515f7950c8490f6937355
parent	7b0feac4a4386eef3e1ea851e52e4f30935e255d (diff)
download	libvpx-4a91541c946c1fc2655a942ec79033618f03c4ca.tar.gz