diff options
Diffstat (limited to 'libavcodec/x86/vp3dsp.asm')
-rw-r--r-- | libavcodec/x86/vp3dsp.asm | 123 |
1 files changed, 82 insertions, 41 deletions
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index 078e9db99a..fc8a047224 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -501,22 +501,22 @@ cglobal vp3_h_loop_filter, 3, 4 ; at this point, function has completed dequantization + dezigzag + ; partial transposition; now do the idct itself -%define I(x) [%1+16* x ] -%define J(x) [%1+16*(x-4)+8] +%define I(x) [%1+16*x] +%define J(x) [%1+16*x] RowIDCT Transpose -%define I(x) [%1+16* x +64] -%define J(x) [%1+16*(x-4)+72] +%define I(x) [%1+16*x+8] +%define J(x) [%1+16*x+8] RowIDCT Transpose -%define I(x) [%1+16*x] -%define J(x) [%1+16*x] +%define I(x) [%1+16* x] +%define J(x) [%1+16*(x-4)+8] ColumnIDCT -%define I(x) [%1+16*x+8] -%define J(x) [%1+16*x+8] +%define I(x) [%1+16* x +64] +%define J(x) [%1+16*(x-4)+72] ColumnIDCT %endif ; mmsize == 16/8 %endmacro @@ -534,10 +534,17 @@ cglobal vp3_idct_put, 3, 4, 9 mova m1, [r2+mmsize*2+%%i] mova m2, [r2+mmsize*4+%%i] mova m3, [r2+mmsize*6+%%i] +%if mmsize == 8 + packsswb m0, [r2+mmsize*8+%%i] + packsswb m1, [r2+mmsize*10+%%i] + packsswb m2, [r2+mmsize*12+%%i] + packsswb m3, [r2+mmsize*14+%%i] +%else packsswb m0, [r2+mmsize*1+%%i] packsswb m1, [r2+mmsize*3+%%i] packsswb m2, [r2+mmsize*5+%%i] packsswb m3, [r2+mmsize*7+%%i] +%endif paddb m0, m4 paddb m1, m4 paddb m2, m4 @@ -561,7 +568,7 @@ cglobal vp3_idct_put, 3, 4, 9 movq [r0+r1*2], m3 movhps [r0+r3 ], m3 %endif -%assign %%i %%i+64 +%assign %%i %%i+8 %endrep pxor m0, m0 @@ -575,47 +582,81 @@ cglobal vp3_idct_put, 3, 4, 9 cglobal vp3_idct_add, 3, 4, 9 VP3_IDCT r2 - mov r3, 4 - pxor m4, m4 movsxdifnidn r1, r1d -.loop: + lea r3, [r1*3] + pxor m4, m4 +%if mmsize == 16 +%assign %%i 0 +%rep 2 movq m0, [r0] movq m1, [r0+r1] -%if mmsize == 8 - mova m2, m0 - mova m3, m1 -%endif + movq m2, [r0+r1*2] + movq m3, [r0+r3] punpcklbw m0, m4 punpcklbw m1, m4 -%if mmsize == 8 - punpckhbw m2, m4 - punpckhbw m3, m4 -%endif - paddsw m0, [r2+ 0] - paddsw m1, [r2+16] -%if mmsize == 8 - paddsw m2, [r2+ 8] - paddsw m3, [r2+24] - packuswb m0, m2 - packuswb m1, m3 -%else ; mmsize == 16 + punpcklbw m2, m4 + punpcklbw m3, m4 + paddsw m0, [r2+ 0+%%i] + paddsw m1, [r2+16+%%i] + paddsw m2, [r2+32+%%i] + paddsw m3, [r2+48+%%i] packuswb m0, m1 + packuswb m2, m3 + movq [r0 ], m0 + movhps [r0+r1 ], m0 + movq [r0+r1*2], m2 + movhps [r0+r3 ], m2 +%if %%i == 0 + lea r0, [r0+r1*4] %endif - movq [r0 ], m0 -%if mmsize == 8 - movq [r0+r1], m1 -%else ; mmsize == 16 - movhps [r0+r1], m0 +%assign %%i %%i+64 +%endrep +%else +%assign %%i 0 +%rep 2 + movq m0, [r0] + movq m1, [r0+r1] + movq m2, [r0+r1*2] + movq m3, [r0+r3] + movq m5, m0 + movq m6, m1 + movq m7, m2 + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpckhbw m5, m4 + punpckhbw m6, m4 + punpckhbw m7, m4 + paddsw m0, [r2+ 0+%%i] + paddsw m1, [r2+16+%%i] + paddsw m2, [r2+32+%%i] + paddsw m5, [r2+64+%%i] + paddsw m6, [r2+80+%%i] + paddsw m7, [r2+96+%%i] + packuswb m0, m5 + movq m5, m3 + punpcklbw m3, m4 + punpckhbw m5, m4 + packuswb m1, m6 + paddsw m3, [r2+48+%%i] + paddsw m5, [r2+112+%%i] + packuswb m2, m7 + packuswb m3, m5 + movq [r0 ], m0 + movq [r0+r1 ], m1 + movq [r0+r1*2], m2 + movq [r0+r3 ], m3 +%if %%i == 0 + lea r0, [r0+r1*4] %endif - lea r0, [r0+r1*2] -%assign %%offset 0 -%rep 32/mmsize - mova [r2+%%offset], m4 -%assign %%offset %%offset+mmsize +%assign %%i %%i+8 +%endrep +%endif +%assign %%i 0 +%rep 128/mmsize + mova [r2+%%i], m4 +%assign %%i %%i+mmsize %endrep - add r2, 32 - dec r3 - jg .loop RET %endmacro |