summaryrefslogtreecommitdiff
path: root/libavcodec/x86/vp3dsp.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86/vp3dsp.asm')
-rw-r--r--libavcodec/x86/vp3dsp.asm123
1 files changed, 82 insertions, 41 deletions
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 078e9db99a..fc8a047224 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -501,22 +501,22 @@ cglobal vp3_h_loop_filter, 3, 4
; at this point, function has completed dequantization + dezigzag +
; partial transposition; now do the idct itself
-%define I(x) [%1+16* x ]
-%define J(x) [%1+16*(x-4)+8]
+%define I(x) [%1+16*x]
+%define J(x) [%1+16*x]
RowIDCT
Transpose
-%define I(x) [%1+16* x +64]
-%define J(x) [%1+16*(x-4)+72]
+%define I(x) [%1+16*x+8]
+%define J(x) [%1+16*x+8]
RowIDCT
Transpose
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
+%define I(x) [%1+16* x]
+%define J(x) [%1+16*(x-4)+8]
ColumnIDCT
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
+%define I(x) [%1+16* x +64]
+%define J(x) [%1+16*(x-4)+72]
ColumnIDCT
%endif ; mmsize == 16/8
%endmacro
@@ -534,10 +534,17 @@ cglobal vp3_idct_put, 3, 4, 9
mova m1, [r2+mmsize*2+%%i]
mova m2, [r2+mmsize*4+%%i]
mova m3, [r2+mmsize*6+%%i]
+%if mmsize == 8
+ packsswb m0, [r2+mmsize*8+%%i]
+ packsswb m1, [r2+mmsize*10+%%i]
+ packsswb m2, [r2+mmsize*12+%%i]
+ packsswb m3, [r2+mmsize*14+%%i]
+%else
packsswb m0, [r2+mmsize*1+%%i]
packsswb m1, [r2+mmsize*3+%%i]
packsswb m2, [r2+mmsize*5+%%i]
packsswb m3, [r2+mmsize*7+%%i]
+%endif
paddb m0, m4
paddb m1, m4
paddb m2, m4
@@ -561,7 +568,7 @@ cglobal vp3_idct_put, 3, 4, 9
movq [r0+r1*2], m3
movhps [r0+r3 ], m3
%endif
-%assign %%i %%i+64
+%assign %%i %%i+8
%endrep
pxor m0, m0
@@ -575,47 +582,81 @@ cglobal vp3_idct_put, 3, 4, 9
cglobal vp3_idct_add, 3, 4, 9
VP3_IDCT r2
- mov r3, 4
- pxor m4, m4
movsxdifnidn r1, r1d
-.loop:
+ lea r3, [r1*3]
+ pxor m4, m4
+%if mmsize == 16
+%assign %%i 0
+%rep 2
movq m0, [r0]
movq m1, [r0+r1]
-%if mmsize == 8
- mova m2, m0
- mova m3, m1
-%endif
+ movq m2, [r0+r1*2]
+ movq m3, [r0+r3]
punpcklbw m0, m4
punpcklbw m1, m4
-%if mmsize == 8
- punpckhbw m2, m4
- punpckhbw m3, m4
-%endif
- paddsw m0, [r2+ 0]
- paddsw m1, [r2+16]
-%if mmsize == 8
- paddsw m2, [r2+ 8]
- paddsw m3, [r2+24]
- packuswb m0, m2
- packuswb m1, m3
-%else ; mmsize == 16
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ paddsw m0, [r2+ 0+%%i]
+ paddsw m1, [r2+16+%%i]
+ paddsw m2, [r2+32+%%i]
+ paddsw m3, [r2+48+%%i]
packuswb m0, m1
+ packuswb m2, m3
+ movq [r0 ], m0
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m2
+ movhps [r0+r3 ], m2
+%if %%i == 0
+ lea r0, [r0+r1*4]
%endif
- movq [r0 ], m0
-%if mmsize == 8
- movq [r0+r1], m1
-%else ; mmsize == 16
- movhps [r0+r1], m0
+%assign %%i %%i+64
+%endrep
+%else
+%assign %%i 0
+%rep 2
+ movq m0, [r0]
+ movq m1, [r0+r1]
+ movq m2, [r0+r1*2]
+ movq m3, [r0+r3]
+ movq m5, m0
+ movq m6, m1
+ movq m7, m2
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpckhbw m5, m4
+ punpckhbw m6, m4
+ punpckhbw m7, m4
+ paddsw m0, [r2+ 0+%%i]
+ paddsw m1, [r2+16+%%i]
+ paddsw m2, [r2+32+%%i]
+ paddsw m5, [r2+64+%%i]
+ paddsw m6, [r2+80+%%i]
+ paddsw m7, [r2+96+%%i]
+ packuswb m0, m5
+ movq m5, m3
+ punpcklbw m3, m4
+ punpckhbw m5, m4
+ packuswb m1, m6
+ paddsw m3, [r2+48+%%i]
+ paddsw m5, [r2+112+%%i]
+ packuswb m2, m7
+ packuswb m3, m5
+ movq [r0 ], m0
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
+%if %%i == 0
+ lea r0, [r0+r1*4]
%endif
- lea r0, [r0+r1*2]
-%assign %%offset 0
-%rep 32/mmsize
- mova [r2+%%offset], m4
-%assign %%offset %%offset+mmsize
+%assign %%i %%i+8
+%endrep
+%endif
+%assign %%i 0
+%rep 128/mmsize
+ mova [r2+%%i], m4
+%assign %%i %%i+mmsize
%endrep
- add r2, 32
- dec r3
- jg .loop
RET
%endmacro