diff options
Diffstat (limited to 'libavcodec/x86/v210enc.asm')
-rw-r--r-- | libavcodec/x86/v210enc.asm | 38 |
1 files changed, 22 insertions, 16 deletions
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index 0db0196313..965f2bea3c 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -2,20 +2,20 @@ ;* V210 SIMD pack ;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -23,8 +23,9 @@ SECTION_RODATA 32 -v210_enc_min_10: times 32 dw 0x4 -v210_enc_max_10: times 32 dw 0x3fb +cextern pw_4 +%define v210_enc_min_10 pw_4 +v210_enc_max_10: times 16 dw 0x3fb v210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0 v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 @@ -32,16 +33,19 @@ v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 v210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0 v210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 -v210_enc_min_8: times 32 db 0x1 -v210_enc_max_8: times 32 db 0xfe +cextern pb_1 +%define v210_enc_min_8 pb_1 +cextern pb_FE +%define v210_enc_max_8 pb_FE -v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0 v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 +v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0 -v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0 v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 +v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0 + SECTION .text %macro v210_planar_pack_10 0 @@ -59,16 +63,16 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width .loop: movu xm0, [yq+2*widthq] %if cpuflag(avx2) - vinserti128 m0, m0, [yq+2*widthq+12], 1 + vinserti128 m0, m0, [yq+widthq*2+12], 1 %endif CLIPW m0, m2, m3 - movq xm1, [uq+widthq] - movhps xm1, [vq+widthq] + movq xm1, [uq+widthq] + movhps xm1, [vq+widthq] %if cpuflag(avx2) movq xm4, [uq+widthq+6] movhps xm4, [vq+widthq+6] - vinserti128 m1, m1, xm4, 1 + vinserti128 m1, m1, xm4, 1 %endif CLIPW m1, m2, m3 @@ -93,6 +97,7 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width INIT_XMM ssse3 v210_planar_pack_10 %endif + %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 v210_planar_pack_10 @@ -113,9 +118,9 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width pxor m6, m6 .loop: - movu xm1, [yq+2*widthq] + movu xm1, [yq+widthq*2] %if cpuflag(avx2) - vinserti128 m1, m1, [yq+2*widthq+12], 1 + vinserti128 m1, m1, [yq+widthq*2+12], 1 %endif CLIPUB m1, m4, m5 @@ -172,6 +177,7 @@ v210_planar_pack_8 INIT_XMM avx v210_planar_pack_8 %endif + %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 v210_planar_pack_8 |