diff options
Diffstat (limited to 'libavcodec/x86/hevc_mc.asm')
-rw-r--r-- | libavcodec/x86/hevc_mc.asm | 2273 |
1 files changed, 1547 insertions, 726 deletions
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm index 16e5eefc69..ff6ed0711a 100644 --- a/libavcodec/x86/hevc_mc.asm +++ b/libavcodec/x86/hevc_mc.asm @@ -1,851 +1,1672 @@ -;***************************************************************************** -;* x86-optimized HEVC MC -;* Copyright 2015 Anton Khirnov -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - +; /* +; * Provide SSE luma and chroma mc functions for HEVC decoding +; * Copyright (c) 2013 Pierre-Edouard LEPERE +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 +cextern pw_255 +cextern pw_512 +cextern pw_2048 +cextern pw_8192 +cextern pw_1023 +cextern pw_1024 +cextern pw_4096 +%define pw_8 pw_512 +%define pw_10 pw_2048 +%define pw_12 pw_8192 +%define pw_bi_10 pw_1024 +%define pw_bi_12 pw_4096 +%define max_pixels_8 pw_255 +%define max_pixels_10 pw_1023 +pw_bi_8: times 16 dw (1 << 8) +max_pixels_12: times 16 dw ((1 << 12)-1) +cextern pd_1 +cextern pb_0 + +%macro EPEL_TABLE 4 +hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 + times %2 d%3 10, -2 + times %2 d%3 -4, 54 + times %2 d%3 16, -2 + times %2 d%3 -6, 46 + times %2 d%3 28, -4 + times %2 d%3 -4, 36 + times %2 d%3 36, -4 + times %2 d%3 -4, 28 + times %2 d%3 46, -6 + times %2 d%3 -2, 16 + times %2 d%3 54, -4 + times %2 d%3 -2, 10 + times %2 d%3 58, -2 +%endmacro -pw_1023: times 8 dw 1023 -cextern hevc_qpel_coeffs -cextern hevc_qpel_coeffs8 +EPEL_TABLE 8,16, b, avx2 +EPEL_TABLE 10, 8, w, avx2 + +EPEL_TABLE 8, 8, b, sse4 +EPEL_TABLE 10, 4, w, sse4 +EPEL_TABLE 12, 4, w, sse4 + +%macro QPEL_TABLE 4 +hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4 + times %2 d%3 -10, 58 + times %2 d%3 17, -5 + times %2 d%3 1, 0 + times %2 d%3 -1, 4 + times %2 d%3 -11, 40 + times %2 d%3 40,-11 + times %2 d%3 4, -1 + times %2 d%3 0, 1 + times %2 d%3 -5, 17 + times %2 d%3 58,-10 + times %2 d%3 4, -1 +%endmacro -cextern hevc_epel_coeffs -cextern hevc_epel_coeffs8 +QPEL_TABLE 8, 8, b, sse4 +QPEL_TABLE 10, 4, w, sse4 +QPEL_TABLE 12, 4, w, sse4 -cextern pw_8 -cextern pw_16 -cextern pw_32 -cextern pw_64 +QPEL_TABLE 8,16, b, avx2 +QPEL_TABLE 10, 8, w, avx2 SECTION .text -; %1: width -; %2: bit depth -%macro COMMON_DEFS 2 - %assign blocksize 8 - %assign nb_blocks ((%1 + blocksize - 1) / blocksize) - %define last_block_truncated (blocksize * nb_blocks > %1) - %if %2 > 8 - %define LOAD_BLOCK movu - %define LOAD_HALFBLOCK movq - %assign pixelsize 2 - %else - %define LOAD_BLOCK movq - %define LOAD_HALFBLOCK movd - %assign pixelsize 1 - %endif - %define STORE_BLOCK mova - %define STORE_HALFBLOCK movq -%endmacro - -; %1: block index -%macro BLOCK_DEFS 1 - %if last_block_truncated && %1 == nb_blocks - 1 - %define block_truncated 1 - %define LOAD LOAD_HALFBLOCK - %define STORE STORE_HALFBLOCK - %else - %define block_truncated 0 - %define LOAD LOAD_BLOCK - %define STORE STORE_BLOCK - %endif -%endmacro - - -; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride, -; pixel *src, ptrdiff_t srcstride, -; int height, int mx, int my, int *mcbuffer) - -; %1: block width -; %2: bit depth -; %3: log2 of height unroll -%macro GET_PIXELS 3 -cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused - - %assign shift 14 - %2 - COMMON_DEFS %1, %2 - -%if pixelsize == 1 - pxor m0, m0 -%endif - - shr heightd, %3 - -.loop: +%define MAX_PB_SIZE 64 -%assign i 0 -%rep (1 << %3) +%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 -%assign j 0 -%rep nb_blocks +%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10 - BLOCK_DEFS j +%if ARCH_X86_64 - LOAD m1, [srcq + j * pixelsize * blocksize] -%if pixelsize == 1 - punpcklbw m1, m0 +%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 +%if %1 <= 4 + movq %3, [%2] ; load data from source2 +%elif %1 <= 8 + movdqa %3, [%2] ; load data from source2 +%elif %1 <= 12 +%if cpuflag(avx2) + mova %3, [%2] +%else + movdqa %3, [%2] ; load data from source2 + movq %4, [%2+16] ; load data from source2 +%endif ;avx +%elif %1 <= 16 +%if cpuflag(avx2) + mova %3, [%2] +%else + movdqa %3, [%2] ; load data from source2 + movdqa %4, [%2+16] ; load data from source2 +%endif ; avx +%else ; %1 = 32 + mova %3, [%2] + mova %4, [%2+32] %endif - psllw m1, shift - STORE [dstq + j * 2 * blocksize], m1 - -%assign j (j + 1) -%endrep +%endmacro - add dstq, dststrideq - add srcq, srcstrideq +%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1 +%if %1 == 2 || (%2 == 8 && %1 <= 4) + movd %4, [%3] ; load data from source +%elif %1 == 4 || (%2 == 8 && %1 <= 8) + movq %4, [%3] ; load data from source +%elif notcpuflag(avx) + movu %4, [%3] ; load data from source +%elif %1 <= 8 || (%2 == 8 && %1 <= 16) + movdqu %4, [%3] +%else + movu %4, [%3] +%endif +%endmacro -%assign i (i + 1) -%endrep - dec heightd - jg .loop - RET +%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp +%if cpuflag(avx2) +%assign %%offset 32 +%ifdef PIC + lea %5q, [hevc_epel_filters_avx2_%1] + %define FILTER %5q +%else + %define FILTER hevc_epel_filters_avx2_%1 +%endif +%else +%assign %%offset 16 +%ifdef PIC + lea %5q, [hevc_epel_filters_sse4_%1] + %define FILTER %5q +%else + %define FILTER hevc_epel_filters_sse4_%1 +%endif +%endif ;cpuflag(avx2) + sub %2q, 1 +%if cpuflag(avx2) + shl %2q, 6 ; multiply by 64 + %else + shl %2q, 5 ; multiply by 32 +%endif + mova %3, [FILTER + %2q] ; get 2 first values of filters + mova %4, [FILTER + %2q+%%offset] ; get 2 last values of filters %endmacro -INIT_XMM sse2 -GET_PIXELS 4, 8, 1 -GET_PIXELS 8, 8, 1 -GET_PIXELS 12, 8, 3 -GET_PIXELS 16, 8, 2 -GET_PIXELS 24, 8, 3 -GET_PIXELS 32, 8, 3 -GET_PIXELS 48, 8, 3 -GET_PIXELS 64, 8, 3 - -GET_PIXELS 4, 10, 1 -GET_PIXELS 8, 10, 1 -GET_PIXELS 12, 10, 3 -GET_PIXELS 16, 10, 2 -GET_PIXELS 24, 10, 3 -GET_PIXELS 32, 10, 3 -GET_PIXELS 48, 10, 3 -GET_PIXELS 64, 10, 3 - -; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride, -; uint8_t *src, ptrdiff_t srcstride, -; int height, int mx, int my, int *mcbuffer) - -; 8-bit qpel interpolation -; %1: block width -; %2: 0 - horizontal; 1 - vertical -%macro QPEL_8 2 -%if %2 - %define postfix v - %define mvfrac myq - %define coeffsaddr r5q - %define pixstride srcstrideq - %define pixstride3 r5q - %define src_m3 r6q -%else - %define postfix h - %define mvfrac mxq - %define coeffsaddr r6q - %define pixstride 1 - %define pixstride3 3 - %define src_m3 (srcq - 3) -%endif - - COMMON_DEFS %1, 8 - -cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my - and mvfrac, 0x3 - dec mvfrac - shl mvfrac, 4 - lea coeffsaddr, [hevc_qpel_coeffs8] - mova m0, [coeffsaddr + mvfrac] - - SPLATW m1, m0, 1 - SPLATW m2, m0, 2 - SPLATW m3, m0, 3 - SPLATW m0, m0, 0 - -%if %2 - lea pixstride3, [srcstrideq + 2 * srcstrideq] - mov src_m3, srcq - sub src_m3, pixstride3 +%macro EPEL_HV_FILTER 1 +%if cpuflag(avx2) +%assign %%offset 32 +%assign %%shift 6 +%define %%table hevc_epel_filters_avx2_%1 +%else +%assign %%offset 16 +%assign %%shift 5 +%define %%table hevc_epel_filters_sse4_%1 %endif -.loop: - -%assign i 0 -%rep nb_blocks +%ifdef PIC + lea r3srcq, [%%table] + %define FILTER r3srcq +%else + %define FILTER %%table +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + mova m14, [FILTER + mxq] ; get 2 first values of filters + mova m15, [FILTER + mxq+%%offset] ; get 2 last values of filters + +%if cpuflag(avx2) +%define %%table hevc_epel_filters_avx2_10 +%else +%define %%table hevc_epel_filters_sse4_10 +%endif +%ifdef PIC + lea r3srcq, [%%table] + %define FILTER r3srcq +%else + %define FILTER %%table +%endif + mova m12, [FILTER + myq] ; get 2 first values of filters + mova m13, [FILTER + myq+%%offset] ; get 2 last values of filters + lea r3srcq, [srcstrideq*3] +%endmacro - BLOCK_DEFS i +%macro QPEL_FILTER 2 - LOAD m4, [src_m3 + i * blocksize] - LOAD m5, [src_m3 + i * blocksize + 1 * pixstride] - punpcklbw m4, m5 - pmaddubsw m4, m0 +%if cpuflag(avx2) +%assign %%offset 32 +%assign %%shift 7 +%define %%table hevc_qpel_filters_avx2_%1 +%else +%assign %%offset 16 +%assign %%shift 6 +%define %%table hevc_qpel_filters_sse4_%1 +%endif - LOAD m5, [src_m3 + i * blocksize + 2 * pixstride] - LOAD m6, [srcq + i * blocksize] - punpcklbw m5, m6 - pmaddubsw m5, m1 - paddsw m4, m5 +%ifdef PIC + lea rfilterq, [%%table] +%else + %define rfilterq %%table +%endif + sub %2q, 1 + shl %2q, %%shift ; multiply by 32 + mova m12, [rfilterq + %2q] ; get 4 first values of filters + mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters + mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters + mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters +%endmacro - LOAD m5, [srcq + i * blocksize + 1 * pixstride] - LOAD m6, [srcq + i * blocksize + 2 * pixstride] - punpcklbw m5, m6 - pmaddubsw m5, m2 - paddsw m4, m5 +%macro EPEL_LOAD 4 +%if (%1 == 8 && %4 <= 4) +%define %%load movd +%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4) +%define %%load movq +%else +%define %%load movdqu +%endif - LOAD m5, [srcq + i * blocksize + pixstride3] - LOAD m6, [srcq + i * blocksize + 4 * pixstride] - punpcklbw m5, m6 - pmaddubsw m5, m3 - paddsw m4, m5 + %%load m0, [%2q ] +%ifnum %3 + %%load m1, [%2q+ %3] + %%load m2, [%2q+2*%3] + %%load m3, [%2q+3*%3] +%else + %%load m1, [%2q+ %3q] + %%load m2, [%2q+2*%3q] + %%load m3, [%2q+r3srcq] +%endif +%if %1 == 8 +%if %4 > 8 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 +%else + punpcklbw m0, m1 + punpcklbw m2, m3 +%endif +%else +%if %4 > 4 + SBUTTERFLY wd, 0, 1, 7 + SBUTTERFLY wd, 2, 3, 7 +%else + punpcklwd m0, m1 + punpcklwd m2, m3 +%endif +%endif +%endmacro - STORE [dstq + i * 2 * blocksize], m4 -%assign i (i + 1) -%endrep +%macro QPEL_H_LOAD 4 +%assign %%stride (%1+7)/8 +%if %1 == 8 +%if %3 <= 4 +%define %%load movd +%elif %3 == 8 +%define %%load movq +%else +%define %%load movu +%endif +%else +%if %3 == 2 +%define %%load movd +%elif %3 == 4 +%define %%load movq +%else +%define %%load movu +%endif +%endif + %%load m0, [%2-3*%%stride] ;load data from source + %%load m1, [%2-2*%%stride] + %%load m2, [%2-%%stride ] + %%load m3, [%2 ] + %%load m4, [%2+%%stride ] + %%load m5, [%2+2*%%stride] + %%load m6, [%2+3*%%stride] + %%load m7, [%2+4*%%stride] + +%if %1 == 8 +%if %3 > 8 + SBUTTERFLY wd, 0, 1, %4 + SBUTTERFLY wd, 2, 3, %4 + SBUTTERFLY wd, 4, 5, %4 + SBUTTERFLY wd, 6, 7, %4 +%else + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 +%endif +%else +%if %3 > 4 + SBUTTERFLY dq, 0, 1, %4 + SBUTTERFLY dq, 2, 3, %4 + SBUTTERFLY dq, 4, 5, %4 + SBUTTERFLY dq, 6, 7, %4 +%else + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 +%endif +%endif +%endmacro - add dstq, dststrideq - add srcq, srcstrideq -%if %2 - add src_m3, srcstrideq +%macro QPEL_V_LOAD 5 + lea %5q, [%2] + sub %5q, r3srcq + movu m0, [%5q ] ;load x- 3*srcstride + movu m1, [%5q+ %3q ] ;load x- 2*srcstride + movu m2, [%5q+ 2*%3q ] ;load x-srcstride + movu m3, [%2 ] ;load x + movu m4, [%2+ %3q] ;load x+stride + movu m5, [%2+ 2*%3q] ;load x+2*stride + movu m6, [%2+r3srcq] ;load x+3*stride + movu m7, [%2+ 4*%3q] ;load x+4*stride +%if %1 == 8 +%if %4 > 8 + SBUTTERFLY bw, 0, 1, 8 + SBUTTERFLY bw, 2, 3, 8 + SBUTTERFLY bw, 4, 5, 8 + SBUTTERFLY bw, 6, 7, 8 +%else + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 %endif +%else +%if %4 > 4 + SBUTTERFLY wd, 0, 1, 8 + SBUTTERFLY wd, 2, 3, 8 + SBUTTERFLY wd, 4, 5, 8 + SBUTTERFLY wd, 6, 7, 8 +%else + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 +%endif +%endif +%endmacro - dec heightd - jg .loop - RET +%macro PEL_12STORE2 3 + movd [%1], %2 +%endmacro +%macro PEL_12STORE4 3 + movq [%1], %2 +%endmacro +%macro PEL_12STORE6 3 + movq [%1], %2 + psrldq %2, 8 + movd [%1+8], %2 +%endmacro +%macro PEL_12STORE8 3 + movdqa [%1], %2 +%endmacro +%macro PEL_12STORE12 3 + movdqa [%1], %2 + movq [%1+16], %3 +%endmacro +%macro PEL_12STORE16 3 + PEL_12STORE8 %1, %2, %3 + movdqa [%1+16], %3 %endmacro -INIT_XMM ssse3 -QPEL_8 4, 0 -QPEL_8 8, 0 -QPEL_8 12, 0 -QPEL_8 16, 0 -QPEL_8 24, 0 -QPEL_8 32, 0 -QPEL_8 48, 0 -QPEL_8 64, 0 - -QPEL_8 4, 1 -QPEL_8 8, 1 -QPEL_8 12, 1 -QPEL_8 16, 1 -QPEL_8 24, 1 -QPEL_8 32, 1 -QPEL_8 48, 1 -QPEL_8 64, 1 - -; 16-bit qpel interpolation -; %1: block width -; %2: shift applied to the result -; %3: 0 - horizontal; 1 - vertical -%macro QPEL_16 3 -%if %3 - %define mvfrac myq - %define pixstride srcstrideq - %define pixstride3 sstride3q - %define src_m3 srcm3q -%else - %define mvfrac mxq - %define pixstride 2 - %define pixstride3 6 - %define src_m3 (srcq - 6) -%endif - - COMMON_DEFS %1, 16 - - and mvfrac, 0x3 - dec mvfrac - shl mvfrac, 4 - lea coeffsregq, [hevc_qpel_coeffs] - mova m0, [coeffsregq + mvfrac] - - pshufd m1, m0, 0x55 - pshufd m2, m0, 0xaa - pshufd m3, m0, 0xff - pshufd m0, m0, 0x00 - -%if %3 - lea sstride3q, [srcstrideq + 2 * srcstrideq] - mov srcm3q, srcq - sub srcm3q, sstride3q +%macro PEL_10STORE2 3 + movd [%1], %2 +%endmacro +%macro PEL_10STORE4 3 + movq [%1], %2 +%endmacro +%macro PEL_10STORE6 3 + movq [%1], %2 + psrldq %2, 8 + movd [%1+8], %2 +%endmacro +%macro PEL_10STORE8 3 + movdqa [%1], %2 +%endmacro +%macro PEL_10STORE12 3 + movdqa [%1], %2 + movq [%1+16], %3 +%endmacro +%macro PEL_10STORE16 3 +%if cpuflag(avx2) + movu [%1], %2 +%else + PEL_10STORE8 %1, %2, %3 + movdqa [%1+16], %3 %endif +%endmacro -.loop: +%macro PEL_10STORE32 3 + PEL_10STORE16 %1, %2, %3 + movu [%1+32], %3 +%endmacro -%assign i 0 -%rep nb_blocks +%macro PEL_8STORE2 3 + pextrw [%1], %2, 0 +%endmacro +%macro PEL_8STORE4 3 + movd [%1], %2 +%endmacro +%macro PEL_8STORE6 3 + movd [%1], %2 + pextrw [%1+4], %2, 2 +%endmacro +%macro PEL_8STORE8 3 + movq [%1], %2 +%endmacro +%macro PEL_8STORE12 3 + movq [%1], %2 + psrldq %2, 8 + movd [%1+8], %2 +%endmacro +%macro PEL_8STORE16 3 +%if cpuflag(avx2) + movdqu [%1], %2 +%else + mova [%1], %2 +%endif ; avx +%endmacro +%macro PEL_8STORE32 3 + movu [%1], %2 +%endmacro - BLOCK_DEFS i +%macro LOOP_END 3 + add %1q, 2*MAX_PB_SIZE ; dst += dststride + add %2q, %3q ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop +%endmacro - LOAD m4, [src_m3 + i * 2 * blocksize] - LOAD m5, [src_m3 + i * 2 * blocksize + 1 * pixstride] - LOAD m6, [src_m3 + i * 2 * blocksize + 2 * pixstride] - LOAD m7, [srcq + i * 2 * blocksize + 0 * pixstride] - LOAD m8, [srcq + i * 2 * blocksize + 1 * pixstride] - LOAD m9, [srcq + i * 2 * blocksize + 2 * pixstride] - LOAD m10, [srcq + i * 2 * blocksize + pixstride3] - LOAD m11, [srcq + i * 2 * blocksize + 4 * pixstride] - punpcklwd m12, m4, m5 - pmaddwd m12, m0 +%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth +%if %2 == 8 +%if cpuflag(avx2) && %0 ==3 +%if %1 > 16 + vextracti128 xm1, m0, 1 + pmovzxbw m1, xm1 + psllw m1, 14-%2 +%endif + pmovzxbw m0, xm0 +%else ; not avx +%if %1 > 8 + punpckhbw m1, m0, m2 + psllw m1, 14-%2 +%endif + punpcklbw m0, m2 +%endif +%endif ;avx + psllw m0, 14-%2 +%endmacro + +%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3 +%if %0 == 8 +%define %%reg0 %5 +%define %%reg2 %6 +%define %%reg1 %7 +%define %%reg3 %8 +%else +%define %%reg0 m0 +%define %%reg2 m2 +%define %%reg1 m1 +%define %%reg3 m3 +%endif +%if %1 == 8 +%if cpuflag(avx2) && (%0 == 5) +%if %2 > 16 + vperm2i128 m10, m0, m1, q0301 +%endif + vinserti128 m0, m0, xm1, 1 + mova m1, m10 +%if %2 > 16 + vperm2i128 m10, m2, m3, q0301 +%endif + vinserti128 m2, m2, xm3, 1 + mova m3, m10 +%endif + pmaddubsw %%reg0, %3 ;x1*c1+x2*c2 + pmaddubsw %%reg2, %4 ;x3*c3+x4*c4 + paddw %%reg0, %%reg2 +%if %2 > 8 + pmaddubsw %%reg1, %3 + pmaddubsw %%reg3, %4 + paddw %%reg1, %%reg3 +%endif +%else + pmaddwd %%reg0, %3 + pmaddwd %%reg2, %4 + paddd %%reg0, %%reg2 +%if %2 > 4 + pmaddwd %%reg1, %3 + pmaddwd %%reg3, %4 + paddd %%reg1, %%reg3 +%if %1 != 8 + psrad %%reg1, %1-8 +%endif +%endif +%if %1 != 8 + psrad %%reg0, %1-8 +%endif + packssdw %%reg0, %%reg1 +%endif +%endmacro - punpcklwd m13, m6, m7 - pmaddwd m13, m1 - paddd m12, m13 +%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx - punpcklwd m13, m8, m9 - pmaddwd m13, m2 - paddd m12, m13 +%if cpuflag(avx2) +%assign %%offset 32 +%define %%table hevc_qpel_filters_avx2_%2 +%else +%assign %%offset 16 +%define %%table hevc_qpel_filters_sse4_%2 +%endif - punpcklwd m13, m10, m11 - pmaddwd m13, m3 - paddd m12, m13 - psrad m12, %2 +%ifdef PIC + lea rfilterq, [%%table] +%else + %define rfilterq %%table +%endif - %if block_truncated == 0 - punpckhwd m4, m5 - pmaddwd m4, m0 +%if %2 == 8 + pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2 + pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4 + pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6 + pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8 + paddw m0, m2 + paddw m4, m6 + paddw m0, m4 +%else + pmaddwd m0, [rfilterq + %3q*8 ] + pmaddwd m2, [rfilterq + %3q*8+%%offset] + pmaddwd m4, [rfilterq + %3q*8+2*%%offset] + pmaddwd m6, [rfilterq + %3q*8+3*%%offset] + paddd m0, m2 + paddd m4, m6 + paddd m0, m4 +%if %2 != 8 + psrad m0, %2-8 +%endif +%if %1 > 4 + pmaddwd m1, [rfilterq + %3q*8 ] + pmaddwd m3, [rfilterq + %3q*8+%%offset] + pmaddwd m5, [rfilterq + %3q*8+2*%%offset] + pmaddwd m7, [rfilterq + %3q*8+3*%%offset] + paddd m1, m3 + paddd m5, m7 + paddd m1, m5 +%if %2 != 8 + psrad m1, %2-8 +%endif +%endif + p%4 m0, m1 +%endif +%endmacro - punpckhwd m6, m7 - pmaddwd m6, m1 - paddd m4, m6 +%macro QPEL_COMPUTE 2-3 ; width, bitdepth +%if %2 == 8 +%if cpuflag(avx2) && (%0 == 3) - punpckhwd m8, m9 - pmaddwd m8, m2 - paddd m4, m8 + vperm2i128 m10, m0, m1, q0301 + vinserti128 m0, m0, xm1, 1 + SWAP 1, 10 - punpckhwd m10, m11 - pmaddwd m10, m3 - paddd m4, m10 + vperm2i128 m10, m2, m3, q0301 + vinserti128 m2, m2, xm3, 1 + SWAP 3, 10 - psrad m4, %2 - %endif - packssdw m12, m4 - STORE [dstq + i * 2 * blocksize], m12 -%assign i (i + 1) -%endrep + vperm2i128 m10, m4, m5, q0301 + vinserti128 m4, m4, xm5, 1 + SWAP 5, 10 - add dstq, dststrideq - add srcq, srcstrideq -%if %3 - add srcm3q, srcstrideq + vperm2i128 m10, m6, m7, q0301 + vinserti128 m6, m6, xm7, 1 + SWAP 7, 10 %endif - dec heightd - jg .loop - RET + pmaddubsw m0, m12 ;x1*c1+x2*c2 + pmaddubsw m2, m13 ;x3*c3+x4*c4 + pmaddubsw m4, m14 ;x5*c5+x6*c6 + pmaddubsw m6, m15 ;x7*c7+x8*c8 + paddw m0, m2 + paddw m4, m6 + paddw m0, m4 +%if %1 > 8 + pmaddubsw m1, m12 + pmaddubsw m3, m13 + pmaddubsw m5, m14 + pmaddubsw m7, m15 + paddw m1, m3 + paddw m5, m7 + paddw m1, m5 +%endif +%else + pmaddwd m0, m12 + pmaddwd m2, m13 + pmaddwd m4, m14 + pmaddwd m6, m15 + paddd m0, m2 + paddd m4, m6 + paddd m0, m4 +%if %2 != 8 + psrad m0, %2-8 +%endif +%if %1 > 4 + pmaddwd m1, m12 + pmaddwd m3, m13 + pmaddwd m5, m14 + pmaddwd m7, m15 + paddd m1, m3 + paddd m5, m7 + paddd m1, m5 +%if %2 != 8 + psrad m1, %2-8 +%endif +%endif +%endif %endmacro -%if ARCH_X86_64 - -%macro QPEL_H_10 1 -cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg -QPEL_16 %1, 2, 0 -%endmacro - -INIT_XMM avx -QPEL_H_10 4 -QPEL_H_10 8 -QPEL_H_10 12 -QPEL_H_10 16 -QPEL_H_10 24 -QPEL_H_10 32 -QPEL_H_10 48 -QPEL_H_10 64 - -%macro QPEL_V_10 1 -cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg -QPEL_16 %1, 2, 1 -%endmacro - -INIT_XMM avx -QPEL_V_10 4 -QPEL_V_10 8 -QPEL_V_10 12 -QPEL_V_10 16 -QPEL_V_10 24 -QPEL_V_10 32 -QPEL_V_10 48 -QPEL_V_10 64 - -; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride, -; uint8_t *src, ptrdiff_t srcstride, -; int height, int mx, int my, int *mcbuffer) - -%macro QPEL_HV 1 -cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg -QPEL_16 %1, 6, 1 -%endmacro - -INIT_XMM avx -QPEL_HV 4 -QPEL_HV 8 -QPEL_HV 12 -QPEL_HV 16 -QPEL_HV 24 -QPEL_HV 32 -QPEL_HV 48 -QPEL_HV 64 - -%endif ; ARCH_X86_64 - -; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride, -; uint8_t *src, ptrdiff_t srcstride, -; int height, int mx, int my, int *mcbuffer) +%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw + paddsw %3, %5 +%if %1 > 8 + paddsw %4, %6 +%endif + UNI_COMPUTE %1, %2, %3, %4, %7 +%if %0 == 8 && cpuflag(avx2) && (%2 == 8) + vpermq %3, %3, 216 + vpermq %4, %4, 216 +%endif +%endmacro -; 8-bit epel interpolation -; %1: block width -; %2: 0 - horizontal; 1 - vertical -%macro EPEL_8 2 -%if %2 - %define postfix v - %define mvfrac myq - %define coeffsaddr r5q - %define pixstride srcstrideq - %define pixstride3 r5q +%macro UNI_COMPUTE 5 + pmulhrsw %3, %5 +%if %1 > 8 || (%2 > 8 && %1 > 4) + pmulhrsw %4, %5 +%endif +%if %2 == 8 + packuswb %3, %4 %else - %define postfix h - %define mvfrac mxq - %define coeffsaddr r6q - %define pixstride 1 - %define pixstride3 3 + CLIPW %3, [pb_0], [max_pixels_%2] +%if (%1 > 8 && notcpuflag(avx)) || %1 > 16 + CLIPW %4, [pb_0], [max_pixels_%2] +%endif %endif +%endmacro + - COMMON_DEFS %1, 8 +; ****************************** +; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my) +; ****************************** -cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my - and mvfrac, 0x7 - dec mvfrac - shl mvfrac, 4 - lea coeffsaddr, [hevc_epel_coeffs8] - movq m0, [coeffsaddr + mvfrac] +%macro HEVC_PUT_HEVC_PEL_PIXELS 2 +HEVC_PEL_PIXELS %1, %2 +HEVC_UNI_PEL_PIXELS %1, %2 +HEVC_BI_PEL_PIXELS %1, %2 +%endmacro - SPLATW m1, m0, 1 - SPLATW m0, m0, 0 +%macro HEVC_PEL_PIXELS 2 +cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height + pxor m2, m2 +.loop: + SIMPLE_LOAD %1, %2, srcq, m0 + MC_PIXEL_COMPUTE %1, %2, 1 + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET + %endmacro -%if %2 - lea pixstride3, [srcstrideq + 2 * srcstrideq] -%endif - sub srcq, pixstride +%macro HEVC_UNI_PEL_PIXELS 2 +cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height +.loop: + SIMPLE_LOAD %1, %2, srcq, m0 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro +%macro HEVC_BI_PEL_PIXELS 2 +cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height + pxor m2, m2 + movdqa m5, [pw_bi_%2] .loop: + SIMPLE_LOAD %1, %2, srcq, m0 + SIMPLE_BILOAD %1, src2q, m3, m4 + MC_PIXEL_COMPUTE %1, %2, 1 + BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro -%assign i 0 -%rep nb_blocks - BLOCK_DEFS i +; ****************************** +; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width); +; ****************************** - LOAD m2, [srcq + i * blocksize + 0 * pixstride] - LOAD m3, [srcq + i * blocksize + 1 * pixstride] - LOAD m4, [srcq + i * blocksize + 2 * pixstride] - LOAD m5, [srcq + i * blocksize + pixstride3] - punpcklbw m2, m3 - punpcklbw m4, m5 +%macro HEVC_PUT_HEVC_EPEL 2 +%if cpuflag(avx2) +%define XMM_REGS 11 +%else +%define XMM_REGS 8 +%endif + +cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter +%assign %%stride ((%2 + 7)/8) + EPEL_FILTER %2, mx, m4, m5, rfilter +.loop: + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m4, m5, 1 + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET - pmaddubsw m2, m0 - pmaddubsw m4, m1 +cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter +%assign %%stride ((%2 + 7)/8) + movdqa m6, [pw_%2] + EPEL_FILTER %2, mx, m4, m5, rfilter +.loop: + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m4, m5 + UNI_COMPUTE %1, %2, m0, m1, m6 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET - paddsw m2, m4 +cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter + movdqa m6, [pw_bi_%2] + EPEL_FILTER %2, mx, m4, m5, rfilter +.loop: + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m4, m5, 1 + SIMPLE_BILOAD %1, src2q, m2, m3 + BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET - STORE [dstq + i * 2 * blocksize], m2 +; ****************************** +; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width) +; ****************************** + +cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my + movifnidn myd, mym + sub srcq, srcstrideq + EPEL_FILTER %2, my, m4, m5, r3src + lea r3srcq, [srcstrideq*3] +.loop: + EPEL_LOAD %2, srcq, srcstride, %1 + EPEL_COMPUTE %2, %1, m4, m5, 1 + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET -%assign i (i + 1) -%endrep +cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my + movifnidn myd, mym + movdqa m6, [pw_%2] + sub srcq, srcstrideq + EPEL_FILTER %2, my, m4, m5, r3src + lea r3srcq, [srcstrideq*3] +.loop: + EPEL_LOAD %2, srcq, srcstride, %1 + EPEL_COMPUTE %2, %1, m4, m5 + UNI_COMPUTE %1, %2, m0, m1, m6 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET - add dstq, dststrideq - add srcq, srcstrideq - dec heightd - jg .loop +cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my + movifnidn myd, mym + movdqa m6, [pw_bi_%2] + sub srcq, srcstrideq + EPEL_FILTER %2, my, m4, m5, r3src + lea r3srcq, [srcstrideq*3] +.loop: + EPEL_LOAD %2, srcq, srcstride, %1 + EPEL_COMPUTE %2, %1, m4, m5, 1 + SIMPLE_BILOAD %1, src2q, m2, m3 + BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop RET %endmacro -INIT_XMM ssse3 -EPEL_8 4, 0 -EPEL_8 8, 0 -EPEL_8 12, 0 -EPEL_8 16, 0 -EPEL_8 24, 0 -EPEL_8 32, 0 - -EPEL_8 4, 1 -EPEL_8 8, 1 -EPEL_8 12, 1 -EPEL_8 16, 1 -EPEL_8 24, 1 -EPEL_8 32, 1 -%macro EPEL_16 3 -%if %3 - %define mvfrac myq - %define pixstride srcstrideq - %define pixstride3 sstride3q +; ****************************** +; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width) +; ****************************** + +%macro HEVC_PUT_HEVC_EPEL_HV 2 +cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src +%assign %%stride ((%2 + 7)/8) + sub srcq, srcstrideq + EPEL_HV_FILTER %2 + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif + SWAP m4, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif + SWAP m5, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif + SWAP m6, m0 + add srcq, srcstrideq +.loop: + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif + SWAP m7, m0 + punpcklwd m0, m4, m5 + punpcklwd m2, m6, m7 +%if %1 > 4 + punpckhwd m1, m4, m5 + punpckhwd m3, m6, m7 +%endif + EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 +%if cpuflag(avx2) + vinserti128 m2, m0, xm4, 1 + vperm2i128 m3, m0, m4, q0301 + PEL_10STORE%1 dstq, m2, m3 %else - %define mvfrac mxq - %define pixstride 2 - %define pixstride3 6 + PEL_10STORE%1 dstq, m0, m4 %endif +%else + PEL_10STORE%1 dstq, m0, m1 +%endif + movdqa m4, m5 + movdqa m5, m6 + movdqa m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif + LOOP_END dst, src, srcstride + RET - COMMON_DEFS %1, 16 +cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src +%assign %%stride ((%2 + 7)/8) + sub srcq, srcstrideq + EPEL_HV_FILTER %2 + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif + SWAP m4, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif + SWAP m5, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif + SWAP m6, m0 + add srcq, srcstrideq +.loop: + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif + mova m7, m0 + punpcklwd m0, m4, m5 + punpcklwd m2, m6, m7 +%if %1 > 4 + punpckhwd m1, m4, m5 + punpckhwd m3, m6, m7 +%endif + EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 + UNI_COMPUTE %1, %2, m0, m4, [pw_%2] +%else + UNI_COMPUTE %1, %2, m0, m1, [pw_%2] +%endif + PEL_%2STORE%1 dstq, m0, m1 + mova m4, m5 + mova m5, m6 + mova m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET - and mvfrac, 0x7 - dec mvfrac - shl mvfrac, 5 - lea coeffsregq, [hevc_epel_coeffs] - mova m0, [coeffsregq + mvfrac] +cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src +%assign %%stride ((%2 + 7)/8) + sub srcq, srcstrideq + EPEL_HV_FILTER %2 + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif + SWAP m4, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif + SWAP m5, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif + SWAP m6, m0 + add srcq, srcstrideq +.loop: + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif + SWAP m7, m0 + punpcklwd m0, m4, m5 + punpcklwd m2, m6, m7 +%if %1 > 4 + punpckhwd m1, m4, m5 + punpckhwd m3, m6, m7 +%endif + EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 + SIMPLE_BILOAD %1, src2q, m8, m3 +%if cpuflag(avx2) + vinserti128 m1, m8, xm3, 1 + vperm2i128 m2, m8, m3, q0301 + BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2] +%else + BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2] +%endif +%else + SIMPLE_BILOAD %1, src2q, m8, m9 + BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] +%endif + PEL_%2STORE%1 dstq, m0, m4 + mova m4, m5 + mova m5, m6 + mova m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro - pshufd m1, m0, 0x55 - pshufd m0, m0, 0x00 +; ****************************** +; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width) +; ****************************** -%if %3 - lea sstride3q, [srcstrideq + 2 * srcstrideq] +%macro HEVC_PUT_HEVC_QPEL 2 +cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter + QPEL_FILTER %2, mx +.loop: + QPEL_H_LOAD %2, srcq, %1, 10 + QPEL_COMPUTE %1, %2, 1 +%if %2 > 8 + packssdw m0, m1 %endif - sub srcq, pixstride + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET +cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter + mova m9, [pw_%2] + QPEL_FILTER %2, mx .loop: - -%assign i 0 -%rep nb_blocks - - BLOCK_DEFS i - - LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride] - LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride] - LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride] - LOAD m5, [srcq + i * 2 * blocksize + pixstride3] - - punpcklwd m6, m2, m3 - punpcklwd m7, m4, m5 - pmaddwd m6, m0 - pmaddwd m7, m1 - paddd m6, m7 - psrad m6, %2 - - %if block_truncated == 0 - punpckhwd m2, m3 - punpckhwd m4, m5 - pmaddwd m2, m0 - pmaddwd m4, m1 - paddd m2, m4 - psrad m2, %2 - %endif - packssdw m6, m2 - STORE [dstq + i * 2 * blocksize], m6 - -%assign i (i + 1) -%endrep - - add dstq, dststrideq - add srcq, srcstrideq - - dec heightd - jg .loop + QPEL_H_LOAD %2, srcq, %1, 10 + QPEL_COMPUTE %1, %2 +%if %2 > 8 + packssdw m0, m1 +%endif + UNI_COMPUTE %1, %2, m0, m1, m9 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop RET -%endmacro -%if ARCH_X86_64 - -%macro EPEL_H_10 1 -cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg -EPEL_16 %1, 2, 0 -%endmacro +cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter + movdqa m9, [pw_bi_%2] + QPEL_FILTER %2, mx +.loop: + QPEL_H_LOAD %2, srcq, %1, 10 + QPEL_COMPUTE %1, %2, 1 +%if %2 > 8 + packssdw m0, m1 +%endif + SIMPLE_BILOAD %1, src2q, m10, m11 + BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET -INIT_XMM avx -EPEL_H_10 4 -EPEL_H_10 8 -EPEL_H_10 12 -EPEL_H_10 16 -EPEL_H_10 24 -EPEL_H_10 32 -%macro EPEL_V_10 1 -cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg -EPEL_16 %1, 2, 1 -%endmacro +; ****************************** +; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width) +; ****************************** -INIT_XMM avx -EPEL_V_10 4 -EPEL_V_10 8 -EPEL_V_10 12 -EPEL_V_10 16 -EPEL_V_10 24 -EPEL_V_10 32 +cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter + movifnidn myd, mym + lea r3srcq, [srcstrideq*3] + QPEL_FILTER %2, my +.loop: + QPEL_V_LOAD %2, srcq, srcstride, %1, r7 + QPEL_COMPUTE %1, %2, 1 +%if %2 > 8 + packssdw m0, m1 +%endif + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET -; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride, -; int16_t *src, ptrdiff_t srcstride, -; int height, int mx, int my, int *mcbuffer) +cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter + movifnidn myd, mym + movdqa m9, [pw_%2] + lea r3srcq, [srcstrideq*3] + QPEL_FILTER %2, my +.loop: + QPEL_V_LOAD %2, srcq, srcstride, %1, r8 + QPEL_COMPUTE %1, %2 +%if %2 > 8 + packssdw m0, m1 +%endif + UNI_COMPUTE %1, %2, m0, m1, m9 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET -%macro EPEL_HV 1 -cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg -EPEL_16 %1, 6, 1 +cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter + movifnidn myd, mym + movdqa m9, [pw_bi_%2] + lea r3srcq, [srcstrideq*3] + QPEL_FILTER %2, my +.loop: + QPEL_V_LOAD %2, srcq, srcstride, %1, r9 + QPEL_COMPUTE %1, %2, 1 +%if %2 > 8 + packssdw m0, m1 +%endif + SIMPLE_BILOAD %1, src2q, m10, m11 + BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET %endmacro -INIT_XMM avx -EPEL_HV 4 -EPEL_HV 8 -EPEL_HV 12 -EPEL_HV 16 -EPEL_HV 24 -EPEL_HV 32 -%endif ; ARCH_X86_64 +; ****************************** +; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my) +; ****************************** +%macro HEVC_PUT_HEVC_QPEL_HV 2 +cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter +%if cpuflag(avx2) +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + lea r3srcq, [srcstrideq*3] + sub srcq, r3srcq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m8, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m9, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m10, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m11, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m12, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m13, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m14, m0 + add srcq, srcstrideq +.loop: + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m15, m0 + punpcklwd m0, m8, m9 + punpcklwd m2, m10, m11 + punpcklwd m4, m12, m13 + punpcklwd m6, m14, m15 +%if %1 > 4 + punpckhwd m1, m8, m9 + punpckhwd m3, m10, m11 + punpckhwd m5, m12, m13 + punpckhwd m7, m14, m15 +%endif + QPEL_HV_COMPUTE %1, 14, my, ackssdw + PEL_10STORE%1 dstq, m0, m1 +%if %1 <= 4 + movq m8, m9 + movq m9, m10 + movq m10, m11 + movq m11, m12 + movq m12, m13 + movq m13, m14 + movq m14, m15 +%else + movdqa m8, m9 + movdqa m9, m10 + movdqa m10, m11 + movdqa m11, m12 + movdqa m12, m13 + movdqa m13, m14 + movdqa m14, m15 +%endif + LOOP_END dst, src, srcstride + RET -; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride, -; int16_t *src, ptrdiff_t srcstride, -; int height) +cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter +%if cpuflag(avx2) +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + lea r3srcq, [srcstrideq*3] + sub srcq, r3srcq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m8, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m9, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m10, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m11, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m12, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m13, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m14, m0 + add srcq, srcstrideq +.loop: + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m15, m0 + punpcklwd m0, m8, m9 + punpcklwd m2, m10, m11 + punpcklwd m4, m12, m13 + punpcklwd m6, m14, m15 +%if %1 > 4 + punpckhwd m1, m8, m9 + punpckhwd m3, m10, m11 + punpckhwd m5, m12, m13 + punpckhwd m7, m14, m15 +%endif + QPEL_HV_COMPUTE %1, 14, my, ackusdw + UNI_COMPUTE %1, %2, m0, m1, [pw_%2] + PEL_%2STORE%1 dstq, m0, m1 + +%if %1 <= 4 + movq m8, m9 + movq m9, m10 + movq m10, m11 + movq m11, m12 + movq m12, m13 + movq m13, m14 + movq m14, m15 +%else + mova m8, m9 + mova m9, m10 + mova m10, m11 + mova m11, m12 + mova m12, m13 + mova m13, m14 + mova m14, m15 +%endif + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET -%macro AVG 5 - %if %3 - %if %4 == 4 - movq %5, %2 - paddsw %1, %5 - %else - paddsw %1, %2 - %endif - %endif +cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter +%if cpuflag(avx2) +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + lea r3srcq, [srcstrideq*3] + sub srcq, r3srcq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m8, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m9, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m10, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m11, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m12, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m13, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m14, m0 + add srcq, srcstrideq +.loop: + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m15, m0 + punpcklwd m0, m8, m9 + punpcklwd m2, m10, m11 + punpcklwd m4, m12, m13 + punpcklwd m6, m14, m15 +%if %1 > 4 + punpckhwd m1, m8, m9 + punpckhwd m3, m10, m11 + punpckhwd m5, m12, m13 + punpckhwd m7, m14, m15 +%endif + QPEL_HV_COMPUTE %1, 14, my, ackssdw + SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case + BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] + PEL_%2STORE%1 dstq, m0, m1 + +%if %1 <= 4 + movq m8, m9 + movq m9, m10 + movq m10, m11 + movq m11, m12 + movq m12, m13 + movq m13, m14 + movq m14, m15 +%else + movdqa m8, m9 + movdqa m9, m10 + movdqa m10, m11 + movdqa m11, m12 + movdqa m12, m13 + movdqa m13, m14 + movdqa m14, m15 +%endif + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET %endmacro -; %1: 0 - one source; 1 - two sources -; %2: width -; %3: bit depth -%macro PUT_PRED 3 -%if %1 -cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height +%macro WEIGHTING_FUNCS 2 +%if WIN64 || ARCH_X86_32 +cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox + mov r4d, denomm +%define SHIFT r4d %else -cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height +cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox +%define SHIFT denomd %endif + lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom +%if %1 <= 4 + pxor m1, m1 +%endif + movd m2, wxm ; WX + movd m4, SHIFT ; shift +%if %1 <= 4 + punpcklwd m2, m1 +%else + punpcklwd m2, m2 +%endif + dec SHIFT + movdqu m5, [pd_1] + movd m6, SHIFT + pshufd m2, m2, 0 + mov SHIFT, oxm + pslld m5, m6 +%if %2 != 8 + shl SHIFT, %2-8 ; ox << (bitd - 8) +%endif + movd m3, SHIFT ; OX + pshufd m3, m3, 0 +%if WIN64 || ARCH_X86_32 + mov SHIFT, heightm +%endif +.loop: + SIMPLE_LOAD %1, 10, srcq, m0 +%if %1 <= 4 + punpcklwd m0, m1 + pmaddwd m0, m2 + paddd m0, m5 + psrad m0, m4 + paddd m0, m3 +%else + pmulhw m6, m0, m2 + pmullw m0, m2 + punpckhwd m1, m0, m6 + punpcklwd m0, m6 + paddd m0, m5 + paddd m1, m5 + psrad m0, m4 + psrad m1, m4 + paddd m0, m3 + paddd m1, m3 +%endif + packssdw m0, m1 +%if %2 == 8 + packuswb m0, m0 +%else + CLIPW m0, [pb_0], [max_pixels_%2] +%endif + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET -%assign shift 14 + %1 - %3 -%assign offset (1 << (shift - 1)) -%define offset_data pw_ %+ offset - - mova m0, [offset_data] - -%if %3 > 8 - %define STORE_BLOCK movu - %define STORE_HALF movq - - %assign pixel_max ((1 << %3) - 1) - %define pw_pixel_max pw_ %+ pixel_max - pxor m1, m1 - mova m2, [pw_pixel_max] +cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1 + movifnidn r5d, denomm +%if %1 <= 4 + pxor m1, m1 +%endif + movd m2, wx0m ; WX0 + lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom + movd m3, wx1m ; WX1 + movd m0, r5d ; shift +%if %1 <= 4 + punpcklwd m2, m1 + punpcklwd m3, m1 +%else + punpcklwd m2, m2 + punpcklwd m3, m3 +%endif + inc r5d + movd m5, r5d ; shift+1 + pshufd m2, m2, 0 + mov r5d, ox0m + pshufd m3, m3, 0 + add r5d, ox1m +%if %2 != 8 + shl r5d, %2-8 ; ox << (bitd - 8) +%endif + inc r5d + movd m4, r5d ; offset + pshufd m4, m4, 0 +%if UNIX64 +%define h heightd %else - %define STORE_BLOCK movq - %define STORE_HALF movd + mov r5d, heightm +%define h r5d %endif + pslld m4, m0 .loop: -%assign i 0 -%rep (%2 + 7) / 8 - - %if (i + 1) * 8 > %2 - %define LOAD movq - %define STORE STORE_HALF - %else - %define LOAD mova - %define STORE STORE_BLOCK - %endif - - LOAD m3, [srcq + 16 * i] - AVG m3, [src2q + 16 * i], %1, %3 - i * 8, m4 - - paddsw m3, m0 - psraw m3, shift - - %if %3 == 8 - packuswb m3, m3 - STORE [dstq + 8 * i], m3 - %else - CLIPW m3, m1, m2 - STORE [dstq + 16 * i], m3 - %endif -%assign i (i + 1) -%endrep - - add dstq, dststrideq - add srcq, srcstrideq -%if %1 - add src2q, srcstrideq -%endif - - dec heightd - jg .loop + SIMPLE_LOAD %1, 10, srcq, m0 + SIMPLE_LOAD %1, 10, src2q, m8 +%if %1 <= 4 + punpcklwd m0, m1 + punpcklwd m8, m1 + pmaddwd m0, m3 + pmaddwd m8, m2 + paddd m0, m4 + paddd m0, m8 + psrad m0, m5 +%else + pmulhw m6, m0, m3 + pmullw m0, m3 + pmulhw m7, m8, m2 + pmullw m8, m2 + punpckhwd m1, m0, m6 + punpcklwd m0, m6 + punpckhwd m9, m8, m7 + punpcklwd m8, m7 + paddd m0, m8 + paddd m1, m9 + paddd m0, m4 + paddd m1, m4 + psrad m0, m5 + psrad m1, m5 +%endif + packssdw m0, m1 +%if %2 == 8 + packuswb m0, m0 +%else + CLIPW m0, [pb_0], [max_pixels_%2] +%endif + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, 2*MAX_PB_SIZE ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src2 += srcstride + dec h ; cmp height + jnz .loop ; height loop RET %endmacro -INIT_XMM sse2 -PUT_PRED 0, 4, 8 -PUT_PRED 1, 4, 8 -PUT_PRED 0, 8, 8 -PUT_PRED 1, 8, 8 -PUT_PRED 0, 12, 8 -PUT_PRED 1, 12, 8 -PUT_PRED 0, 16, 8 -PUT_PRED 1, 16, 8 -PUT_PRED 0, 24, 8 -PUT_PRED 1, 24, 8 -PUT_PRED 0, 32, 8 -PUT_PRED 1, 32, 8 -PUT_PRED 0, 48, 8 -PUT_PRED 1, 48, 8 -PUT_PRED 0, 64, 8 -PUT_PRED 1, 64, 8 - -PUT_PRED 0, 4, 10 -PUT_PRED 1, 4, 10 -PUT_PRED 0, 8, 10 -PUT_PRED 1, 8, 10 -PUT_PRED 0, 12, 10 -PUT_PRED 1, 12, 10 -PUT_PRED 0, 16, 10 -PUT_PRED 1, 16, 10 -PUT_PRED 0, 24, 10 -PUT_PRED 1, 24, 10 -PUT_PRED 0, 32, 10 -PUT_PRED 1, 32, 10 -PUT_PRED 0, 48, 10 -PUT_PRED 1, 48, 10 -PUT_PRED 0, 64, 10 -PUT_PRED 1, 64, 10 - -%macro PUT_WEIGHTED_PRED 3 -%if %1 -cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height -%else -cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height -%endif - - and denomd, 0xff - movsx weight0d, weight0w - movsx offset0d, offset0w -%if %1 - movsx weight1d, weight1w - movsx offset1d, offset1w -%endif - - add denomd, 14 + %1 - %3 - movd m0, denomd +INIT_XMM sse4 ; adds ff_ and _sse4 to function name + +WEIGHTING_FUNCS 2, 8 +WEIGHTING_FUNCS 4, 8 +WEIGHTING_FUNCS 6, 8 +WEIGHTING_FUNCS 8, 8 + +WEIGHTING_FUNCS 2, 10 +WEIGHTING_FUNCS 4, 10 +WEIGHTING_FUNCS 6, 10 +WEIGHTING_FUNCS 8, 10 -%if %3 > 8 - %assign pixel_max ((1 << %3) - 1) - %define pw_pixel_max pw_ %+ pixel_max - pxor m4, m4 - mova m5, [pw_pixel_max] +WEIGHTING_FUNCS 2, 12 +WEIGHTING_FUNCS 4, 12 +WEIGHTING_FUNCS 6, 12 +WEIGHTING_FUNCS 8, 12 - shl offset0d, %3 - 8 -%if %1 - shl offset1d, %3 - 8 -%endif -%endif +HEVC_PUT_HEVC_PEL_PIXELS 2, 8 +HEVC_PUT_HEVC_PEL_PIXELS 4, 8 +HEVC_PUT_HEVC_PEL_PIXELS 6, 8 +HEVC_PUT_HEVC_PEL_PIXELS 8, 8 +HEVC_PUT_HEVC_PEL_PIXELS 12, 8 +HEVC_PUT_HEVC_PEL_PIXELS 16, 8 -%if %1 - lea offset0d, [offset0d + offset1d + 1] -%else - lea offset0d, [2 * offset0d + 1] -%endif - movd m1, offset0d - SPLATD m1 - pslld m1, m0 - psrad m1, 1 +HEVC_PUT_HEVC_PEL_PIXELS 2, 10 +HEVC_PUT_HEVC_PEL_PIXELS 4, 10 +HEVC_PUT_HEVC_PEL_PIXELS 6, 10 +HEVC_PUT_HEVC_PEL_PIXELS 8, 10 - movd m2, weight0d - SPLATD m2 -%if %1 - movd m3, weight1d - SPLATD m3 -%endif +HEVC_PUT_HEVC_PEL_PIXELS 2, 12 +HEVC_PUT_HEVC_PEL_PIXELS 4, 12 +HEVC_PUT_HEVC_PEL_PIXELS 6, 12 +HEVC_PUT_HEVC_PEL_PIXELS 8, 12 -.loop: -%assign i 0 -%rep (%2 + 3) / 4 +HEVC_PUT_HEVC_EPEL 2, 8 +HEVC_PUT_HEVC_EPEL 4, 8 +HEVC_PUT_HEVC_EPEL 6, 8 +HEVC_PUT_HEVC_EPEL 8, 8 +HEVC_PUT_HEVC_EPEL 12, 8 +HEVC_PUT_HEVC_EPEL 16, 8 - pmovsxwd m6, [src0q + 8 * i] - pmulld m6, m2 -%if %1 - pmovsxwd m7, [src1q + 8 * i] - pmulld m7, m3 - paddd m6, m7 -%endif +HEVC_PUT_HEVC_EPEL 2, 10 +HEVC_PUT_HEVC_EPEL 4, 10 +HEVC_PUT_HEVC_EPEL 6, 10 +HEVC_PUT_HEVC_EPEL 8, 10 - paddd m6, m1 - psrad m6, m0 +HEVC_PUT_HEVC_EPEL 2, 12 +HEVC_PUT_HEVC_EPEL 4, 12 +HEVC_PUT_HEVC_EPEL 6, 12 +HEVC_PUT_HEVC_EPEL 8, 12 - packssdw m6, m6 +HEVC_PUT_HEVC_EPEL_HV 2, 8 +HEVC_PUT_HEVC_EPEL_HV 4, 8 +HEVC_PUT_HEVC_EPEL_HV 6, 8 +HEVC_PUT_HEVC_EPEL_HV 8, 8 +HEVC_PUT_HEVC_EPEL_HV 16, 8 -%if %3 > 8 - CLIPW m6, m4, m5 - movq [dstq + 8 * i], m6 -%else - packuswb m6, m6 - movd [dstq + 4 * i], m6 -%endif +HEVC_PUT_HEVC_EPEL_HV 2, 10 +HEVC_PUT_HEVC_EPEL_HV 4, 10 +HEVC_PUT_HEVC_EPEL_HV 6, 10 +HEVC_PUT_HEVC_EPEL_HV 8, 10 -%assign i (i + 1) -%endrep +HEVC_PUT_HEVC_EPEL_HV 2, 12 +HEVC_PUT_HEVC_EPEL_HV 4, 12 +HEVC_PUT_HEVC_EPEL_HV 6, 12 +HEVC_PUT_HEVC_EPEL_HV 8, 12 - add dstq, dststrideq - add src0q, srcstrideq -%if %1 - add src1q, srcstrideq -%endif +HEVC_PUT_HEVC_QPEL 4, 8 +HEVC_PUT_HEVC_QPEL 8, 8 +HEVC_PUT_HEVC_QPEL 12, 8 +HEVC_PUT_HEVC_QPEL 16, 8 - dec heightd - jg .loop - RET -%endmacro +HEVC_PUT_HEVC_QPEL 4, 10 +HEVC_PUT_HEVC_QPEL 8, 10 -%if ARCH_X86_64 -INIT_XMM sse4 -PUT_WEIGHTED_PRED 0, 4, 8 -PUT_WEIGHTED_PRED 1, 4, 8 -PUT_WEIGHTED_PRED 0, 8, 8 -PUT_WEIGHTED_PRED 1, 8, 8 -PUT_WEIGHTED_PRED 0, 12, 8 -PUT_WEIGHTED_PRED 1, 12, 8 -PUT_WEIGHTED_PRED 0, 16, 8 -PUT_WEIGHTED_PRED 1, 16, 8 -PUT_WEIGHTED_PRED 0, 24, 8 -PUT_WEIGHTED_PRED 1, 24, 8 -PUT_WEIGHTED_PRED 0, 32, 8 -PUT_WEIGHTED_PRED 1, 32, 8 -PUT_WEIGHTED_PRED 0, 48, 8 -PUT_WEIGHTED_PRED 1, 48, 8 -PUT_WEIGHTED_PRED 0, 64, 8 -PUT_WEIGHTED_PRED 1, 64, 8 - -PUT_WEIGHTED_PRED 0, 4, 10 -PUT_WEIGHTED_PRED 1, 4, 10 -PUT_WEIGHTED_PRED 0, 8, 10 -PUT_WEIGHTED_PRED 1, 8, 10 -PUT_WEIGHTED_PRED 0, 12, 10 -PUT_WEIGHTED_PRED 1, 12, 10 -PUT_WEIGHTED_PRED 0, 16, 10 -PUT_WEIGHTED_PRED 1, 16, 10 -PUT_WEIGHTED_PRED 0, 24, 10 -PUT_WEIGHTED_PRED 1, 24, 10 -PUT_WEIGHTED_PRED 0, 32, 10 -PUT_WEIGHTED_PRED 1, 32, 10 -PUT_WEIGHTED_PRED 0, 48, 10 -PUT_WEIGHTED_PRED 1, 48, 10 -PUT_WEIGHTED_PRED 0, 64, 10 -PUT_WEIGHTED_PRED 1, 64, 10 +HEVC_PUT_HEVC_QPEL 4, 12 +HEVC_PUT_HEVC_QPEL 8, 12 + +HEVC_PUT_HEVC_QPEL_HV 2, 8 +HEVC_PUT_HEVC_QPEL_HV 4, 8 +HEVC_PUT_HEVC_QPEL_HV 6, 8 +HEVC_PUT_HEVC_QPEL_HV 8, 8 + +HEVC_PUT_HEVC_QPEL_HV 2, 10 +HEVC_PUT_HEVC_QPEL_HV 4, 10 +HEVC_PUT_HEVC_QPEL_HV 6, 10 +HEVC_PUT_HEVC_QPEL_HV 8, 10 + +HEVC_PUT_HEVC_QPEL_HV 2, 12 +HEVC_PUT_HEVC_QPEL_HV 4, 12 +HEVC_PUT_HEVC_QPEL_HV 6, 12 +HEVC_PUT_HEVC_QPEL_HV 8, 12 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0 + +HEVC_PUT_HEVC_PEL_PIXELS 32, 8 +HEVC_PUT_HEVC_PEL_PIXELS 16, 10 + +HEVC_PUT_HEVC_EPEL 32, 8 +HEVC_PUT_HEVC_EPEL 16, 10 + +HEVC_PUT_HEVC_EPEL_HV 16, 10 +HEVC_PUT_HEVC_EPEL_HV 32, 8 + +HEVC_PUT_HEVC_QPEL 32, 8 + +HEVC_PUT_HEVC_QPEL 16, 10 + +HEVC_PUT_HEVC_QPEL_HV 16, 10 +%endif ;AVX2 %endif ; ARCH_X86_64 |