diff options
author | Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr> | 2015-02-05 19:20:39 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2015-02-06 17:20:47 +0100 |
commit | a0d1300f7112e9e4577379669bc44660bb20bda2 (patch) | |
tree | 248ee3cb6d9dfc88a6ac5f23c874b779c3ade685 /libavcodec/x86/hevcdsp.h | |
parent | f968166439e4d4fc9f352ea20b8922d42ca5c7b1 (diff) | |
download | ffmpeg-a0d1300f7112e9e4577379669bc44660bb20bda2.tar.gz |
x86: hevc_mc: add AVX2 optimizations
before
33304 decicycles in luma_bi_1, 523066 runs, 1222 skips
38138 decicycles in luma_bi_2, 523427 runs, 861 skips
13490 decicycles in luma_uni, 516138 runs, 8150 skips
after
20185 decicycles in luma_bi_1, 519970 runs, 4318 skips
24620 decicycles in luma_bi_2, 521024 runs, 3264 skips
10397 decicycles in luma_uni, 515715 runs, 8573 skips
Conflicts:
libavcodec/x86/hevc_mc.asm
libavcodec/x86/hevcdsp_init.c
Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/hevcdsp.h')
-rw-r--r-- | libavcodec/x86/hevcdsp.h | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h index 8dea1428f0..7864163bca 100644 --- a/libavcodec/x86/hevcdsp.h +++ b/libavcodec/x86/hevcdsp.h @@ -96,6 +96,40 @@ void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstst EPEL_PROTOTYPES(pel_pixels , 8, sse4); EPEL_PROTOTYPES(pel_pixels , 10, sse4); EPEL_PROTOTYPES(pel_pixels , 12, sse4); + +void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); + +void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); + + + +void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit +void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit + + +void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); + +void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); + /////////////////////////////////////////////////////////////////////////////// // EPEL /////////////////////////////////////////////////////////////////////////////// @@ -111,6 +145,42 @@ EPEL_PROTOTYPES(epel_hv , 8, sse4); EPEL_PROTOTYPES(epel_hv , 10, sse4); EPEL_PROTOTYPES(epel_hv , 12, sse4); +PEL_PROTOTYPE(epel_h16, 8, avx2); +PEL_PROTOTYPE(epel_h24, 8, avx2); +PEL_PROTOTYPE(epel_h32, 8, avx2); +PEL_PROTOTYPE(epel_h48, 8, avx2); +PEL_PROTOTYPE(epel_h64, 8, avx2); + +PEL_PROTOTYPE(epel_h16,10, avx2); +PEL_PROTOTYPE(epel_h24,10, avx2); +PEL_PROTOTYPE(epel_h32,10, avx2); +PEL_PROTOTYPE(epel_h48,10, avx2); +PEL_PROTOTYPE(epel_h64,10, avx2); + +PEL_PROTOTYPE(epel_v16, 8, avx2); +PEL_PROTOTYPE(epel_v24, 8, avx2); +PEL_PROTOTYPE(epel_v32, 8, avx2); +PEL_PROTOTYPE(epel_v48, 8, avx2); +PEL_PROTOTYPE(epel_v64, 8, avx2); + +PEL_PROTOTYPE(epel_v16,10, avx2); +PEL_PROTOTYPE(epel_v24,10, avx2); +PEL_PROTOTYPE(epel_v32,10, avx2); +PEL_PROTOTYPE(epel_v48,10, avx2); +PEL_PROTOTYPE(epel_v64,10, avx2); + +PEL_PROTOTYPE(epel_hv16, 8, avx2); +PEL_PROTOTYPE(epel_hv24, 8, avx2); +PEL_PROTOTYPE(epel_hv32, 8, avx2); +PEL_PROTOTYPE(epel_hv48, 8, avx2); +PEL_PROTOTYPE(epel_hv64, 8, avx2); + +PEL_PROTOTYPE(epel_hv16,10, avx2); +PEL_PROTOTYPE(epel_hv24,10, avx2); +PEL_PROTOTYPE(epel_hv32,10, avx2); +PEL_PROTOTYPE(epel_hv48,10, avx2); +PEL_PROTOTYPE(epel_hv64,10, avx2); + /////////////////////////////////////////////////////////////////////////////// // QPEL /////////////////////////////////////////////////////////////////////////////// @@ -126,6 +196,41 @@ QPEL_PROTOTYPES(qpel_hv, 8, sse4); QPEL_PROTOTYPES(qpel_hv, 10, sse4); QPEL_PROTOTYPES(qpel_hv, 12, sse4); +PEL_PROTOTYPE(qpel_h16, 8, avx2); +PEL_PROTOTYPE(qpel_h24, 8, avx2); +PEL_PROTOTYPE(qpel_h32, 8, avx2); +PEL_PROTOTYPE(qpel_h48, 8, avx2); +PEL_PROTOTYPE(qpel_h64, 8, avx2); + +PEL_PROTOTYPE(qpel_h16,10, avx2); +PEL_PROTOTYPE(qpel_h24,10, avx2); +PEL_PROTOTYPE(qpel_h32,10, avx2); +PEL_PROTOTYPE(qpel_h48,10, avx2); +PEL_PROTOTYPE(qpel_h64,10, avx2); + +PEL_PROTOTYPE(qpel_v16, 8, avx2); +PEL_PROTOTYPE(qpel_v24, 8, avx2); +PEL_PROTOTYPE(qpel_v32, 8, avx2); +PEL_PROTOTYPE(qpel_v48, 8, avx2); +PEL_PROTOTYPE(qpel_v64, 8, avx2); + +PEL_PROTOTYPE(qpel_v16,10, avx2); +PEL_PROTOTYPE(qpel_v24,10, avx2); +PEL_PROTOTYPE(qpel_v32,10, avx2); +PEL_PROTOTYPE(qpel_v48,10, avx2); +PEL_PROTOTYPE(qpel_v64,10, avx2); + +PEL_PROTOTYPE(qpel_hv16, 8, avx2); +PEL_PROTOTYPE(qpel_hv24, 8, avx2); +PEL_PROTOTYPE(qpel_hv32, 8, avx2); +PEL_PROTOTYPE(qpel_hv48, 8, avx2); +PEL_PROTOTYPE(qpel_hv64, 8, avx2); + +PEL_PROTOTYPE(qpel_hv16,10, avx2); +PEL_PROTOTYPE(qpel_hv24,10, avx2); +PEL_PROTOTYPE(qpel_hv32,10, avx2); +PEL_PROTOTYPE(qpel_hv48,10, avx2); +PEL_PROTOTYPE(qpel_hv64,10, avx2); WEIGHTING_PROTOTYPES(8, sse4); WEIGHTING_PROTOTYPES(10, sse4); |