summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/Makefile102
-rw-r--r--libavcodec/x86/aacencdsp.asm86
-rw-r--r--libavcodec/x86/aacencdsp_init.c43
-rw-r--r--libavcodec/x86/aacpsdsp.asm209
-rw-r--r--libavcodec/x86/aacpsdsp_init.c55
-rw-r--r--libavcodec/x86/ac3dsp.asm10
-rw-r--r--libavcodec/x86/ac3dsp_downmix.asm8
-rw-r--r--libavcodec/x86/ac3dsp_init.c8
-rw-r--r--libavcodec/x86/alacdsp.asm133
-rw-r--r--libavcodec/x86/alacdsp_init.c44
-rw-r--r--libavcodec/x86/audiodsp.asm23
-rw-r--r--libavcodec/x86/audiodsp_init.c9
-rw-r--r--libavcodec/x86/blockdsp.asm86
-rw-r--r--libavcodec/x86/blockdsp.c118
-rw-r--r--libavcodec/x86/blockdsp_init.c54
-rw-r--r--libavcodec/x86/bswapdsp.asm17
-rw-r--r--libavcodec/x86/bswapdsp_init.c8
-rw-r--r--libavcodec/x86/cabac.h60
-rw-r--r--libavcodec/x86/cavsdsp.c147
-rw-r--r--libavcodec/x86/constants.c71
-rw-r--r--libavcodec/x86/constants.h46
-rw-r--r--libavcodec/x86/dca.h60
-rw-r--r--libavcodec/x86/dcadsp.asm537
-rw-r--r--libavcodec/x86/dcadsp_init.c88
-rw-r--r--libavcodec/x86/dct32.asm10
-rw-r--r--libavcodec/x86/dct_init.c8
-rw-r--r--libavcodec/x86/dirac_dwt.asm307
-rw-r--r--libavcodec/x86/dirac_dwt_init.c229
-rw-r--r--libavcodec/x86/diracdsp.asm347
-rw-r--r--libavcodec/x86/diracdsp_init.c195
-rw-r--r--libavcodec/x86/dnxhdenc.asm8
-rw-r--r--libavcodec/x86/dnxhdenc_init.c8
-rw-r--r--libavcodec/x86/fdct.c12
-rw-r--r--libavcodec/x86/fdct.h8
-rw-r--r--libavcodec/x86/fdctdsp_init.c8
-rw-r--r--libavcodec/x86/fft.asm158
-rw-r--r--libavcodec/x86/fft.h16
-rw-r--r--libavcodec/x86/fft_init.c18
-rw-r--r--libavcodec/x86/flac_dsp_gpl.asm101
-rw-r--r--libavcodec/x86/flacdsp.asm313
-rw-r--r--libavcodec/x86/flacdsp_init.c115
-rw-r--r--libavcodec/x86/fmtconvert.asm55
-rw-r--r--libavcodec/x86/fmtconvert_init.c14
-rw-r--r--libavcodec/x86/fpel.asm134
-rw-r--r--libavcodec/x86/fpel.h18
-rw-r--r--libavcodec/x86/fpel_mmx.c140
-rw-r--r--libavcodec/x86/g722dsp.asm54
-rw-r--r--libavcodec/x86/g722dsp_init.c35
-rw-r--r--libavcodec/x86/h263_loopfilter.asm10
-rw-r--r--libavcodec/x86/h263dsp_init.c8
-rw-r--r--libavcodec/x86/h264_chromamc.asm8
-rw-r--r--libavcodec/x86/h264_chromamc_10bit.asm12
-rw-r--r--libavcodec/x86/h264_deblock.asm404
-rw-r--r--libavcodec/x86/h264_deblock_10bit.asm174
-rw-r--r--libavcodec/x86/h264_i386.h44
-rw-r--r--libavcodec/x86/h264_idct.asm40
-rw-r--r--libavcodec/x86/h264_idct_10bit.asm86
-rw-r--r--libavcodec/x86/h264_intrapred.asm50
-rw-r--r--libavcodec/x86/h264_intrapred_10bit.asm55
-rw-r--r--libavcodec/x86/h264_intrapred_init.c15
-rw-r--r--libavcodec/x86/h264_qpel.c48
-rw-r--r--libavcodec/x86/h264_qpel_10bit.asm16
-rw-r--r--libavcodec/x86/h264_qpel_8bit.asm8
-rw-r--r--libavcodec/x86/h264_weight.asm8
-rw-r--r--libavcodec/x86/h264_weight_10bit.asm11
-rw-r--r--libavcodec/x86/h264chroma_init.c8
-rw-r--r--libavcodec/x86/h264dsp_init.c136
-rw-r--r--libavcodec/x86/hevc_add_res.asm66
-rw-r--r--libavcodec/x86/hevc_deblock.asm456
-rw-r--r--libavcodec/x86/hevc_idct.asm15
-rw-r--r--libavcodec/x86/hevc_mc.asm2273
-rw-r--r--libavcodec/x86/hevc_sao.asm340
-rw-r--r--libavcodec/x86/hevc_sao_10bit.asm370
-rw-r--r--libavcodec/x86/hevcdsp.h259
-rw-r--r--libavcodec/x86/hevcdsp_init.c1318
-rw-r--r--libavcodec/x86/hpeldsp.asm318
-rw-r--r--libavcodec/x86/hpeldsp.h25
-rw-r--r--libavcodec/x86/hpeldsp_init.c86
-rw-r--r--libavcodec/x86/hpeldsp_mmx.c53
-rw-r--r--libavcodec/x86/hpeldsp_rnd_template.c57
-rw-r--r--libavcodec/x86/hpeldsp_vp3.asm8
-rw-r--r--libavcodec/x86/hpeldsp_vp3_init.c22
-rw-r--r--libavcodec/x86/huffyuvdsp.asm255
-rw-r--r--libavcodec/x86/huffyuvdsp_init.c121
-rw-r--r--libavcodec/x86/huffyuvencdsp.asm143
-rw-r--r--libavcodec/x86/huffyuvencdsp_init.c54
-rw-r--r--libavcodec/x86/idctdsp.asm183
-rw-r--r--libavcodec/x86/idctdsp.h15
-rw-r--r--libavcodec/x86/idctdsp_init.c62
-rw-r--r--libavcodec/x86/idctdsp_mmx.c168
-rw-r--r--libavcodec/x86/imdct36.asm42
-rw-r--r--libavcodec/x86/inline_asm.h10
-rw-r--r--libavcodec/x86/jpeg2000dsp.asm144
-rw-r--r--libavcodec/x86/jpeg2000dsp_init.c50
-rw-r--r--libavcodec/x86/lossless_audiodsp.asm (renamed from libavcodec/x86/apedsp.asm)55
-rw-r--r--libavcodec/x86/lossless_audiodsp_init.c (renamed from libavcodec/x86/apedsp_init.c)21
-rw-r--r--libavcodec/x86/lossless_videodsp.asm290
-rw-r--r--libavcodec/x86/lossless_videodsp_init.c118
-rw-r--r--libavcodec/x86/lossless_videoencdsp.asm150
-rw-r--r--libavcodec/x86/lossless_videoencdsp_init.c (renamed from libavcodec/x86/huffyuvencdsp_mmx.c)69
-rw-r--r--libavcodec/x86/lpc.c14
-rw-r--r--libavcodec/x86/mathops.h10
-rw-r--r--libavcodec/x86/mdct.h32
-rw-r--r--libavcodec/x86/mdct_init.c51
-rw-r--r--libavcodec/x86/me_cmp.asm646
-rw-r--r--libavcodec/x86/me_cmp_init.c938
-rw-r--r--libavcodec/x86/mlpdsp.asm196
-rw-r--r--libavcodec/x86/mlpdsp_init.c (renamed from libavcodec/x86/mlpdsp.c)43
-rw-r--r--libavcodec/x86/mpegaudiodsp.c51
-rw-r--r--libavcodec/x86/mpegvideo.c75
-rw-r--r--libavcodec/x86/mpegvideodsp.c22
-rw-r--r--libavcodec/x86/mpegvideoenc.c26
-rw-r--r--libavcodec/x86/mpegvideoenc_qns_template.c12
-rw-r--r--libavcodec/x86/mpegvideoenc_template.c71
-rw-r--r--libavcodec/x86/mpegvideoencdsp.asm151
-rw-r--r--libavcodec/x86/mpegvideoencdsp_init.c49
-rw-r--r--libavcodec/x86/pixblockdsp.asm59
-rw-r--r--libavcodec/x86/pixblockdsp_init.c11
-rw-r--r--libavcodec/x86/pngdsp.asm22
-rw-r--r--libavcodec/x86/pngdsp_init.c8
-rw-r--r--libavcodec/x86/proresdsp.asm421
-rw-r--r--libavcodec/x86/proresdsp_init.c17
-rw-r--r--libavcodec/x86/qpel.asm8
-rw-r--r--libavcodec/x86/qpeldsp.asm13
-rw-r--r--libavcodec/x86/qpeldsp_init.c18
-rw-r--r--libavcodec/x86/rnd_template.c52
-rw-r--r--libavcodec/x86/rv34dsp.asm27
-rw-r--r--libavcodec/x86/rv34dsp_init.c13
-rw-r--r--libavcodec/x86/rv40dsp.asm8
-rw-r--r--libavcodec/x86/rv40dsp_init.c74
-rw-r--r--libavcodec/x86/sbrdsp.asm317
-rw-r--r--libavcodec/x86/sbrdsp_init.c38
-rw-r--r--libavcodec/x86/simple_idct.c12
-rw-r--r--libavcodec/x86/simple_idct.h20
-rw-r--r--libavcodec/x86/simple_idct10.asm100
-rw-r--r--libavcodec/x86/simple_idct10_template.asm315
-rw-r--r--libavcodec/x86/snowdsp.c908
-rw-r--r--libavcodec/x86/svq1enc.asm61
-rw-r--r--libavcodec/x86/svq1enc.c73
-rw-r--r--libavcodec/x86/svq1enc_init.c42
-rw-r--r--libavcodec/x86/synth_filter.asm246
-rw-r--r--libavcodec/x86/synth_filter_init.c74
-rw-r--r--libavcodec/x86/takdsp.asm116
-rw-r--r--libavcodec/x86/takdsp_init.c45
-rw-r--r--libavcodec/x86/ttadsp.asm119
-rw-r--r--libavcodec/x86/ttadsp_init.c42
-rw-r--r--libavcodec/x86/ttaencdsp.asm119
-rw-r--r--libavcodec/x86/ttaencdsp_init.c42
-rw-r--r--libavcodec/x86/v210-init.c48
-rw-r--r--libavcodec/x86/v210.asm90
-rw-r--r--libavcodec/x86/v210enc.asm38
-rw-r--r--libavcodec/x86/v210enc_init.c8
-rw-r--r--libavcodec/x86/vc1dsp.h8
-rw-r--r--libavcodec/x86/vc1dsp_init.c50
-rw-r--r--libavcodec/x86/vc1dsp_loopfilter.asm (renamed from libavcodec/x86/vc1dsp.asm)10
-rw-r--r--libavcodec/x86/vc1dsp_mc.asm292
-rw-r--r--libavcodec/x86/vc1dsp_mmx.c436
-rw-r--r--libavcodec/x86/videodsp.asm100
-rw-r--r--libavcodec/x86/videodsp_init.c95
-rw-r--r--libavcodec/x86/vorbisdsp.asm8
-rw-r--r--libavcodec/x86/vorbisdsp_init.c8
-rw-r--r--libavcodec/x86/vp3dsp.asm52
-rw-r--r--libavcodec/x86/vp3dsp_init.c20
-rw-r--r--libavcodec/x86/vp56_arith.h23
-rw-r--r--libavcodec/x86/vp6dsp.asm8
-rw-r--r--libavcodec/x86/vp6dsp_init.c8
-rw-r--r--libavcodec/x86/vp8dsp.asm27
-rw-r--r--libavcodec/x86/vp8dsp_init.c25
-rw-r--r--libavcodec/x86/vp8dsp_loopfilter.asm8
-rw-r--r--libavcodec/x86/vp9dsp_init.c527
-rw-r--r--libavcodec/x86/vp9dsp_init.h189
-rw-r--r--libavcodec/x86/vp9dsp_init_10bpp.c25
-rw-r--r--libavcodec/x86/vp9dsp_init_12bpp.c25
-rw-r--r--libavcodec/x86/vp9dsp_init_16bpp.c141
-rw-r--r--libavcodec/x86/vp9dsp_init_16bpp_template.c240
-rw-r--r--libavcodec/x86/vp9intrapred.asm2044
-rw-r--r--libavcodec/x86/vp9intrapred_16bpp.asm2174
-rw-r--r--libavcodec/x86/vp9itxfm.asm3197
-rw-r--r--libavcodec/x86/vp9itxfm_16bpp.asm2044
-rw-r--r--libavcodec/x86/vp9itxfm_template.asm142
-rw-r--r--libavcodec/x86/vp9lpf.asm251
-rw-r--r--libavcodec/x86/vp9lpf_16bpp.asm823
-rw-r--r--libavcodec/x86/vp9mc.asm105
-rw-r--r--libavcodec/x86/vp9mc_16bpp.asm431
-rw-r--r--libavcodec/x86/w64xmmtest.c8
-rw-r--r--libavcodec/x86/xvididct.asm983
-rw-r--r--libavcodec/x86/xvididct.h12
-rw-r--r--libavcodec/x86/xvididct_init.c56
-rw-r--r--libavcodec/x86/xvididct_mmx.c548
-rw-r--r--libavcodec/x86/xvididct_sse2.c405
190 files changed, 27963 insertions, 6686 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 094c1fa517..0295a9f8f7 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -3,11 +3,14 @@ OBJS += x86/constants.o \
# subsystems
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
+OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
+OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
+ x86/dirac_dwt_init.o
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
+OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
@@ -15,11 +18,13 @@ OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
+OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
+OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_mmx.o
+OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
-OBJS-$(CONFIG_MDCT) += x86/mdct_init.o
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
@@ -36,44 +41,56 @@ OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
# decoders/encoders
-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o
+OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
+ x86/sbrdsp_init.o
+OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
+OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
+OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
+OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
+OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
+OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
+OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
+OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
+OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
+OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
+OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
+ x86/vp9dsp_init_10bpp.o \
+ x86/vp9dsp_init_12bpp.o \
+ x86/vp9dsp_init_16bpp.o
+OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
# GCC inline assembly optimizations
# subsystems
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
-MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \
- x86/hpeldsp_mmx.o
-MMX-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_mmx.o \
- x86/simple_idct.o
-MMX-OBJS-$(CONFIG_QPELDSP) += x86/fpel_mmx.o
+MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
# decoders/encoders
-MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o \
- x86/xvididct_sse2.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
# subsystems
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
x86/ac3dsp_downmix.o
YASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
+YASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
YASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
@@ -96,6 +113,11 @@ YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
+YASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
+YASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
+YASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
+YASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
+YASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
YASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
@@ -104,27 +126,59 @@ YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
x86/fpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
-YASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp.o
+YASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
+ x86/vc1dsp_mc.o
+YASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
x86/vp8dsp_loopfilter.o
# decoders/encoders
-YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
-YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o
-YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
+YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
+ x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
+YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
+YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
+YASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
+YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
+YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
+YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
+ x86/dirac_dwt.o
YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
+YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
+ifdef CONFIG_GPL
+YASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
+endif
YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
x86/hevc_deblock.o \
x86/hevc_idct.o \
- x86/hevc_mc.o
+ x86/hevc_mc.o \
+ x86/hevc_sao.o \
+ x86/hevc_sao_10bit.o
+YASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
+YASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
+YASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
+YASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
+YASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
+YASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
YASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
+YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
-YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9mc.o \
- x86/vp9lpf.o
+YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
+ x86/vp9intrapred_16bpp.o \
+ x86/vp9itxfm.o \
+ x86/vp9itxfm_16bpp.o \
+ x86/vp9lpf.o \
+ x86/vp9lpf_16bpp.o \
+ x86/vp9mc.o \
+ x86/vp9mc_16bpp.o
+YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm
new file mode 100644
index 0000000000..97af571ec8
--- /dev/null
+++ b/libavcodec/x86/aacencdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD optimized AAC encoder DSP functions
+;*
+;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+float_abs_mask: times 4 dd 0x7fffffff
+
+SECTION .text
+
+;*******************************************************************
+;void ff_abs_pow34(float *out, const float *in, const int size);
+;*******************************************************************
+INIT_XMM sse
+cglobal abs_pow34, 3, 3, 3, out, in, size
+ mova m2, [float_abs_mask]
+ shl sizeq, 2
+ add inq, sizeq
+ add outq, sizeq
+ neg sizeq
+.loop:
+ andps m0, m2, [inq+sizeq]
+ sqrtps m1, m0
+ mulps m0, m1
+ sqrtps m0, m0
+ mova [outq+sizeq], m0
+ add sizeq, mmsize
+ jl .loop
+ RET
+
+;*******************************************************************
+;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
+; int size, int is_signed, int maxval, const float Q34,
+; const float rounding)
+;*******************************************************************
+INIT_XMM sse2
+cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
+%if UNIX64 == 0
+ movss m0, Q34m
+ movss m1, roundingm
+ cvtsi2ss m3, dword maxvalm
+%else
+ cvtsi2ss m3, maxvald
+%endif
+ shufps m0, m0, 0
+ shufps m1, m1, 0
+ shufps m3, m3, 0
+ shl is_signedd, 31
+ movd m4, is_signedd
+ shufps m4, m4, 0
+ shl sized, 2
+ add inq, sizeq
+ add outq, sizeq
+ add scaledq, sizeq
+ neg sizeq
+.loop:
+ mulps m2, m0, [scaledq+sizeq]
+ addps m2, m1
+ minps m2, m3
+ andps m5, m4, [inq+sizeq]
+ orps m2, m5
+ cvttps2dq m2, m2
+ mova [outq+sizeq], m2
+ add sizeq, mmsize
+ jl .loop
+ RET
diff --git a/libavcodec/x86/aacencdsp_init.c b/libavcodec/x86/aacencdsp_init.c
new file mode 100644
index 0000000000..d761c3c5e6
--- /dev/null
+++ b/libavcodec/x86/aacencdsp_init.c
@@ -0,0 +1,43 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_sse(float *out, const float *in, const int size);
+
+void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
+ int size, int is_signed, int maxval, const float Q34,
+ const float rounding);
+
+av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE(cpu_flags))
+ s->abs_pow34 = ff_abs_pow34_sse;
+
+ if (EXTERNAL_SSE2(cpu_flags))
+ s->quant_bands = ff_aac_quantize_bands_sse2;
+}
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
new file mode 100644
index 0000000000..e92cbbce08
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -0,0 +1,209 @@
+;******************************************************************************
+;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
+
+SECTION .text
+
+;*************************************************************************
+;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
+;*************************************************************************
+%macro PS_ADD_SQUARES 1
+cglobal ps_add_squares, 3, 3, %1, dst, src, n
+ shl nd, 3
+ add srcq, nq
+ neg nq
+
+align 16
+.loop:
+ movaps m0, [srcq+nq]
+ movaps m1, [srcq+nq+mmsize]
+ mulps m0, m0
+ mulps m1, m1
+ HADDPS m0, m1, m2
+ addps m0, [dstq]
+ movaps [dstq], m0
+ add dstq, mmsize
+ add nq, mmsize*2
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_ADD_SQUARES 2
+INIT_XMM sse3
+PS_ADD_SQUARES 3
+
+;*******************************************************************
+;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
+; float *src1, int n);
+;*******************************************************************
+INIT_XMM sse
+cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n
+ xor r4q, r4q
+
+.loop:
+ movu m0, [src1q+r4q]
+ movu m1, [src1q+r4q+mmsize]
+ mova m2, [src2q]
+ mova m3, m2
+ unpcklps m2, m2
+ unpckhps m3, m3
+ mulps m0, m2
+ mulps m1, m3
+ mova [dstq+r4q], m0
+ mova [dstq+r4q+mmsize], m1
+ add src2q, mmsize
+ add r4q, mmsize*2
+ sub nd, mmsize/4
+ jg .loop
+ REP_RET
+
+;***********************************************************************
+;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+; float h[2][4], float h_step[2][4],
+; int len);
+;***********************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
+ movaps m0, [hq]
+ movaps m1, [h_stepq]
+ cmp nd, 0
+ jle .ret
+ shl nd, 3
+ add lq, nq
+ add rq, nq
+ neg nq
+
+align 16
+.loop:
+ addps m0, m1
+ movddup m2, [lq+nq]
+ movddup m3, [rq+nq]
+ movaps m4, m0
+ movaps m5, m0
+ unpcklps m4, m4
+ unpckhps m5, m5
+ mulps m2, m4
+ mulps m3, m5
+ addps m2, m3
+ movsd [lq+nq], m2
+ movhps [rq+nq], m2
+ add nq, 8
+ jl .loop
+.ret:
+ REP_RET
+
+;*******************************************************************
+;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
+; const float (*filter)[8][2],
+; int stride, int n);
+;*******************************************************************
+%macro PS_HYBRID_ANALYSIS_LOOP 3
+ movu %1, [inq+mmsize*%3]
+ movu m1, [inq+mmsize*(5-%3)+8]
+%if cpuflag(sse3)
+ pshufd %2, %1, q2301
+ pshufd m4, m1, q0123
+ pshufd m1, m1, q1032
+ pshufd m2, [filterq+nq+mmsize*%3], q2301
+ addsubps %2, m4
+ addsubps %1, m1
+%else
+ mova m2, [filterq+nq+mmsize*%3]
+ mova %2, %1
+ mova m4, m1
+ shufps %2, %2, q2301
+ shufps m4, m4, q0123
+ shufps m1, m1, q1032
+ shufps m2, m2, q2301
+ xorps m4, m7
+ xorps m1, m7
+ subps %2, m4
+ subps %1, m1
+%endif
+ mulps %2, m2
+ mulps %1, m2
+%if %3
+ addps m3, %2
+ addps m0, %1
+%endif
+%endmacro
+
+%macro PS_HYBRID_ANALYSIS 0
+cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
+%if cpuflag(sse3)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+ shl strided, 3
+ shl nd, 6
+ add filterq, nq
+ neg nq
+ mova m7, [ps_p1m1p1m1]
+
+align 16
+.loop:
+ PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
+ PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
+ PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
+
+%if cpuflag(sse3)
+ pshufd m3, m3, q2301
+ xorps m0, m7
+ hsubps m3, m0
+ pshufd m1, m3, q0020
+ pshufd m3, m3, q0031
+ addps m1, m3
+ movsd m2, [inq+6*8]
+%else
+ mova m1, m3
+ mova m2, m0
+ shufps m1, m1, q2301
+ shufps m2, m2, q2301
+ subps m1, m3
+ addps m2, m0
+ unpcklps m3, m1, m2
+ unpckhps m1, m2
+ addps m1, m3
+ movu m2, [inq+6*8] ; faster than movlps and no risk of overread
+%endif
+ movss m3, [filterq+nq+8*6]
+ SPLATD m3
+ mulps m2, m3
+ addps m1, m2
+ MOVH [outq], m1
+ add outq, strideq
+ add nq, 64
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_HYBRID_ANALYSIS
+INIT_XMM sse3
+PS_HYBRID_ANALYSIS
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
new file mode 100644
index 0000000000..f6d6c039c3
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -0,0 +1,55 @@
+/*
+ * SIMD optimized MPEG-4 Parametric Stereo decoding functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_sse (float *dst, const float (*src)[2], int n);
+void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
+ float *src1, int n);
+void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
+ const float (*filter)[8][2],
+ int stride, int n);
+void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
+ const float (*filter)[8][2],
+ int stride, int n);
+void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4],
+ int len);
+
+av_cold void ff_psdsp_init_x86(PSDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE(cpu_flags)) {
+ s->add_squares = ff_ps_add_squares_sse;
+ s->mul_pair_single = ff_ps_mul_pair_single_sse;
+ s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
+ }
+ if (EXTERNAL_SSE3(cpu_flags)) {
+ s->add_squares = ff_ps_add_squares_sse3;
+ s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
+ s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
+ }
+}
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 817d5a319c..675ade3101 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -2,20 +2,20 @@
;* x86-optimized AC-3 DSP functions
;* Copyright (c) 2011 Justin Ruggles
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -32,7 +32,7 @@ pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
; used in ff_ac3_extract_exponents()
-pd_1: times 4 dd 1
+cextern pd_1
pd_151: times 4 dd 151
; used in ff_apply_window_int16()
diff --git a/libavcodec/x86/ac3dsp_downmix.asm b/libavcodec/x86/ac3dsp_downmix.asm
index b085035ce3..057cc6061c 100644
--- a/libavcodec/x86/ac3dsp_downmix.asm
+++ b/libavcodec/x86/ac3dsp_downmix.asm
@@ -2,20 +2,20 @@
;* x86-optimized AC-3 downmixing
;* Copyright (c) 2012 Justin Ruggles
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index 6d049b37cf..2e7e2fb6da 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -2,20 +2,20 @@
* x86-optimized AC-3 DSP functions
* Copyright (c) 2011 Justin Ruggles
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/alacdsp.asm b/libavcodec/x86/alacdsp.asm
new file mode 100644
index 0000000000..bb2069f785
--- /dev/null
+++ b/libavcodec/x86/alacdsp.asm
@@ -0,0 +1,133 @@
+;******************************************************************************
+;* ALAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
+%else
+cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
+%define buf1q r2q
+%endif
+ movd m6, shiftm
+ movd m7, weightm
+ SPLATD m7
+ shl lend, 2
+ mov buf1q, [buf0q + gprsize]
+ mov buf0q, [buf0q]
+ add buf1q, lenq
+ add buf0q, lenq
+ neg lenq
+
+align 16
+.loop:
+ mova m0, [buf0q + lenq]
+ mova m1, [buf0q + lenq + mmsize]
+ mova m2, [buf1q + lenq]
+ mova m3, [buf1q + lenq + mmsize]
+ pmulld m4, m2, m7
+ pmulld m5, m3, m7
+ psrad m4, m6
+ psrad m5, m6
+ psubd m0, m4
+ psubd m1, m5
+ paddd m2, m0
+ paddd m3, m1
+ mova [buf1q + lenq], m0
+ mova [buf1q + lenq + mmsize], m1
+ mova [buf0q + lenq], m2
+ mova [buf0q + lenq + mmsize], m3
+
+ add lenq, mmsize*2
+ jl .loop
+ RET
+
+INIT_XMM sse2
+cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
+ movifnidn lend, lenm
+ movd m4, r2m ; exbits
+ shl lend, 2
+ mov buf1q, [buf0q + gprsize]
+ mov buf0q, [buf0q]
+ mov exbuf1q, [exbuf0q + gprsize]
+ mov exbuf0q, [exbuf0q]
+ add buf1q, lenq
+ add buf0q, lenq
+ add exbuf1q, lenq
+ add exbuf0q, lenq
+ neg lenq
+
+align 16
+.loop:
+ mova m0, [buf0q + lenq]
+ mova m1, [buf0q + lenq + mmsize]
+ pslld m0, m4
+ pslld m1, m4
+ mova m2, [buf1q + lenq]
+ mova m3, [buf1q + lenq + mmsize]
+ pslld m2, m4
+ pslld m3, m4
+ por m0, [exbuf0q + lenq]
+ por m1, [exbuf0q + lenq + mmsize]
+ por m2, [exbuf1q + lenq]
+ por m3, [exbuf1q + lenq + mmsize]
+ mova [buf0q + lenq ], m0
+ mova [buf0q + lenq + mmsize], m1
+ mova [buf1q + lenq ], m2
+ mova [buf1q + lenq + mmsize], m3
+
+ add lenq, mmsize*2
+ jl .loop
+ REP_RET
+
+%if ARCH_X86_64
+cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
+%else
+cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
+%define exbitsm r2m
+%endif
+ movifnidn lend, r4m
+ movd m2, exbitsm
+ shl lend, 2
+ mov bufq, [bufq]
+ mov exbufq, [exbufq]
+ add bufq, lenq
+ add exbufq, lenq
+ neg lenq
+
+align 16
+.loop:
+ mova m0, [bufq + lenq]
+ mova m1, [bufq + lenq + mmsize]
+ pslld m0, m2
+ pslld m1, m2
+ por m0, [exbufq + lenq]
+ por m1, [exbufq + lenq + mmsize]
+ mova [bufq + lenq], m0
+ mova [bufq + lenq + mmsize], m1
+
+ add lenq, mmsize*2
+ jl .loop
+ REP_RET
diff --git a/libavcodec/x86/alacdsp_init.c b/libavcodec/x86/alacdsp_init.c
new file mode 100644
index 0000000000..de5dae6c77
--- /dev/null
+++ b/libavcodec/x86/alacdsp_init.c
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/alacdsp.h"
+#include "config.h"
+
+void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
+ int decorr_shift, int decorr_left_weight);
+void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+ int extra_bits, int channels, int nb_samples);
+void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+ int extra_bits, int channels, int nb_samples);
+
+av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
+ c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
+ }
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ c->decorrelate_stereo = ff_alac_decorrelate_stereo_sse4;
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
index e038c18bd8..3973808ca5 100644
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -2,20 +2,20 @@
;* optimized audio functions
;* Copyright (c) 2008 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -40,15 +40,11 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
paddd m2, m1
add orderq, mmsize*2
jl .loop
-%if mmsize == 16
- movhlps m0, m2
- paddd m2, m0
- pshuflw m0, m2, 0x4e
-%else
- pshufw m0, m2, 0x4e
-%endif
- paddd m2, m0
+ HADDD m2, m0
movd eax, m2
+%if mmsize == 8
+ emms
+%endif
RET
%endmacro
@@ -144,7 +140,8 @@ cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
VBROADCASTSS m0, minm
VBROADCASTSS m1, maxm
%elif WIN64
- VBROADCASTSS m0, m3
+ SWAP 0, 3
+ VBROADCASTSS m0, m0
VBROADCASTSS m1, maxm
%else ; 64bit sysv
VBROADCASTSS m0, m0
diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c
index 093f3f0672..98e296c264 100644
--- a/libavcodec/x86/audiodsp_init.c
+++ b/libavcodec/x86/audiodsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -37,7 +37,6 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
-
void ff_vector_clipf_sse(float *dst, const float *src,
int len, float min, float max);
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
new file mode 100644
index 0000000000..7cbfa3a843
--- /dev/null
+++ b/libavcodec/x86/blockdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD-optimized clear block functions
+;* Copyright (c) 2002 Michael Niedermayer
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2009 Fiona Glaser
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;----------------------------------------
+; void ff_clear_block(int16_t *blocks);
+;----------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline store loops
+%macro CLEAR_BLOCK 2
+cglobal clear_block, 1, 1, %1, blocks
+ ZERO m0, m0
+%assign %%i 0
+%rep %2
+ mova [blocksq+mmsize*(0+%%i)], m0
+ mova [blocksq+mmsize*(1+%%i)], m0
+ mova [blocksq+mmsize*(2+%%i)], m0
+ mova [blocksq+mmsize*(3+%%i)], m0
+ mova [blocksq+mmsize*(4+%%i)], m0
+ mova [blocksq+mmsize*(5+%%i)], m0
+ mova [blocksq+mmsize*(6+%%i)], m0
+ mova [blocksq+mmsize*(7+%%i)], m0
+%assign %%i %%i+8
+%endrep
+ RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCK 0, 2
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCK 1, 1
+
+;-----------------------------------------
+; void ff_clear_blocks(int16_t *blocks);
+;-----------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS 1
+cglobal clear_blocks, 1, 2, %1, blocks, len
+ add blocksq, 768
+ mov lenq, -768
+ ZERO m0, m0
+.loop:
+ mova [blocksq+lenq+mmsize*0], m0
+ mova [blocksq+lenq+mmsize*1], m0
+ mova [blocksq+lenq+mmsize*2], m0
+ mova [blocksq+lenq+mmsize*3], m0
+ mova [blocksq+lenq+mmsize*4], m0
+ mova [blocksq+lenq+mmsize*5], m0
+ mova [blocksq+lenq+mmsize*6], m0
+ mova [blocksq+lenq+mmsize*7], m0
+ add lenq, mmsize*8
+ js .loop
+ RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS 1
diff --git a/libavcodec/x86/blockdsp.c b/libavcodec/x86/blockdsp.c
deleted file mode 100644
index 9bb5185b89..0000000000
--- a/libavcodec/x86/blockdsp.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/blockdsp.h"
-#include "libavcodec/version.h"
-
-#if HAVE_INLINE_ASM
-
-#define CLEAR_BLOCKS(name, n) \
-static void name(int16_t *blocks) \
-{ \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "mov %1, %%"FF_REG_a" \n\t" \
- "1: \n\t" \
- "movq %%mm7, (%0, %%"FF_REG_a") \n\t" \
- "movq %%mm7, 8(%0, %%"FF_REG_a") \n\t" \
- "movq %%mm7, 16(%0, %%"FF_REG_a") \n\t" \
- "movq %%mm7, 24(%0, %%"FF_REG_a") \n\t" \
- "add $32, %%"FF_REG_a" \n\t" \
- "js 1b \n\t" \
- :: "r"(((uint8_t *) blocks) + 128 * n), \
- "i"(-128 * n) \
- : "%"FF_REG_a); \
-}
-CLEAR_BLOCKS(clear_blocks_mmx, 6)
-CLEAR_BLOCKS(clear_block_mmx, 1)
-
-static void clear_block_sse(int16_t *block)
-{
- __asm__ volatile (
- "xorps %%xmm0, %%xmm0 \n"
- "movaps %%xmm0, (%0) \n"
- "movaps %%xmm0, 16(%0) \n"
- "movaps %%xmm0, 32(%0) \n"
- "movaps %%xmm0, 48(%0) \n"
- "movaps %%xmm0, 64(%0) \n"
- "movaps %%xmm0, 80(%0) \n"
- "movaps %%xmm0, 96(%0) \n"
- "movaps %%xmm0, 112(%0) \n"
- :: "r" (block)
- : "memory");
-}
-
-static void clear_blocks_sse(int16_t *blocks)
-{
- __asm__ volatile (
- "xorps %%xmm0, %%xmm0 \n"
- "mov %1, %%"FF_REG_a" \n"
- "1: \n"
- "movaps %%xmm0, (%0, %%"FF_REG_a") \n"
- "movaps %%xmm0, 16(%0, %%"FF_REG_a") \n"
- "movaps %%xmm0, 32(%0, %%"FF_REG_a") \n"
- "movaps %%xmm0, 48(%0, %%"FF_REG_a") \n"
- "movaps %%xmm0, 64(%0, %%"FF_REG_a") \n"
- "movaps %%xmm0, 80(%0, %%"FF_REG_a") \n"
- "movaps %%xmm0, 96(%0, %%"FF_REG_a") \n"
- "movaps %%xmm0, 112(%0, %%"FF_REG_a") \n"
- "add $128, %%"FF_REG_a" \n"
- "js 1b \n"
- :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6)
- : "%"FF_REG_a);
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-#if FF_API_XVMC
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
- AVCodecContext *avctx)
-#else
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c)
-#endif /* FF_API_XVMC */
-{
-#if HAVE_INLINE_ASM
- int cpu_flags = av_get_cpu_flags();
-
- if (INLINE_MMX(cpu_flags)) {
- c->clear_block = clear_block_mmx;
- c->clear_blocks = clear_blocks_mmx;
- }
-
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
- /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
- if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)
- return;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
- if (INLINE_SSE(cpu_flags)) {
- c->clear_block = clear_block_sse;
- c->clear_blocks = clear_blocks_sse;
- }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
new file mode 100644
index 0000000000..21599934ff
--- /dev/null
+++ b/libavcodec/x86/blockdsp_init.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/blockdsp.h"
+#include "libavcodec/version.h"
+
+void ff_clear_block_mmx(int16_t *block);
+void ff_clear_block_sse(int16_t *block);
+void ff_clear_blocks_mmx(int16_t *blocks);
+void ff_clear_blocks_sse(int16_t *blocks);
+
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
+ AVCodecContext *avctx)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ c->clear_block = ff_clear_block_mmx;
+ c->clear_blocks = ff_clear_blocks_mmx;
+ }
+
+ /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+ if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
+ return;
+
+ if (EXTERNAL_SSE(cpu_flags)) {
+ c->clear_block = ff_clear_block_sse;
+ c->clear_blocks = ff_clear_blocks_sse;
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 4810867921..56d8083622 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -1,21 +1,23 @@
;******************************************************************************
;* optimized bswap buffer functions
;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -24,6 +26,8 @@
SECTION_RODATA
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+cextern pb_80
+
SECTION .text
; %1 = aligned/unaligned
@@ -84,11 +88,14 @@ SECTION .text
%macro BSWAP32_BUF 0
%if cpuflag(ssse3)
cglobal bswap32_buf, 3,4,3
+ mov r3, r1
mova m2, [pb_bswap32]
%else
cglobal bswap32_buf, 3,4,5
+ mov r3, r1
%endif
- test r1, 15
+ or r3, r0
+ test r3, 15
jz .start_align
BSWAP_LOOPS u
jmp .left
diff --git a/libavcodec/x86/bswapdsp_init.c b/libavcodec/x86/bswapdsp_init.c
index ba40f2dbe1..c042e56371 100644
--- a/libavcodec/x86/bswapdsp_init.c
+++ b/libavcodec/x86/bswapdsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 205511ef61..cfd3b759c9 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,8 +27,28 @@
#include "libavutil/x86/asm.h"
#include "config.h"
+#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+ || ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
+ || (defined(__INTEL_COMPILER) && defined(_MSC_VER))
+# define BROKEN_COMPILER 1
+#else
+# define BROKEN_COMPILER 0
+#endif
+
#if HAVE_INLINE_ASM
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+ "cmp "end" , %%"FF_REG_c" \n\t"\
+ "jge 1f \n\t"
+#endif
+
#ifdef BROKEN_RELOCATIONS
#define TABLES_ARG , "r"(tables)
@@ -73,11 +93,10 @@
"test "lowword" , "lowword" \n\t"\
"jnz 2f \n\t"\
"mov "byte" , %%"FF_REG_c" \n\t"\
- "cmp "end" , %%"FF_REG_c" \n\t"\
- "jge 1f \n\t"\
+ END_CHECK(end)\
"add"FF_OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
- "movzwl (%%"FF_REG_c"), "tmp" \n\t"\
+ "movzwl (%%"FF_REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
@@ -92,7 +111,8 @@
"2: \n\t"
#else /* BROKEN_RELOCATIONS */
-#define TABLES_ARG
+#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
+#define RIP_ARG
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
@@ -134,8 +154,7 @@
"test "lowword" , "lowword" \n\t"\
" jnz 2f \n\t"\
"mov "byte" , %%"FF_REG_c" \n\t"\
- "cmp "end" , %%"FF_REG_c" \n\t"\
- "jge 1f \n\t"\
+ END_CHECK(end)\
"add"FF_OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
@@ -154,8 +173,7 @@
#endif /* BROKEN_RELOCATIONS */
-
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
#define get_cabac_inline get_cabac_inline_x86
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
uint8_t *const state)
@@ -167,6 +185,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
+ : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
@@ -178,17 +197,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%8")
- : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+ : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
+ ,"1"(c->low), "2"(c->range)
: "%"FF_REG_c, "memory"
);
return bit & 1;
}
-#endif /* HAVE_7REGS */
+#endif /* HAVE_7REGS && !BROKEN_COMPILER */
+#if !BROKEN_COMPILER
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
@@ -199,7 +220,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
- "cltd \n\t"
+ "cdq \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"xor %%edx, %%ecx \n\t"
@@ -211,10 +232,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
+#if UNCHECKED_BITSTREAM_READER
+ "add $2, %1 \n\t"
+ "addl %%edx, %%eax \n\t"
+ "mov %1, %c4(%2) \n\t"
+#else
"addl %%edx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
+#endif
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
@@ -240,7 +267,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
- "cltd \n\t"
+ "cdq \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"inc %%edx \n\t"
@@ -268,6 +295,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
);
return res;
}
+#endif /* !BROKEN_COMPILER */
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_CABAC_H */
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index b5406ef16d..ecb9b231f1 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -5,20 +5,20 @@
* MMX-optimized DSP functions, based on H.264 optimizations by
* Michael Niedermayer and Loren Merritt
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -139,7 +139,7 @@ static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
{
int i;
- DECLARE_ALIGNED(8, int16_t, b2)[64];
+ LOCAL_ALIGNED(16, int16_t, b2, [64]);
for(i=0; i<2; i++){
cavs_idct8_1d(block + 4 * i, ff_pw_4.a);
@@ -196,7 +196,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
);
}
- ff_add_pixels_clamped_mmx(b2, dst, stride);
+ ff_add_pixels_clamped(b2, dst, stride);
}
#endif /* HAVE_MMX_INLINE */
@@ -210,10 +210,10 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
****************************************************************************/
/* vertical filter [-1 -2 96 42 -7 0] */
-#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
- "pmullw %5, %%mm6 \n\t"\
+ "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\
"psllw $3, "#E" \n\t"\
@@ -228,35 +228,35 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
"psubw "#B", %%mm6 \n\t"\
"psraw $1, "#B" \n\t"\
"psubw "#A", %%mm6 \n\t"\
- "paddw %4, %%mm6 \n\t"\
+ "paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
/* vertical filter [ 0 -1 5 5 -1 0] */
-#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"paddw "#D", %%mm6 \n\t"\
- "pmullw %5, %%mm6 \n\t"\
+ "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psubw "#E", %%mm6 \n\t"\
- "paddw %4, %%mm6 \n\t"\
+ "paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $3, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
/* vertical filter [ 0 -7 42 96 -2 -1] */
-#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\
"movq "#D", %%mm7 \n\t"\
- "pmullw %5, %%mm7 \n\t"\
+ "pmullw "MANGLE(MUL1)", %%mm7\n\t"\
"psllw $3, "#B" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $3, "#B" \n\t"\
@@ -269,7 +269,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
"psubw "#E", %%mm6 \n\t"\
"psraw $1, "#E" \n\t"\
"psubw "#F", %%mm6 \n\t"\
- "paddw %4, %%mm6 \n\t"\
+ "paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
@@ -298,32 +298,34 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
- VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
- VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
- VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
- VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
- VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
- VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
- VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
- VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
\
: "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+ : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+ NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
: "memory"\
);\
if(h==16){\
__asm__ volatile(\
- VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
- VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
- VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
- VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
- VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
- VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
- VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
- VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
\
: "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+ : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+ NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
: "memory"\
);\
}\
@@ -337,7 +339,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
- "movq %5, %%mm6 \n\t"\
+ "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
@@ -363,7 +365,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
- "movq %6, %%mm5 \n\t"\
+ "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm5, %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\
@@ -375,7 +377,8 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\
- : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+ NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
: "memory"\
);\
}\
@@ -387,7 +390,7 @@ static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8
\
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
{ \
- QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \
+ QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
@@ -468,7 +471,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
@@ -481,6 +484,12 @@ static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
ff_avg_pixels8_mmx(dst, src, stride, 8);
}
+static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
@@ -493,18 +502,40 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
ff_avg_pixels16_mmx(dst, src, stride, 16);
}
+static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ ff_avg_pixels16_mmxext(dst, src, stride, 16);
+}
+
+static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#endif
+
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
AVCodecContext *avctx)
{
+#if HAVE_MMX_EXTERNAL
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
+#endif
+#if HAVE_MMX_INLINE
c->cavs_idct8_add = cavs_idct8_add_mmx;
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
-}
#endif /* HAVE_MMX_INLINE */
+}
#define DSPFUNC(PFX, IDX, NUM, EXT) \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
@@ -520,15 +551,6 @@ CAVS_MC(put_, 8, mmxext)
CAVS_MC(put_, 16, mmxext)
CAVS_MC(avg_, 8, mmxext)
CAVS_MC(avg_, 16, mmxext)
-
-static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
- AVCodecContext *avctx)
-{
- DSPFUNC(put, 0, 16, mmxext);
- DSPFUNC(put, 1, 8, mmxext);
- DSPFUNC(avg, 0, 16, mmxext);
- DSPFUNC(avg, 1, 8, mmxext);
-}
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_AMD3DNOW_INLINE
@@ -552,18 +574,33 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
{
-#if HAVE_MMX_INLINE
- int cpu_flags = av_get_cpu_flags();
+ av_unused int cpu_flags = av_get_cpu_flags();
- if (INLINE_MMX(cpu_flags))
+ if (X86_MMX(cpu_flags))
cavsdsp_init_mmx(c, avctx);
-#endif /* HAVE_MMX_INLINE */
+
#if HAVE_AMD3DNOW_INLINE
if (INLINE_AMD3DNOW(cpu_flags))
cavsdsp_init_3dnow(c, avctx);
#endif /* HAVE_AMD3DNOW_INLINE */
#if HAVE_MMXEXT_INLINE
- if (INLINE_MMXEXT(cpu_flags))
- cavsdsp_init_mmxext(c, avctx);
-#endif /* HAVE_MMXEXT_INLINE */
+ if (INLINE_MMXEXT(cpu_flags)) {
+ DSPFUNC(put, 0, 16, mmxext);
+ DSPFUNC(put, 1, 8, mmxext);
+ DSPFUNC(avg, 0, 16, mmxext);
+ DSPFUNC(avg, 1, 8, mmxext);
+ }
+#endif
+#if HAVE_MMX_EXTERNAL
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
+ c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
+ }
+#endif
+#if HAVE_SSE2_EXTERNAL
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
+ c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+ }
+#endif
}
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 6f7dd7346e..11002ee61e 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -1,20 +1,20 @@
/*
- * MMX/SSE constants used across x86 dsp optimizations.
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -22,12 +22,13 @@
#include "libavutil/x86/asm.h" // for xmm_reg
#include "constants.h"
-DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL,
+ 0x0002000200020002ULL, 0x0002000200020002ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
+ 0x0004000400040004ULL, 0x0004000400040004ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
@@ -35,22 +36,58 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
0x0100010001000100ULL, 0x0100010001000100ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL,
+ 0x0200020002000200ULL, 0x0200020002000200ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
+ 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
+ 0x0400040004000400ULL, 0x0400040004000400ULL};
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
+ 0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+ 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
+ 0x1000100010001000ULL, 0x1000100010001000ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
+ 0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
+ 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL,
+ 0x0202020202020202ULL, 0x0202020202020202ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+ 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
+ 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
+
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
+ 0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+ 0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+ 0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
+ 0x0000200000002000ULL, 0x0000200000002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+ 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 59ff94725d..bbb0ef844a 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -1,20 +1,20 @@
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,30 +25,48 @@
#include "libavutil/x86/asm.h"
-extern const uint64_t ff_wtwo;
-
+extern const ymm_reg ff_pw_1;
+extern const ymm_reg ff_pw_2;
extern const xmm_reg ff_pw_3;
-extern const xmm_reg ff_pw_4;
+extern const ymm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_8;
+extern const xmm_reg ff_pw_9;
extern const uint64_t ff_pw_15;
extern const xmm_reg ff_pw_16;
extern const xmm_reg ff_pw_18;
-extern const uint64_t ff_pw_20;
+extern const xmm_reg ff_pw_20;
extern const xmm_reg ff_pw_32;
extern const uint64_t ff_pw_42;
extern const uint64_t ff_pw_53;
extern const xmm_reg ff_pw_64;
extern const uint64_t ff_pw_96;
extern const uint64_t ff_pw_128;
-extern const uint64_t ff_pw_255;
+extern const ymm_reg ff_pw_255;
extern const ymm_reg ff_pw_256;
-extern const xmm_reg ff_pw_512;
-extern const xmm_reg ff_pw_m1;
+extern const ymm_reg ff_pw_512;
+extern const ymm_reg ff_pw_1023;
+extern const ymm_reg ff_pw_1024;
+extern const ymm_reg ff_pw_2048;
+extern const ymm_reg ff_pw_4095;
+extern const ymm_reg ff_pw_4096;
+extern const ymm_reg ff_pw_8192;
+extern const ymm_reg ff_pw_m1;
-extern const xmm_reg ff_pb_1;
-extern const xmm_reg ff_pb_3;
-extern const xmm_reg ff_pb_F8;
+extern const ymm_reg ff_pb_0;
+extern const ymm_reg ff_pb_1;
+extern const ymm_reg ff_pb_2;
+extern const ymm_reg ff_pb_3;
+extern const xmm_reg ff_pb_80;
+extern const ymm_reg ff_pb_FE;
extern const uint64_t ff_pb_FC;
+extern const xmm_reg ff_ps_neg;
+
+extern const ymm_reg ff_pd_1;
+extern const ymm_reg ff_pd_16;
+extern const ymm_reg ff_pd_32;
+extern const ymm_reg ff_pd_8192;
+extern const ymm_reg ff_pd_65535;
+
#endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
deleted file mode 100644
index 11d45ae61c..0000000000
--- a/libavcodec/x86/dca.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DCA_H
-#define AVCODEC_X86_DCA_H
-
-#include "config.h"
-
-#if ARCH_X86_64 && HAVE_SSE2_INLINE
-# include "libavutil/x86/asm.h"
-# include "libavutil/mem.h"
-#include "libavcodec/dcadsp.h"
-
-# define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
- float *dst, const int8_t *src, int scale)
-{
- DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
- __asm__ volatile (
- "cvtsi2ss %2, %%xmm0 \n\t"
- "mulss %3, %%xmm0 \n\t"
- "movq (%1), %%xmm1 \n\t"
- "punpcklbw %%xmm1, %%xmm1 \n\t"
- "movaps %%xmm1, %%xmm2 \n\t"
- "punpcklwd %%xmm1, %%xmm1 \n\t"
- "punpckhwd %%xmm2, %%xmm2 \n\t"
- "psrad $24, %%xmm1 \n\t"
- "psrad $24, %%xmm2 \n\t"
- "shufps $0, %%xmm0, %%xmm0 \n\t"
- "cvtdq2ps %%xmm1, %%xmm1 \n\t"
- "cvtdq2ps %%xmm2, %%xmm2 \n\t"
- "mulps %%xmm0, %%xmm1 \n\t"
- "mulps %%xmm0, %%xmm2 \n\t"
- "movaps %%xmm1, 0(%0) \n\t"
- "movaps %%xmm2, 16(%0) \n\t"
- :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
- XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
- );
-}
-
-#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */
-
-#endif /* AVCODEC_X86_DCA_H */
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index fa8d3cb66a..055361a765 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -1,336 +1,301 @@
;******************************************************************************
-;* SSE-optimized functions for the DCA decoder
-;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;* SIMD-optimized functions for the DCA decoder
+;* Copyright (C) 2016 James Almer
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-pf_inv16: times 4 dd 0x3D800000 ; 1/16
-
SECTION .text
-; %1=v0/v1 %2=in1 %3=in2
-%macro FIR_LOOP 2-3
-.loop%1:
-%define va m1
-%define vb m2
-%if %1
-%define OFFSET 0
-%else
-%define OFFSET NUM_COEF*count
-%endif
-; for v0, incrementing and for v1, decrementing
- mova va, [cf0q + OFFSET]
- mova vb, [cf0q + OFFSET + 4*NUM_COEF]
-%if %0 == 3
- mova m4, [cf0q + OFFSET + mmsize]
- mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
-%endif
- mulps va, %2
- mulps vb, %2
-%if %0 == 3
- mulps m4, %3
- mulps m0, %3
- addps va, m4
- addps vb, m0
-%endif
- ; va = va1 va2 va3 va4
- ; vb = vb1 vb2 vb3 vb4
-%if %1
- SWAP va, vb
-%endif
- mova m4, va
- unpcklps va, vb ; va3 vb3 va4 vb4
- unpckhps m4, vb ; va1 vb1 va2 vb2
- addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
- movhlps vb, m4 ; va1+3 vb1+3
- addps vb, m4 ; va0..4 vb0..4
- movlps [outq + count], vb
-%if %1
- sub cf0q, 8*NUM_COEF
-%endif
- add count, 8
- jl .loop%1
-%endmacro
+%define sizeof_float 4
+%define FMA3_OFFSET (8 * cpuflag(fma3))
-; void dca_lfe_fir(float *out, float *in, float *coefs)
-%macro DCA_LFE_FIR 1
-cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
-%define IN1 m3
-%define IN2 m5
-%define count inq
-%define NUM_COEF 4*(2-%1)
-%define NUM_OUT 32*(%1+1)
+%macro LFE_FIR0_FLOAT 0
+cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
+ shr nblocksd, 1
+ sub lfeq, 7*sizeof_float
+ mov cnt1d, 32*sizeof_float
+ mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+ lea coeffq, [coeffq+cnt1q*8]
+ add samplesq, cnt1q
+ neg cnt1q
- movu IN1, [inq + 4 - 1*mmsize]
- shufps IN1, IN1, q0123
-%if %1 == 0
- movu IN2, [inq + 4 - 2*mmsize]
- shufps IN2, IN2, q0123
-%endif
-
- mov count, -4*NUM_OUT
- add cf0q, 4*NUM_COEF*NUM_OUT
- add outq, 4*NUM_OUT
- ; compute v0 first
-%if %1 == 0
- FIR_LOOP 0, IN1, IN2
-%else
- FIR_LOOP 0, IN1
-%endif
- shufps IN1, IN1, q0123
- mov count, -4*NUM_OUT
- ; cf1 already correctly positioned
- add outq, 4*NUM_OUT ; outq now at out2
- sub cf0q, 8*NUM_COEF
-%if %1 == 0
- shufps IN2, IN2, q0123
- FIR_LOOP 1, IN2, IN1
+.loop:
+%if cpuflag(avx)
+ cvtdq2ps m4, [lfeq+16]
+ cvtdq2ps m5, [lfeq ]
+ shufps m7, m4, m4, q0123
+ shufps m6, m5, m5, q0123
+%elif cpuflag(sse2)
+ movu m4, [lfeq+16]
+ movu m5, [lfeq ]
+ cvtdq2ps m4, m4
+ cvtdq2ps m5, m5
+ pshufd m7, m4, q0123
+ pshufd m6, m5, q0123
%else
- FIR_LOOP 1, IN1
+ cvtpi2ps m4, [lfeq+16]
+ cvtpi2ps m0, [lfeq+24]
+ cvtpi2ps m5, [lfeq ]
+ cvtpi2ps m1, [lfeq+8 ]
+ shufps m4, m0, q1010
+ shufps m5, m1, q1010
+ shufps m7, m4, m4, q0123
+ shufps m6, m5, m5, q0123
%endif
- RET
-%endmacro
-INIT_XMM sse
-DCA_LFE_FIR 0
-DCA_LFE_FIR 1
+.inner_loop:
+%if ARCH_X86_64
+ movaps m8, [coeffq+cnt1q*8 ]
+ movaps m9, [coeffq+cnt1q*8+16]
+ movaps m10, [coeffq+cnt1q*8+32]
+ movaps m11, [coeffq+cnt1q*8+48]
+%if cpuflag(fma3)
+ movaps m12, [coeffq+cnt1q*8+64]
+ movaps m13, [coeffq+cnt1q*8+80]
+ movaps m14, [coeffq+cnt1q*8+96]
+ movaps m15, [coeffq+cnt1q*8+112]
+ mulps m0, m7, m8
+ mulps m1, m7, m10
+ mulps m2, m7, m12
+ mulps m3, m7, m14
+ fmaddps m0, m6, m9, m0
+ fmaddps m1, m6, m11, m1
+ fmaddps m2, m6, m13, m2
+ fmaddps m3, m6, m15, m3
-%macro SETZERO 1
-%if cpuflag(sse2) && notcpuflag(avx)
- pxor %1, %1
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
%else
- xorps %1, %1, %1
-%endif
-%endmacro
+ mulps m0, m7, m8
+ mulps m1, m6, m9
+ mulps m2, m7, m10
+ mulps m3, m6, m11
+ addps m0, m1
+ addps m2, m3
-%macro SHUF 3
-%if cpuflag(avx)
- mova %3, [%2 - 16]
- vperm2f128 %1, %3, %3, 1
- vshufps %1, %1, %1, q0123
-%elif cpuflag(sse2)
- pshufd %1, [%2], q0123
+ unpckhps m3, m0, m2
+ unpcklps m0, m2
+ addps m3, m0
+ movhlps m2, m3
+ addps m2, m3
+ movlps [samplesq+cnt1q], m2
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+ mulps m0, m7, [coeffq+cnt1q*8 ]
+ mulps m1, m7, [coeffq+cnt1q*8+32 ]
+ mulps m2, m7, [coeffq+cnt1q*8+64 ]
+ mulps m3, m7, [coeffq+cnt1q*8+96 ]
+ fmaddps m0, m6, [coeffq+cnt1q*8+16 ], m0
+ fmaddps m1, m6, [coeffq+cnt1q*8+48 ], m1
+ fmaddps m2, m6, [coeffq+cnt1q*8+80 ], m2
+ fmaddps m3, m6, [coeffq+cnt1q*8+112], m3
+
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
%else
- mova %1, [%2]
- shufps %1, %1, q0123
-%endif
-%endmacro
+ mulps m0, m7, [coeffq+cnt1q*8 ]
+ mulps m1, m6, [coeffq+cnt1q*8+16]
+ mulps m2, m7, [coeffq+cnt1q*8+32]
+ mulps m3, m6, [coeffq+cnt1q*8+48]
+ addps m0, m1
+ addps m2, m3
+
+ unpckhps m3, m0, m2
+ unpcklps m0, m2
+ addps m3, m0
+ movhlps m2, m3
+ addps m2, m3
+ movlps [samplesq+cnt1q], m2
+%endif
+%endif; ARCH
-%macro INNER_LOOP 1
- ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
- ;~ a += window[i + j] * (-synth_buf[15 - i + j])
- ;~ b += window[i + j + 16] * (synth_buf[i + j])
- SHUF m5, ptr2 + j + (15 - 3) * 4, m6
- mova m6, [ptr1 + j]
-%if ARCH_X86_64
- SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
- mova m12, [ptr1 + j + mmsize]
-%endif
-%if cpuflag(fma3)
- fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
- fnmaddps m1, m5, [win + %1 + j], m1
-%if ARCH_X86_64
- fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
- fnmaddps m7, m11, [win + %1 + j + mmsize], m7
-%endif
-%else ; non-FMA
- mulps m6, m6, [win + %1 + j + 16 * 4]
- mulps m5, m5, [win + %1 + j]
-%if ARCH_X86_64
- mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
- mulps m11, m11, [win + %1 + j + mmsize]
-%endif
- addps m2, m2, m6
- subps m1, m1, m5
-%if ARCH_X86_64
- addps m8, m8, m12
- subps m7, m7, m11
-%endif
-%endif ; cpuflag(fma3)
- ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
- ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
- SHUF m6, ptr2 + j + (31 - 3) * 4, m5
- mova m5, [ptr1 + j + 16 * 4]
%if ARCH_X86_64
- SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
- mova m11, [ptr1 + j + mmsize + 16 * 4]
-%endif
%if cpuflag(fma3)
- fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
- fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
-%if ARCH_X86_64
- fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
- fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
-%endif
-%else ; non-FMA
- mulps m5, m5, [win + %1 + j + 32 * 4]
- mulps m6, m6, [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
- mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
- mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
- addps m3, m3, m5
- addps m4, m4, m6
-%if ARCH_X86_64
- addps m9, m9, m11
- addps m10, m10, m12
-%endif
-%endif ; cpuflag(fma3)
- sub j, 64 * 4
-%endmacro
+ mulps m8, m5
+ mulps m10, m5
+ mulps m12, m5
+ mulps m14, m5
+ fmaddps m8, m4, m9, m8
+ fmaddps m10, m4, m11, m10
+ fmaddps m12, m4, m13, m12
+ fmaddps m14, m4, m15, m14
-; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
-; const float window[512], float out[32],
-; intptr_t offset, float scale)
-%macro SYNTH_FILTER 0
-cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
- synth_buf, synth_buf2, window, out, off, scale
-%define scale m0
-%if ARCH_X86_32 || WIN64
-%if cpuflag(sse2) && notcpuflag(avx)
- movd scale, scalem
- SPLATD m0
-%else
- VBROADCASTSS m0, scalem
-%endif
-; Make sure offset is in a register and not on the stack
-%define OFFQ r4q
+ haddps m10, m8
+ haddps m14, m12
+ haddps m14, m10
+ movaps [samplesq+cnt2q], m14
%else
- SPLATD xmm0
-%if cpuflag(avx)
- vinsertf128 m0, m0, xmm0, 1
-%endif
-%define OFFQ offq
-%endif
- ; prepare inner counter limit 1
- mov r5q, 480
- sub r5q, offmp
- and r5q, -64
- shl r5q, 2
-%if ARCH_X86_32 || notcpuflag(avx)
- mov OFFQ, r5q
-%define i r5q
- mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
+ mulps m8, m5
+ mulps m9, m4
+ mulps m10, m5
+ mulps m11, m4
+ addps m8, m9
+ addps m10, m11
+
+ unpckhps m11, m10, m8
+ unpcklps m10, m8
+ addps m11, m10
+ movhlps m8, m11
+ addps m8, m11
+ movlps [samplesq+cnt2q], m8
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+ mulps m0, m5, [coeffq+cnt1q*8 ]
+ mulps m1, m5, [coeffq+cnt1q*8+32 ]
+ mulps m2, m5, [coeffq+cnt1q*8+64 ]
+ mulps m3, m5, [coeffq+cnt1q*8+96 ]
+ fmaddps m0, m4, [coeffq+cnt1q*8+16 ], m0
+ fmaddps m1, m4, [coeffq+cnt1q*8+48 ], m1
+ fmaddps m2, m4, [coeffq+cnt1q*8+80 ], m2
+ fmaddps m3, m4, [coeffq+cnt1q*8+112], m3
+
+ haddps m1, m0
+ haddps m3, m2
+ haddps m3, m1
+ movaps [samplesq+cnt2q], m3
%else
-%define i 0
-%define OFFQ r5q
-%endif
+ mulps m0, m5, [coeffq+cnt1q*8 ]
+ mulps m1, m4, [coeffq+cnt1q*8+16]
+ mulps m2, m5, [coeffq+cnt1q*8+32]
+ mulps m3, m4, [coeffq+cnt1q*8+48]
+ addps m0, m1
+ addps m2, m3
+
+ unpckhps m3, m2, m0
+ unpcklps m2, m0
+ addps m3, m2
+ movhlps m0, m3
+ addps m0, m3
+ movlps [samplesq+cnt2q], m0
+%endif
+%endif; ARCH
+
+ sub cnt2d, 8 + FMA3_OFFSET
+ add cnt1q, 8 + FMA3_OFFSET
+ jl .inner_loop
+
+ add lfeq, 4
+ add samplesq, 64*sizeof_float
+ mov cnt1q, -32*sizeof_float
+ mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+ sub nblocksd, 1
+ jg .loop
+ RET
+%endmacro
-%define buf2 synth_buf2q
-%if ARCH_X86_32
- mov buf2, synth_buf2mp
-%endif
-.mainloop:
- ; m1 = a m2 = b m3 = c m4 = d
- SETZERO m3
- SETZERO m4
- mova m1, [buf2 + i]
- mova m2, [buf2 + i + 16 * 4]
%if ARCH_X86_32
-%define ptr1 r0q
-%define ptr2 r1q
-%define win r2q
-%define j r3q
- mov win, windowm
- mov ptr1, synth_bufm
-%if ARCH_X86_32 || notcpuflag(avx)
- add win, i
- add ptr1, i
+INIT_XMM sse
+LFE_FIR0_FLOAT
%endif
-%else ; ARCH_X86_64
-%define ptr1 r6q
-%define ptr2 r7q ; must be loaded
-%define win r8q
-%define j r9q
- SETZERO m9
- SETZERO m10
- mova m7, [buf2 + i + mmsize]
- mova m8, [buf2 + i + mmsize + 16 * 4]
- lea win, [windowq + i]
- lea ptr1, [synth_bufq + i]
+INIT_XMM sse2
+LFE_FIR0_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR0_FLOAT
%endif
- mov ptr2, synth_bufmp
- ; prepare the inner loop counter
- mov j, OFFQ
-%if ARCH_X86_32 || notcpuflag(avx)
- sub ptr2, i
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+LFE_FIR0_FLOAT
%endif
-.loop1:
- INNER_LOOP 0
- jge .loop1
- mov j, 448 * 4
- sub j, OFFQ
- jz .end
- sub ptr1, j
- sub ptr2, j
- add win, OFFQ ; now at j-64, so define OFFSET
- sub j, 64 * 4
-.loop2:
- INNER_LOOP 64 * 4
- jge .loop2
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+ shr nblocksd, 2
+ sub lfeq, 3*sizeof_float
+ mov cnt1d, 64*sizeof_float
+ mov cnt2d, 64*sizeof_float-16
+ lea coeffq, [coeffq+cnt1q*4]
+ add samplesq, cnt1q
+ neg cnt1q
-.end:
-%if ARCH_X86_32
- mov buf2, synth_buf2m ; needed for next iteration anyway
- mov outq, outmp ; j, which will be set again during it
-%endif
- ;~ out[i] = a * scale;
- ;~ out[i + 16] = b * scale;
- mulps m1, m1, scale
- mulps m2, m2, scale
-%if ARCH_X86_64
- mulps m7, m7, scale
- mulps m8, m8, scale
+.loop:
+%if cpuflag(avx)
+ cvtdq2ps m4, [lfeq]
+ shufps m5, m4, m4, q0123
+%elif cpuflag(sse2)
+ movu m4, [lfeq]
+ cvtdq2ps m4, m4
+ pshufd m5, m4, q0123
%endif
- ;~ synth_buf2[i] = c;
- ;~ synth_buf2[i + 16] = d;
- mova [buf2 + i + 0 * 4], m3
- mova [buf2 + i + 16 * 4], m4
+
+.inner_loop:
+ movaps m6, [coeffq+cnt1q*4 ]
+ movaps m7, [coeffq+cnt1q*4+16]
+ mulps m0, m5, m6
+ mulps m1, m5, m7
%if ARCH_X86_64
- mova [buf2 + i + 0 * 4 + mmsize], m9
- mova [buf2 + i + 16 * 4 + mmsize], m10
+ movaps m8, [coeffq+cnt1q*4+32]
+ movaps m9, [coeffq+cnt1q*4+48]
+ mulps m2, m5, m8
+ mulps m3, m5, m9
+%else
+ mulps m2, m5, [coeffq+cnt1q*4+32]
+ mulps m3, m5, [coeffq+cnt1q*4+48]
%endif
- ;~ out[i] = a;
- ;~ out[i + 16] = a;
- mova [outq + i + 0 * 4], m1
- mova [outq + i + 16 * 4], m2
+
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
+
+ mulps m6, m4
+ mulps m7, m4
%if ARCH_X86_64
- mova [outq + i + 0 * 4 + mmsize], m7
- mova [outq + i + 16 * 4 + mmsize], m8
-%endif
-%if ARCH_X86_32 || notcpuflag(avx)
- sub i, (ARCH_X86_64 + 1) * mmsize
- jge .mainloop
+ mulps m8, m4
+ mulps m9, m4
+
+ haddps m6, m7
+ haddps m8, m9
+ haddps m6, m8
+%else
+ mulps m2, m4, [coeffq+cnt1q*4+32]
+ mulps m3, m4, [coeffq+cnt1q*4+48]
+
+ haddps m6, m7
+ haddps m2, m3
+ haddps m6, m2
%endif
+ movaps [samplesq+cnt2q], m6
+
+ sub cnt2d, 16
+ add cnt1q, 16
+ jl .inner_loop
+
+ add lfeq, sizeof_float
+ add samplesq, 128*sizeof_float
+ mov cnt1q, -64*sizeof_float
+ mov cnt2d, 64*sizeof_float-16
+ sub nblocksd, 1
+ jg .loop
RET
%endmacro
-%if ARCH_X86_32
-INIT_XMM sse
-SYNTH_FILTER
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
%endif
-INIT_XMM sse2
-SYNTH_FILTER
-INIT_YMM avx
-SYNTH_FILTER
-INIT_YMM fma3
-SYNTH_FILTER
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 8632c4a98f..fc10fb8bc5 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -1,20 +1,18 @@
/*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ * This file is part of FFmpeg.
*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -23,66 +21,32 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/dcadsp.h"
-void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
-
-av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_SSE(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
- s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
- }
-}
-
+#define LFE_FIR_FLOAT_FUNC(opt) \
+void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
+ const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
+ const float *filter_coeff, ptrdiff_t npcmblocks);
-#define SYNTH_FILTER_FUNC(opt) \
-void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
- const float window[512], \
- float out[32], intptr_t offset, float scale); \
-static void synth_filter_##opt(FFTContext *imdct, \
- float *synth_buf_ptr, int *synth_buf_offset, \
- float synth_buf2[32], const float window[512], \
- float out[32], const float in[32], float scale) \
-{ \
- float *synth_buf= synth_buf_ptr + *synth_buf_offset; \
- \
- imdct->imdct_half(imdct, synth_buf, in); \
- \
- ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \
- out, *synth_buf_offset, scale); \
- \
- *synth_buf_offset = (*synth_buf_offset - 32) & 511; \
-} \
+LFE_FIR_FLOAT_FUNC(sse)
+LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
+LFE_FIR_FLOAT_FUNC(avx)
+LFE_FIR_FLOAT_FUNC(fma3)
-#if HAVE_YASM
-#if ARCH_X86_32
-SYNTH_FILTER_FUNC(sse)
-#endif
-SYNTH_FILTER_FUNC(sse2)
-SYNTH_FILTER_FUNC(avx)
-SYNTH_FILTER_FUNC(fma3)
-#endif /* HAVE_YASM */
-
-av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
{
-#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
-#if ARCH_X86_32
- if (EXTERNAL_SSE(cpu_flags)) {
- s->synth_filter_float = synth_filter_sse;
- }
-#endif
- if (EXTERNAL_SSE2(cpu_flags)) {
- s->synth_filter_float = synth_filter_sse2;
- }
- if (EXTERNAL_AVX_FAST(cpu_flags)) {
- s->synth_filter_float = synth_filter_avx;
- }
- if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
- s->synth_filter_float = synth_filter_fma3;
+ if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
+ s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
+ if (EXTERNAL_SSE2(cpu_flags))
+ s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
+ if (EXTERNAL_SSE3(cpu_flags))
+ s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+ if (EXTERNAL_AVX(cpu_flags)) {
+ s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+ s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
}
-#endif /* HAVE_YASM */
+ if (EXTERNAL_FMA3(cpu_flags))
+ s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
}
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index 2c4c32eb11..4e657b5460 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -2,20 +2,20 @@
;* 32 point SSE-optimized DCT transform
;* Copyright (c) 2010 Vitor Sessak
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -192,6 +192,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
INIT_YMM avx
SECTION .text
+%if HAVE_AVX_EXTERNAL
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
cglobal dct32_float, 2,3,8, out, in, tmp
; pass 1
@@ -264,6 +265,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp
INIT_XMM
PASS6_AND_PERMUTE
RET
+%endif
%if ARCH_X86_64
%define SPILL SWAP
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index b2e43a9be1..c31ef92238 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/dirac_dwt.asm b/libavcodec/x86/dirac_dwt.asm
new file mode 100644
index 0000000000..89806899a2
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.asm
@@ -0,0 +1,307 @@
+;******************************************************************************
+;* x86 optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1991: times 4 dw 9,-1
+
+cextern pw_1
+cextern pw_2
+cextern pw_8
+cextern pw_16
+
+section .text
+
+; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
+%macro COMPOSE_53iL0 4
+ paddw %2, %3
+ paddw %2, %4
+ psraw %2, 2
+ psubw %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered m3: pw_8 m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+ paddw m0, %3
+ paddw m1, %2
+ psubw m0, m3
+ mova m2, m1
+ punpcklwd m1, m0
+ punpckhwd m2, m0
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+%if %0 > 3
+ movu %1, %4
+%endif
+ psrad m1, 4
+ psrad m2, 4
+ packssdw m1, m2
+ paddw m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+ mova m2, [pw_2]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m1, [b0q+2*widthq]
+ mova m0, [b1q+2*widthq]
+ COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+ mova [b1q+2*widthq], m0
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+ mova m1, [pw_1]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ paddw m0, [b2q+2*widthq]
+ paddw m0, m1
+ psraw m0, 1
+ paddw m0, [b1q+2*widthq]
+ mova [b1q+2*widthq], m0
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+ mova m3, [pw_8]
+ mova m4, [pw_1991]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ mova m1, [b1q+2*widthq]
+ COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+ mova [b2q+2*widthq], m1
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+ mova m3, [pw_16]
+ mova m4, [pw_1991]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ mova m1, [b1q+2*widthq]
+ mova m5, [b2q+2*widthq]
+ paddw m0, [b4q+2*widthq]
+ paddw m1, [b3q+2*widthq]
+ psubw m0, m3
+ mova m2, m1
+ punpcklwd m1, m0
+ punpckhwd m2, m0
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ psrad m1, 5
+ psrad m2, 5
+ packssdw m1, m2
+ psubw m5, m1
+ mova [b2q+2*widthq], m5
+ jg .loop
+ REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+ mova m3, [pw_1]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m1, [b1q+2*widthq]
+ mova m0, [b0q+2*widthq]
+ mova m2, m1
+ paddw m1, m3
+ psraw m1, 1
+ psubw m0, m1
+ mova [b0q+2*widthq], m0
+ paddw m2, m0
+ mova [b1q+2*widthq], m2
+ jg .loop
+ REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+ mov %3, [tmpq]
+%assign %%i 1
+%rep %1
+ mov [tmpq-2*%%i], %3
+ %assign %%i %%i+1
+%endrep
+ mov %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+ mov [tmpq+2*w2q+2*%%i], %3
+ %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+ mov w2d, wd
+ xor xq, xq
+ shr w2d, 1
+ lea b_w2q, [bq+wq]
+ mova m3, [pw_1]
+.lowpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [bq + 2*xq]
+ paddw m1, m3
+ psraw m1, 1
+ psubw m0, m1
+ mova [tmpq + 2*xq], m0
+ add xq, mmsize/2
+ cmp xq, w2q
+ jl .lowpass_loop
+
+ xor xq, xq
+ and w2q, ~(mmsize/2 - 1)
+ cmp w2q, mmsize/2
+ jl .end
+
+.highpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [tmpq + 2*xq]
+ paddw m1, m0
+
+ ; shift and interleave
+%if %2 == 1
+ paddw m0, m3
+ paddw m1, m3
+ psraw m0, 1
+ psraw m1, 1
+%endif
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ mova [bq+4*xq], m0
+ mova [bq+4*xq+mmsize], m2
+
+ add xq, mmsize/2
+ cmp xq, w2q
+ jl .highpass_loop
+.end:
+ REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+ mov w2d, wd
+ xor xd, xd
+ shr w2d, 1
+ lea b_w2q, [bq+wq]
+ movu m4, [bq+wq]
+ mova m7, [pw_2]
+ pslldq m4, 14
+.lowpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [bq + 2*xq]
+ mova m2, m1
+ palignr m1, m4, 14
+ mova m4, m2
+ COMPOSE_53iL0 m0, m1, m2, m7
+ mova [tmpq + 2*xq], m0
+ add xd, mmsize/2
+ cmp xd, w2d
+ jl .lowpass_loop
+
+ EDGE_EXTENSION 1, 2, xw
+ ; leave the last up to 7 (sse) or 3 (mmx) values for C
+ xor xd, xd
+ and w2d, ~(mmsize/2 - 1)
+ cmp w2d, mmsize/2
+ jl .end
+
+ mova m7, [tmpq-mmsize]
+ mova m0, [tmpq]
+ mova m5, [pw_1]
+ mova m3, [pw_8]
+ mova m4, [pw_1991]
+.highpass_loop:
+ mova m6, m0
+ palignr m0, m7, 14
+ mova m7, [tmpq + 2*xq + 16]
+ mova m1, m7
+ mova m2, m7
+ palignr m1, m6, 2
+ palignr m2, m6, 4
+ COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+ mova m0, m7
+ mova m7, m6
+
+ ; shift and interleave
+ paddw m6, m5
+ paddw m1, m5
+ psraw m6, 1
+ psraw m1, 1
+ mova m2, m6
+ punpcklwd m6, m1
+ punpckhwd m2, m1
+ mova [bq+4*xq], m6
+ mova [bq+4*xq+mmsize], m2
+
+ add xd, mmsize/2
+ cmp xd, w2d
+ jl .highpass_loop
+.end:
+ REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/libavcodec/x86/dirac_dwt_init.c b/libavcodec/x86/dirac_dwt_init.c
new file mode 100644
index 0000000000..afdf0a1415
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_init.c
@@ -0,0 +1,229 @@
+/*
+ * x86 optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+ int16_t *b0 = (int16_t *)_b0; \
+ int16_t *b1 = (int16_t *)_b1; \
+ int16_t *b2 = (int16_t *)_b2; \
+\
+ for(i=width_align; i<width; i++) \
+ b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+ ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+ int16_t *b0 = (int16_t *)_b0; \
+ int16_t *b1 = (int16_t *)_b1; \
+ int16_t *b2 = (int16_t *)_b2; \
+\
+ for(i=width_align; i<width; i++) \
+ b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+ ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+ uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+ int16_t *b0 = (int16_t *)_b0; \
+ int16_t *b1 = (int16_t *)_b1; \
+ int16_t *b2 = (int16_t *)_b2; \
+ int16_t *b3 = (int16_t *)_b3; \
+ int16_t *b4 = (int16_t *)_b4; \
+\
+ for(i=width_align; i<width; i++) \
+ b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+ ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+ uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+ int16_t *b0 = (int16_t *)_b0; \
+ int16_t *b1 = (int16_t *)_b1; \
+ int16_t *b2 = (int16_t *)_b2; \
+ int16_t *b3 = (int16_t *)_b3; \
+ int16_t *b4 = (int16_t *)_b4; \
+\
+ for(i=width_align; i<width; i++) \
+ b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+ ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+ int16_t *b0 = (int16_t *)_b0; \
+ int16_t *b1 = (int16_t *)_b1; \
+\
+ for(i=width_align; i<width; i++) { \
+ b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+ b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+ } \
+\
+ ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+ int w2= w>>1;\
+ int x= w2 - (w2&(align-1));\
+ int16_t *b = (int16_t *)_b; \
+ int16_t *tmp = (int16_t *)_tmp; \
+\
+ ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+ for (; x < w2; x++) {\
+ b[2*x ] = tmp[x];\
+ b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+ }\
+}\
+static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+ int w2= w>>1;\
+ int x= w2 - (w2&(align-1));\
+ int16_t *b = (int16_t *)_b; \
+ int16_t *tmp = (int16_t *)_tmp; \
+\
+ ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+ for (; x < w2; x++) {\
+ b[2*x ] = (tmp[x] + 1)>>1;\
+ b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+ }\
+}\
+\
+
+#if HAVE_YASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
+{
+ int w2= w>>1;
+ int x= w2 - (w2&7);
+ int16_t *b = (int16_t *)_b;
+ int16_t *tmp = (int16_t *)_tmp;
+
+ ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+ for (; x < w2; x++) {
+ b[2*x ] = (tmp[x] + 1)>>1;
+ b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+ }
+}
+#endif
+
+void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+ if (!(mm_flags & AV_CPU_FLAG_MMX))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+ break;
+ case DWT_DIRAC_LEGALL5_3:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+ break;
+ case DWT_DIRAC_DD13_7:
+ d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+ break;
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_mmx;
+ d->horizontal_compose = horizontal_compose_haar0i_mmx;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->vertical_compose = (void*)vertical_compose_haar_mmx;
+ d->horizontal_compose = horizontal_compose_haar1i_mmx;
+ break;
+ }
+#endif
+
+ if (!(mm_flags & AV_CPU_FLAG_SSE2))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+ break;
+ case DWT_DIRAC_LEGALL5_3:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+ break;
+ case DWT_DIRAC_DD13_7:
+ d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+ break;
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_sse2;
+ d->horizontal_compose = horizontal_compose_haar0i_sse2;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->vertical_compose = (void*)vertical_compose_haar_sse2;
+ d->horizontal_compose = horizontal_compose_haar1i_sse2;
+ break;
+ }
+
+ if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+ break;
+ }
+#endif // HAVE_YASM
+}
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
new file mode 100644
index 0000000000..6b3f780e41
--- /dev/null
+++ b/libavcodec/x86/diracdsp.asm
@@ -0,0 +1,347 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit: times 8 dw 0x3ff
+
+cextern pw_3
+cextern pw_16
+cextern pw_32
+cextern pb_80
+
+section .text
+
+%macro UNPACK_ADD 6
+ mov%5 %1, %3
+ mov%6 m5, %4
+ mova m4, %1
+ mova %2, m5
+ punpcklbw %1, m7
+ punpcklbw m5, m7
+ punpckhbw m4, m7
+ punpckhbw %2, m7
+ paddw %1, m5
+ paddw %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+ mov src0q, srcq
+ lea stridex3q, [3*strideq]
+ sub src0q, stridex3q
+ pxor m7, m7
+.loop:
+ ; 7*(src[0] + src[1])
+ UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+ pmullw m0, [pw_7]
+ pmullw m1, [pw_7]
+
+ ; 3*( ... + src[-2] + src[3])
+ UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+ paddw m0, m2
+ paddw m1, m3
+ pmullw m0, [pw_3]
+ pmullw m1, [pw_3]
+
+ ; ... - 7*(src[-1] + src[2])
+ UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+ pmullw m2, [pw_7]
+ pmullw m3, [pw_7]
+ psubw m0, m2
+ psubw m1, m3
+
+ ; ... - (src[-3] + src[4])
+ UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+ psubw m0, m2
+ psubw m1, m3
+
+ paddw m0, [pw_16]
+ paddw m1, [pw_16]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, mmsize
+ add srcq, mmsize
+ add src0q, mmsize
+ sub widthd, mmsize
+ jg .loop
+ RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+ dec widthd
+ pxor m7, m7
+ and widthd, ~(mmsize-1)
+.loop:
+ ; 7*(src[0] + src[1])
+ UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+ pmullw m0, [pw_7]
+ pmullw m1, [pw_7]
+
+ ; 3*( ... + src[-2] + src[3])
+ UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+ paddw m0, m2
+ paddw m1, m3
+ pmullw m0, [pw_3]
+ pmullw m1, [pw_3]
+
+ ; ... - 7*(src[-1] + src[2])
+ UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+ pmullw m2, [pw_7]
+ pmullw m3, [pw_7]
+ psubw m0, m2
+ psubw m1, m3
+
+ ; ... - (src[-3] + src[4])
+ UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+ psubw m0, m2
+ psubw m1, m3
+
+ paddw m0, [pw_16]
+ paddw m1, [pw_16]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ mova [dstq + widthq], m0
+ sub widthd, mmsize
+ jge .loop
+ RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+ mova m0, [pb_80]
+ add wd, (mmsize-1)
+ and wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+ movsxd dst_strideq, dst_strided
+ movsxd src_strideq, src_strided
+ mov r7d, r5m
+ mov r8d, wd
+ %define wspill r8d
+ %define hd r7d
+%else
+ mov r4m, wd
+ %define wspill r4m
+ %define hd r5mp
+%endif
+
+.loopy:
+ lea src2q, [srcq+src_strideq]
+ lea dst2q, [dstq+dst_strideq]
+.loopx:
+ sub wd, mmsize
+ mova m1, [srcq +2*wq]
+ mova m2, [src2q+2*wq]
+ packsswb m1, [srcq +2*wq+mmsize]
+ packsswb m2, [src2q+2*wq+mmsize]
+ paddb m1, m0
+ paddb m2, m0
+ mova [dstq +wq], m1
+ mova [dst2q+wq], m2
+ jg .loopx
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+ sub hd, 2
+ mov wd, wspill
+ jg .loopy
+ RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+ mova m0, [pw_32]
+ add wd, (mmsize-1)
+ and wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+ movsxd strideq, strided
+ movsxd idwt_strideq, idwt_strided
+ mov r8d, wd
+ %define wspill r8d
+%else
+ mov r5m, wd
+ %define wspill r5m
+%endif
+
+.loop:
+ sub wd, mmsize
+ movu m1, [srcq +2*wq] ; FIXME: ensure alignment
+ paddw m1, m0
+ psraw m1, 6
+ movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+ paddw m2, m0
+ psraw m2, 6
+ paddw m1, [idwtq+2*wq]
+ paddw m2, [idwtq+2*wq+mmsize]
+ packuswb m1, m2
+ mova [dstq +wq], m1
+ jg .loop
+
+ lea srcq, [srcq + 2*strideq]
+ add dstq, strideq
+ lea idwtq, [idwtq+ 2*idwt_strideq]
+ sub hd, 1
+ mov wd, wspill
+ jg .loop
+ RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+ pxor m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+ mova m0, [srcq+i]
+ mova m1, m0
+ punpcklbw m0, m4
+ punpckhbw m1, m4
+ mova m2, [obmcq+i]
+ mova m3, m2
+ punpcklbw m2, m4
+ punpckhbw m3, m4
+ pmullw m0, m2
+ pmullw m1, m3
+ movu m2, [dstq+2*i]
+ movu m3, [dstq+2*i+mmsize]
+ paddw m0, m2
+ paddw m1, m3
+ movu [dstq+2*i], m0
+ movu [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+ lea srcq, [srcq+strideq]
+ lea dstq, [dstq+2*strideq]
+ add obmcq, 32
+ sub yblend, 1
+ jg .loop
+ RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
+
+INIT_XMM sse4
+
+; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
+cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
+ movd m2, qfd
+ movd m3, qsd
+ SPLATD m2
+ SPLATD m3
+ mov r4, tot_hq
+ mov r3, dstq
+
+ .loop_v:
+ mov tot_hq, r4
+ mov dstq, r3
+
+ .loop_h:
+ movu m0, [srcq]
+
+ pabsd m1, m0
+ pmulld m1, m2
+ paddd m1, m3
+ psrld m1, 2
+ psignd m1, m0
+
+ movu [dstq], m1
+
+ add srcq, mmsize
+ add dstq, mmsize
+ sub tot_hd, 4
+ jg .loop_h
+
+ add r3, strideq
+ dec tot_vd
+ jg .loop_v
+
+ RET
+
+INIT_XMM sse4
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+%if ARCH_X86_64
+cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
+%else
+cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
+ %define hd r5mp
+%endif
+ shl wd, 2
+ add srcq, wq
+ neg wq
+ mov t2q, dstq
+ mov t1q, wq
+ pxor m2, m2
+ mova m3, [clip_10bit]
+ mova m4, [convert_to_unsigned_10bit]
+
+ .loop_h:
+ mov dstq, t2q
+ mov wq, t1q
+
+ .loop_w:
+ movu m0, [srcq+wq+0*mmsize]
+ movu m1, [srcq+wq+1*mmsize]
+
+ paddd m0, m4
+ paddd m1, m4
+ packusdw m0, m0, m1
+ CLIPW m0, m2, m3 ; packusdw saturates so it's fine
+
+ movu [dstq], m0
+
+ add dstq, 1*mmsize
+ add wq, 2*mmsize
+ jl .loop_w
+
+ add srcq, src_strideq
+ add t2q, dst_strideq
+ sub hd, 1
+ jg .loop_h
+
+ RET
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
new file mode 100644
index 0000000000..b195113789
--- /dev/null
+++ b/libavcodec/x86/diracdsp_init.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/diracdsp.h"
+#include "fpel.h"
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+
+void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
+#if HAVE_YASM
+
+#define HPEL_FILTER(MMSIZE, EXT) \
+ void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
+ void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
+ \
+ static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
+ const uint8_t *src, int stride, int width, int height) \
+ { \
+ while( height-- ) \
+ { \
+ ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+ ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
+ ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
+ \
+ dsth += stride; \
+ dstv += stride; \
+ dstc += stride; \
+ src += stride; \
+ } \
+ }
+
+#define PIXFUNC(PFX, IDX, EXT) \
+ /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
+ c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+ c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3)\
+ ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+ else\
+ OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3)\
+ ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+ else\
+ OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3) {\
+ ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+ } else {\
+ OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
+ OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+ }\
+}
+
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3)
+ ff_put_dirac_pixels16_c(dst, src, stride, h);
+ else
+ ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3)
+ ff_avg_dirac_pixels16_c(dst, src, stride, h);
+ else
+ ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3) {
+ ff_put_dirac_pixels32_c(dst, src, stride, h);
+ } else {
+ ff_put_pixels16_sse2(dst , src[0] , stride, h);
+ ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+ }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3) {
+ ff_avg_dirac_pixels32_c(dst, src, stride, h);
+ } else {
+ ff_avg_pixels16_sse2(dst , src[0] , stride, h);
+ ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+ }
+}
+
+#else // HAVE_YASM
+
+#define HPEL_FILTER(MMSIZE, EXT) \
+ void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
+ const uint8_t *src, int stride, int width, int height);
+
+#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
+
+#endif // HAVE_YASM
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+void ff_diracdsp_init_x86(DiracDSPContext* c)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(mm_flags)) {
+ c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+ c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+ c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+ c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+ c->add_rect_clamped = ff_add_rect_clamped_mmx;
+ c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
+#endif
+ PIXFUNC(put, 0, mmx);
+ PIXFUNC(avg, 0, mmx);
+ }
+
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ PIXFUNC(avg, 0, mmxext);
+ }
+
+ if (EXTERNAL_SSE2(mm_flags)) {
+ c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+ c->add_rect_clamped = ff_add_rect_clamped_sse2;
+ c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
+
+ c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+ c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+
+ c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+ c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+ c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+ c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+ }
+
+ if (EXTERNAL_SSE4(mm_flags)) {
+ c->dequant_subband[1] = ff_dequant_subband_32_sse4;
+ c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
+ }
+}
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index d39b07b9f4..9dd6d51ee6 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c
index f1ff7bd986..fd6f15005a 100644
--- a/libavcodec/x86/dnxhdenc_init.c
+++ b/libavcodec/x86/dnxhdenc_init.c
@@ -4,20 +4,20 @@
*
* VC-3 encoder funded by the British Broadcasting Corporation
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c
index 6528b57361..112566ded0 100644
--- a/libavcodec/x86/fdct.c
+++ b/libavcodec/x86/fdct.c
@@ -13,20 +13,20 @@
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
-static struct
+static const struct
{
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
} fdct_r_row_sse2 =
@@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct
29692, -12299, 26722, -31521,
};
-static struct
+static const struct
{
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
} tab_frw_01234567_sse2 =
diff --git a/libavcodec/x86/fdct.h b/libavcodec/x86/fdct.h
index c94a977e8f..648cdc5350 100644
--- a/libavcodec/x86/fdct.h
+++ b/libavcodec/x86/fdct.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/fdctdsp_init.c b/libavcodec/x86/fdctdsp_init.c
index 4e8e4eb60d..0cb5fd625b 100644
--- a/libavcodec/x86/fdctdsp_init.c
+++ b/libavcodec/x86/fdctdsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index ef007f4eec..cdbfd66e82 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein.
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -51,13 +51,12 @@ struc FFTContext
.imdcthalf:pointer 1
endstruc
-SECTION_RODATA
+SECTION_RODATA 32
%define M_SQRT1_2 0.70710678118654752440
%define M_COS_PI_1_8 0.923879532511287
%define M_COS_PI_3_8 0.38268343236509
-align 32
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
@@ -69,11 +68,12 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
ps_m1p1: dd 1<<31, 0
+cextern ps_neg
+
%assign i 16
-%rep 13
+%rep 14
cextern cos_ %+ i
%assign i i<<1
%endrep
@@ -305,6 +305,7 @@ IF%1 mova Z(1), m5
INIT_YMM avx
+%if HAVE_AVX_EXTERNAL
align 16
fft8_avx:
mova m0, Z(0)
@@ -394,6 +395,8 @@ fft32_interleave_avx:
jg .deint_loop
ret
+%endif
+
INIT_XMM sse
align 16
@@ -537,6 +540,7 @@ DEFINE_ARGS zc, w, n, o1, o3
INIT_YMM avx
+%if HAVE_AVX_EXTERNAL
%macro INTERL_AVX 5
vunpckhps %3, %2, %1
vunpcklps %2, %2, %1
@@ -558,6 +562,7 @@ cglobal fft_calc, 2,5,8
FFT_DISPATCH _interleave %+ SUFFIX, r1
REP_RET
+%endif
INIT_XMM sse
@@ -655,6 +660,68 @@ cglobal fft_permute, 2,7,1
jl .loopcopy
REP_RET
+%macro IMDCT_CALC_FUNC 0
+cglobal imdct_calc, 3,5,3
+ mov r3d, [r0 + FFTContext.mdctsize]
+ mov r4, [r0 + FFTContext.imdcthalf]
+ add r1, r3
+ PUSH r3
+ PUSH r1
+%if ARCH_X86_32
+ push r2
+ push r1
+ push r0
+%else
+ sub rsp, 8+32*WIN64 ; allocate win64 shadow space
+%endif
+ call r4
+%if ARCH_X86_32
+ add esp, 12
+%else
+ add rsp, 8+32*WIN64
+%endif
+ POP r1
+ POP r3
+ lea r0, [r1 + 2*r3]
+ mov r2, r3
+ sub r3, mmsize
+ neg r2
+ mova m2, [ps_neg]
+.loop:
+%if mmsize == 8
+ PSWAPD m0, [r1 + r3]
+ PSWAPD m1, [r0 + r2]
+ pxor m0, m2
+%else
+ mova m0, [r1 + r3]
+ mova m1, [r0 + r2]
+ shufps m0, m0, 0x1b
+ shufps m1, m1, 0x1b
+ xorps m0, m2
+%endif
+ mova [r0 + r3], m1
+ mova [r1 + r2], m0
+ sub r3, mmsize
+ add r2, mmsize
+ jl .loop
+%if cpuflag(3dnow)
+ femms
+ RET
+%else
+ REP_RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+IMDCT_CALC_FUNC
+INIT_MMX 3dnowext
+IMDCT_CALC_FUNC
+%endif
+
+INIT_XMM sse
+IMDCT_CALC_FUNC
+
%if ARCH_X86_32
INIT_MMX 3dnow
%define mulps pfmul
@@ -689,7 +756,7 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%endif
%assign n 1<<%1
-%rep 17-%1
+%rep 18-%1
%assign n2 n/2
%assign n4 n/4
%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
@@ -714,9 +781,11 @@ align 8
dispatch_tab %+ fullsuffix: pointer list_of_fft
%endmacro ; DECL_FFT
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
DECL_FFT 6
DECL_FFT 6, _interleave
+%endif
INIT_XMM sse
DECL_FFT 5
DECL_FFT 5, _interleave
@@ -729,70 +798,6 @@ DECL_FFT 4
DECL_FFT 4, _interleave
%endif
-%if CONFIG_MDCT
-
-%macro IMDCT_CALC_FUNC 0
-cglobal imdct_calc, 3,5,3
- mov r3d, [r0 + FFTContext.mdctsize]
- mov r4, [r0 + FFTContext.imdcthalf]
- add r1, r3
- PUSH r3
- PUSH r1
-%if ARCH_X86_32
- push r2
- push r1
- push r0
-%else
- sub rsp, 8+32*WIN64 ; allocate win64 shadow space
-%endif
- call r4
-%if ARCH_X86_32
- add esp, 12
-%else
- add rsp, 8+32*WIN64
-%endif
- POP r1
- POP r3
- lea r0, [r1 + 2*r3]
- mov r2, r3
- sub r3, mmsize
- neg r2
- mova m2, [ps_m1m1m1m1]
-.loop:
-%if mmsize == 8
- PSWAPD m0, [r1 + r3]
- PSWAPD m1, [r0 + r2]
- pxor m0, m2
-%else
- mova m0, [r1 + r3]
- mova m1, [r0 + r2]
- shufps m0, m0, 0x1b
- shufps m1, m1, 0x1b
- xorps m0, m2
-%endif
- mova [r0 + r3], m1
- mova [r1 + r2], m0
- sub r3, mmsize
- add r2, mmsize
- jl .loop
-%if cpuflag(3dnow)
- femms
- RET
-%else
- REP_RET
-%endif
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-IMDCT_CALC_FUNC
-INIT_MMX 3dnowext
-IMDCT_CALC_FUNC
-%endif
-
-INIT_XMM sse
-IMDCT_CALC_FUNC
-
INIT_XMM sse
%undef mulps
%undef addps
@@ -994,7 +999,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
sub r4, r3
%endif
%if notcpuflag(3dnowext) && mmsize == 8
- movd m7, [ps_m1m1m1m1]
+ movd m7, [ps_neg]
%endif
.pre:
%if ARCH_X86_64 == 0
@@ -1082,6 +1087,7 @@ DECL_IMDCT POSROTATESHUF_3DNOW
%endif
INIT_YMM avx
-DECL_IMDCT POSROTATESHUF_AVX
-%endif ; CONFIG_MDCT
+%if HAVE_AVX_EXTERNAL
+DECL_IMDCT POSROTATESHUF_AVX
+%endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 94405d0cb4..398091eb1f 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,4 +27,12 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
+void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
+
#endif /* AVCODEC_X86_FFT_H */
diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c
index ed1290997b..928f1dcda7 100644
--- a/libavcodec/x86/fft_init.c
+++ b/libavcodec/x86/fft_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -28,23 +28,33 @@ av_cold void ff_fft_init_x86(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
+ if (s->nbits > 16)
+ return;
+
#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+ s->imdct_calc = ff_imdct_calc_3dnow;
+ s->imdct_half = ff_imdct_half_3dnow;
s->fft_calc = ff_fft_calc_3dnow;
}
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+ s->imdct_calc = ff_imdct_calc_3dnowext;
+ s->imdct_half = ff_imdct_half_3dnowext;
s->fft_calc = ff_fft_calc_3dnowext;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE(cpu_flags)) {
+ s->imdct_calc = ff_imdct_calc_sse;
+ s->imdct_half = ff_imdct_half_sse;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_sse;
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
}
if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
+ s->imdct_half = ff_imdct_half_avx;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
}
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
new file mode 100644
index 0000000000..e285158185
--- /dev/null
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -0,0 +1,101 @@
+;******************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+ cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
+ DECLARE_REG_TMP 5, 6
+ %define length r2d
+
+ movsxd orderq, orderd
+%else
+ cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
+ DECLARE_REG_TMP 2, 5
+ %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32. This means that we only
+; need to copy a maximum of 32 samples. Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+ movu m0, [smpq+iter]
+ movu [resq+iter], m0
+ %assign iter iter+mmsize
+%endrep
+
+lea resq, [resq+orderq*4]
+lea smpq, [smpq+orderq*4]
+lea coefsq, [coefsq+orderq*4]
+sub length, orderd
+movd m3, r5m
+neg orderq
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+ pxor m0, m0
+ pxor m4, m4
+ pxor m6, m6
+ mov posj, orderq
+ xor negj, negj
+
+ .looporder:
+ movd m2, [coefsq+posj*4] ; c = coefs[j]
+ SPLATD m2
+ movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+ movu m5, [smpq+negj*4-4+mmsize]
+ movu m7, [smpq+negj*4-4+mmsize*2]
+ pmulld m1, m2
+ pmulld m5, m2
+ pmulld m7, m2
+ paddd m0, m1 ; p += c * s
+ paddd m4, m5
+ paddd m6, m7
+
+ dec negj
+ inc posj
+ jnz .looporder
+
+ psrad m0, m3 ; p >>= shift
+ psrad m4, m3
+ psrad m6, m3
+ movu m1, [smpq]
+ movu m5, [smpq+mmsize]
+ movu m7, [smpq+mmsize*2]
+ psubd m1, m0 ; smp[i] - p
+ psubd m5, m4
+ psubd m7, m6
+ movu [resq], m1 ; res[i] = smp[i] - (p >> shift)
+ movu [resq+mmsize], m5
+ movu [resq+mmsize*2], m7
+
+ add resq, 3*mmsize
+ add smpq, 3*mmsize
+ sub length, (3*mmsize)/4
+jg .looplen
+RET
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
new file mode 100644
index 0000000000..7138611526
--- /dev/null
+++ b/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,313 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro PMACSDQL 5
+%if cpuflag(xop)
+ pmacsdql %1, %2, %3, %1
+%else
+ pmuldq %2, %3
+ paddq %1, %2
+%endif
+%endmacro
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+ sub lend, pred_orderd
+ jle .ret
+ lea decodedq, [decodedq+pred_orderq*4-8]
+ lea coeffsq, [coeffsq+pred_orderq*4]
+ neg pred_orderq
+ movd m4, qlevelm
+ALIGN 16
+.loop_sample:
+ movd m0, [decodedq+pred_orderq*4+8]
+ add decodedq, 8
+ movd m1, [coeffsq+pred_orderq*4]
+ pxor m2, m2
+ pxor m3, m3
+ lea jq, [pred_orderq+1]
+ test jq, jq
+ jz .end_order
+.loop_order:
+ PMACSDQL m2, m0, m1, m2, m0
+ movd m0, [decodedq+jq*4]
+ PMACSDQL m3, m1, m0, m3, m1
+ movd m1, [coeffsq+jq*4]
+ inc jq
+ jl .loop_order
+.end_order:
+ PMACSDQL m2, m0, m1, m2, m0
+ psrlq m2, m4
+ movd m0, [decodedq]
+ paddd m0, m2
+ movd [decodedq], m0
+ sub lend, 2
+ jl .ret
+ PMACSDQL m3, m1, m0, m3, m1
+ psrlq m3, m4
+ movd m1, [decodedq+4]
+ paddd m1, m3
+ movd [decodedq+4], m1
+ jg .loop_sample
+.ret:
+ REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
+; int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_16 3-4
+cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+ mov lend, lenm
+%endif
+ movd m3, r4m
+ shl lend, 2
+ mov in1q, [in0q + gprsize]
+ mov in0q, [in0q]
+ mov outq, [outq]
+ add in1q, lenq
+ add in0q, lenq
+ add outq, lenq
+ neg lenq
+
+align 16
+.loop:
+ mova m0, [in0q + lenq]
+ mova m1, [in1q + lenq]
+%ifidn %1, ms
+ psrad m2, m1, 1
+ psubd m0, m2
+%endif
+%ifnidn %1, indep2
+ p%4d m2, m0, m1
+%endif
+ packssdw m%2, m%2
+ packssdw m%3, m%3
+ punpcklwd m%2, m%3
+ psllw m%2, m3
+ mova [outq + lenq], m%2
+ add lenq, 16
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 ls, 0, 2, sub
+FLAC_DECORRELATE_16 rs, 2, 1, add
+FLAC_DECORRELATE_16 ms, 2, 0, add
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
+; int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_32 5
+cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+ mov lend, lenm
+%endif
+ movd m3, r4m
+ mov in1q, [in0q + gprsize]
+ mov in0q, [in0q]
+ mov outq, [outq]
+ sub in1q, in0q
+
+align 16
+.loop:
+ mova m0, [in0q]
+ mova m1, [in0q + in1q]
+%ifidn %1, ms
+ psrad m2, m1, 1
+ psubd m0, m2
+%endif
+ p%5d m2, m0, m1
+ pslld m%2, m3
+ pslld m%3, m3
+
+ SBUTTERFLY dq, %2, %3, %4
+
+ mova [outq ], m%2
+ mova [outq + mmsize], m%3
+
+ add in0q, mmsize
+ add outq, mmsize*2
+ sub lend, mmsize/4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
+FLAC_DECORRELATE_32 rs, 2, 1, 0, add
+FLAC_DECORRELATE_32 ms, 2, 0, 1, add
+
+;-----------------------------------------------------------------------------------------
+;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
+; int len, int shift);
+;-----------------------------------------------------------------------------------------
+;%1 = bps
+;%2 = channels
+;%3 = last xmm reg used
+;%4 = word/dword (shift instruction)
+%macro FLAC_DECORRELATE_INDEP 4
+%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
+cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
+%if ARCH_X86_32
+%if %2 == 6
+ DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
+ %define lend dword r3m
+%else
+ mov lend, lenm
+%endif
+%endif
+ movd m%3, r4m
+
+%assign %%i 1
+%rep %2-1
+ mov in %+ %%i %+ q, [in0q+%%i*gprsize]
+%assign %%i %%i+1
+%endrep
+
+ mov in0q, [in0q]
+ mov outq, [outq]
+
+%assign %%i 1
+%rep %2-1
+ sub in %+ %%i %+ q, in0q
+%assign %%i %%i+1
+%endrep
+
+align 16
+.loop:
+ mova m0, [in0q]
+
+%assign %%i 1
+%rep REPCOUNT-1
+ mova m %+ %%i, [in0q + in %+ %%i %+ q]
+%assign %%i %%i+1
+%endrep
+
+%if %1 == 32
+
+%if %2 == 8
+ TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
+%elif %2 == 6
+ SBUTTERFLY dq, 0, 1, 6
+ SBUTTERFLY dq, 2, 3, 6
+ SBUTTERFLY dq, 4, 5, 6
+
+ punpcklqdq m6, m0, m2
+ punpckhqdq m2, m4
+ shufps m4, m0, 0xe4
+ punpcklqdq m0, m1, m3
+ punpckhqdq m3, m5
+ shufps m5, m1, 0xe4
+ SWAP 0,6,1,4,5,3
+%elif %2 == 4
+ TRANSPOSE4x4D 0, 1, 2, 3, 4
+%else ; %2 == 2
+ SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%else ; %1 == 16
+
+%if %2 == 8
+ packssdw m0, [in0q + in4q]
+ packssdw m1, [in0q + in5q]
+ packssdw m2, [in0q + in6q]
+ packssdw m3, [in0q + in7q]
+ TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+%elif %2 == 6
+ packssdw m0, [in0q + in3q]
+ packssdw m1, [in0q + in4q]
+ packssdw m2, [in0q + in5q]
+ pshufd m3, m0, q1032
+ punpcklwd m0, m1
+ punpckhwd m1, m2
+ punpcklwd m2, m3
+
+ shufps m3, m0, m2, q2020
+ shufps m0, m1, q2031
+ shufps m2, m1, q3131
+ shufps m1, m2, m3, q3120
+ shufps m3, m0, q0220
+ shufps m0, m2, q3113
+ SWAP 2, 0, 3
+%else ; %2 == 4
+ packssdw m0, [in0q + in2q]
+ packssdw m1, [in0q + in3q]
+ SBUTTERFLY wd, 0, 1, 2
+ SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%endif
+
+%assign %%i 0
+%rep REPCOUNT
+ psll%4 m %+ %%i, m%3
+%assign %%i %%i+1
+%endrep
+
+%assign %%i 0
+%rep REPCOUNT
+ mova [outq + %%i*mmsize], m %+ %%i
+%assign %%i %%i+1
+%endrep
+
+ add in0q, mmsize
+ add outq, mmsize*REPCOUNT
+ sub lend, mmsize/4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
+FLAC_DECORRELATE_INDEP 32, 2, 3, d
+FLAC_DECORRELATE_INDEP 16, 4, 3, w
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 16, 6, 4, w
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
+
+INIT_XMM avx
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
new file mode 100644
index 0000000000..e28c5c9322
--- /dev/null
+++ b/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+ int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+ int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+#define DECORRELATE_FUNCS(fmt, opt) \
+void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift)
+
+DECORRELATE_FUNCS(16, sse2);
+DECORRELATE_FUNCS(16, avx);
+DECORRELATE_FUNCS(32, sse2);
+DECORRELATE_FUNCS(32, avx);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
+ int bps)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_FLAC_DECODER
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ if (fmt == AV_SAMPLE_FMT_S16) {
+ if (channels == 2)
+ c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+ else if (channels == 4)
+ c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+ else if (channels == 6)
+ c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+ else if (ARCH_X86_64 && channels == 8)
+ c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
+ c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+ c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+ c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+ } else if (fmt == AV_SAMPLE_FMT_S32) {
+ if (channels == 2)
+ c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+ else if (channels == 4)
+ c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+ else if (channels == 6)
+ c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+ else if (ARCH_X86_64 && channels == 8)
+ c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
+ c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+ c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+ c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+ }
+ }
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ c->lpc32 = ff_flac_lpc_32_sse4;
+ }
+ if (EXTERNAL_AVX(cpu_flags)) {
+ if (fmt == AV_SAMPLE_FMT_S16) {
+ if (ARCH_X86_64 && channels == 8)
+ c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
+ } else if (fmt == AV_SAMPLE_FMT_S32) {
+ if (channels == 4)
+ c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
+ else if (channels == 6)
+ c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
+ else if (ARCH_X86_64 && channels == 8)
+ c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
+ }
+ }
+ if (EXTERNAL_XOP(cpu_flags)) {
+ c->lpc32 = ff_flac_lpc_32_xop;
+ }
+#endif
+
+#if CONFIG_FLAC_ENCODER
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ if (CONFIG_GPL)
+ c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
+ }
+#endif
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 2a3e4a5f74..8f62a0a093 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -2,20 +2,20 @@
;* x86 optimized Format Conversion Utils
;* Copyright (c) 2008 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -75,3 +75,50 @@ INIT_XMM sse
INT32_TO_FLOAT_FMUL_SCALAR 5
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_SCALAR 3
+
+;------------------------------------------------------------------------------
+; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
+; const float *mul, int len);
+;------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
+cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
+ shl lend, 2
+ add srcq, lenq
+ add dstq, lenq
+ neg lenq
+.loop:
+ movss m0, [mulq]
+ SPLATD m0
+%if cpuflag(sse2)
+ cvtdq2ps m1, [srcq+lenq ]
+ cvtdq2ps m2, [srcq+lenq+16]
+%else
+ cvtpi2ps m1, [srcq+lenq ]
+ cvtpi2ps m3, [srcq+lenq+ 8]
+ cvtpi2ps m2, [srcq+lenq+16]
+ cvtpi2ps m4, [srcq+lenq+24]
+ movlhps m1, m3
+ movlhps m2, m4
+%endif
+ mulps m1, m0
+ mulps m2, m0
+ mova [dstq+lenq ], m1
+ mova [dstq+lenq+16], m2
+ add mulq, 4
+ add lenq, 32
+ jl .loop
+%if notcpuflag(sse2)
+ ;; cvtpi2ps switches to MMX even if the source is a memory location
+ ;; possible an error in documentation since every tested CPU disagrees with
+ ;; that. Use emms anyway since the vast majority of machines will use the
+ ;; SSE2 variant
+ emms
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_ARRAY8
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_ARRAY8
+
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 2f10db4fc1..6d35377a77 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -5,20 +5,20 @@
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -31,6 +31,10 @@
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
+ const float *mul, int len);
+void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
+ const float *mul, int len);
#endif /* HAVE_YASM */
@@ -41,9 +45,11 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
if (EXTERNAL_SSE(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+ c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+ c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
}
#endif /* HAVE_YASM */
}
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b1be289a92..961a1587a7 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -25,84 +25,82 @@
SECTION .text
-INIT_MMX mmxext
+%macro PAVGB_MMX 4
+ LOAD %3, %1
+ por %3, %2
+ pxor %2, %1
+ pand %2, %4
+ psrlq %2, 1
+ psubb %3, %2
+ SWAP %2, %3
+%endmacro
+
; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
; ptrdiff_t line_size, int h)
-%macro PIXELS48 2
-%if %2 == 4
-%define OP movh
+%macro OP_PIXELS 2
+%if %2 == mmsize/2
+%define LOAD movh
+%define SAVE movh
+%define LEN mmsize
%else
-%define OP mova
+%define LOAD movu
+%define SAVE mova
+%define LEN %2
%endif
-cglobal %1_pixels%2, 4,5
+cglobal %1_pixels%2, 4,5,4
lea r4, [r2*3]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+ pcmpeqd m6, m6
+ paddb m6, m6
+%endif
+%endif
.loop:
- OP m0, [r1]
- OP m1, [r1+r2]
- OP m2, [r1+r2*2]
- OP m3, [r1+r4]
- lea r1, [r1+r2*4]
+%assign %%i 0
+%rep LEN/mmsize
+ LOAD m0, [r1 + %%i]
+ LOAD m1, [r1+r2 + %%i]
+ LOAD m2, [r1+r2*2 + %%i]
+ LOAD m3, [r1+r4 + %%i]
%ifidn %1, avg
- pavgb m0, [r0]
- pavgb m1, [r0+r2]
- pavgb m2, [r0+r2*2]
- pavgb m3, [r0+r4]
+%if notcpuflag(mmxext)
+ PAVGB_MMX [r0 + %%i], m0, m4, m6
+ PAVGB_MMX [r0+r2 + %%i], m1, m5, m6
+ PAVGB_MMX [r0+r2*2 + %%i], m2, m4, m6
+ PAVGB_MMX [r0+r4 + %%i], m3, m5, m6
+%else
+ pavgb m0, [r0 + %%i]
+ pavgb m1, [r0+r2 + %%i]
+ pavgb m2, [r0+r2*2 + %%i]
+ pavgb m3, [r0+r4 + %%i]
+%endif
%endif
- OP [r0], m0
- OP [r0+r2], m1
- OP [r0+r2*2], m2
- OP [r0+r4], m3
+ SAVE [r0 + %%i], m0
+ SAVE [r0+r2 + %%i], m1
+ SAVE [r0+r2*2 + %%i], m2
+ SAVE [r0+r4 + %%i], m3
+%assign %%i %%i+mmsize
+%endrep
sub r3d, 4
+ lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
jne .loop
RET
%endmacro
-PIXELS48 put, 4
-PIXELS48 avg, 4
-PIXELS48 put, 8
-PIXELS48 avg, 8
+INIT_MMX mmx
+OP_PIXELS put, 4
+OP_PIXELS avg, 4
+OP_PIXELS put, 8
+OP_PIXELS avg, 8
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
+INIT_MMX mmxext
+OP_PIXELS avg, 4
+OP_PIXELS avg, 8
+OP_PIXELS avg, 16
INIT_XMM sse2
-; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-; ptrdiff_t line_size, int h)
-cglobal put_pixels16, 4,5,4
- lea r4, [r2*3]
-.loop:
- movu m0, [r1]
- movu m1, [r1+r2]
- movu m2, [r1+r2*2]
- movu m3, [r1+r4]
- lea r1, [r1+r2*4]
- mova [r0], m0
- mova [r0+r2], m1
- mova [r0+r2*2], m2
- mova [r0+r4], m3
- sub r3d, 4
- lea r0, [r0+r2*4]
- jnz .loop
- REP_RET
-
-; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-; ptrdiff_t line_size, int h)
-cglobal avg_pixels16, 4,5,4
- lea r4, [r2*3]
-.loop:
- movu m0, [r1]
- movu m1, [r1+r2]
- movu m2, [r1+r2*2]
- movu m3, [r1+r4]
- lea r1, [r1+r2*4]
- pavgb m0, [r0]
- pavgb m1, [r0+r2]
- pavgb m2, [r0+r2*2]
- pavgb m3, [r0+r4]
- mova [r0], m0
- mova [r0+r2], m1
- mova [r0+r2*2], m2
- mova [r0+r4], m3
- sub r3d, 4
- lea r0, [r0+r2*4]
- jnz .loop
- REP_RET
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 88d1415ade..4e83cf71c3 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -22,18 +22,24 @@
#include <stddef.h>
#include <stdint.h>
+void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c
deleted file mode 100644
index 813bcc2b37..0000000000
--- a/libavcodec/x86/fpel_mmx.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "fpel.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-// in case more speed is needed - unrolling would certainly help
-void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- JUMPALIGN();
- do {
- __asm__ volatile(
- "movq %0, %%mm0 \n\t"
- "movq %1, %%mm1 \n\t"
- PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
- "movq %%mm2, %0 \n\t"
- :"+m"(*block)
- :"m"(*pixels)
- :"memory");
- pixels += line_size;
- block += line_size;
- }
- while (--h);
-}
-
-void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- JUMPALIGN();
- do {
- __asm__ volatile(
- "movq %0, %%mm0 \n\t"
- "movq %1, %%mm1 \n\t"
- PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
- "movq %%mm2, %0 \n\t"
- "movq 8%0, %%mm0 \n\t"
- "movq 8%1, %%mm1 \n\t"
- PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
- "movq %%mm2, 8%0 \n\t"
- :"+m"(*block)
- :"m"(*pixels)
- :"memory");
- pixels += line_size;
- block += line_size;
- }
- while (--h);
-}
-
-void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- __asm__ volatile (
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1 ), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "movq (%1 ), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- : "+g"(h), "+r"(pixels), "+r"(block)
- : "r"((x86_reg)line_size)
- : "%"FF_REG_a, "memory"
- );
-}
-
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- __asm__ volatile (
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1 ), %%mm0 \n\t"
- "movq 8(%1 ), %%mm4 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm5 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "movq (%1 ), %%mm0 \n\t"
- "movq 8(%1 ), %%mm4 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm5 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- : "+g"(h), "+r"(pixels), "+r"(block)
- : "r"((x86_reg)line_size)
- : "%"FF_REG_a, "memory"
- );
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/g722dsp.asm b/libavcodec/x86/g722dsp.asm
new file mode 100644
index 0000000000..a529422262
--- /dev/null
+++ b/libavcodec/x86/g722dsp.asm
@@ -0,0 +1,54 @@
+;******************************************************************************
+;* SIMD optimized DSP functions for G722 coding
+;*
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_qmf_coeffs: dw 3, -210, -11, -805, -11, 951, 53, 3876
+pw_qmf_coeffs2: dw 12, 3876, -156, 951, 32, -805, 362, -210
+pw_qmf_coeffs3: dw 362, 0 , 32, 0, -156, 0, 12, 0
+pw_qmf_coeffs4: dw 53, 0, -11, 0, -11, 0, 3, 0
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal g722_apply_qmf, 2, 2, 5, prev, out
+ movu m0, [prevq+mmsize*0]
+ movu m1, [prevq+mmsize*1]
+ movu m2, [prevq+mmsize*2]
+ punpcklwd m3, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m4, m2, m2
+ punpckhwd m2, m2
+ pmaddwd m3, [pw_qmf_coeffs ]
+ pmaddwd m0, [pw_qmf_coeffs2]
+ pmaddwd m4, [pw_qmf_coeffs3]
+ pmaddwd m2, [pw_qmf_coeffs4]
+ paddd m0, m3
+ paddd m2, m4
+ paddd m0, m2
+ pshufd m2, m0, q0032
+ paddd m0, m2
+ pshufd m0, m0, q0001
+ movq [outq], m0
+ RET
diff --git a/libavcodec/x86/g722dsp_init.c b/libavcodec/x86/g722dsp_init.c
new file mode 100644
index 0000000000..614695193b
--- /dev/null
+++ b/libavcodec/x86/g722dsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/g722dsp.h"
+
+void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
+
+av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpu_flags))
+ dsp->apply_qmf = ff_g722_apply_qmf_sse2;
+}
diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm
index cd726ba86d..77c8cf154d 100644
--- a/libavcodec/x86/h263_loopfilter.asm
+++ b/libavcodec/x86/h263_loopfilter.asm
@@ -1,20 +1,22 @@
;******************************************************************************
;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c
index d4fab981bf..ab81063233 100644
--- a/libavcodec/x86/h263dsp_init.c
+++ b/libavcodec/x86/h263dsp_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index a9cac59f0f..b5a78b537d 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
;* 2005-2008 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index ff53b91c42..34bc41969b 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -249,8 +249,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7
%define CHROMAMC_AVG NOTHING
INIT_XMM sse2
CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 put
+%endif
INIT_MMX mmxext
CHROMA_MC4 put
CHROMA_MC2 put
@@ -258,8 +260,10 @@ CHROMA_MC2 put
%define CHROMAMC_AVG AVG
INIT_XMM sse2
CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 avg
+%endif
INIT_MMX mmxext
CHROMA_MC4 avg
CHROMA_MC2 avg
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 33fd5a9dd7..6702ae98d4 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,20 +7,20 @@
;* Fiona Glaser <fiona@x264.com>
;* Oskar Arvidsson <oskar@irock.se>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -37,11 +37,6 @@ cextern pb_0
cextern pb_1
cextern pb_3
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
- [base], [base+stride], [base+stride*2], [base3], \
- [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
%define PASS8ROWS(base, base3, stride, stride3, offset) \
PASS8ROWS(base+offset, base3+offset, stride, stride3)
@@ -287,18 +282,18 @@ cextern pb_3
; int8_t *tc0)
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA 0
-cglobal deblock_v_luma_8, 5,5,10
+cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_
movd m8, [r4] ; tc0
- lea r4, [r1*3]
- dec r2d ; alpha-1
+ lea r4, [stride_q*3]
+ dec alpha_d ; alpha-1
neg r4
- dec r3d ; beta-1
- add r4, r0 ; pix-3*stride
+ dec beta_d ; beta-1
+ add base3_q, pix_q ; pix-3*stride
- mova m0, [r4+r1] ; p1
- mova m1, [r4+2*r1] ; p0
- mova m2, [r0] ; q0
- mova m3, [r0+r1] ; q1
+ mova m0, [base3_q + stride_q] ; p1
+ mova m1, [base3_q + 2*stride_q] ; p0
+ mova m2, [pix_q] ; q0
+ mova m3, [pix_q + stride_q] ; q1
LOAD_MASK r2d, r3d
punpcklbw m8, m8
@@ -308,24 +303,24 @@ cglobal deblock_v_luma_8, 5,5,10
pandn m9, m7
pand m8, m9
- movdqa m3, [r4] ; p2
+ movdqa m3, [base3_q] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m9
psubb m7, m8, m6
pand m6, m8
- LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+ LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4
- movdqa m4, [r0+2*r1] ; q2
+ movdqa m4, [pix_q + 2*stride_q] ; q2
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
pand m6, m9
pand m8, m6
psubb m7, m6
- mova m3, [r0+r1]
- LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
+ mova m3, [pix_q + stride_q]
+ LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6
DEBLOCK_P0_Q0
- mova [r4+2*r1], m1
- mova [r0], m2
+ mova [base3_q + 2*stride_q], m1
+ mova [pix_q], m2
RET
;-----------------------------------------------------------------------------
@@ -382,10 +377,101 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
RET
%endmacro
+%macro DEBLOCK_H_LUMA_MBAFF 0
+
+cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_
+ movsxd stride_q, stride_d
+ dec alpha_d
+ dec beta_d
+ mov base3_q, pix_q
+ lea stride3_q, [3*stride_q]
+ add base3_q, stride3_q
+
+ movq m0, [pix_q - 4]
+ movq m1, [pix_q + stride_q - 4]
+ movq m2, [pix_q + 2*stride_q - 4]
+ movq m3, [base3_q - 4]
+ movq m4, [base3_q + stride_q - 4]
+ movq m5, [base3_q + 2*stride_q - 4]
+ movq m6, [base3_q + stride3_q - 4]
+ movq m7, [base3_q + 4*stride_q - 4]
+
+ TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
+
+ %assign i 0
+ %rep 8
+ movq [rsp + 16*i], m %+ i
+ %assign i i+1
+ %endrep
+
+ ; p2 = m1 [rsp + 16]
+ ; p1 = m2 [rsp + 32]
+ ; p0 = m3 [rsp + 48]
+ ; q0 = m4 [rsp + 64]
+ ; q1 = m5 [rsp + 80]
+ ; q2 = m6 [rsp + 96]
+
+ SWAP 0, 2
+ SWAP 1, 3
+ SWAP 2, 4
+ SWAP 3, 5
+
+ LOAD_MASK alpha_d, beta_d
+ movd m8, [tc0_q]
+ punpcklbw m8, m8
+ pcmpeqb m9, m9
+ pcmpeqb m9, m8
+ pandn m9, m7
+ pand m8, m9
+
+ movdqa m3, [rsp + 16] ; p2
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m9
+ psubb m7, m8, m6
+ pand m6, m8
+ LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
+
+ movdqa m4, [rsp + 96] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ pand m6, m9
+ pand m8, m6
+ psubb m7, m6
+ mova m3, [rsp + 80]
+ LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
+
+ DEBLOCK_P0_Q0
+ SWAP 1, 3
+ SWAP 2, 4
+ movq m0, [rsp]
+ movq m1, [rsp + 16]
+ movq m2, [rsp + 32]
+ movq m5, [rsp + 80]
+ movq m6, [rsp + 96]
+ movq m7, [rsp + 112]
+
+ TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
+ movq [pix_q - 4], m0
+ movq [pix_q + stride_q - 4], m1
+ movq [pix_q + 2*stride_q - 4], m2
+ movq [base3_q - 4], m3
+ movq [base3_q + stride_q - 4], m4
+ movq [base3_q + 2*stride_q - 4], m5
+ movq [base3_q + stride3_q - 4], m6
+ movq [base3_q + 4*stride_q - 4], m7
+
+RET
+
+%endmacro
+
INIT_XMM sse2
+DEBLOCK_H_LUMA_MBAFF
DEBLOCK_LUMA
+
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
+DEBLOCK_H_LUMA_MBAFF
DEBLOCK_LUMA
+%endif
%else
@@ -499,8 +585,10 @@ INIT_MMX mmxext
DEBLOCK_LUMA v8, 8
INIT_XMM sse2
DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA v, 16
+%endif
%endif ; ARCH
@@ -772,8 +860,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80
INIT_XMM sse2
DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
+%endif
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_LUMA_INTRA v8
@@ -836,7 +926,11 @@ cglobal deblock_h_chroma_8, 5,7
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
movq buf1, m3
- call ff_chroma_inter_body_mmxext
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
@@ -854,7 +948,52 @@ ff_chroma_inter_body_mmxext:
DEBLOCK_P0_Q0
ret
+%define t5 r4
+%define t6 r5
+
+cglobal deblock_h_chroma422_8, 5, 6
+ SUB rsp, (1+ARCH_X86_64*2)*mmsize
+ %if ARCH_X86_64
+ %define buf0 [rsp+16]
+ %define buf1 [rsp+8]
+ %else
+ %define buf0 r0m
+ %define buf1 r2m
+ %endif
+
+ movd m6, [r4]
+ punpcklbw m6, m6
+ movq [rsp], m6
+ CHROMA_H_START
+ TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+ movq buf0, m0
+ movq buf1, m3
+ LOAD_MASK r2d, r3d
+ movd m6, [rsp]
+ punpcklwd m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ movq m0, buf0
+ movq m3, buf1
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+ lea r0, [r0+r1*8]
+ lea t5, [t5+r1*8]
+
+ TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+ movq buf0, m0
+ movq buf1, m3
+ LOAD_MASK r2d, r3d
+ movd m6, [rsp+4]
+ punpcklwd m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ movq m0, buf0
+ movq m3, buf1
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+ ADD rsp, (1+ARCH_X86_64*2)*mmsize
+RET
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
@@ -867,9 +1006,6 @@ ff_chroma_inter_body_mmxext:
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro
-%define t5 r4
-%define t6 r5
-
;------------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
;------------------------------------------------------------------------------
@@ -894,6 +1030,20 @@ cglobal deblock_h_chroma_intra_8, 4,6
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET
+cglobal deblock_h_chroma422_intra_8, 4, 6
+ CHROMA_H_START
+ TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+ call ff_chroma_intra_body_mmxext
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+ lea r0, [r0+r1*8]
+ lea t5, [t5+r1*8]
+
+ TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+ call ff_chroma_intra_body_mmxext
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+RET
+
ALIGN 16
ff_chroma_intra_body_mmxext:
LOAD_MASK r2d, r3d
@@ -909,6 +1059,202 @@ ff_chroma_intra_body_mmxext:
paddb m2, m6
ret
+%macro LOAD_8_ROWS 8
+ movd m0, %1
+ movd m1, %2
+ movd m2, %3
+ movd m3, %4
+ movd m4, %5
+ movd m5, %6
+ movd m6, %7
+ movd m7, %8
+%endmacro
+
+%macro STORE_8_ROWS 8
+ movd %1, m0
+ movd %2, m1
+ movd %3, m2
+ movd %4, m3
+ movd %5, m4
+ movd %6, m5
+ movd %7, m6
+ movd %8, m7
+%endmacro
+
+%macro TRANSPOSE_8x4B_XMM 0
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+ punpcklwd m0, m2
+ punpcklwd m4, m6
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ MOVHL m1, m0
+ MOVHL m3, m2
+%endmacro
+
+%macro TRANSPOSE_4x8B_XMM 0
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpckhwd m4, m0, m2
+ punpcklwd m0, m2
+ MOVHL m6, m4
+ MOVHL m2, m0
+ pshufd m1, m0, 1
+ pshufd m3, m2, 1
+ pshufd m5, m4, 1
+ pshufd m7, m6, 1
+%endmacro
+
+%macro CHROMA_INTER_BODY_XMM 1
+ LOAD_MASK alpha_d, beta_d
+ movd m6, [tc0_q]
+ %rep %1
+ punpcklbw m6, m6
+ %endrep
+ pand m7, m6
+ DEBLOCK_P0_Q0
+%endmacro
+
+%macro CHROMA_INTRA_BODY_XMM 0
+ LOAD_MASK alpha_d, beta_d
+ mova m5, m1
+ mova m6, m2
+ pxor m4, m1, m3
+ pand m4, [pb_1]
+ pavgb m1, m3
+ psubusb m1, m4
+ pavgb m1, m0
+ pxor m4, m2, m0
+ pand m4, [pb_1]
+ pavgb m2, m0
+ psubusb m2, m4
+ pavgb m2, m3
+ psubb m1, m5
+ psubb m2, m6
+ pand m1, m7
+ pand m2, m7
+ paddb m1, m5
+ paddb m2, m6
+%endmacro
+
+%macro CHROMA_V_START_XMM 1
+ movsxdifnidn stride_q, stride_d
+ dec alpha_d
+ dec beta_d
+ mov %1, pix_q
+ sub %1, stride_q
+ sub %1, stride_q
+%endmacro
+
+%macro CHROMA_H_START_XMM 2
+ movsxdifnidn stride_q, stride_d
+ dec alpha_d
+ dec beta_d
+ lea %2, [3*stride_q]
+ mov %1, pix_q
+ add %1, %2
+%endmacro
+
+%macro DEBLOCK_CHROMA_XMM 1
+
+INIT_XMM %1
+
+cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
+ CHROMA_V_START_XMM r5
+ movq m0, [r5]
+ movq m1, [r5 + stride_q]
+ movq m2, [pix_q]
+ movq m3, [pix_q + stride_q]
+ CHROMA_INTER_BODY_XMM 1
+ movq [r5 + stride_q], m1
+ movq [pix_q], m2
+RET
+
+cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
+ CHROMA_H_START_XMM r5, r6
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+ TRANSPOSE_8x4B_XMM
+ movq [rsp], m0
+ movq [rsp + 8], m3
+ CHROMA_INTER_BODY_XMM 1
+ movq m0, [rsp]
+ movq m3, [rsp + 8]
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
+cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_,
+ CHROMA_H_START_XMM r5, r6
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+ TRANSPOSE_8x4B_XMM
+ movq [rsp], m0
+ movq [rsp + 8], m3
+ CHROMA_INTER_BODY_XMM 2
+ movq m0, [rsp]
+ movq m3, [rsp + 8]
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+
+ lea pix_q, [pix_q + 8*stride_q]
+ lea r5, [r5 + 8*stride_q]
+ add tc0_q, 2
+
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+ TRANSPOSE_8x4B_XMM
+ movq [rsp], m0
+ movq [rsp + 8], m3
+ CHROMA_INTER_BODY_XMM 2
+ movq m0, [rsp]
+ movq m3, [rsp + 8]
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
+cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_
+ CHROMA_V_START_XMM r4
+ movq m0, [r4]
+ movq m1, [r4 + stride_q]
+ movq m2, [pix_q]
+ movq m3, [pix_q + stride_q]
+ CHROMA_INTRA_BODY_XMM
+ movq [r4 + stride_q], m1
+ movq [pix_q], m2
+RET
+
+cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
+ CHROMA_H_START_XMM r4, r5
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+ TRANSPOSE_8x4B_XMM
+ CHROMA_INTRA_BODY_XMM
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+RET
+
+cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
+ CHROMA_H_START_XMM r4, r5
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+ TRANSPOSE_8x4B_XMM
+ CHROMA_INTRA_BODY_XMM
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+
+ lea pix_q, [pix_q + 8*stride_q]
+ lea r4, [r4 + 8*stride_q]
+
+ LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+ TRANSPOSE_8x4B_XMM
+ CHROMA_INTRA_BODY_XMM
+ TRANSPOSE_4x8B_XMM
+ STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+RET
+
+%endmacro ; DEBLOCK_CHROMA_XMM
+
+DEBLOCK_CHROMA_XMM sse2
+DEBLOCK_CHROMA_XMM avx
+
;-----------------------------------------------------------------------------
; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
; int8_t ref[2][40], int16_t mv[2][40][2],
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index d049c62bf2..1af3257a67 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -7,34 +7,32 @@
;* Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
SECTION .text
cextern pw_2
cextern pw_3
cextern pw_4
+cextern pw_1023
+%define pw_pixel_max pw_1023
; out: %4 = |%1-%2|-%3
; clobbers: %5
@@ -418,9 +416,11 @@ cglobal deblock_h_luma_10, 5,7,15
INIT_XMM sse2
DEBLOCK_LUMA_64
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_64
%endif
+%endif
%macro SWAPMOVA 2
%ifid %1
@@ -715,8 +715,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16
INIT_XMM sse2
DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_INTRA_64
+%endif
%endif
@@ -802,10 +804,12 @@ DEBLOCK_LUMA_INTRA
INIT_XMM sse2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%endif
+%endif
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', %2=q0'
@@ -839,6 +843,83 @@ DEBLOCK_LUMA_INTRA
mova [r0+2*r1], m2
%endmacro
+; in: 8 rows of 4 words in %4..%11
+; out: 4 rows of 8 words in m0..m3
+%macro TRANSPOSE4x8W_LOAD 8
+ movq m0, %1
+ movq m2, %2
+ movq m1, %3
+ movq m3, %4
+
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ punpckhdq m2, m0, m1
+ punpckldq m0, m1
+
+ movq m4, %5
+ movq m6, %6
+ movq m5, %7
+ movq m3, %8
+
+ punpcklwd m4, m6
+ punpcklwd m5, m3
+ punpckhdq m6, m4, m5
+ punpckldq m4, m5
+
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ punpckhqdq m3, m2, m6
+ punpcklqdq m2, m6
+%endmacro
+
+; in: 4 rows of 8 words in m0..m3
+; out: 8 rows of 4 words in %1..%8
+%macro TRANSPOSE8x4W_STORE 8
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ movq %1, m0
+ movhps %2, m0
+ movq %3, m1
+ movhps %4, m1
+ movq %5, m2
+ movhps %6, m2
+ movq %7, m3
+ movhps %8, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to store p1 and q1 values
+%macro CHROMA_H_LOAD 4
+ %if mmsize == 8
+ movq m0, [pix_q - 4]
+ movq m1, [pix_q + stride_q - 4]
+ movq m2, [pix_q + 2*stride_q - 4]
+ movq m3, [%1 - 4]
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ %else
+ TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+ %endif
+ mova %3, m0
+ mova %4, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to load p1 and q1 values
+%macro CHROMA_H_STORE 4
+ mova m0, %3
+ mova m3, %4
+ %if mmsize == 8
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ movq [pix_q - 4], m0
+ movq [pix_q + stride_q - 4], m1
+ movq [pix_q + 2*stride_q - 4], m2
+ movq [%1 - 4], m3
+ %else
+ TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+ %endif
+%endmacro
+
%macro CHROMA_V_LOAD_TC 2
movd %1, [%2]
punpcklbw %1, %1
@@ -910,6 +991,81 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
%else
RET
%endif
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
+; int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, tc0_
+ shl alpha_d, 2
+ shl beta_d, 2
+ mov r5, pix_q
+ lea r6, [3*stride_q]
+ add r5, r6
+%if mmsize == 8
+ mov r6d, 2
+ .loop:
+%endif
+
+ CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
+ LOAD_AB m4, m5, alpha_d, beta_d
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ pxor m4, m4
+ CHROMA_V_LOAD_TC m6, tc0_q
+ psubw m6, [pw_3]
+ pmaxsw m6, m4
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
+
+%if mmsize == 8
+ lea pix_q, [pix_q + 4*stride_q]
+ lea r5, [r5 + 4*stride_q]
+ add tc0_q, 2
+ dec r6d
+ jg .loop
+%endif
+RET
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta,
+; int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma422_10, 5, 7, 8, 0-3*mmsize, pix_, stride_, alpha_, beta_, tc0_
+ shl alpha_d, 2
+ shl beta_d, 2
+
+ movd m0, [tc0_q]
+ punpcklbw m0, m0
+ psraw m0, 6
+ movq [rsp], m0
+
+ mov r5, pix_q
+ lea r6, [3*stride_q]
+ add r5, r6
+
+ mov r4, -8
+ .loop:
+
+ CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+ LOAD_AB m4, m5, alpha_d, beta_d
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ pxor m4, m4
+ movd m6, [rsp + r4 + 8]
+ punpcklwd m6, m6
+ punpcklwd m6, m6
+ psubw m6, [pw_3]
+ pmaxsw m6, m4
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+
+ lea pix_q, [pix_q + (mmsize/2)*stride_q]
+ lea r5, [r5 + (mmsize/2)*stride_q]
+ add r4, (mmsize/4)
+ jl .loop
+RET
+
%endmacro
%if ARCH_X86_64 == 0
@@ -918,5 +1074,7 @@ DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_CHROMA
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_CHROMA
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index ad57aa91ab..19cd128381 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -2,20 +2,20 @@
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -36,9 +36,15 @@
#if HAVE_INLINE_ASM
+#if ARCH_X86_64
+#define REG64 "r"
+#else
+#define REG64 "m"
+#endif
+
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
//as that would make optimization work hard)
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
#define decode_significance decode_significance_x86
static int decode_significance_x86(CABACContext *c, int max_coeff,
uint8_t *significant_coeff_ctx_base,
@@ -55,6 +61,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
+ : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
@@ -130,6 +137,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
+ : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
@@ -138,7 +146,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
"3: \n\t"
"mov %10, %0 \n\t"
- "movzbl (%0, %6), %k6 \n\t"
+ "movzb (%0, %6), %6 \n\t"
"add %9, %6 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
@@ -149,14 +157,14 @@ static int decode_significance_8x8_x86(CABACContext *c,
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
- "mov %1, %k6 \n\t"
+ "mov %1, %6 \n\t"
"test $1, %4 \n\t"
" jz 4f \n\t"
#ifdef BROKEN_RELOCATIONS
- "movzbl %c14(%15, %q6), %k6\n\t"
+ "movzb %c14(%15, %q6), %6\n\t"
#else
- "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
+ "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
#endif
"add %11, %6 \n\t"
@@ -169,8 +177,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
"%15")
"mov %2, %0 \n\t"
- "mov %1, %k6 \n\t"
- "movl %k6, (%0) \n\t"
+ "mov %1, %6 \n\t"
+ "mov %k6, (%0) \n\t"
"test $1, %4 \n\t"
" jnz 5f \n\t"
@@ -178,19 +186,19 @@ static int decode_significance_8x8_x86(CABACContext *c,
"add"FF_OPSIZE" $4, %2 \n\t"
"4: \n\t"
- "addl $1, %k6 \n\t"
- "mov %k6, %1 \n\t"
- "cmpl $63, %k6 \n\t"
+ "add $1, %6 \n\t"
+ "mov %6, %1 \n\t"
+ "cmp $63, %6 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
- "movl %k6, (%0) \n\t"
+ "mov %k6, (%0) \n\t"
"5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
- : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+ : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
"=&r"(bit), "+&r"(c->range), "=&r"(state)
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
- "m"(sig_off), "m"(last_coeff_ctx_base),
+ REG64(sig_off), REG64(last_coeff_ctx_base),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
@@ -198,7 +206,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
);
return coeff_count;
}
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif /* HAVE_7REGS && BROKEN_COMPILER */
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_H264_I386_H */
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index eb99476a85..c36fea59b0 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -9,20 +9,20 @@
;* Holger Lubitz <hal@duncan.ol.sub.de>
;* Min Chen <chenm001.163.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
@@ -697,6 +697,38 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
call h264_idct_add8_mmx_plane
RET
+cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+ movsxdifnidn r3, r3d
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+%if ARCH_X86_64
+ mov dst2q, r0
+%endif
+
+ mov r5, 16 ; i
+ add r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t
+
+ call h264_idct_add8_mmx_plane
+ add r5, 4
+ call h264_idct_add8_mmx_plane
+
+%if ARCH_X86_64
+ add dst2q, gprsize ; dest[1]
+%else
+ add r0mp, gprsize
+%endif
+
+ add r5, 4 ; set to 32
+ add r2, 256 ; set to i * 16 * sizeof(dctcoef)
+
+ call h264_idct_add8_mmx_plane
+ add r5, 4
+ call h264_idct_add8_mmx_plane
+
+ RET
+
h264_idct_add8_mmxext_plane:
movsxdifnidn r3, r3d
.nextblock:
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 432d74b1f4..9fd05abb2b 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -5,32 +5,31 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pd_32: times 4 dd 32
-
SECTION .text
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pd_32
+
;-----------------------------------------------------------------------------
; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
@@ -84,8 +83,10 @@ cglobal h264_idct_add_10, 3,3
INIT_XMM sse2
IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD_10
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
@@ -118,9 +119,11 @@ add4x4_idct %+ SUFFIX:
INIT_XMM sse2
ALIGN 16
ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
ALIGN 16
ADD4x4IDCT
+%endif
%macro ADD16_OP 2
cmp byte [r4+%2], 0
@@ -157,8 +160,10 @@ cglobal h264_idct_add16_10, 5,6
INIT_XMM sse2
IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16_10
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
@@ -224,8 +229,10 @@ cglobal h264_idct8_dc_add_10,3,4,7
INIT_XMM sse2
IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_DC_ADD
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
@@ -298,8 +305,10 @@ cglobal h264_idct_add16intra_10,5,7,8
INIT_XMM sse2
IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16INTRA_10
+%endif
%assign last_block 36
;-----------------------------------------------------------------------------
@@ -336,8 +345,63 @@ cglobal h264_idct_add8_10,5,8,7
INIT_XMM sse2
IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD8
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset,
+; int16_t *block, int stride,
+; const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+%assign last_block 44
+
+%macro IDCT_ADD8_422 0
+
+cglobal h264_idct_add8_422_10, 5, 8, 7
+ movsxdifnidn r3, r3d
+%if ARCH_X86_64
+ mov r7, r0
+%endif
+
+ add r2, 1024
+ mov r0, [r0]
+ ADD16_OP_INTRA 16, 4+ 6*8
+ ADD16_OP_INTRA 18, 4+ 7*8
+ ADD16_OP_INTRA 24, 4+ 8*8 ; i+4
+ ADD16_OP_INTRA 26, 4+ 9*8 ; i+4
+ add r2, 1024-128*4
+
+%if ARCH_X86_64
+ mov r0, [r7+gprsize]
+%else
+ mov r0, r0m
+ mov r0, [r0+gprsize]
+%endif
+
+ ADD16_OP_INTRA 32, 4+11*8
+ ADD16_OP_INTRA 34, 4+12*8
+ ADD16_OP_INTRA 40, 4+13*8 ; i+4
+ ADD16_OP_INTRA 42, 4+14*8 ; i+4
+REP_RET
+ AC 16
+ AC 18
+ AC 24 ; i+4
+ AC 26 ; i+4
+ AC 32
+ AC 34
+ AC 40 ; i+4
+ AC 42 ; i+4
+
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD8_422
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD8_422
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
@@ -544,8 +608,10 @@ h264_idct8_add1_10 %+ SUFFIX:
INIT_XMM sse2
IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
@@ -585,5 +651,7 @@ cglobal h264_idct8_add4_10, 0,7,16
INIT_XMM sse2
IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD4
+%endif
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index 1ea97fa1ca..f3aa3172f0 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -5,20 +5,20 @@
;* Copyright (c) 2010 Loren Merritt
;* Copyright (c) 2010 Ronald S. Bultje
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -268,6 +268,43 @@ cglobal pred16x16_tm_vp8_8, 2,6,6
jg .loop
REP_RET
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
+ sub dstq, strideq
+ pmovzxbw m0, [dstq]
+ vpbroadcastb xm1, [r0-1]
+ pmovzxbw m1, xm1
+ psubw m0, m1
+ mov iterationd, 4
+ lea stride3q, [strideq*3]
+.loop:
+ vpbroadcastb xm1, [dstq+strideq*1-1]
+ vpbroadcastb xm2, [dstq+strideq*2-1]
+ vpbroadcastb xm3, [dstq+stride3q-1]
+ vpbroadcastb xm4, [dstq+strideq*4-1]
+ pmovzxbw m1, xm1
+ pmovzxbw m2, xm2
+ pmovzxbw m3, xm3
+ pmovzxbw m4, xm4
+ paddw m1, m0
+ paddw m2, m0
+ paddw m3, m0
+ paddw m4, m0
+ vpackuswb m1, m1, m2
+ vpackuswb m3, m3, m4
+ vpermq m1, m1, q3120
+ vpermq m3, m3, q3120
+ movdqa [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m1, 1
+ movdqa [dstq+stride3q*1], xm3
+ vextracti128 [dstq+strideq*4], m3, 1
+ lea dstq, [dstq+strideq*4]
+ dec iterationd
+ jg .loop
+ REP_RET
+%endif
+
;-----------------------------------------------------------------------------
; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
@@ -2498,10 +2535,7 @@ cglobal pred4x4_tm_vp8_8, 3,3
pshufb mm3, mm6
pshufb mm4, mm6
pshufb mm5, mm6
- psubw mm2, mm7
- psubw mm3, mm7
- psubw mm4, mm7
- psubw mm5, mm7
+ psubw mm0, mm7
paddw mm2, mm0
paddw mm3, mm0
paddw mm4, mm0
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 7ba9828e17..629e0a72e3 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -26,18 +26,19 @@
SECTION_RODATA
+cextern pw_1023
+%define pw_pixel_max pw_1023
cextern pw_512
cextern pw_16
cextern pw_8
cextern pw_4
cextern pw_2
cextern pw_1
+cextern pd_16
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3
-pw_pixel_max: times 8 dw ((1 << 10)-1)
pd_17: times 4 dd 17
-pd_16: times 4 dd 16
SECTION .text
@@ -83,8 +84,10 @@ INIT_XMM sse2
PRED4x4_DR
INIT_XMM ssse3
PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_DR
+%endif
;------------------------------------------------------------------------------
; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
@@ -121,8 +124,10 @@ INIT_XMM sse2
PRED4x4_VR
INIT_XMM ssse3
PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_VR
+%endif
;-------------------------------------------------------------------------------
; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
@@ -162,28 +167,14 @@ INIT_XMM sse2
PRED4x4_HD
INIT_XMM ssse3
PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_HD
+%endif
;-----------------------------------------------------------------------------
; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
-%macro HADDD 2 ; sum junk
-%if mmsize == 16
- movhlps %2, %1
- paddd %1, %2
- pshuflw %2, %1, 0xE
- paddd %1, %2
-%else
- pshufw %2, %1, 0xE
- paddd %1, %2
-%endif
-%endmacro
-
-%macro HADDW 2
- pmaddwd %1, [pw_1]
- HADDD %1, %2
-%endmacro
INIT_MMX mmxext
cglobal pred4x4_dc_10, 3, 3
@@ -232,8 +223,10 @@ cglobal pred4x4_down_left_10, 3, 3
INIT_XMM sse2
PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_DL
+%endif
;-----------------------------------------------------------------------------
; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
@@ -260,8 +253,10 @@ cglobal pred4x4_vertical_left_10, 3, 3
INIT_XMM sse2
PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_VL
+%endif
;-----------------------------------------------------------------------------
; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
@@ -571,8 +566,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6
INIT_XMM sse2
PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_TOP_DC
+%endif
;-------------------------------------------------------------------------------
; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
@@ -629,8 +626,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6
INIT_XMM sse2
PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DC
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
@@ -663,8 +662,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6
INIT_XMM sse2
PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_VERTICAL
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
@@ -718,8 +719,10 @@ INIT_XMM sse2
PRED8x8L_HORIZONTAL
INIT_XMM ssse3
PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_HORIZONTAL
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
@@ -785,8 +788,10 @@ INIT_XMM sse2
PRED8x8L_DOWN_LEFT
INIT_XMM ssse3
PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DOWN_LEFT
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
@@ -858,8 +863,10 @@ INIT_XMM sse2
PRED8x8L_DOWN_RIGHT
INIT_XMM ssse3
PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DOWN_RIGHT
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
@@ -927,8 +934,10 @@ INIT_XMM sse2
PRED8x8L_VERTICAL_RIGHT
INIT_XMM ssse3
PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_VERTICAL_RIGHT
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
@@ -987,8 +996,10 @@ INIT_XMM sse2
PRED8x8L_HORIZONTAL_UP
INIT_XMM ssse3
PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_HORIZONTAL_UP
+%endif
;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 0e572b1226..bdd5125d68 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -127,6 +127,7 @@ PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
+PRED16x16(tm_vp8, 8, avx2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
@@ -323,6 +324,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
}
}
}
+
+ if(EXTERNAL_AVX2(cpu_flags)){
+ if (codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
+ }
+ }
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 74458259a6..6b7ecf52b3 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -2,20 +2,20 @@
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
* Copyright (c) 2011 Daniel Kang
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -29,10 +29,6 @@
#include "fpel.h"
#if HAVE_YASM
-void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
@@ -49,9 +45,9 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext ff_put_pixels4_mmx
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
@@ -284,7 +280,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uin
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
}\
@@ -296,7 +292,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uin
\
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
}\
@@ -304,74 +300,74 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
+ LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
+ av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
+ av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
+ av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
+ av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index 4557e5e209..872268300a 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -26,12 +26,13 @@
SECTION_RODATA 32
+cextern pd_65535
+cextern pw_1023
+%define pw_pixel_max pw_1023
cextern pw_16
cextern pw_1
cextern pb_0
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
pad10: times 8 dw 10*1023
pad20: times 8 dw 20*1023
pad30: times 8 dw 30*1023
@@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
tap1: times 4 dw 1, -5
tap2: times 4 dw 20, 20
tap3: times 4 dw -5, 1
-pd_0f: times 4 dd 0xffff
SECTION .text
@@ -708,7 +708,7 @@ h%1_loop_op:
psrad m1, 10
psrad m2, 10
pslld m2, 16
- pand m1, [pd_0f]
+ pand m1, [pd_65535]
por m1, m2
%if num_mmregs <= 8
pxor m0, m0
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index bc6c72541b..2d287ba443 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -6,20 +6,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index f9da05b215..0975d74fcf 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 961ec8ca45..f924e55854 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -26,11 +26,12 @@
SECTION_RODATA 32
-pw_pixel_max: times 8 dw ((1 << 10)-1)
sq_1: dq 1
dq 0
cextern pw_1
+cextern pw_1023
+%define pw_pixel_max pw_1023
SECTION .text
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index 0d5ff3d159..36bf29df02 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 134d594ca9..0643b37362 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -78,6 +78,11 @@ IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
+IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
+
+IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
+IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
+
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
@@ -103,28 +108,37 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
int beta);
#define LF_FUNCS(type, depth) \
-LF_FUNC(h, chroma, depth, mmxext) \
-LF_IFUNC(h, chroma_intra, depth, mmxext) \
-LF_FUNC(v, chroma, depth, mmxext) \
-LF_IFUNC(v, chroma_intra, depth, mmxext) \
-LF_FUNC(h, luma, depth, mmxext) \
-LF_IFUNC(h, luma_intra, depth, mmxext) \
-LF_FUNC(h, luma, depth, sse2) \
-LF_IFUNC(h, luma_intra, depth, sse2) \
-LF_FUNC(v, luma, depth, sse2) \
-LF_IFUNC(v, luma_intra, depth, sse2) \
-LF_FUNC(h, chroma, depth, sse2) \
-LF_IFUNC(h, chroma_intra, depth, sse2) \
-LF_FUNC(v, chroma, depth, sse2) \
-LF_IFUNC(v, chroma_intra, depth, sse2) \
-LF_FUNC(h, luma, depth, avx) \
-LF_IFUNC(h, luma_intra, depth, avx) \
-LF_FUNC(v, luma, depth, avx) \
-LF_IFUNC(v, luma_intra, depth, avx) \
-LF_FUNC(h, chroma, depth, avx) \
-LF_IFUNC(h, chroma_intra, depth, avx) \
-LF_FUNC(v, chroma, depth, avx) \
-LF_IFUNC(v, chroma_intra, depth, avx)
+LF_FUNC(h, chroma, depth, mmxext) \
+LF_IFUNC(h, chroma_intra, depth, mmxext) \
+LF_FUNC(h, chroma422, depth, mmxext) \
+LF_IFUNC(h, chroma422_intra, depth, mmxext) \
+LF_FUNC(v, chroma, depth, mmxext) \
+LF_IFUNC(v, chroma_intra, depth, mmxext) \
+LF_FUNC(h, luma, depth, mmxext) \
+LF_IFUNC(h, luma_intra, depth, mmxext) \
+LF_FUNC(h, luma, depth, sse2) \
+LF_IFUNC(h, luma_intra, depth, sse2) \
+LF_FUNC(v, luma, depth, sse2) \
+LF_IFUNC(v, luma_intra, depth, sse2) \
+LF_FUNC(h, chroma, depth, sse2) \
+LF_IFUNC(h, chroma_intra, depth, sse2) \
+LF_FUNC(h, chroma422, depth, sse2) \
+LF_IFUNC(h, chroma422_intra, depth, sse2) \
+LF_FUNC(v, chroma, depth, sse2) \
+LF_IFUNC(v, chroma_intra, depth, sse2) \
+LF_FUNC(h, luma, depth, avx) \
+LF_IFUNC(h, luma_intra, depth, avx) \
+LF_FUNC(v, luma, depth, avx) \
+LF_IFUNC(v, luma_intra, depth, avx) \
+LF_FUNC(h, chroma, depth, avx) \
+LF_IFUNC(h, chroma_intra, depth, avx) \
+LF_FUNC(h, chroma422, depth, avx) \
+LF_IFUNC(h, chroma422_intra, depth, avx) \
+LF_FUNC(v, chroma, depth, avx) \
+LF_IFUNC(v, chroma_intra, depth, avx)
+
+LF_FUNC(h, luma_mbaff, 8, sse2)
+LF_FUNC(h, luma_mbaff, 8, avx)
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
@@ -155,13 +169,13 @@ LF_IFUNC(v, luma_intra, 10, mmxext)
/* weighted prediction */
#define H264_WEIGHT(W, OPT) \
-void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride, \
int height, int log2_denom, \
int weight, int offset);
#define H264_BIWEIGHT(W, OPT) \
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
- int stride, int height, \
+ ptrdiff_t stride, int height, \
int log2_denom, int weightd, \
int weights, int offset);
@@ -181,7 +195,7 @@ H264_BIWEIGHT_MMX(4)
#define H264_WEIGHT_10(W, DEPTH, OPT) \
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
- int stride, \
+ ptrdiff_t stride, \
int height, \
int log2_denom, \
int weight, \
@@ -190,7 +204,7 @@ void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
uint8_t *src, \
- int stride, \
+ ptrdiff_t stride, \
int height, \
int log2_denom, \
int weightd, \
@@ -210,6 +224,7 @@ H264_BIWEIGHT_10_SSE(4, 10)
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
+#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
@@ -224,8 +239,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
- if (chroma_format_idc <= 1)
+ if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
+ } else {
+ c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
+ }
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
@@ -244,6 +262,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
}
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
@@ -279,6 +300,20 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
+
+#if ARCH_X86_64
+ c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
+#endif
+
+ c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_sse2;
+ c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_sse2;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
+ }
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
@@ -289,12 +324,30 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#if ARCH_X86_64
+ c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
+#endif
+
+ c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx;
+ c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
+ }
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
#if ARCH_X86_32
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
+ }
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
@@ -307,8 +360,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
- if (chroma_format_idc <= 1)
+ if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
+ } else {
+ c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
+ }
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
@@ -325,6 +381,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
+ }
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
@@ -347,8 +408,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
- if (chroma_format_idc <= 1)
+ if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
+ } else {
+ c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
+ }
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_avx;
@@ -357,6 +421,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
+ }
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
@@ -365,4 +434,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
#endif /* HAVE_ALIGNED_STACK */
}
}
+#endif
}
diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm
index 66b929c594..d97e4abddb 100644
--- a/libavcodec/x86/hevc_add_res.asm
+++ b/libavcodec/x86/hevc_add_res.asm
@@ -2,49 +2,51 @@
; * Provide SIMD optimizations for add_residual functions for HEVC decoding
; * Copyright (c) 2014 Pierre-Edouard LEPERE
; *
-; * This file is part of Libav.
+; * This file is part of FFmpeg.
; *
-; * Libav is free software; you can redistribute it and/or
+; * FFmpeg is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2.1 of the License, or (at your option) any later version.
; *
-; * Libav is distributed in the hope that it will be useful,
+; * FFmpeg is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
-; * License along with Libav; if not, write to the Free Software
+; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; ******************************************************************************
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA 32
-max_pixels_10: times 16 dw ((1 << 10)-1)
-
SECTION .text
+cextern pw_1023
+%define max_pixels_10 pw_1023
+
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
%macro ADD_RES_MMX_4_8 0
- mova m0, [r1]
- mova m2, [r1+8]
- pxor m1, m1
+ mova m2, [r1]
+ mova m4, [r1+8]
pxor m3, m3
- psubw m1, m0
psubw m3, m2
- packuswb m0, m2
- packuswb m1, m3
-
- movd m2, [r0]
- movd m3, [r0+r2]
- punpckldq m2, m3
+ packuswb m2, m2
+ packuswb m3, m3
+ pxor m5, m5
+ psubw m5, m4
+ packuswb m4, m4
+ packuswb m5, m5
+
+ movh m0, [r0]
+ movh m1, [r0+r2]
paddusb m0, m2
- psubusb m0, m1
- movd [r0], m0
- psrlq m0, 32
- movd [r0+r2], m0
+ paddusb m1, m4
+ psubusb m0, m3
+ psubusb m1, m5
+ movh [r0], m0
+ movh [r0+r2], m1
%endmacro
@@ -93,8 +95,15 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
vinserti128 m2, m2, [r1+%1+32], 1
vinserti128 m6, m6, [r1+%1+48], 1
%endif
+%if cpuflag(avx)
psubw m1, m0, m2
psubw m5, m0, m6
+%else
+ mova m1, m0
+ mova m5, m0
+ psubw m1, m2
+ psubw m5, m6
+%endif
packuswb m2, m6
packuswb m1, m5
@@ -104,8 +113,15 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
vinserti128 m4, m4, [r1+%1+96 ], 1
vinserti128 m6, m6, [r1+%1+112], 1
%endif
+%if cpuflag(avx)
psubw m3, m0, m4
psubw m5, m0, m6
+%else
+ mova m3, m0
+ mova m5, m0
+ psubw m3, m4
+ psubw m5, m6
+%endif
packuswb m4, m6
packuswb m3, m5
@@ -176,7 +192,7 @@ cglobal hevc_add_residual_32_8, 3, 5, 7
dec r4d
jg .loop
RET
-%endif ;HAVE_AVX2_EXTERNAL
+%endif
%macro ADD_RES_SSE_8_10 4
mova m0, [%4]
@@ -330,7 +346,7 @@ cglobal hevc_add_residual_32_10, 3, 5, 6
mova m5, [max_pixels_10]
mov r4d, 32
-.loop
+.loop:
ADD_RES_SSE_32_10 r0, r1
lea r0, [r0+r2]
add r1, 64
@@ -346,7 +362,7 @@ cglobal hevc_add_residual_16_10, 3, 5, 6
lea r3, [r2*3]
mov r4d, 4
-.loop
+.loop:
ADD_RES_AVX2_16_10 r0, r2, r3, r1
lea r0, [r0+r2*4]
add r1, 128
@@ -359,7 +375,7 @@ cglobal hevc_add_residual_32_10, 3, 5, 6
mova m5, [max_pixels_10]
mov r4d, 16
-.loop
+.loop:
ADD_RES_AVX2_32_10 r0, r2, r1
lea r0, [r0+r2*2]
add r1, 128
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 153eaf7f94..85ee4800bb 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -26,9 +26,11 @@
SECTION_RODATA
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_m2: times 8 dw -2
-pd_1 : times 4 dd 1
+cextern pw_1023
+%define pw_pixel_max_10 pw_1023
+pw_pixel_max_12: times 8 dw ((1 << 12)-1)
+pw_m2: times 8 dw -2
+pd_1 : times 4 dd 1
cextern pw_4
cextern pw_8
@@ -37,11 +39,6 @@ cextern pw_m1
SECTION .text
INIT_XMM sse2
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
- [base], [base+stride], [base+stride*2], [base3], \
- [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 words in m0..m3
%macro TRANSPOSE4x8B_LOAD 8
@@ -57,10 +54,10 @@ INIT_XMM sse2
movd m4, %5
movd m6, %6
movd m5, %7
- movd m7, %8
+ movd m3, %8
punpcklbw m4, m6
- punpcklbw m5, m7
+ punpcklbw m5, m3
punpcklwd m4, m5
punpckhdq m2, m0, m4
@@ -76,16 +73,10 @@ INIT_XMM sse2
; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
- packuswb m0, m0
- packuswb m1, m1
- packuswb m2, m2
- packuswb m3, m3
-
- punpcklbw m0, m1
- punpcklbw m2, m3
-
- punpckhwd m6, m0, m2
- punpcklwd m0, m2
+ packuswb m0, m2
+ packuswb m1, m3
+ SBUTTERFLY bw, 0, 1, 2
+ SBUTTERFLY wd, 0, 1, 2
movd %1, m0
pshufd m0, m0, 0x39
@@ -95,13 +86,13 @@ INIT_XMM sse2
pshufd m0, m0, 0x39
movd %4, m0
- movd %5, m6
- pshufd m6, m6, 0x39
- movd %6, m6
- pshufd m6, m6, 0x39
- movd %7, m6
- pshufd m6, m6, 0x39
- movd %8, m6
+ movd %5, m1
+ pshufd m1, m1, 0x39
+ movd %6, m1
+ pshufd m1, m1, 0x39
+ movd %7, m1
+ pshufd m1, m1, 0x39
+ movd %8, m1
%endmacro
; in: 8 rows of 4 words in %4..%11
@@ -120,10 +111,10 @@ INIT_XMM sse2
movq m4, %5
movq m6, %6
movq m5, %7
- movq m7, %8
+ movq m3, %8
punpcklwd m4, m6
- punpcklwd m5, m7
+ punpcklwd m5, m3
punpckhdq m6, m4, m5
punpckldq m4, m5
@@ -136,32 +127,23 @@ INIT_XMM sse2
; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 words in %1..%8
-%macro TRANSPOSE8x4W_STORE 8
- pxor m5, m5; zeros reg
- CLIPW m0, m5, [pw_pixel_max]
- CLIPW m1, m5, [pw_pixel_max]
- CLIPW m2, m5, [pw_pixel_max]
- CLIPW m3, m5, [pw_pixel_max]
+%macro TRANSPOSE8x4W_STORE 9
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
- punpckhwd m4, m0, m1
- punpcklwd m0, m1
- punpckhwd m5, m2, m3
- punpcklwd m2, m3
- punpckhdq m6, m0, m2
- punpckldq m0, m2
+ pxor m5, m5; zeros reg
+ CLIPW m0, m5, %9
+ CLIPW m1, m5, %9
+ CLIPW m2, m5, %9
+ CLIPW m3, m5, %9
movq %1, m0
movhps %2, m0
- movq %3, m6
- movhps %4, m6
-
- punpckhdq m6, m4, m5
- punpckldq m4, m5
-
- movq %5, m4
- movhps %6, m4
- movq %7, m6
- movhps %8, m6
+ movq %3, m1
+ movhps %4, m1
+ movq %5, m2
+ movhps %6, m2
+ movq %7, m3
+ movhps %8, m3
%endmacro
; in: 8 rows of 8 bytes in %1..%8
@@ -212,40 +194,20 @@ INIT_XMM sse2
; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 bytes in %1..%8
%macro TRANSPOSE8x8B_STORE 8
- packuswb m0, m0
- packuswb m1, m1
- packuswb m2, m2
- packuswb m3, m3
- packuswb m4, m4
- packuswb m5, m5
- packuswb m6, m6
- packuswb m7, m7
-
- punpcklbw m0, m1
- punpcklbw m2, m3
-
- punpckhwd m8, m0, m2
- punpcklwd m0, m2
+ packuswb m0, m4
+ packuswb m1, m5
+ packuswb m2, m6
+ packuswb m3, m7
+ TRANSPOSE2x4x4B 0, 1, 2, 3, 4
- punpcklbw m4, m5
- punpcklbw m6, m7
-
- punpckhwd m9, m4, m6
- punpcklwd m4, m6
-
- punpckhdq m10, m0, m4; 2, 3
- punpckldq m0, m4; 0, 1
-
- punpckldq m11, m8, m9; 4, 5
- punpckhdq m8, m9; 6, 7
movq %1, m0
movhps %2, m0
- movq %3, m10
- movhps %4, m10
- movq %5, m11
- movhps %6, m11
- movq %7, m8
- movhps %8, m8
+ movq %3, m1
+ movhps %4, m1
+ movq %5, m2
+ movhps %6, m2
+ movq %7, m3
+ movhps %8, m3
%endmacro
; in: 8 rows of 8 words in %1..%8
@@ -264,18 +226,18 @@ INIT_XMM sse2
; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 words in %1..%8
-%macro TRANSPOSE8x8W_STORE 8
+%macro TRANSPOSE8x8W_STORE 9
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
pxor m8, m8
- CLIPW m0, m8, [pw_pixel_max]
- CLIPW m1, m8, [pw_pixel_max]
- CLIPW m2, m8, [pw_pixel_max]
- CLIPW m3, m8, [pw_pixel_max]
- CLIPW m4, m8, [pw_pixel_max]
- CLIPW m5, m8, [pw_pixel_max]
- CLIPW m6, m8, [pw_pixel_max]
- CLIPW m7, m8, [pw_pixel_max]
+ CLIPW m0, m8, %9
+ CLIPW m1, m8, %9
+ CLIPW m2, m8, %9
+ CLIPW m3, m8, %9
+ CLIPW m4, m8, %9
+ CLIPW m5, m8, %9
+ CLIPW m6, m8, %9
+ CLIPW m7, m8, %9
movdqu %1, m0
movdqu %2, m1
@@ -318,13 +280,14 @@ ALIGN 16
paddw m5, m4;
;tc calculations
- movd m6, [r2]; tc0
- add r2, 4;
+ movq m6, [tcq]; tc0
punpcklwd m6, m6
- movd m7, [r2]; tc1
- punpcklwd m7, m7
- shufps m6, m7, 0; tc0, tc1
+ pshufd m6, m6, 0xA0; tc0, tc1
+%if cpuflag(ssse3)
+ psignw m4, m6, [pw_m1]; -tc0, -tc1
+%else
pmullw m4, m6, [pw_m1]; -tc0, -tc1
+%endif
;end tc calculations
paddw m5, [pw_4]; +4
@@ -362,11 +325,11 @@ ALIGN 16
paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3
- pshufhw m14, m9, q0033 ;0b00001111; 0d3 0d3 0d0 0d0 in high
- pshuflw m14, m14, q0033 ;0b00001111; 1d3 1d3 1d0 1d0 in low
+ pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high
+ pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low
- pshufhw m9, m9, q3300 ;0b11110000; 0d0 0d0 0d3 0d3
- pshuflw m9, m9, q3300 ;0b11110000; 1d0 1d0 1d3 1d3
+ pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
+ pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
paddw m14, m9; 0d0+0d3, 1d0+1d3
@@ -380,7 +343,7 @@ ALIGN 16
psraw m15, m13, 2; beta >> 2
psllw m8, m9, 1;
pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
- movmskps r14, m15;
+ movmskps r6, m15;
;end weak / strong decision
; weak filter nd_p/q calculation
@@ -388,19 +351,15 @@ ALIGN 16
psrld m8, 16
paddw m8, m10
movd r7d, m8
- and r7, 0xffff; 1dp0 + 1dp3
pshufd m8, m8, 0x4E
movd r8d, m8
- and r8, 0xffff; 0dp0 + 0dp3
pshufd m8, m11, 0x31
psrld m8, 16
paddw m8, m11
movd r9d, m8
- and r9, 0xffff; 1dq0 + 1dq3
pshufd m8, m8, 0x4E
movd r10d, m8
- and r10, 0xffff; 0dq0 + 0dq3
; end calc for weak filter
; filtering mask
@@ -422,14 +381,13 @@ ALIGN 16
shl r11, %1 - 8
%endif
movd m8, r11d; tc0
- add tcq, 4;
- mov r3d, [tcq];
+ mov r3d, [tcq+4];
%if %1 > 8
shl r3, %1 - 8
%endif
- movd m9, r3d; tc1
add r11d, r3d; tc0 + tc1
jz .bypassluma
+ movd m9, r3d; tc1
punpcklwd m8, m8
punpcklwd m9, m9
shufps m8, m9, 0; tc0, tc1
@@ -453,7 +411,7 @@ ALIGN 16
psraw m13, 3; beta >> 3
pcmpgtw m13, m12;
movmskps r11, m13;
- and r14, r11; strong mask , beta_2 and beta_3 comparisons
+ and r6, r11; strong mask , beta_2 and beta_3 comparisons
;----beta_3 comparison end-----
;----tc25 comparison---
psubw m12, m3, m4; p0 - q0
@@ -464,23 +422,23 @@ ALIGN 16
pcmpgtw m8, m12; tc25 comparisons
movmskps r11, m8;
- and r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
+ and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
;----tc25 comparison end---
- mov r11, r14;
+ mov r11, r6;
shr r11, 1;
- and r14, r11; strong mask, bits 2 and 0
+ and r6, r11; strong mask, bits 2 and 0
pmullw m14, m9, [pw_m2]; -tc * 2
paddw m9, m9
- and r14, 5; 0b101
- mov r11, r14; strong mask
- shr r14, 2;
- movd m12, r14d; store to xmm for mask generation
- shl r14, 1
+ and r6, 5; 0b101
+ mov r11, r6; strong mask
+ shr r6, 2;
+ movd m12, r6d; store to xmm for mask generation
+ shl r6, 1
and r11, 1
movd m10, r11d; store to xmm for mask generation
- or r14, r11; final strong mask, bits 1 and 0
+ or r6, r11; final strong mask, bits 1 and 0
jz .weakfilter
shufps m10, m12, 0
@@ -565,16 +523,16 @@ ALIGN 16
MASKED_COPY m3, m12
.weakfilter:
- not r14; strong mask -> weak mask
- and r14, r13; final weak filtering mask, bits 0 and 1
+ not r6; strong mask -> weak mask
+ and r6, r13; final weak filtering mask, bits 0 and 1
jz .store
; weak filtering mask
- mov r11, r14
+ mov r11, r6
shr r11, 1
movd m12, r11d
- and r14, 1
- movd m11, r14d
+ and r6, 1
+ movd m11, r6d
shufps m11, m12, 0
pcmpeqd m11, [pd_1]; filtering mask
@@ -609,7 +567,11 @@ ALIGN 16
pminsw m12, m9; av_clip(delta0, -tc, tc)
psraw m9, 1; tc -> tc / 2
+%if cpuflag(ssse3)
+ psignw m14, m9, [pw_m1]; -tc / 2
+%else
pmullw m14, m9, [pw_m1]; -tc / 2
+%endif
pavgw m15, m1, m3; (p2 + p0 + 1) >> 1
psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1
@@ -658,117 +620,161 @@ ALIGN 16
MASKED_COPY m4, m8
%endmacro
-INIT_XMM sse2
;-----------------------------------------------------------------------------
-; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
; uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8
- sub r0, 2
- lea r5, [3 * r1]
- mov r4, r0
- add r0, r5
- TRANSPOSE4x8B_LOAD PASS8ROWS(r4, r0, r1, r5)
+%macro LOOP_FILTER_CHROMA 0
+cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
+ sub pixq, 2
+ lea r3strideq, [3*strideq]
+ mov pix0q, pixq
+ add pixq, r3strideq
+ TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
CHROMA_DEBLOCK_BODY 8
- TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5)
+ TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
RET
-cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8
- sub r0, 4
- lea r5, [3 * r1]
- mov r4, r0
- add r0, r5
- TRANSPOSE4x8W_LOAD PASS8ROWS(r4, r0, r1, r5)
+cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
+ sub pixq, 4
+ lea r3strideq, [3*strideq]
+ mov pix0q, pixq
+ add pixq, r3strideq
+ TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
CHROMA_DEBLOCK_BODY 10
- TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5)
+ TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
+ RET
+
+cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
+ sub pixq, 4
+ lea r3strideq, [3*strideq]
+ mov pix0q, pixq
+ add pixq, r3strideq
+ TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+ CHROMA_DEBLOCK_BODY 12
+ TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
RET
;-----------------------------------------------------------------------------
-; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
; uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8
- mov r5, r0; pix
- sub r5, r1
- sub r5, r1
- movh m0, [r5]; p1
- movh m1, [r5 + r1]; p0
- movh m2, [r0]; q0
- movh m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
+ mov pix0q, pixq
+ sub pix0q, strideq
+ sub pix0q, strideq
+ movq m0, [pix0q]; p1
+ movq m1, [pix0q+strideq]; p0
+ movq m2, [pixq]; q0
+ movq m3, [pixq+strideq]; q1
pxor m5, m5; zeros reg
punpcklbw m0, m5
punpcklbw m1, m5
punpcklbw m2, m5
punpcklbw m3, m5
CHROMA_DEBLOCK_BODY 8
- packuswb m1, m2
- movh [r5 + r1], m1
- movhps [r0], m1
+ packuswb m1, m2
+ movh[pix0q+strideq], m1
+ movhps [pixq], m1
RET
-cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8
- mov r5, r0; pix
- sub r5, r1
- sub r5, r1
- movdqu m0, [r5]; p1
- movdqu m1, [r5+r1]; p0
- movdqu m2, [r0]; q0
- movdqu m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
+ mov pix0q, pixq
+ sub pix0q, strideq
+ sub pix0q, strideq
+ movu m0, [pix0q]; p1
+ movu m1, [pix0q+strideq]; p0
+ movu m2, [pixq]; q0
+ movu m3, [pixq+strideq]; q1
CHROMA_DEBLOCK_BODY 10
pxor m5, m5; zeros reg
- CLIPW m1, m5, [pw_pixel_max]
- CLIPW m2, m5, [pw_pixel_max]
- movdqu [r5 + r1], m1
- movdqu [r0], m2
+ CLIPW m1, m5, [pw_pixel_max_10]
+ CLIPW m2, m5, [pw_pixel_max_10]
+ movu [pix0q+strideq], m1
+ movu [pixq], m2
+ RET
+
+cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
+ mov pix0q, pixq
+ sub pix0q, strideq
+ sub pix0q, strideq
+ movu m0, [pix0q]; p1
+ movu m1, [pix0q+strideq]; p0
+ movu m2, [pixq]; q0
+ movu m3, [pixq+strideq]; q1
+ CHROMA_DEBLOCK_BODY 12
+ pxor m5, m5; zeros reg
+ CLIPW m1, m5, [pw_pixel_max_12]
+ CLIPW m2, m5, [pw_pixel_max_12]
+ movu [pix0q+strideq], m1
+ movu [pixq], m2
RET
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_CHROMA
+INIT_XMM avx
+LOOP_FILTER_CHROMA
%if ARCH_X86_64
-INIT_XMM ssse3
+%macro LOOP_FILTER_LUMA 0
;-----------------------------------------------------------------------------
; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-; int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
- sub r0, 4
- lea r5, [3 * r1]
- mov r6, r0
- add r0, r5
- TRANSPOSE8x8B_LOAD PASS8ROWS(r6, r0, r1, r5)
+cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+ sub pixq, 4
+ lea pix0q, [3 * r1]
+ mov src3strideq, pixq
+ add pixq, pix0q
+ TRANSPOSE8x8B_LOAD PASS8ROWS(src3strideq, pixq, r1, pix0q)
LUMA_DEBLOCK_BODY 8, v
.store:
- TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
+ TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
.bypassluma:
RET
-cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
+cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
- lea r5, [3 * strideq]
- mov r6, pixq
- add pixq, r5
- TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5)
+ lea pix0q, [3 * strideq]
+ mov src3strideq, pixq
+ add pixq, pix0q
+ TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q)
LUMA_DEBLOCK_BODY 10, v
.store:
- TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
+ TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
+.bypassluma:
+ RET
+
+cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+ sub pixq, 8
+ lea pix0q, [3 * strideq]
+ mov src3strideq, pixq
+ add pixq, pix0q
+ TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q)
+ LUMA_DEBLOCK_BODY 12, v
+.store:
+ TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
.bypassluma:
RET
;-----------------------------------------------------------------------------
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-; int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
sub pix0q, strideq
- movdqu m0, [pix0q]; p3
- movdqu m1, [pix0q + strideq]; p2
- movdqu m2, [pix0q + 2 * strideq]; p1
- movdqu m3, [pix0q + src3strideq]; p0
- movdqu m4, [pixq]; q0
- movdqu m5, [pixq + strideq]; q1
- movdqu m6, [pixq + 2 * strideq]; q2
- movdqu m7, [pixq + src3strideq]; q3
+ movq m0, [pix0q]; p3
+ movq m1, [pix0q + strideq]; p2
+ movq m2, [pix0q + 2 * strideq]; p1
+ movq m3, [pix0q + src3strideq]; p0
+ movq m4, [pixq]; q0
+ movq m5, [pixq + strideq]; q1
+ movq m6, [pixq + 2 * strideq]; q2
+ movq m7, [pixq + src3strideq]; q3
pxor m8, m8
punpcklbw m0, m8
punpcklbw m1, m8
@@ -783,16 +789,16 @@ cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0
packuswb m1, m2
packuswb m3, m4
packuswb m5, m6
- movh [r5 + r1], m1
- movhps [r5 + 2 * r1], m1
- movh [r5 + r6], m3
- movhps [r0 ], m3
- movh [r0 + r1], m5
- movhps [r0 + 2 * r1], m5
+ movh [pix0q + strideq], m1
+ movhps [pix0q + 2 * strideq], m1
+ movh [pix0q + src3strideq], m3
+ movhps [pixq ], m3
+ movh [pixq + strideq], m5
+ movhps [pixq + 2 * strideq], m5
.bypassluma:
RET
-cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
@@ -808,12 +814,12 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
LUMA_DEBLOCK_BODY 10, h
.store:
pxor m8, m8; zeros reg
- CLIPW m1, m8, [pw_pixel_max]
- CLIPW m2, m8, [pw_pixel_max]
- CLIPW m3, m8, [pw_pixel_max]
- CLIPW m4, m8, [pw_pixel_max]
- CLIPW m5, m8, [pw_pixel_max]
- CLIPW m6, m8, [pw_pixel_max]
+ CLIPW m1, m8, [pw_pixel_max_10]
+ CLIPW m2, m8, [pw_pixel_max_10]
+ CLIPW m3, m8, [pw_pixel_max_10]
+ CLIPW m4, m8, [pw_pixel_max_10]
+ CLIPW m5, m8, [pw_pixel_max_10]
+ CLIPW m6, m8, [pw_pixel_max_10]
movdqu [pix0q + strideq], m1; p2
movdqu [pix0q + 2 * strideq], m2; p1
movdqu [pix0q + src3strideq], m3; p0
@@ -822,4 +828,44 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
movdqu [pixq + 2 * strideq], m6; q2
.bypassluma:
RET
+
+cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+ lea src3strideq, [3 * strideq]
+ mov pix0q, pixq
+ sub pix0q, src3strideq
+ sub pix0q, strideq
+ movdqu m0, [pix0q]; p3
+ movdqu m1, [pix0q + strideq]; p2
+ movdqu m2, [pix0q + 2 * strideq]; p1
+ movdqu m3, [pix0q + src3strideq]; p0
+ movdqu m4, [pixq]; q0
+ movdqu m5, [pixq + strideq]; q1
+ movdqu m6, [pixq + 2 * strideq]; q2
+ movdqu m7, [pixq + src3strideq]; q3
+ LUMA_DEBLOCK_BODY 12, h
+.store:
+ pxor m8, m8; zeros reg
+ CLIPW m1, m8, [pw_pixel_max_12]
+ CLIPW m2, m8, [pw_pixel_max_12]
+ CLIPW m3, m8, [pw_pixel_max_12]
+ CLIPW m4, m8, [pw_pixel_max_12]
+ CLIPW m5, m8, [pw_pixel_max_12]
+ CLIPW m6, m8, [pw_pixel_max_12]
+ movdqu [pix0q + strideq], m1; p2
+ movdqu [pix0q + 2 * strideq], m2; p1
+ movdqu [pix0q + src3strideq], m3; p0
+ movdqu [pixq ], m4; q0
+ movdqu [pixq + strideq], m5; q1
+ movdqu [pixq + 2 * strideq], m6; q2
+.bypassluma:
+ RET
+
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_LUMA
+INIT_XMM ssse3
+LOOP_FILTER_LUMA
+INIT_XMM avx
+LOOP_FILTER_LUMA
%endif
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index f397cc1097..1eb1973f27 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2014 James Almer
;* Copyright (c) 2016 Alexandra Hájková
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -234,9 +234,9 @@ times 4 dw 78, -82
times 4 dw 85, -88
times 4 dw 90, -90
-section .text
+SECTION .text
-; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
; %1 = HxW
; %2 = number of loops
; %3 = bitdepth
@@ -844,7 +844,10 @@ IDCT_4x4 %1
INIT_IDCT_DC 8
INIT_IDCT_DC 10
+INIT_IDCT_DC 12
INIT_IDCT 8, sse2
INIT_IDCT 8, avx
INIT_IDCT 10, sse2
INIT_IDCT 10, avx
+;INIT_IDCT 12, sse2
+;INIT_IDCT 12, avx
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 16e5eefc69..ff6ed0711a 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -1,851 +1,1672 @@
-;*****************************************************************************
-;* x86-optimized HEVC MC
-;* Copyright 2015 Anton Khirnov
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_8192
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+%define pw_8 pw_512
+%define pw_10 pw_2048
+%define pw_12 pw_8192
+%define pw_bi_10 pw_1024
+%define pw_bi_12 pw_4096
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+pw_bi_8: times 16 dw (1 << 8)
+max_pixels_12: times 16 dw ((1 << 12)-1)
+cextern pd_1
+cextern pb_0
+
+%macro EPEL_TABLE 4
+hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
+ times %2 d%3 10, -2
+ times %2 d%3 -4, 54
+ times %2 d%3 16, -2
+ times %2 d%3 -6, 46
+ times %2 d%3 28, -4
+ times %2 d%3 -4, 36
+ times %2 d%3 36, -4
+ times %2 d%3 -4, 28
+ times %2 d%3 46, -6
+ times %2 d%3 -2, 16
+ times %2 d%3 54, -4
+ times %2 d%3 -2, 10
+ times %2 d%3 58, -2
+%endmacro
-pw_1023: times 8 dw 1023
-cextern hevc_qpel_coeffs
-cextern hevc_qpel_coeffs8
+EPEL_TABLE 8,16, b, avx2
+EPEL_TABLE 10, 8, w, avx2
+
+EPEL_TABLE 8, 8, b, sse4
+EPEL_TABLE 10, 4, w, sse4
+EPEL_TABLE 12, 4, w, sse4
+
+%macro QPEL_TABLE 4
+hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
+ times %2 d%3 -10, 58
+ times %2 d%3 17, -5
+ times %2 d%3 1, 0
+ times %2 d%3 -1, 4
+ times %2 d%3 -11, 40
+ times %2 d%3 40,-11
+ times %2 d%3 4, -1
+ times %2 d%3 0, 1
+ times %2 d%3 -5, 17
+ times %2 d%3 58,-10
+ times %2 d%3 4, -1
+%endmacro
-cextern hevc_epel_coeffs
-cextern hevc_epel_coeffs8
+QPEL_TABLE 8, 8, b, sse4
+QPEL_TABLE 10, 4, w, sse4
+QPEL_TABLE 12, 4, w, sse4
-cextern pw_8
-cextern pw_16
-cextern pw_32
-cextern pw_64
+QPEL_TABLE 8,16, b, avx2
+QPEL_TABLE 10, 8, w, avx2
SECTION .text
-; %1: width
-; %2: bit depth
-%macro COMMON_DEFS 2
- %assign blocksize 8
- %assign nb_blocks ((%1 + blocksize - 1) / blocksize)
- %define last_block_truncated (blocksize * nb_blocks > %1)
- %if %2 > 8
- %define LOAD_BLOCK movu
- %define LOAD_HALFBLOCK movq
- %assign pixelsize 2
- %else
- %define LOAD_BLOCK movq
- %define LOAD_HALFBLOCK movd
- %assign pixelsize 1
- %endif
- %define STORE_BLOCK mova
- %define STORE_HALFBLOCK movq
-%endmacro
-
-; %1: block index
-%macro BLOCK_DEFS 1
- %if last_block_truncated && %1 == nb_blocks - 1
- %define block_truncated 1
- %define LOAD LOAD_HALFBLOCK
- %define STORE STORE_HALFBLOCK
- %else
- %define block_truncated 0
- %define LOAD LOAD_BLOCK
- %define STORE STORE_BLOCK
- %endif
-%endmacro
-
-
-; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
-; pixel *src, ptrdiff_t srcstride,
-; int height, int mx, int my, int *mcbuffer)
-
-; %1: block width
-; %2: bit depth
-; %3: log2 of height unroll
-%macro GET_PIXELS 3
-cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused
-
- %assign shift 14 - %2
- COMMON_DEFS %1, %2
-
-%if pixelsize == 1
- pxor m0, m0
-%endif
-
- shr heightd, %3
-
-.loop:
+%define MAX_PB_SIZE 64
-%assign i 0
-%rep (1 << %3)
+%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
-%assign j 0
-%rep nb_blocks
+%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
- BLOCK_DEFS j
+%if ARCH_X86_64
- LOAD m1, [srcq + j * pixelsize * blocksize]
-%if pixelsize == 1
- punpcklbw m1, m0
+%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
+%if %1 <= 4
+ movq %3, [%2] ; load data from source2
+%elif %1 <= 8
+ movdqa %3, [%2] ; load data from source2
+%elif %1 <= 12
+%if cpuflag(avx2)
+ mova %3, [%2]
+%else
+ movdqa %3, [%2] ; load data from source2
+ movq %4, [%2+16] ; load data from source2
+%endif ;avx
+%elif %1 <= 16
+%if cpuflag(avx2)
+ mova %3, [%2]
+%else
+ movdqa %3, [%2] ; load data from source2
+ movdqa %4, [%2+16] ; load data from source2
+%endif ; avx
+%else ; %1 = 32
+ mova %3, [%2]
+ mova %4, [%2+32]
%endif
- psllw m1, shift
- STORE [dstq + j * 2 * blocksize], m1
-
-%assign j (j + 1)
-%endrep
+%endmacro
- add dstq, dststrideq
- add srcq, srcstrideq
+%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+ movd %4, [%3] ; load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+ movq %4, [%3] ; load data from source
+%elif notcpuflag(avx)
+ movu %4, [%3] ; load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+ movdqu %4, [%3]
+%else
+ movu %4, [%3]
+%endif
+%endmacro
-%assign i (i + 1)
-%endrep
- dec heightd
- jg .loop
- RET
+%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
+%if cpuflag(avx2)
+%assign %%offset 32
+%ifdef PIC
+ lea %5q, [hevc_epel_filters_avx2_%1]
+ %define FILTER %5q
+%else
+ %define FILTER hevc_epel_filters_avx2_%1
+%endif
+%else
+%assign %%offset 16
+%ifdef PIC
+ lea %5q, [hevc_epel_filters_sse4_%1]
+ %define FILTER %5q
+%else
+ %define FILTER hevc_epel_filters_sse4_%1
+%endif
+%endif ;cpuflag(avx2)
+ sub %2q, 1
+%if cpuflag(avx2)
+ shl %2q, 6 ; multiply by 64
+ %else
+ shl %2q, 5 ; multiply by 32
+%endif
+ mova %3, [FILTER + %2q] ; get 2 first values of filters
+ mova %4, [FILTER + %2q+%%offset] ; get 2 last values of filters
%endmacro
-INIT_XMM sse2
-GET_PIXELS 4, 8, 1
-GET_PIXELS 8, 8, 1
-GET_PIXELS 12, 8, 3
-GET_PIXELS 16, 8, 2
-GET_PIXELS 24, 8, 3
-GET_PIXELS 32, 8, 3
-GET_PIXELS 48, 8, 3
-GET_PIXELS 64, 8, 3
-
-GET_PIXELS 4, 10, 1
-GET_PIXELS 8, 10, 1
-GET_PIXELS 12, 10, 3
-GET_PIXELS 16, 10, 2
-GET_PIXELS 24, 10, 3
-GET_PIXELS 32, 10, 3
-GET_PIXELS 48, 10, 3
-GET_PIXELS 64, 10, 3
-
-; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
-; uint8_t *src, ptrdiff_t srcstride,
-; int height, int mx, int my, int *mcbuffer)
-
-; 8-bit qpel interpolation
-; %1: block width
-; %2: 0 - horizontal; 1 - vertical
-%macro QPEL_8 2
-%if %2
- %define postfix v
- %define mvfrac myq
- %define coeffsaddr r5q
- %define pixstride srcstrideq
- %define pixstride3 r5q
- %define src_m3 r6q
-%else
- %define postfix h
- %define mvfrac mxq
- %define coeffsaddr r6q
- %define pixstride 1
- %define pixstride3 3
- %define src_m3 (srcq - 3)
-%endif
-
- COMMON_DEFS %1, 8
-
-cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my
- and mvfrac, 0x3
- dec mvfrac
- shl mvfrac, 4
- lea coeffsaddr, [hevc_qpel_coeffs8]
- mova m0, [coeffsaddr + mvfrac]
-
- SPLATW m1, m0, 1
- SPLATW m2, m0, 2
- SPLATW m3, m0, 3
- SPLATW m0, m0, 0
-
-%if %2
- lea pixstride3, [srcstrideq + 2 * srcstrideq]
- mov src_m3, srcq
- sub src_m3, pixstride3
+%macro EPEL_HV_FILTER 1
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift 6
+%define %%table hevc_epel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift 5
+%define %%table hevc_epel_filters_sse4_%1
%endif
-.loop:
-
-%assign i 0
-%rep nb_blocks
+%ifdef PIC
+ lea r3srcq, [%%table]
+ %define FILTER r3srcq
+%else
+ %define FILTER %%table
+%endif
+ sub mxq, 1
+ sub myq, 1
+ shl mxq, %%shift ; multiply by 32
+ shl myq, %%shift ; multiply by 32
+ mova m14, [FILTER + mxq] ; get 2 first values of filters
+ mova m15, [FILTER + mxq+%%offset] ; get 2 last values of filters
+
+%if cpuflag(avx2)
+%define %%table hevc_epel_filters_avx2_10
+%else
+%define %%table hevc_epel_filters_sse4_10
+%endif
+%ifdef PIC
+ lea r3srcq, [%%table]
+ %define FILTER r3srcq
+%else
+ %define FILTER %%table
+%endif
+ mova m12, [FILTER + myq] ; get 2 first values of filters
+ mova m13, [FILTER + myq+%%offset] ; get 2 last values of filters
+ lea r3srcq, [srcstrideq*3]
+%endmacro
- BLOCK_DEFS i
+%macro QPEL_FILTER 2
- LOAD m4, [src_m3 + i * blocksize]
- LOAD m5, [src_m3 + i * blocksize + 1 * pixstride]
- punpcklbw m4, m5
- pmaddubsw m4, m0
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift 7
+%define %%table hevc_qpel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift 6
+%define %%table hevc_qpel_filters_sse4_%1
+%endif
- LOAD m5, [src_m3 + i * blocksize + 2 * pixstride]
- LOAD m6, [srcq + i * blocksize]
- punpcklbw m5, m6
- pmaddubsw m5, m1
- paddsw m4, m5
+%ifdef PIC
+ lea rfilterq, [%%table]
+%else
+ %define rfilterq %%table
+%endif
+ sub %2q, 1
+ shl %2q, %%shift ; multiply by 32
+ mova m12, [rfilterq + %2q] ; get 4 first values of filters
+ mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters
+ mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters
+ mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters
+%endmacro
- LOAD m5, [srcq + i * blocksize + 1 * pixstride]
- LOAD m6, [srcq + i * blocksize + 2 * pixstride]
- punpcklbw m5, m6
- pmaddubsw m5, m2
- paddsw m4, m5
+%macro EPEL_LOAD 4
+%if (%1 == 8 && %4 <= 4)
+%define %%load movd
+%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
- LOAD m5, [srcq + i * blocksize + pixstride3]
- LOAD m6, [srcq + i * blocksize + 4 * pixstride]
- punpcklbw m5, m6
- pmaddubsw m5, m3
- paddsw m4, m5
+ %%load m0, [%2q ]
+%ifnum %3
+ %%load m1, [%2q+ %3]
+ %%load m2, [%2q+2*%3]
+ %%load m3, [%2q+3*%3]
+%else
+ %%load m1, [%2q+ %3q]
+ %%load m2, [%2q+2*%3q]
+ %%load m3, [%2q+r3srcq]
+%endif
+%if %1 == 8
+%if %4 > 8
+ SBUTTERFLY bw, 0, 1, 7
+ SBUTTERFLY bw, 2, 3, 7
+%else
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+%endif
+%else
+%if %4 > 4
+ SBUTTERFLY wd, 0, 1, 7
+ SBUTTERFLY wd, 2, 3, 7
+%else
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+%endif
+%endif
+%endmacro
- STORE [dstq + i * 2 * blocksize], m4
-%assign i (i + 1)
-%endrep
+%macro QPEL_H_LOAD 4
+%assign %%stride (%1+7)/8
+%if %1 == 8
+%if %3 <= 4
+%define %%load movd
+%elif %3 == 8
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%else
+%if %3 == 2
+%define %%load movd
+%elif %3 == 4
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%endif
+ %%load m0, [%2-3*%%stride] ;load data from source
+ %%load m1, [%2-2*%%stride]
+ %%load m2, [%2-%%stride ]
+ %%load m3, [%2 ]
+ %%load m4, [%2+%%stride ]
+ %%load m5, [%2+2*%%stride]
+ %%load m6, [%2+3*%%stride]
+ %%load m7, [%2+4*%%stride]
+
+%if %1 == 8
+%if %3 > 8
+ SBUTTERFLY wd, 0, 1, %4
+ SBUTTERFLY wd, 2, 3, %4
+ SBUTTERFLY wd, 4, 5, %4
+ SBUTTERFLY wd, 6, 7, %4
+%else
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+%endif
+%else
+%if %3 > 4
+ SBUTTERFLY dq, 0, 1, %4
+ SBUTTERFLY dq, 2, 3, %4
+ SBUTTERFLY dq, 4, 5, %4
+ SBUTTERFLY dq, 6, 7, %4
+%else
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+%endif
+%endif
+%endmacro
- add dstq, dststrideq
- add srcq, srcstrideq
-%if %2
- add src_m3, srcstrideq
+%macro QPEL_V_LOAD 5
+ lea %5q, [%2]
+ sub %5q, r3srcq
+ movu m0, [%5q ] ;load x- 3*srcstride
+ movu m1, [%5q+ %3q ] ;load x- 2*srcstride
+ movu m2, [%5q+ 2*%3q ] ;load x-srcstride
+ movu m3, [%2 ] ;load x
+ movu m4, [%2+ %3q] ;load x+stride
+ movu m5, [%2+ 2*%3q] ;load x+2*stride
+ movu m6, [%2+r3srcq] ;load x+3*stride
+ movu m7, [%2+ 4*%3q] ;load x+4*stride
+%if %1 == 8
+%if %4 > 8
+ SBUTTERFLY bw, 0, 1, 8
+ SBUTTERFLY bw, 2, 3, 8
+ SBUTTERFLY bw, 4, 5, 8
+ SBUTTERFLY bw, 6, 7, 8
+%else
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
%endif
+%else
+%if %4 > 4
+ SBUTTERFLY wd, 0, 1, 8
+ SBUTTERFLY wd, 2, 3, 8
+ SBUTTERFLY wd, 4, 5, 8
+ SBUTTERFLY wd, 6, 7, 8
+%else
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+%endif
+%endif
+%endmacro
- dec heightd
- jg .loop
- RET
+%macro PEL_12STORE2 3
+ movd [%1], %2
+%endmacro
+%macro PEL_12STORE4 3
+ movq [%1], %2
+%endmacro
+%macro PEL_12STORE6 3
+ movq [%1], %2
+ psrldq %2, 8
+ movd [%1+8], %2
+%endmacro
+%macro PEL_12STORE8 3
+ movdqa [%1], %2
+%endmacro
+%macro PEL_12STORE12 3
+ movdqa [%1], %2
+ movq [%1+16], %3
+%endmacro
+%macro PEL_12STORE16 3
+ PEL_12STORE8 %1, %2, %3
+ movdqa [%1+16], %3
%endmacro
-INIT_XMM ssse3
-QPEL_8 4, 0
-QPEL_8 8, 0
-QPEL_8 12, 0
-QPEL_8 16, 0
-QPEL_8 24, 0
-QPEL_8 32, 0
-QPEL_8 48, 0
-QPEL_8 64, 0
-
-QPEL_8 4, 1
-QPEL_8 8, 1
-QPEL_8 12, 1
-QPEL_8 16, 1
-QPEL_8 24, 1
-QPEL_8 32, 1
-QPEL_8 48, 1
-QPEL_8 64, 1
-
-; 16-bit qpel interpolation
-; %1: block width
-; %2: shift applied to the result
-; %3: 0 - horizontal; 1 - vertical
-%macro QPEL_16 3
-%if %3
- %define mvfrac myq
- %define pixstride srcstrideq
- %define pixstride3 sstride3q
- %define src_m3 srcm3q
-%else
- %define mvfrac mxq
- %define pixstride 2
- %define pixstride3 6
- %define src_m3 (srcq - 6)
-%endif
-
- COMMON_DEFS %1, 16
-
- and mvfrac, 0x3
- dec mvfrac
- shl mvfrac, 4
- lea coeffsregq, [hevc_qpel_coeffs]
- mova m0, [coeffsregq + mvfrac]
-
- pshufd m1, m0, 0x55
- pshufd m2, m0, 0xaa
- pshufd m3, m0, 0xff
- pshufd m0, m0, 0x00
-
-%if %3
- lea sstride3q, [srcstrideq + 2 * srcstrideq]
- mov srcm3q, srcq
- sub srcm3q, sstride3q
+%macro PEL_10STORE2 3
+ movd [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+ movq [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+ movq [%1], %2
+ psrldq %2, 8
+ movd [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+ movdqa [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+ movdqa [%1], %2
+ movq [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+%if cpuflag(avx2)
+ movu [%1], %2
+%else
+ PEL_10STORE8 %1, %2, %3
+ movdqa [%1+16], %3
%endif
+%endmacro
-.loop:
+%macro PEL_10STORE32 3
+ PEL_10STORE16 %1, %2, %3
+ movu [%1+32], %3
+%endmacro
-%assign i 0
-%rep nb_blocks
+%macro PEL_8STORE2 3
+ pextrw [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+ movd [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+ movd [%1], %2
+ pextrw [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+ movq [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+ movq [%1], %2
+ psrldq %2, 8
+ movd [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+%if cpuflag(avx2)
+ movdqu [%1], %2
+%else
+ mova [%1], %2
+%endif ; avx
+%endmacro
+%macro PEL_8STORE32 3
+ movu [%1], %2
+%endmacro
- BLOCK_DEFS i
+%macro LOOP_END 3
+ add %1q, 2*MAX_PB_SIZE ; dst += dststride
+ add %2q, %3q ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+%endmacro
- LOAD m4, [src_m3 + i * 2 * blocksize]
- LOAD m5, [src_m3 + i * 2 * blocksize + 1 * pixstride]
- LOAD m6, [src_m3 + i * 2 * blocksize + 2 * pixstride]
- LOAD m7, [srcq + i * 2 * blocksize + 0 * pixstride]
- LOAD m8, [srcq + i * 2 * blocksize + 1 * pixstride]
- LOAD m9, [srcq + i * 2 * blocksize + 2 * pixstride]
- LOAD m10, [srcq + i * 2 * blocksize + pixstride3]
- LOAD m11, [srcq + i * 2 * blocksize + 4 * pixstride]
- punpcklwd m12, m4, m5
- pmaddwd m12, m0
+%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && %0 ==3
+%if %1 > 16
+ vextracti128 xm1, m0, 1
+ pmovzxbw m1, xm1
+ psllw m1, 14-%2
+%endif
+ pmovzxbw m0, xm0
+%else ; not avx
+%if %1 > 8
+ punpckhbw m1, m0, m2
+ psllw m1, 14-%2
+%endif
+ punpcklbw m0, m2
+%endif
+%endif ;avx
+ psllw m0, 14-%2
+%endmacro
+
+%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
+%if %0 == 8
+%define %%reg0 %5
+%define %%reg2 %6
+%define %%reg1 %7
+%define %%reg3 %8
+%else
+%define %%reg0 m0
+%define %%reg2 m2
+%define %%reg1 m1
+%define %%reg3 m3
+%endif
+%if %1 == 8
+%if cpuflag(avx2) && (%0 == 5)
+%if %2 > 16
+ vperm2i128 m10, m0, m1, q0301
+%endif
+ vinserti128 m0, m0, xm1, 1
+ mova m1, m10
+%if %2 > 16
+ vperm2i128 m10, m2, m3, q0301
+%endif
+ vinserti128 m2, m2, xm3, 1
+ mova m3, m10
+%endif
+ pmaddubsw %%reg0, %3 ;x1*c1+x2*c2
+ pmaddubsw %%reg2, %4 ;x3*c3+x4*c4
+ paddw %%reg0, %%reg2
+%if %2 > 8
+ pmaddubsw %%reg1, %3
+ pmaddubsw %%reg3, %4
+ paddw %%reg1, %%reg3
+%endif
+%else
+ pmaddwd %%reg0, %3
+ pmaddwd %%reg2, %4
+ paddd %%reg0, %%reg2
+%if %2 > 4
+ pmaddwd %%reg1, %3
+ pmaddwd %%reg3, %4
+ paddd %%reg1, %%reg3
+%if %1 != 8
+ psrad %%reg1, %1-8
+%endif
+%endif
+%if %1 != 8
+ psrad %%reg0, %1-8
+%endif
+ packssdw %%reg0, %%reg1
+%endif
+%endmacro
- punpcklwd m13, m6, m7
- pmaddwd m13, m1
- paddd m12, m13
+%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
- punpcklwd m13, m8, m9
- pmaddwd m13, m2
- paddd m12, m13
+%if cpuflag(avx2)
+%assign %%offset 32
+%define %%table hevc_qpel_filters_avx2_%2
+%else
+%assign %%offset 16
+%define %%table hevc_qpel_filters_sse4_%2
+%endif
- punpcklwd m13, m10, m11
- pmaddwd m13, m3
- paddd m12, m13
- psrad m12, %2
+%ifdef PIC
+ lea rfilterq, [%%table]
+%else
+ %define rfilterq %%table
+%endif
- %if block_truncated == 0
- punpckhwd m4, m5
- pmaddwd m4, m0
+%if %2 == 8
+ pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
+ pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4
+ pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6
+ pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8
+ paddw m0, m2
+ paddw m4, m6
+ paddw m0, m4
+%else
+ pmaddwd m0, [rfilterq + %3q*8 ]
+ pmaddwd m2, [rfilterq + %3q*8+%%offset]
+ pmaddwd m4, [rfilterq + %3q*8+2*%%offset]
+ pmaddwd m6, [rfilterq + %3q*8+3*%%offset]
+ paddd m0, m2
+ paddd m4, m6
+ paddd m0, m4
+%if %2 != 8
+ psrad m0, %2-8
+%endif
+%if %1 > 4
+ pmaddwd m1, [rfilterq + %3q*8 ]
+ pmaddwd m3, [rfilterq + %3q*8+%%offset]
+ pmaddwd m5, [rfilterq + %3q*8+2*%%offset]
+ pmaddwd m7, [rfilterq + %3q*8+3*%%offset]
+ paddd m1, m3
+ paddd m5, m7
+ paddd m1, m5
+%if %2 != 8
+ psrad m1, %2-8
+%endif
+%endif
+ p%4 m0, m1
+%endif
+%endmacro
- punpckhwd m6, m7
- pmaddwd m6, m1
- paddd m4, m6
+%macro QPEL_COMPUTE 2-3 ; width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && (%0 == 3)
- punpckhwd m8, m9
- pmaddwd m8, m2
- paddd m4, m8
+ vperm2i128 m10, m0, m1, q0301
+ vinserti128 m0, m0, xm1, 1
+ SWAP 1, 10
- punpckhwd m10, m11
- pmaddwd m10, m3
- paddd m4, m10
+ vperm2i128 m10, m2, m3, q0301
+ vinserti128 m2, m2, xm3, 1
+ SWAP 3, 10
- psrad m4, %2
- %endif
- packssdw m12, m4
- STORE [dstq + i * 2 * blocksize], m12
-%assign i (i + 1)
-%endrep
+ vperm2i128 m10, m4, m5, q0301
+ vinserti128 m4, m4, xm5, 1
+ SWAP 5, 10
- add dstq, dststrideq
- add srcq, srcstrideq
-%if %3
- add srcm3q, srcstrideq
+ vperm2i128 m10, m6, m7, q0301
+ vinserti128 m6, m6, xm7, 1
+ SWAP 7, 10
%endif
- dec heightd
- jg .loop
- RET
+ pmaddubsw m0, m12 ;x1*c1+x2*c2
+ pmaddubsw m2, m13 ;x3*c3+x4*c4
+ pmaddubsw m4, m14 ;x5*c5+x6*c6
+ pmaddubsw m6, m15 ;x7*c7+x8*c8
+ paddw m0, m2
+ paddw m4, m6
+ paddw m0, m4
+%if %1 > 8
+ pmaddubsw m1, m12
+ pmaddubsw m3, m13
+ pmaddubsw m5, m14
+ pmaddubsw m7, m15
+ paddw m1, m3
+ paddw m5, m7
+ paddw m1, m5
+%endif
+%else
+ pmaddwd m0, m12
+ pmaddwd m2, m13
+ pmaddwd m4, m14
+ pmaddwd m6, m15
+ paddd m0, m2
+ paddd m4, m6
+ paddd m0, m4
+%if %2 != 8
+ psrad m0, %2-8
+%endif
+%if %1 > 4
+ pmaddwd m1, m12
+ pmaddwd m3, m13
+ pmaddwd m5, m14
+ pmaddwd m7, m15
+ paddd m1, m3
+ paddd m5, m7
+ paddd m1, m5
+%if %2 != 8
+ psrad m1, %2-8
+%endif
+%endif
+%endif
%endmacro
-%if ARCH_X86_64
-
-%macro QPEL_H_10 1
-cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg
-QPEL_16 %1, 2, 0
-%endmacro
-
-INIT_XMM avx
-QPEL_H_10 4
-QPEL_H_10 8
-QPEL_H_10 12
-QPEL_H_10 16
-QPEL_H_10 24
-QPEL_H_10 32
-QPEL_H_10 48
-QPEL_H_10 64
-
-%macro QPEL_V_10 1
-cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
-QPEL_16 %1, 2, 1
-%endmacro
-
-INIT_XMM avx
-QPEL_V_10 4
-QPEL_V_10 8
-QPEL_V_10 12
-QPEL_V_10 16
-QPEL_V_10 24
-QPEL_V_10 32
-QPEL_V_10 48
-QPEL_V_10 64
-
-; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
-; uint8_t *src, ptrdiff_t srcstride,
-; int height, int mx, int my, int *mcbuffer)
-
-%macro QPEL_HV 1
-cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
-QPEL_16 %1, 6, 1
-%endmacro
-
-INIT_XMM avx
-QPEL_HV 4
-QPEL_HV 8
-QPEL_HV 12
-QPEL_HV 16
-QPEL_HV 24
-QPEL_HV 32
-QPEL_HV 48
-QPEL_HV 64
-
-%endif ; ARCH_X86_64
-
-; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
-; uint8_t *src, ptrdiff_t srcstride,
-; int height, int mx, int my, int *mcbuffer)
+%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+ paddsw %3, %5
+%if %1 > 8
+ paddsw %4, %6
+%endif
+ UNI_COMPUTE %1, %2, %3, %4, %7
+%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
+ vpermq %3, %3, 216
+ vpermq %4, %4, 216
+%endif
+%endmacro
-; 8-bit epel interpolation
-; %1: block width
-; %2: 0 - horizontal; 1 - vertical
-%macro EPEL_8 2
-%if %2
- %define postfix v
- %define mvfrac myq
- %define coeffsaddr r5q
- %define pixstride srcstrideq
- %define pixstride3 r5q
+%macro UNI_COMPUTE 5
+ pmulhrsw %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+ pmulhrsw %4, %5
+%endif
+%if %2 == 8
+ packuswb %3, %4
%else
- %define postfix h
- %define mvfrac mxq
- %define coeffsaddr r6q
- %define pixstride 1
- %define pixstride3 3
+ CLIPW %3, [pb_0], [max_pixels_%2]
+%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
+ CLIPW %4, [pb_0], [max_pixels_%2]
+%endif
%endif
+%endmacro
+
- COMMON_DEFS %1, 8
+; ******************************
+; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int height, int mx, int my)
+; ******************************
-cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my
- and mvfrac, 0x7
- dec mvfrac
- shl mvfrac, 4
- lea coeffsaddr, [hevc_epel_coeffs8]
- movq m0, [coeffsaddr + mvfrac]
+%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+HEVC_PEL_PIXELS %1, %2
+HEVC_UNI_PEL_PIXELS %1, %2
+HEVC_BI_PEL_PIXELS %1, %2
+%endmacro
- SPLATW m1, m0, 1
- SPLATW m0, m0, 0
+%macro HEVC_PEL_PIXELS 2
+cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
+ pxor m2, m2
+.loop:
+ SIMPLE_LOAD %1, %2, srcq, m0
+ MC_PIXEL_COMPUTE %1, %2, 1
+ PEL_10STORE%1 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
+ %endmacro
-%if %2
- lea pixstride3, [srcstrideq + 2 * srcstrideq]
-%endif
- sub srcq, pixstride
+%macro HEVC_UNI_PEL_PIXELS 2
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
+.loop:
+ SIMPLE_LOAD %1, %2, srcq, m0
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+%endmacro
+%macro HEVC_BI_PEL_PIXELS 2
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
+ pxor m2, m2
+ movdqa m5, [pw_bi_%2]
.loop:
+ SIMPLE_LOAD %1, %2, srcq, m0
+ SIMPLE_BILOAD %1, src2q, m3, m4
+ MC_PIXEL_COMPUTE %1, %2, 1
+ BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+%endmacro
-%assign i 0
-%rep nb_blocks
- BLOCK_DEFS i
+; ******************************
+; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int height, int mx, int my, int width);
+; ******************************
- LOAD m2, [srcq + i * blocksize + 0 * pixstride]
- LOAD m3, [srcq + i * blocksize + 1 * pixstride]
- LOAD m4, [srcq + i * blocksize + 2 * pixstride]
- LOAD m5, [srcq + i * blocksize + pixstride3]
- punpcklbw m2, m3
- punpcklbw m4, m5
+%macro HEVC_PUT_HEVC_EPEL 2
+%if cpuflag(avx2)
+%define XMM_REGS 11
+%else
+%define XMM_REGS 8
+%endif
+
+cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+ EPEL_FILTER %2, mx, m4, m5, rfilter
+.loop:
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m4, m5, 1
+ PEL_10STORE%1 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
- pmaddubsw m2, m0
- pmaddubsw m4, m1
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+ movdqa m6, [pw_%2]
+ EPEL_FILTER %2, mx, m4, m5, rfilter
+.loop:
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m4, m5
+ UNI_COMPUTE %1, %2, m0, m1, m6
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
- paddsw m2, m4
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
+ movdqa m6, [pw_bi_%2]
+ EPEL_FILTER %2, mx, m4, m5, rfilter
+.loop:
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m4, m5, 1
+ SIMPLE_BILOAD %1, src2q, m2, m3
+ BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
- STORE [dstq + i * 2 * blocksize], m2
+; ******************************
+; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int height, int mx, int my, int width)
+; ******************************
+
+cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
+ movifnidn myd, mym
+ sub srcq, srcstrideq
+ EPEL_FILTER %2, my, m4, m5, r3src
+ lea r3srcq, [srcstrideq*3]
+.loop:
+ EPEL_LOAD %2, srcq, srcstride, %1
+ EPEL_COMPUTE %2, %1, m4, m5, 1
+ PEL_10STORE%1 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
-%assign i (i + 1)
-%endrep
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
+ movifnidn myd, mym
+ movdqa m6, [pw_%2]
+ sub srcq, srcstrideq
+ EPEL_FILTER %2, my, m4, m5, r3src
+ lea r3srcq, [srcstrideq*3]
+.loop:
+ EPEL_LOAD %2, srcq, srcstride, %1
+ EPEL_COMPUTE %2, %1, m4, m5
+ UNI_COMPUTE %1, %2, m0, m1, m6
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd
- jg .loop
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
+ movifnidn myd, mym
+ movdqa m6, [pw_bi_%2]
+ sub srcq, srcstrideq
+ EPEL_FILTER %2, my, m4, m5, r3src
+ lea r3srcq, [srcstrideq*3]
+.loop:
+ EPEL_LOAD %2, srcq, srcstride, %1
+ EPEL_COMPUTE %2, %1, m4, m5, 1
+ SIMPLE_BILOAD %1, src2q, m2, m3
+ BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
RET
%endmacro
-INIT_XMM ssse3
-EPEL_8 4, 0
-EPEL_8 8, 0
-EPEL_8 12, 0
-EPEL_8 16, 0
-EPEL_8 24, 0
-EPEL_8 32, 0
-
-EPEL_8 4, 1
-EPEL_8 8, 1
-EPEL_8 12, 1
-EPEL_8 16, 1
-EPEL_8 24, 1
-EPEL_8 32, 1
-%macro EPEL_16 3
-%if %3
- %define mvfrac myq
- %define pixstride srcstrideq
- %define pixstride3 sstride3q
+; ******************************
+; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int height, int mx, int my, int width)
+; ******************************
+
+%macro HEVC_PUT_HEVC_EPEL_HV 2
+cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+ sub srcq, srcstrideq
+ EPEL_HV_FILTER %2
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m8, m1
+%endif
+ SWAP m4, m0
+ add srcq, srcstrideq
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m9, m1
+%endif
+ SWAP m5, m0
+ add srcq, srcstrideq
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m10, m1
+%endif
+ SWAP m6, m0
+ add srcq, srcstrideq
+.loop:
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m11, m1
+%endif
+ SWAP m7, m0
+ punpcklwd m0, m4, m5
+ punpcklwd m2, m6, m7
+%if %1 > 4
+ punpckhwd m1, m4, m5
+ punpckhwd m3, m6, m7
+%endif
+ EPEL_COMPUTE 14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+ punpcklwd m4, m8, m9
+ punpcklwd m2, m10, m11
+ punpckhwd m8, m8, m9
+ punpckhwd m3, m10, m11
+ EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
+%if cpuflag(avx2)
+ vinserti128 m2, m0, xm4, 1
+ vperm2i128 m3, m0, m4, q0301
+ PEL_10STORE%1 dstq, m2, m3
%else
- %define mvfrac mxq
- %define pixstride 2
- %define pixstride3 6
+ PEL_10STORE%1 dstq, m0, m4
%endif
+%else
+ PEL_10STORE%1 dstq, m0, m1
+%endif
+ movdqa m4, m5
+ movdqa m5, m6
+ movdqa m6, m7
+%if (%1 > 8 && (%2 == 8))
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+%endif
+ LOOP_END dst, src, srcstride
+ RET
- COMMON_DEFS %1, 16
+cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+ sub srcq, srcstrideq
+ EPEL_HV_FILTER %2
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m8, m1
+%endif
+ SWAP m4, m0
+ add srcq, srcstrideq
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m9, m1
+%endif
+ SWAP m5, m0
+ add srcq, srcstrideq
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m10, m1
+%endif
+ SWAP m6, m0
+ add srcq, srcstrideq
+.loop:
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m11, m1
+%endif
+ mova m7, m0
+ punpcklwd m0, m4, m5
+ punpcklwd m2, m6, m7
+%if %1 > 4
+ punpckhwd m1, m4, m5
+ punpckhwd m3, m6, m7
+%endif
+ EPEL_COMPUTE 14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+ punpcklwd m4, m8, m9
+ punpcklwd m2, m10, m11
+ punpckhwd m8, m8, m9
+ punpckhwd m3, m10, m11
+ EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
+ UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
+%else
+ UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
+%endif
+ PEL_%2STORE%1 dstq, m0, m1
+ mova m4, m5
+ mova m5, m6
+ mova m6, m7
+%if (%1 > 8 && (%2 == 8))
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+%endif
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
- and mvfrac, 0x7
- dec mvfrac
- shl mvfrac, 5
- lea coeffsregq, [hevc_epel_coeffs]
- mova m0, [coeffsregq + mvfrac]
+cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+ sub srcq, srcstrideq
+ EPEL_HV_FILTER %2
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m8, m1
+%endif
+ SWAP m4, m0
+ add srcq, srcstrideq
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m9, m1
+%endif
+ SWAP m5, m0
+ add srcq, srcstrideq
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m10, m1
+%endif
+ SWAP m6, m0
+ add srcq, srcstrideq
+.loop:
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m11, m1
+%endif
+ SWAP m7, m0
+ punpcklwd m0, m4, m5
+ punpcklwd m2, m6, m7
+%if %1 > 4
+ punpckhwd m1, m4, m5
+ punpckhwd m3, m6, m7
+%endif
+ EPEL_COMPUTE 14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+ punpcklwd m4, m8, m9
+ punpcklwd m2, m10, m11
+ punpckhwd m8, m8, m9
+ punpckhwd m3, m10, m11
+ EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
+ SIMPLE_BILOAD %1, src2q, m8, m3
+%if cpuflag(avx2)
+ vinserti128 m1, m8, xm3, 1
+ vperm2i128 m2, m8, m3, q0301
+ BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2]
+%else
+ BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2]
+%endif
+%else
+ SIMPLE_BILOAD %1, src2q, m8, m9
+ BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+%endif
+ PEL_%2STORE%1 dstq, m0, m4
+ mova m4, m5
+ mova m5, m6
+ mova m6, m7
+%if (%1 > 8 && (%2 == 8))
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+%endif
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+%endmacro
- pshufd m1, m0, 0x55
- pshufd m0, m0, 0x00
+; ******************************
+; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int height, int mx, int my, int width)
+; ******************************
-%if %3
- lea sstride3q, [srcstrideq + 2 * srcstrideq]
+%macro HEVC_PUT_HEVC_QPEL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
+ QPEL_FILTER %2, mx
+.loop:
+ QPEL_H_LOAD %2, srcq, %1, 10
+ QPEL_COMPUTE %1, %2, 1
+%if %2 > 8
+ packssdw m0, m1
%endif
- sub srcq, pixstride
+ PEL_10STORE%1 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
+cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
+ mova m9, [pw_%2]
+ QPEL_FILTER %2, mx
.loop:
-
-%assign i 0
-%rep nb_blocks
-
- BLOCK_DEFS i
-
- LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride]
- LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride]
- LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride]
- LOAD m5, [srcq + i * 2 * blocksize + pixstride3]
-
- punpcklwd m6, m2, m3
- punpcklwd m7, m4, m5
- pmaddwd m6, m0
- pmaddwd m7, m1
- paddd m6, m7
- psrad m6, %2
-
- %if block_truncated == 0
- punpckhwd m2, m3
- punpckhwd m4, m5
- pmaddwd m2, m0
- pmaddwd m4, m1
- paddd m2, m4
- psrad m2, %2
- %endif
- packssdw m6, m2
- STORE [dstq + i * 2 * blocksize], m6
-
-%assign i (i + 1)
-%endrep
-
- add dstq, dststrideq
- add srcq, srcstrideq
-
- dec heightd
- jg .loop
+ QPEL_H_LOAD %2, srcq, %1, 10
+ QPEL_COMPUTE %1, %2
+%if %2 > 8
+ packssdw m0, m1
+%endif
+ UNI_COMPUTE %1, %2, m0, m1, m9
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
RET
-%endmacro
-%if ARCH_X86_64
-
-%macro EPEL_H_10 1
-cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 2, 0
-%endmacro
+cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
+ movdqa m9, [pw_bi_%2]
+ QPEL_FILTER %2, mx
+.loop:
+ QPEL_H_LOAD %2, srcq, %1, 10
+ QPEL_COMPUTE %1, %2, 1
+%if %2 > 8
+ packssdw m0, m1
+%endif
+ SIMPLE_BILOAD %1, src2q, m10, m11
+ BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
-INIT_XMM avx
-EPEL_H_10 4
-EPEL_H_10 8
-EPEL_H_10 12
-EPEL_H_10 16
-EPEL_H_10 24
-EPEL_H_10 32
-%macro EPEL_V_10 1
-cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 2, 1
-%endmacro
+; ******************************
+; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int height, int mx, int my, int width)
+; ******************************
-INIT_XMM avx
-EPEL_V_10 4
-EPEL_V_10 8
-EPEL_V_10 12
-EPEL_V_10 16
-EPEL_V_10 24
-EPEL_V_10 32
+cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
+ movifnidn myd, mym
+ lea r3srcq, [srcstrideq*3]
+ QPEL_FILTER %2, my
+.loop:
+ QPEL_V_LOAD %2, srcq, srcstride, %1, r7
+ QPEL_COMPUTE %1, %2, 1
+%if %2 > 8
+ packssdw m0, m1
+%endif
+ PEL_10STORE%1 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
-; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
-; int16_t *src, ptrdiff_t srcstride,
-; int height, int mx, int my, int *mcbuffer)
+cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
+ movifnidn myd, mym
+ movdqa m9, [pw_%2]
+ lea r3srcq, [srcstrideq*3]
+ QPEL_FILTER %2, my
+.loop:
+ QPEL_V_LOAD %2, srcq, srcstride, %1, r8
+ QPEL_COMPUTE %1, %2
+%if %2 > 8
+ packssdw m0, m1
+%endif
+ UNI_COMPUTE %1, %2, m0, m1, m9
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
-%macro EPEL_HV 1
-cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 6, 1
+cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
+ movifnidn myd, mym
+ movdqa m9, [pw_bi_%2]
+ lea r3srcq, [srcstrideq*3]
+ QPEL_FILTER %2, my
+.loop:
+ QPEL_V_LOAD %2, srcq, srcstride, %1, r9
+ QPEL_COMPUTE %1, %2, 1
+%if %2 > 8
+ packssdw m0, m1
+%endif
+ SIMPLE_BILOAD %1, src2q, m10, m11
+ BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
%endmacro
-INIT_XMM avx
-EPEL_HV 4
-EPEL_HV 8
-EPEL_HV 12
-EPEL_HV 16
-EPEL_HV 24
-EPEL_HV 32
-%endif ; ARCH_X86_64
+; ******************************
+; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int height, int mx, int my)
+; ******************************
+%macro HEVC_PUT_HEVC_QPEL_HV 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift 4
+%else
+%assign %%shift 3
+%endif
+ sub mxq, 1
+ sub myq, 1
+ shl mxq, %%shift ; multiply by 32
+ shl myq, %%shift ; multiply by 32
+ lea r3srcq, [srcstrideq*3]
+ sub srcq, r3srcq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m8, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m9, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m10, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m11, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m12, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m13, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m14, m0
+ add srcq, srcstrideq
+.loop:
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m15, m0
+ punpcklwd m0, m8, m9
+ punpcklwd m2, m10, m11
+ punpcklwd m4, m12, m13
+ punpcklwd m6, m14, m15
+%if %1 > 4
+ punpckhwd m1, m8, m9
+ punpckhwd m3, m10, m11
+ punpckhwd m5, m12, m13
+ punpckhwd m7, m14, m15
+%endif
+ QPEL_HV_COMPUTE %1, 14, my, ackssdw
+ PEL_10STORE%1 dstq, m0, m1
+%if %1 <= 4
+ movq m8, m9
+ movq m9, m10
+ movq m10, m11
+ movq m11, m12
+ movq m12, m13
+ movq m13, m14
+ movq m14, m15
+%else
+ movdqa m8, m9
+ movdqa m9, m10
+ movdqa m10, m11
+ movdqa m11, m12
+ movdqa m12, m13
+ movdqa m13, m14
+ movdqa m14, m15
+%endif
+ LOOP_END dst, src, srcstride
+ RET
-; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride,
-; int16_t *src, ptrdiff_t srcstride,
-; int height)
+cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift 4
+%else
+%assign %%shift 3
+%endif
+ sub mxq, 1
+ sub myq, 1
+ shl mxq, %%shift ; multiply by 32
+ shl myq, %%shift ; multiply by 32
+ lea r3srcq, [srcstrideq*3]
+ sub srcq, r3srcq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m8, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m9, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m10, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m11, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m12, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m13, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m14, m0
+ add srcq, srcstrideq
+.loop:
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m15, m0
+ punpcklwd m0, m8, m9
+ punpcklwd m2, m10, m11
+ punpcklwd m4, m12, m13
+ punpcklwd m6, m14, m15
+%if %1 > 4
+ punpckhwd m1, m8, m9
+ punpckhwd m3, m10, m11
+ punpckhwd m5, m12, m13
+ punpckhwd m7, m14, m15
+%endif
+ QPEL_HV_COMPUTE %1, 14, my, ackusdw
+ UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
+ PEL_%2STORE%1 dstq, m0, m1
+
+%if %1 <= 4
+ movq m8, m9
+ movq m9, m10
+ movq m10, m11
+ movq m11, m12
+ movq m12, m13
+ movq m13, m14
+ movq m14, m15
+%else
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+ mova m11, m12
+ mova m12, m13
+ mova m13, m14
+ mova m14, m15
+%endif
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
-%macro AVG 5
- %if %3
- %if %4 == 4
- movq %5, %2
- paddsw %1, %5
- %else
- paddsw %1, %2
- %endif
- %endif
+cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift 4
+%else
+%assign %%shift 3
+%endif
+ sub mxq, 1
+ sub myq, 1
+ shl mxq, %%shift ; multiply by 32
+ shl myq, %%shift ; multiply by 32
+ lea r3srcq, [srcstrideq*3]
+ sub srcq, r3srcq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m8, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m9, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m10, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m11, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m12, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m13, m0
+ add srcq, srcstrideq
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m14, m0
+ add srcq, srcstrideq
+.loop:
+ QPEL_H_LOAD %2, srcq, %1, 15
+ QPEL_HV_COMPUTE %1, %2, mx, ackssdw
+ SWAP m15, m0
+ punpcklwd m0, m8, m9
+ punpcklwd m2, m10, m11
+ punpcklwd m4, m12, m13
+ punpcklwd m6, m14, m15
+%if %1 > 4
+ punpckhwd m1, m8, m9
+ punpckhwd m3, m10, m11
+ punpckhwd m5, m12, m13
+ punpckhwd m7, m14, m15
+%endif
+ QPEL_HV_COMPUTE %1, 14, my, ackssdw
+ SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
+ BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+ PEL_%2STORE%1 dstq, m0, m1
+
+%if %1 <= 4
+ movq m8, m9
+ movq m9, m10
+ movq m10, m11
+ movq m11, m12
+ movq m12, m13
+ movq m13, m14
+ movq m14, m15
+%else
+ movdqa m8, m9
+ movdqa m9, m10
+ movdqa m10, m11
+ movdqa m11, m12
+ movdqa m12, m13
+ movdqa m13, m14
+ movdqa m14, m15
+%endif
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
%endmacro
-; %1: 0 - one source; 1 - two sources
-; %2: width
-; %3: bit depth
-%macro PUT_PRED 3
-%if %1
-cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
+%macro WEIGHTING_FUNCS 2
+%if WIN64 || ARCH_X86_32
+cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
+ mov r4d, denomm
+%define SHIFT r4d
%else
-cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
+cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
+%define SHIFT denomd
%endif
+ lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
+%if %1 <= 4
+ pxor m1, m1
+%endif
+ movd m2, wxm ; WX
+ movd m4, SHIFT ; shift
+%if %1 <= 4
+ punpcklwd m2, m1
+%else
+ punpcklwd m2, m2
+%endif
+ dec SHIFT
+ movdqu m5, [pd_1]
+ movd m6, SHIFT
+ pshufd m2, m2, 0
+ mov SHIFT, oxm
+ pslld m5, m6
+%if %2 != 8
+ shl SHIFT, %2-8 ; ox << (bitd - 8)
+%endif
+ movd m3, SHIFT ; OX
+ pshufd m3, m3, 0
+%if WIN64 || ARCH_X86_32
+ mov SHIFT, heightm
+%endif
+.loop:
+ SIMPLE_LOAD %1, 10, srcq, m0
+%if %1 <= 4
+ punpcklwd m0, m1
+ pmaddwd m0, m2
+ paddd m0, m5
+ psrad m0, m4
+ paddd m0, m3
+%else
+ pmulhw m6, m0, m2
+ pmullw m0, m2
+ punpckhwd m1, m0, m6
+ punpcklwd m0, m6
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, m4
+ psrad m1, m4
+ paddd m0, m3
+ paddd m1, m3
+%endif
+ packssdw m0, m1
+%if %2 == 8
+ packuswb m0, m0
+%else
+ CLIPW m0, [pb_0], [max_pixels_%2]
+%endif
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, 2*MAX_PB_SIZE ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
-%assign shift 14 + %1 - %3
-%assign offset (1 << (shift - 1))
-%define offset_data pw_ %+ offset
-
- mova m0, [offset_data]
-
-%if %3 > 8
- %define STORE_BLOCK movu
- %define STORE_HALF movq
-
- %assign pixel_max ((1 << %3) - 1)
- %define pw_pixel_max pw_ %+ pixel_max
- pxor m1, m1
- mova m2, [pw_pixel_max]
+cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
+ movifnidn r5d, denomm
+%if %1 <= 4
+ pxor m1, m1
+%endif
+ movd m2, wx0m ; WX0
+ lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom
+ movd m3, wx1m ; WX1
+ movd m0, r5d ; shift
+%if %1 <= 4
+ punpcklwd m2, m1
+ punpcklwd m3, m1
+%else
+ punpcklwd m2, m2
+ punpcklwd m3, m3
+%endif
+ inc r5d
+ movd m5, r5d ; shift+1
+ pshufd m2, m2, 0
+ mov r5d, ox0m
+ pshufd m3, m3, 0
+ add r5d, ox1m
+%if %2 != 8
+ shl r5d, %2-8 ; ox << (bitd - 8)
+%endif
+ inc r5d
+ movd m4, r5d ; offset
+ pshufd m4, m4, 0
+%if UNIX64
+%define h heightd
%else
- %define STORE_BLOCK movq
- %define STORE_HALF movd
+ mov r5d, heightm
+%define h r5d
%endif
+ pslld m4, m0
.loop:
-%assign i 0
-%rep (%2 + 7) / 8
-
- %if (i + 1) * 8 > %2
- %define LOAD movq
- %define STORE STORE_HALF
- %else
- %define LOAD mova
- %define STORE STORE_BLOCK
- %endif
-
- LOAD m3, [srcq + 16 * i]
- AVG m3, [src2q + 16 * i], %1, %3 - i * 8, m4
-
- paddsw m3, m0
- psraw m3, shift
-
- %if %3 == 8
- packuswb m3, m3
- STORE [dstq + 8 * i], m3
- %else
- CLIPW m3, m1, m2
- STORE [dstq + 16 * i], m3
- %endif
-%assign i (i + 1)
-%endrep
-
- add dstq, dststrideq
- add srcq, srcstrideq
-%if %1
- add src2q, srcstrideq
-%endif
-
- dec heightd
- jg .loop
+ SIMPLE_LOAD %1, 10, srcq, m0
+ SIMPLE_LOAD %1, 10, src2q, m8
+%if %1 <= 4
+ punpcklwd m0, m1
+ punpcklwd m8, m1
+ pmaddwd m0, m3
+ pmaddwd m8, m2
+ paddd m0, m4
+ paddd m0, m8
+ psrad m0, m5
+%else
+ pmulhw m6, m0, m3
+ pmullw m0, m3
+ pmulhw m7, m8, m2
+ pmullw m8, m2
+ punpckhwd m1, m0, m6
+ punpcklwd m0, m6
+ punpckhwd m9, m8, m7
+ punpcklwd m8, m7
+ paddd m0, m8
+ paddd m1, m9
+ paddd m0, m4
+ paddd m1, m4
+ psrad m0, m5
+ psrad m1, m5
+%endif
+ packssdw m0, m1
+%if %2 == 8
+ packuswb m0, m0
+%else
+ CLIPW m0, [pb_0], [max_pixels_%2]
+%endif
+ PEL_%2STORE%1 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, 2*MAX_PB_SIZE ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src2 += srcstride
+ dec h ; cmp height
+ jnz .loop ; height loop
RET
%endmacro
-INIT_XMM sse2
-PUT_PRED 0, 4, 8
-PUT_PRED 1, 4, 8
-PUT_PRED 0, 8, 8
-PUT_PRED 1, 8, 8
-PUT_PRED 0, 12, 8
-PUT_PRED 1, 12, 8
-PUT_PRED 0, 16, 8
-PUT_PRED 1, 16, 8
-PUT_PRED 0, 24, 8
-PUT_PRED 1, 24, 8
-PUT_PRED 0, 32, 8
-PUT_PRED 1, 32, 8
-PUT_PRED 0, 48, 8
-PUT_PRED 1, 48, 8
-PUT_PRED 0, 64, 8
-PUT_PRED 1, 64, 8
-
-PUT_PRED 0, 4, 10
-PUT_PRED 1, 4, 10
-PUT_PRED 0, 8, 10
-PUT_PRED 1, 8, 10
-PUT_PRED 0, 12, 10
-PUT_PRED 1, 12, 10
-PUT_PRED 0, 16, 10
-PUT_PRED 1, 16, 10
-PUT_PRED 0, 24, 10
-PUT_PRED 1, 24, 10
-PUT_PRED 0, 32, 10
-PUT_PRED 1, 32, 10
-PUT_PRED 0, 48, 10
-PUT_PRED 1, 48, 10
-PUT_PRED 0, 64, 10
-PUT_PRED 1, 64, 10
-
-%macro PUT_WEIGHTED_PRED 3
-%if %1
-cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
-%else
-cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
-%endif
-
- and denomd, 0xff
- movsx weight0d, weight0w
- movsx offset0d, offset0w
-%if %1
- movsx weight1d, weight1w
- movsx offset1d, offset1w
-%endif
-
- add denomd, 14 + %1 - %3
- movd m0, denomd
+INIT_XMM sse4 ; adds ff_ and _sse4 to function name
+
+WEIGHTING_FUNCS 2, 8
+WEIGHTING_FUNCS 4, 8
+WEIGHTING_FUNCS 6, 8
+WEIGHTING_FUNCS 8, 8
+
+WEIGHTING_FUNCS 2, 10
+WEIGHTING_FUNCS 4, 10
+WEIGHTING_FUNCS 6, 10
+WEIGHTING_FUNCS 8, 10
-%if %3 > 8
- %assign pixel_max ((1 << %3) - 1)
- %define pw_pixel_max pw_ %+ pixel_max
- pxor m4, m4
- mova m5, [pw_pixel_max]
+WEIGHTING_FUNCS 2, 12
+WEIGHTING_FUNCS 4, 12
+WEIGHTING_FUNCS 6, 12
+WEIGHTING_FUNCS 8, 12
- shl offset0d, %3 - 8
-%if %1
- shl offset1d, %3 - 8
-%endif
-%endif
+HEVC_PUT_HEVC_PEL_PIXELS 2, 8
+HEVC_PUT_HEVC_PEL_PIXELS 4, 8
+HEVC_PUT_HEVC_PEL_PIXELS 6, 8
+HEVC_PUT_HEVC_PEL_PIXELS 8, 8
+HEVC_PUT_HEVC_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 8
-%if %1
- lea offset0d, [offset0d + offset1d + 1]
-%else
- lea offset0d, [2 * offset0d + 1]
-%endif
- movd m1, offset0d
- SPLATD m1
- pslld m1, m0
- psrad m1, 1
+HEVC_PUT_HEVC_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_PEL_PIXELS 8, 10
- movd m2, weight0d
- SPLATD m2
-%if %1
- movd m3, weight1d
- SPLATD m3
-%endif
+HEVC_PUT_HEVC_PEL_PIXELS 2, 12
+HEVC_PUT_HEVC_PEL_PIXELS 4, 12
+HEVC_PUT_HEVC_PEL_PIXELS 6, 12
+HEVC_PUT_HEVC_PEL_PIXELS 8, 12
-.loop:
-%assign i 0
-%rep (%2 + 3) / 4
+HEVC_PUT_HEVC_EPEL 2, 8
+HEVC_PUT_HEVC_EPEL 4, 8
+HEVC_PUT_HEVC_EPEL 6, 8
+HEVC_PUT_HEVC_EPEL 8, 8
+HEVC_PUT_HEVC_EPEL 12, 8
+HEVC_PUT_HEVC_EPEL 16, 8
- pmovsxwd m6, [src0q + 8 * i]
- pmulld m6, m2
-%if %1
- pmovsxwd m7, [src1q + 8 * i]
- pmulld m7, m3
- paddd m6, m7
-%endif
+HEVC_PUT_HEVC_EPEL 2, 10
+HEVC_PUT_HEVC_EPEL 4, 10
+HEVC_PUT_HEVC_EPEL 6, 10
+HEVC_PUT_HEVC_EPEL 8, 10
- paddd m6, m1
- psrad m6, m0
+HEVC_PUT_HEVC_EPEL 2, 12
+HEVC_PUT_HEVC_EPEL 4, 12
+HEVC_PUT_HEVC_EPEL 6, 12
+HEVC_PUT_HEVC_EPEL 8, 12
- packssdw m6, m6
+HEVC_PUT_HEVC_EPEL_HV 2, 8
+HEVC_PUT_HEVC_EPEL_HV 4, 8
+HEVC_PUT_HEVC_EPEL_HV 6, 8
+HEVC_PUT_HEVC_EPEL_HV 8, 8
+HEVC_PUT_HEVC_EPEL_HV 16, 8
-%if %3 > 8
- CLIPW m6, m4, m5
- movq [dstq + 8 * i], m6
-%else
- packuswb m6, m6
- movd [dstq + 4 * i], m6
-%endif
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
-%assign i (i + 1)
-%endrep
+HEVC_PUT_HEVC_EPEL_HV 2, 12
+HEVC_PUT_HEVC_EPEL_HV 4, 12
+HEVC_PUT_HEVC_EPEL_HV 6, 12
+HEVC_PUT_HEVC_EPEL_HV 8, 12
- add dstq, dststrideq
- add src0q, srcstrideq
-%if %1
- add src1q, srcstrideq
-%endif
+HEVC_PUT_HEVC_QPEL 4, 8
+HEVC_PUT_HEVC_QPEL 8, 8
+HEVC_PUT_HEVC_QPEL 12, 8
+HEVC_PUT_HEVC_QPEL 16, 8
- dec heightd
- jg .loop
- RET
-%endmacro
+HEVC_PUT_HEVC_QPEL 4, 10
+HEVC_PUT_HEVC_QPEL 8, 10
-%if ARCH_X86_64
-INIT_XMM sse4
-PUT_WEIGHTED_PRED 0, 4, 8
-PUT_WEIGHTED_PRED 1, 4, 8
-PUT_WEIGHTED_PRED 0, 8, 8
-PUT_WEIGHTED_PRED 1, 8, 8
-PUT_WEIGHTED_PRED 0, 12, 8
-PUT_WEIGHTED_PRED 1, 12, 8
-PUT_WEIGHTED_PRED 0, 16, 8
-PUT_WEIGHTED_PRED 1, 16, 8
-PUT_WEIGHTED_PRED 0, 24, 8
-PUT_WEIGHTED_PRED 1, 24, 8
-PUT_WEIGHTED_PRED 0, 32, 8
-PUT_WEIGHTED_PRED 1, 32, 8
-PUT_WEIGHTED_PRED 0, 48, 8
-PUT_WEIGHTED_PRED 1, 48, 8
-PUT_WEIGHTED_PRED 0, 64, 8
-PUT_WEIGHTED_PRED 1, 64, 8
-
-PUT_WEIGHTED_PRED 0, 4, 10
-PUT_WEIGHTED_PRED 1, 4, 10
-PUT_WEIGHTED_PRED 0, 8, 10
-PUT_WEIGHTED_PRED 1, 8, 10
-PUT_WEIGHTED_PRED 0, 12, 10
-PUT_WEIGHTED_PRED 1, 12, 10
-PUT_WEIGHTED_PRED 0, 16, 10
-PUT_WEIGHTED_PRED 1, 16, 10
-PUT_WEIGHTED_PRED 0, 24, 10
-PUT_WEIGHTED_PRED 1, 24, 10
-PUT_WEIGHTED_PRED 0, 32, 10
-PUT_WEIGHTED_PRED 1, 32, 10
-PUT_WEIGHTED_PRED 0, 48, 10
-PUT_WEIGHTED_PRED 1, 48, 10
-PUT_WEIGHTED_PRED 0, 64, 10
-PUT_WEIGHTED_PRED 1, 64, 10
+HEVC_PUT_HEVC_QPEL 4, 12
+HEVC_PUT_HEVC_QPEL 8, 12
+
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+HEVC_PUT_HEVC_QPEL_HV 2, 12
+HEVC_PUT_HEVC_QPEL_HV 4, 12
+HEVC_PUT_HEVC_QPEL_HV 6, 12
+HEVC_PUT_HEVC_QPEL_HV 8, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
+
+HEVC_PUT_HEVC_PEL_PIXELS 32, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+
+HEVC_PUT_HEVC_EPEL 32, 8
+HEVC_PUT_HEVC_EPEL 16, 10
+
+HEVC_PUT_HEVC_EPEL_HV 16, 10
+HEVC_PUT_HEVC_EPEL_HV 32, 8
+
+HEVC_PUT_HEVC_QPEL 32, 8
+
+HEVC_PUT_HEVC_QPEL 16, 10
+
+HEVC_PUT_HEVC_QPEL_HV 16, 10
+%endif ;AVX2
%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
new file mode 100644
index 0000000000..888a28afa7
--- /dev/null
+++ b/libavcodec/x86/hevc_sao.asm
@@ -0,0 +1,340 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 8bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pb_1
+cextern pb_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 0
+ and leftq, 31
+ movd xm0, leftd
+ add leftq, 1
+ and leftq, 31
+ movd xm1, leftd
+ add leftq, 1
+ and leftq, 31
+ movd xm2, leftd
+ add leftq, 1
+ and leftq, 31
+ movd xm3, leftd
+
+ SPLATW m0, xm0
+ SPLATW m1, xm1
+ SPLATW m2, xm2
+ SPLATW m3, xm3
+%if mmsize > 16
+ SPLATW m4, [offsetq + 2]
+ SPLATW m5, [offsetq + 4]
+ SPLATW m6, [offsetq + 6]
+ SPLATW m7, [offsetq + 8]
+%else
+ movq m7, [offsetq + 2]
+ SPLATW m4, m7, 0
+ SPLATW m5, m7, 1
+ SPLATW m6, m7, 2
+ SPLATW m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+ pxor m14, m14
+
+%else ; ARCH_X86_32
+ mova [rsp+mmsize*0], m0
+ mova [rsp+mmsize*1], m1
+ mova [rsp+mmsize*2], m2
+ mova [rsp+mmsize*3], m3
+ mova [rsp+mmsize*4], m4
+ mova [rsp+mmsize*5], m5
+ mova [rsp+mmsize*6], m6
+ pxor m0, m0
+ %assign MMSIZE mmsize
+ %define m14 m0
+ %define m13 m1
+ %define m9 m2
+ %define m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+ mov heightd, r7m
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
+ psraw %1, %2, 3
+%if ARCH_X86_64
+ pcmpeqw m10, %1, m0
+ pcmpeqw m11, %1, m1
+ pcmpeqw m12, %1, m2
+ pcmpeqw %1, m3
+ pand m10, m4
+ pand m11, m5
+ pand m12, m6
+ pand %1, m7
+ por m10, m11
+ por m12, %1
+ por m10, m12
+ paddw %2, m10
+%else ; ARCH_X86_32
+ pcmpeqw m4, %1, [rsp+MMSIZE*0]
+ pcmpeqw m5, %1, [rsp+MMSIZE*1]
+ pcmpeqw m6, %1, [rsp+MMSIZE*2]
+ pcmpeqw %1, [rsp+MMSIZE*3]
+ pand m4, [rsp+MMSIZE*4]
+ pand m5, [rsp+MMSIZE*5]
+ pand m6, [rsp+MMSIZE*6]
+ pand %1, m7
+ por m4, m5
+ por m6, %1
+ por m4, m6
+ paddw %2, m4
+%endif ; ARCH
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+; int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 2
+cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+ HEVC_SAO_BAND_FILTER_INIT
+
+align 16
+.loop:
+%if %1 == 8
+ movq m8, [srcq]
+ punpcklbw m8, m14
+ HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
+ packuswb m8, m14
+ movq [dstq], m8
+%endif ; %1 == 8
+
+%assign i 0
+%rep %2
+ mova m13, [srcq + i]
+ punpcklbw m8, m13, m14
+ HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
+ punpckhbw m13, m14
+ HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+ packuswb m8, m13
+ mova [dstq + i], m8
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+ mova m13, [srcq + i]
+ punpcklbw m8, m13, m14
+ HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
+ punpckhbw m13, m14
+ HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+ packuswb m8, m13
+ mova [dstq + i], m8
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif ; %1 == 48
+
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ REP_RET
+%endmacro
+
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER 8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+HEVC_SAO_BAND_FILTER 32, 2
+HEVC_SAO_BAND_FILTER 48, 2
+HEVC_SAO_BAND_FILTER 64, 4
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 32, 1
+HEVC_SAO_BAND_FILTER 48, 1
+HEVC_SAO_BAND_FILTER 64, 2
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE 64
+%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+ movsxd eoq, dword eom
+%elif ARCH_X86_64
+ movsxd eoq, eod
+%else
+ mov eoq, r4m
+%endif
+ lea tmp2q, [pb_eo]
+ movsx a_strideq, byte [tmp2q+eoq*4+1]
+ movsx b_strideq, byte [tmp2q+eoq*4+3]
+ imul a_strideq, EDGE_SRCSTRIDE
+ imul b_strideq, EDGE_SRCSTRIDE
+ movsx tmpq, byte [tmp2q+eoq*4]
+ add a_strideq, tmpq
+ movsx tmpq, byte [tmp2q+eoq*4+2]
+ add b_strideq, tmpq
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
+ pminub m4, m1, m2
+ pminub m5, m1, m3
+ pcmpeqb m2, m4
+ pcmpeqb m3, m5
+ pcmpeqb m4, m1
+ pcmpeqb m5, m1
+ psubb m4, m2
+ psubb m5, m3
+ paddb m4, m6
+ paddb m4, m5
+
+ pshufb m2, m0, m4
+%if %1 > 8
+ punpckhbw m5, m7, m1
+ punpckhbw m4, m2, m7
+ punpcklbw m3, m7, m1
+ punpcklbw m2, m7
+ pmaddubsw m5, m4
+ pmaddubsw m3, m2
+ packuswb m3, m5
+%else
+ punpcklbw m3, m7, m1
+ punpcklbw m2, m7
+ pmaddubsw m3, m2
+ packuswb m3, m3
+%endif
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+; int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 2-3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+ HEVC_SAO_EDGE_FILTER_INIT
+ mov heightd, r6m
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
+%define eoq srcq
+%define tmpq heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+ HEVC_SAO_EDGE_FILTER_INIT
+ mov srcq, srcm
+ mov offsetq, r3m
+ mov dststrideq, dststridem
+%endif ; ARCH
+
+%if mmsize > 16
+ vbroadcasti128 m0, [offsetq]
+%else
+ movu m0, [offsetq]
+%endif
+ mova m1, [pb_edge_shuffle]
+ packsswb m0, m0
+ mova m7, [pb_1]
+ pshufb m0, m1
+ mova m6, [pb_2]
+%if ARCH_X86_32
+ mov heightd, r6m
+%endif
+
+align 16
+.loop:
+
+%if %1 == 8
+ movq m1, [srcq]
+ movq m2, [srcq + a_strideq]
+ movq m3, [srcq + b_strideq]
+ HEVC_SAO_EDGE_FILTER_COMPUTE %1
+ movq [dstq], m3
+%endif
+
+%assign i 0
+%rep %2
+ mova m1, [srcq + i]
+ movu m2, [srcq + a_strideq + i]
+ movu m3, [srcq + b_strideq + i]
+ HEVC_SAO_EDGE_FILTER_COMPUTE %1
+ mov%3 [dstq + i], m3
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+ mova m1, [srcq + i]
+ movu m2, [srcq + a_strideq + i]
+ movu m3, [srcq + b_strideq + i]
+ HEVC_SAO_EDGE_FILTER_COMPUTE %1
+ mova [dstq + i], m3
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif
+
+ add dstq, dststrideq
+ add srcq, EDGE_SRCSTRIDE
+ dec heightd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+HEVC_SAO_EDGE_FILTER 8, 0
+HEVC_SAO_EDGE_FILTER 16, 1, a
+HEVC_SAO_EDGE_FILTER 32, 2, a
+HEVC_SAO_EDGE_FILTER 48, 2, a
+HEVC_SAO_EDGE_FILTER 64, 4, a
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 32, 1, a
+HEVC_SAO_EDGE_FILTER 48, 1, u
+HEVC_SAO_EDGE_FILTER 64, 2, a
+%endif
diff --git a/libavcodec/x86/hevc_sao_10bit.asm b/libavcodec/x86/hevc_sao_10bit.asm
new file mode 100644
index 0000000000..f81e2d5033
--- /dev/null
+++ b/libavcodec/x86/hevc_sao_10bit.asm
@@ -0,0 +1,370 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 10/12bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m2: times 16 dw -2
+pw_mask10: times 16 dw 0x03FF
+pw_mask12: times 16 dw 0x0FFF
+pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pw_m1
+cextern pw_1
+cextern pw_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 1
+ and leftq, 31
+ movd xm0, leftd
+ add leftq, 1
+ and leftq, 31
+ movd xm1, leftd
+ add leftq, 1
+ and leftq, 31
+ movd xm2, leftd
+ add leftq, 1
+ and leftq, 31
+ movd xm3, leftd
+
+ SPLATW m0, xm0
+ SPLATW m1, xm1
+ SPLATW m2, xm2
+ SPLATW m3, xm3
+%if mmsize > 16
+ SPLATW m4, [offsetq + 2]
+ SPLATW m5, [offsetq + 4]
+ SPLATW m6, [offsetq + 6]
+ SPLATW m7, [offsetq + 8]
+%else
+ movq m7, [offsetq + 2]
+ SPLATW m4, m7, 0
+ SPLATW m5, m7, 1
+ SPLATW m6, m7, 2
+ SPLATW m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+ mova m13, [pw_mask %+ %1]
+ pxor m14, m14
+
+%else ; ARCH_X86_32
+ mova [rsp+mmsize*0], m0
+ mova [rsp+mmsize*1], m1
+ mova [rsp+mmsize*2], m2
+ mova [rsp+mmsize*3], m3
+ mova [rsp+mmsize*4], m4
+ mova [rsp+mmsize*5], m5
+ mova [rsp+mmsize*6], m6
+ mova m1, [pw_mask %+ %1]
+ pxor m0, m0
+ %define m14 m0
+ %define m13 m1
+ %define m9 m2
+ %define m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+ mov heightd, r7m
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+; int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 3
+cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+ HEVC_SAO_BAND_FILTER_INIT %1
+
+align 16
+.loop:
+
+%assign i 0
+%assign j 0
+%rep %3
+%assign k 8+(j&1)
+%assign l 9-(j&1)
+ mova m %+ k, [srcq + i]
+ psraw m %+ l, m %+ k, %1-5
+%if ARCH_X86_64
+ pcmpeqw m10, m %+ l, m0
+ pcmpeqw m11, m %+ l, m1
+ pcmpeqw m12, m %+ l, m2
+ pcmpeqw m %+ l, m3
+ pand m10, m4
+ pand m11, m5
+ pand m12, m6
+ pand m %+ l, m7
+ por m10, m11
+ por m12, m %+ l
+ por m10, m12
+ paddw m %+ k, m10
+%else ; ARCH_X86_32
+ pcmpeqw m4, m %+ l, [rsp+mmsize*0]
+ pcmpeqw m5, m %+ l, [rsp+mmsize*1]
+ pcmpeqw m6, m %+ l, [rsp+mmsize*2]
+ pcmpeqw m %+ l, [rsp+mmsize*3]
+ pand m4, [rsp+mmsize*4]
+ pand m5, [rsp+mmsize*5]
+ pand m6, [rsp+mmsize*6]
+ pand m %+ l, m7
+ por m4, m5
+ por m6, m %+ l
+ por m4, m6
+ paddw m %+ k, m4
+%endif ; ARCH
+ CLIPW m %+ k, m14, m13
+ mova [dstq + i], m %+ k
+%assign i i+mmsize
+%assign j j+1
+%endrep
+
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd
+ jg .loop
+ REP_RET
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER 10, 8, 1
+HEVC_SAO_BAND_FILTER 10, 16, 2
+HEVC_SAO_BAND_FILTER 10, 32, 4
+HEVC_SAO_BAND_FILTER 10, 48, 6
+HEVC_SAO_BAND_FILTER 10, 64, 8
+
+HEVC_SAO_BAND_FILTER 12, 8, 1
+HEVC_SAO_BAND_FILTER 12, 16, 2
+HEVC_SAO_BAND_FILTER 12, 32, 4
+HEVC_SAO_BAND_FILTER 12, 48, 6
+HEVC_SAO_BAND_FILTER 12, 64, 8
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 10, 8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 10, 16, 1
+HEVC_SAO_BAND_FILTER 10, 32, 2
+HEVC_SAO_BAND_FILTER 10, 48, 3
+HEVC_SAO_BAND_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 12, 8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 12, 16, 1
+HEVC_SAO_BAND_FILTER 12, 32, 2
+HEVC_SAO_BAND_FILTER 12, 48, 3
+HEVC_SAO_BAND_FILTER 12, 64, 4
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE 64
+%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro PMINUW 4
+%if cpuflag(sse4)
+ pminuw %1, %2, %3
+%else
+ psubusw %4, %2, %3
+ psubw %1, %2, %4
+%endif
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+ movsxd eoq, dword eom
+%elif ARCH_X86_64
+ movsxd eoq, eod
+%else
+ mov eoq, r4m
+%endif
+ lea tmp2q, [pb_eo]
+ movsx a_strideq, byte [tmp2q+eoq*4+1]
+ movsx b_strideq, byte [tmp2q+eoq*4+3]
+ imul a_strideq, EDGE_SRCSTRIDE >> 1
+ imul b_strideq, EDGE_SRCSTRIDE >> 1
+ movsx tmpq, byte [tmp2q+eoq*4]
+ add a_strideq, tmpq
+ movsx tmpq, byte [tmp2q+eoq*4+2]
+ add b_strideq, tmpq
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+; int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+ HEVC_SAO_EDGE_FILTER_INIT
+ mov heightd, r6m
+ add a_strideq, a_strideq
+ add b_strideq, b_strideq
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
+%define eoq srcq
+%define tmpq heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+%define m8 m1
+%define m9 m2
+%define m10 m3
+%define m11 m4
+%define m12 m5
+ HEVC_SAO_EDGE_FILTER_INIT
+ mov srcq, srcm
+ mov offsetq, r3m
+ mov dststrideq, dststridem
+ add a_strideq, a_strideq
+ add b_strideq, b_strideq
+
+%endif ; ARCH
+
+%if mmsize > 16
+ SPLATW m8, [offsetq+2]
+ SPLATW m9, [offsetq+4]
+ SPLATW m10, [offsetq+0]
+ SPLATW m11, [offsetq+6]
+ SPLATW m12, [offsetq+8]
+%else
+ movq m10, [offsetq+0]
+ movd m12, [offsetq+6]
+ SPLATW m8, xm10, 1
+ SPLATW m9, xm10, 2
+ SPLATW m10, xm10, 0
+ SPLATW m11, xm12, 0
+ SPLATW m12, xm12, 1
+%endif
+ pxor m0, m0
+%if ARCH_X86_64
+ mova m13, [pw_m1]
+ mova m14, [pw_1]
+ mova m15, [pw_2]
+%else
+ mov heightd, r6m
+ mova [rsp+mmsize*0], m8
+ mova [rsp+mmsize*1], m9
+ mova [rsp+mmsize*2], m10
+ mova [rsp+mmsize*3], m11
+ mova [rsp+mmsize*4], m12
+%endif
+
+align 16
+.loop:
+
+%assign i 0
+%rep %3
+ mova m1, [srcq + i]
+ movu m2, [srcq+a_strideq + i]
+ movu m3, [srcq+b_strideq + i]
+ PMINUW m4, m1, m2, m6
+ PMINUW m5, m1, m3, m7
+ pcmpeqw m2, m4
+ pcmpeqw m3, m5
+ pcmpeqw m4, m1
+ pcmpeqw m5, m1
+ psubw m4, m2
+ psubw m5, m3
+
+ paddw m4, m5
+ pcmpeqw m2, m4, [pw_m2]
+%if ARCH_X86_64
+ pcmpeqw m3, m4, m13
+ pcmpeqw m5, m4, m0
+ pcmpeqw m6, m4, m14
+ pcmpeqw m7, m4, m15
+ pand m2, m8
+ pand m3, m9
+ pand m5, m10
+ pand m6, m11
+ pand m7, m12
+%else
+ pcmpeqw m3, m4, [pw_m1]
+ pcmpeqw m5, m4, m0
+ pcmpeqw m6, m4, [pw_1]
+ pcmpeqw m7, m4, [pw_2]
+ pand m2, [rsp+mmsize*0]
+ pand m3, [rsp+mmsize*1]
+ pand m5, [rsp+mmsize*2]
+ pand m6, [rsp+mmsize*3]
+ pand m7, [rsp+mmsize*4]
+%endif
+ paddw m2, m3
+ paddw m5, m6
+ paddw m2, m7
+ paddw m2, m1
+ paddw m2, m5
+ CLIPW m2, m0, [pw_mask %+ %1]
+ mova [dstq + i], m2
+%assign i i+mmsize
+%endrep
+
+ add dstq, dststrideq
+ add srcq, EDGE_SRCSTRIDE
+ dec heightd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_EDGE_FILTER 10, 8, 1
+HEVC_SAO_EDGE_FILTER 10, 16, 2
+HEVC_SAO_EDGE_FILTER 10, 32, 4
+HEVC_SAO_EDGE_FILTER 10, 48, 6
+HEVC_SAO_EDGE_FILTER 10, 64, 8
+
+HEVC_SAO_EDGE_FILTER 12, 8, 1
+HEVC_SAO_EDGE_FILTER 12, 16, 2
+HEVC_SAO_EDGE_FILTER 12, 32, 4
+HEVC_SAO_EDGE_FILTER 12, 48, 6
+HEVC_SAO_EDGE_FILTER 12, 64, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 10, 8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 10, 16, 1
+HEVC_SAO_EDGE_FILTER 10, 32, 2
+HEVC_SAO_EDGE_FILTER 10, 48, 3
+HEVC_SAO_EDGE_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 12, 8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 12, 16, 1
+HEVC_SAO_EDGE_FILTER 12, 32, 2
+HEVC_SAO_EDGE_FILTER 12, 48, 3
+HEVC_SAO_EDGE_FILTER 12, 64, 4
+%endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
new file mode 100644
index 0000000000..67be0a9059
--- /dev/null
+++ b/libavcodec/x86/hevcdsp.h
@@ -0,0 +1,259 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+ PEL_PROTOTYPE(fname##4, bitd, opt); \
+ PEL_PROTOTYPE(fname##6, bitd, opt); \
+ PEL_PROTOTYPE(fname##8, bitd, opt); \
+ PEL_PROTOTYPE(fname##12, bitd, opt); \
+ PEL_PROTOTYPE(fname##16, bitd, opt); \
+ PEL_PROTOTYPE(fname##24, bitd, opt); \
+ PEL_PROTOTYPE(fname##32, bitd, opt); \
+ PEL_PROTOTYPE(fname##48, bitd, opt); \
+ PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+ PEL_PROTOTYPE(fname##4, bitd, opt); \
+ PEL_PROTOTYPE(fname##8, bitd, opt); \
+ PEL_PROTOTYPE(fname##12, bitd, opt); \
+ PEL_PROTOTYPE(fname##16, bitd, opt); \
+ PEL_PROTOTYPE(fname##24, bitd, opt); \
+ PEL_PROTOTYPE(fname##32, bitd, opt); \
+ PEL_PROTOTYPE(fname##48, bitd, opt); \
+ PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+ WEIGHTING_PROTOTYPE(2, bitd, opt); \
+ WEIGHTING_PROTOTYPE(4, bitd, opt); \
+ WEIGHTING_PROTOTYPE(6, bitd, opt); \
+ WEIGHTING_PROTOTYPE(8, bitd, opt); \
+ WEIGHTING_PROTOTYPE(12, bitd, opt); \
+ WEIGHTING_PROTOTYPE(16, bitd, opt); \
+ WEIGHTING_PROTOTYPE(24, bitd, opt); \
+ WEIGHTING_PROTOTYPE(32, bitd, opt); \
+ WEIGHTING_PROTOTYPE(48, bitd, opt); \
+ WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels , 8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h , 8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v , 8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv , 8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h , 8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v, 8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv, 8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
+
+void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index a95fa30a95..17cd2332aa 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -2,29 +2,31 @@
* Copyright (c) 2013 Seppo Tomperi
* Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
-
+#include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
#include "libavcodec/hevcdsp.h"
+#include "libavcodec/x86/hevcdsp.h"
#define LFC_FUNC(DIR, DEPTH, OPT) \
void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
@@ -32,43 +34,34 @@ void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix,
#define LFL_FUNC(DIR, DEPTH, OPT) \
void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
-#define LFC_FUNCS(type, depth) \
- LFC_FUNC(h, depth, sse2) \
- LFC_FUNC(v, depth, sse2)
-
-#define LFL_FUNCS(type, depth) \
- LFL_FUNC(h, depth, ssse3) \
- LFL_FUNC(v, depth, ssse3)
-
-LFC_FUNCS(uint8_t, 8)
-LFC_FUNCS(uint8_t, 10)
-LFL_FUNCS(uint8_t, 8)
-LFL_FUNCS(uint8_t, 10)
-
-#define idct_dc_proto(size, bitd, opt) \
- void ff_hevc_idct_ ## size ## _dc_add_ ## bitd ## _ ## opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-
-idct_dc_proto(4, 8,mmxext);
-idct_dc_proto(8, 8,mmxext);
-idct_dc_proto(16,8, sse2);
-idct_dc_proto(32,8, sse2);
-
-idct_dc_proto(32,8, avx2);
+#define LFC_FUNCS(type, depth, opt) \
+ LFC_FUNC(h, depth, opt) \
+ LFC_FUNC(v, depth, opt)
-idct_dc_proto(4, 10,mmxext);
-idct_dc_proto(8, 10, sse2);
-idct_dc_proto(16,10, sse2);
-idct_dc_proto(32,10, sse2);
-idct_dc_proto(8, 10, avx);
-idct_dc_proto(16,10, avx);
-idct_dc_proto(32,10, avx);
+#define LFL_FUNCS(type, depth, opt) \
+ LFL_FUNC(h, depth, opt) \
+ LFL_FUNC(v, depth, opt)
-idct_dc_proto(16,10, avx2);
-idct_dc_proto(32,10, avx2);
+LFC_FUNCS(uint8_t, 8, sse2)
+LFC_FUNCS(uint8_t, 10, sse2)
+LFC_FUNCS(uint8_t, 12, sse2)
+LFC_FUNCS(uint8_t, 8, avx)
+LFC_FUNCS(uint8_t, 10, avx)
+LFC_FUNCS(uint8_t, 12, avx)
+LFL_FUNCS(uint8_t, 8, sse2)
+LFL_FUNCS(uint8_t, 10, sse2)
+LFL_FUNCS(uint8_t, 12, sse2)
+LFL_FUNCS(uint8_t, 8, ssse3)
+LFL_FUNCS(uint8_t, 10, ssse3)
+LFL_FUNCS(uint8_t, 12, ssse3)
+LFL_FUNCS(uint8_t, 8, avx)
+LFL_FUNCS(uint8_t, 10, avx)
+LFL_FUNCS(uint8_t, 12, avx)
#define IDCT_DC_FUNCS(W, opt) \
void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
-void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs)
+void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
+void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
IDCT_DC_FUNCS(4x4, mmxext);
IDCT_DC_FUNCS(8x8, mmxext);
@@ -91,208 +84,631 @@ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
IDCT_FUNCS(sse2)
IDCT_FUNCS(avx)
-void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-#define GET_PIXELS(width, depth, cf) \
-void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
- uint8_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer);
-
-GET_PIXELS(4, 8, sse2)
-GET_PIXELS(8, 8, sse2)
-GET_PIXELS(12, 8, sse2)
-GET_PIXELS(16, 8, sse2)
-GET_PIXELS(24, 8, sse2)
-GET_PIXELS(32, 8, sse2)
-GET_PIXELS(48, 8, sse2)
-GET_PIXELS(64, 8, sse2)
-
-GET_PIXELS(4, 10, sse2)
-GET_PIXELS(8, 10, sse2)
-GET_PIXELS(12, 10, sse2)
-GET_PIXELS(16, 10, sse2)
-GET_PIXELS(24, 10, sse2)
-GET_PIXELS(32, 10, sse2)
-GET_PIXELS(48, 10, sse2)
-GET_PIXELS(64, 10, sse2)
-
-/* those are independent of the bit depth, so declared separately */
-#define INTERP_HV_FUNC(width, cf) \
-void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
- int16_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer); \
-void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
- int16_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer);
-
-INTERP_HV_FUNC(4, avx)
-INTERP_HV_FUNC(8, avx)
-INTERP_HV_FUNC(12, avx)
-INTERP_HV_FUNC(16, avx)
-INTERP_HV_FUNC(24, avx)
-INTERP_HV_FUNC(32, avx)
-INTERP_HV_FUNC(48, avx)
-INTERP_HV_FUNC(64, avx)
-
-#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
-#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
-static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
- uint8_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer) \
+#define mc_rep_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \
+ uint8_t *_src, ptrdiff_t _srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ int i; \
+ uint8_t *src; \
+ int16_t *dst; \
+ for (i = 0; i < W; i += step) { \
+ src = _src + (i * ((bitd + 7) / 8)); \
+ dst = _dst + i; \
+ ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
+ } \
+}
+#define mc_rep_uni_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
+ uint8_t *_src, ptrdiff_t _srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ int i; \
+ uint8_t *src; \
+ uint8_t *dst; \
+ for (i = 0; i < W; i += step) { \
+ src = _src + (i * ((bitd + 7) / 8)); \
+ dst = _dst + (i * ((bitd + 7) / 8)); \
+ ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
+ height, mx, my, width); \
+ } \
+}
+#define mc_rep_bi_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src, \
+ ptrdiff_t _srcstride, int16_t* _src2, \
+ int height, intptr_t mx, intptr_t my, int width) \
+{ \
+ int i; \
+ uint8_t *src; \
+ uint8_t *dst; \
+ int16_t *src2; \
+ for (i = 0; i < W ; i += step) { \
+ src = _src + (i * ((bitd + 7) / 8)); \
+ dst = _dst + (i * ((bitd + 7) / 8)); \
+ src2 = _src2 + i; \
+ ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \
+ height, mx, my, width); \
+ } \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt) \
+ mc_rep_func(name, bitd, step, W, opt) \
+ mc_rep_uni_func(name, bitd, step, W, opt) \
+ mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst, \
+ uint8_t *src, ptrdiff_t _srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
+ ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \
+ _srcstride, height, mx, my, width); \
+}
+#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t _srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
+ ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
+ src + (step1 * ((bitd + 7) / 8)), _srcstride, \
+ height, mx, my, width); \
+}
+#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t _srcstride, int16_t* src2, \
+ int height, intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
+ ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
+ src + (step1 * ((bitd + 7) / 8)), _srcstride, \
+ src2 + step1, height, mx, my, width); \
+}
+
+#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
+ mc_rep_func2(name, bitd, step1, step2, W, opt) \
+ mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
+ mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
+
+#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
+void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
+ int height, intptr_t mx, intptr_t my, int width) \
+ \
+{ \
+ ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \
+ ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
+void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t _srcstride, int16_t *src2, \
+ int height, intptr_t mx, intptr_t my, int width) \
{ \
- const ptrdiff_t stride = FFALIGN(width + 7, 8); \
- ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
- height + 7, mx, my, mcbuffer); \
- ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride, \
- height, mx, my, mcbuffer); \
+ ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \
+ height, mx, my, width); \
+ ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
+ height, mx, my, width); \
}
-#else
-#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
-
-#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
-void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
- uint8_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer); \
-void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
- uint8_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer); \
-QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-
-QPEL_FUNCS(4, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(8, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
-
-QPEL_FUNCS(4, 10, avx, avx, avx)
-QPEL_FUNCS(8, 10, avx, avx, avx)
-QPEL_FUNCS(12, 10, avx, avx, avx)
-QPEL_FUNCS(16, 10, avx, avx, avx)
-QPEL_FUNCS(24, 10, avx, avx, avx)
-QPEL_FUNCS(32, 10, avx, avx, avx)
-QPEL_FUNCS(48, 10, avx, avx, avx)
-QPEL_FUNCS(64, 10, avx, avx, avx)
-
-#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
-#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
-static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
- uint8_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer) \
+
+#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
+void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t _srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
{ \
- const ptrdiff_t stride = FFALIGN(width + 3, 8); \
- ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride, \
- height + 3, mx, my, mcbuffer); \
- ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride, \
- height, mx, my, mcbuffer); \
+ ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \
+ height, mx, my, width); \
+ ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \
+ height, mx, my, width); \
}
-#else
-#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
-
-#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
-void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
- uint8_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer); \
-void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
- uint8_t *src, ptrdiff_t srcstride, \
- int height, int mx, int my, int16_t *mcbuffer); \
-EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-
-EPEL_FUNCS(4, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(8, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
-
-EPEL_FUNCS(4, 10, avx, avx, avx)
-EPEL_FUNCS(8, 10, avx, avx, avx)
-EPEL_FUNCS(12, 10, avx, avx, avx)
-EPEL_FUNCS(16, 10, avx, avx, avx)
-EPEL_FUNCS(24, 10, avx, avx, avx)
-EPEL_FUNCS(32, 10, avx, avx, avx)
-
-#define PUT_PRED(width, depth, cf_uw, cf_w) \
-void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
- int16_t *src, ptrdiff_t srcstride, \
- int height); \
-void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
- int16_t *src1, int16_t *src2, \
- ptrdiff_t srcstride, int height); \
-void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset, \
- uint8_t *dst, ptrdiff_t dststride, \
- int16_t *src, ptrdiff_t srcstride, \
- int height); \
-void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1, \
- int16_t offset0, int16_t offset1, \
- uint8_t *dst, ptrdiff_t dststride, \
- int16_t *src0, int16_t *src1, ptrdiff_t srcstride, \
- int height);
-
-PUT_PRED(4, 8, sse2, sse4)
-PUT_PRED(8, 8, sse2, sse4)
-PUT_PRED(12, 8, sse2, sse4)
-PUT_PRED(16, 8, sse2, sse4)
-PUT_PRED(24, 8, sse2, sse4)
-PUT_PRED(32, 8, sse2, sse4)
-PUT_PRED(48, 8, sse2, sse4)
-PUT_PRED(64, 8, sse2, sse4)
-
-PUT_PRED(4, 10, sse2, sse4)
-PUT_PRED(8, 10, sse2, sse4)
-PUT_PRED(12, 10, sse2, sse4)
-PUT_PRED(16, 10, sse2, sse4)
-PUT_PRED(24, 10, sse2, sse4)
-PUT_PRED(32, 10, sse2, sse4)
-PUT_PRED(48, 10, sse2, sse4)
-PUT_PRED(64, 10, sse2, sse4)
+
+#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \
+mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
+mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
+mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
+
+#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
+void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
+ int height, intptr_t mx, intptr_t my, int width) \
+ \
+{ \
+ ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \
+ ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
+void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t _srcstride, int16_t* src2, \
+ int height, intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
+ src2, height, mx, my, width); \
+ ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
+ src2+width2, height, mx, my, width); \
+}
+
+#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
+void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t _srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
+ height, mx, my, width); \
+ ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
+ height, mx, my, width); \
+}
+
+#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \
+mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
+mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
+mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
+
+#if HAVE_AVX2_EXTERNAL
+
+mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4)
+
+mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
+mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32)
+
+
+mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32)
+
+
+mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
+mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
+
+mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
+
+mc_rep_func(pel_pixels, 10, 16, 32, avx2)
+mc_rep_func(pel_pixels, 10, 16, 48, avx2)
+mc_rep_func(pel_pixels, 10, 32, 64, avx2)
+
+mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
+mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
+mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
+
+mc_rep_funcs(epel_h, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_v, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_h, 10, 16, 32, avx2)
+mc_rep_funcs(epel_h, 10, 16, 48, avx2)
+mc_rep_funcs(epel_h, 10, 32, 64, avx2)
+
+mc_rep_funcs(epel_v, 10, 16, 32, avx2)
+mc_rep_funcs(epel_v, 10, 16, 48, avx2)
+mc_rep_funcs(epel_v, 10, 32, 64, avx2)
+
+
+mc_rep_funcs(epel_hv, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
+mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
+mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
+mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4)
+
+mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
+mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4)
+
+mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
+
+#endif //AVX2
+
+mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
+mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
+mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
+mc_rep_funcs(pel_pixels, 8, 8, 24, sse4)
+mc_rep_funcs(pel_pixels,10, 8, 64, sse4)
+mc_rep_funcs(pel_pixels,10, 8, 48, sse4)
+mc_rep_funcs(pel_pixels,10, 8, 32, sse4)
+mc_rep_funcs(pel_pixels,10, 8, 24, sse4)
+mc_rep_funcs(pel_pixels,10, 8, 16, sse4)
+mc_rep_funcs(pel_pixels,10, 4, 12, sse4)
+mc_rep_funcs(pel_pixels,12, 8, 64, sse4)
+mc_rep_funcs(pel_pixels,12, 8, 48, sse4)
+mc_rep_funcs(pel_pixels,12, 8, 32, sse4)
+mc_rep_funcs(pel_pixels,12, 8, 24, sse4)
+mc_rep_funcs(pel_pixels,12, 8, 16, sse4)
+mc_rep_funcs(pel_pixels,12, 4, 12, sse4)
+
+mc_rep_funcs(epel_h, 8, 16, 64, sse4)
+mc_rep_funcs(epel_h, 8, 16, 48, sse4)
+mc_rep_funcs(epel_h, 8, 16, 32, sse4)
+mc_rep_funcs(epel_h, 8, 8, 24, sse4)
+mc_rep_funcs(epel_h,10, 8, 64, sse4)
+mc_rep_funcs(epel_h,10, 8, 48, sse4)
+mc_rep_funcs(epel_h,10, 8, 32, sse4)
+mc_rep_funcs(epel_h,10, 8, 24, sse4)
+mc_rep_funcs(epel_h,10, 8, 16, sse4)
+mc_rep_funcs(epel_h,10, 4, 12, sse4)
+mc_rep_funcs(epel_h,12, 8, 64, sse4)
+mc_rep_funcs(epel_h,12, 8, 48, sse4)
+mc_rep_funcs(epel_h,12, 8, 32, sse4)
+mc_rep_funcs(epel_h,12, 8, 24, sse4)
+mc_rep_funcs(epel_h,12, 8, 16, sse4)
+mc_rep_funcs(epel_h,12, 4, 12, sse4)
+mc_rep_funcs(epel_v, 8, 16, 64, sse4)
+mc_rep_funcs(epel_v, 8, 16, 48, sse4)
+mc_rep_funcs(epel_v, 8, 16, 32, sse4)
+mc_rep_funcs(epel_v, 8, 8, 24, sse4)
+mc_rep_funcs(epel_v,10, 8, 64, sse4)
+mc_rep_funcs(epel_v,10, 8, 48, sse4)
+mc_rep_funcs(epel_v,10, 8, 32, sse4)
+mc_rep_funcs(epel_v,10, 8, 24, sse4)
+mc_rep_funcs(epel_v,10, 8, 16, sse4)
+mc_rep_funcs(epel_v,10, 4, 12, sse4)
+mc_rep_funcs(epel_v,12, 8, 64, sse4)
+mc_rep_funcs(epel_v,12, 8, 48, sse4)
+mc_rep_funcs(epel_v,12, 8, 32, sse4)
+mc_rep_funcs(epel_v,12, 8, 24, sse4)
+mc_rep_funcs(epel_v,12, 8, 16, sse4)
+mc_rep_funcs(epel_v,12, 4, 12, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
+mc_rep_funcs(epel_hv, 8, 8, 24, sse4)
+mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4)
+mc_rep_funcs(epel_hv,10, 8, 64, sse4)
+mc_rep_funcs(epel_hv,10, 8, 48, sse4)
+mc_rep_funcs(epel_hv,10, 8, 32, sse4)
+mc_rep_funcs(epel_hv,10, 8, 24, sse4)
+mc_rep_funcs(epel_hv,10, 8, 16, sse4)
+mc_rep_funcs(epel_hv,10, 4, 12, sse4)
+mc_rep_funcs(epel_hv,12, 8, 64, sse4)
+mc_rep_funcs(epel_hv,12, 8, 48, sse4)
+mc_rep_funcs(epel_hv,12, 8, 32, sse4)
+mc_rep_funcs(epel_hv,12, 8, 24, sse4)
+mc_rep_funcs(epel_hv,12, 8, 16, sse4)
+mc_rep_funcs(epel_hv,12, 4, 12, sse4)
+
+mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
+mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
+mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
+mc_rep_funcs(qpel_h, 8, 8, 24, sse4)
+mc_rep_funcs(qpel_h,10, 8, 64, sse4)
+mc_rep_funcs(qpel_h,10, 8, 48, sse4)
+mc_rep_funcs(qpel_h,10, 8, 32, sse4)
+mc_rep_funcs(qpel_h,10, 8, 24, sse4)
+mc_rep_funcs(qpel_h,10, 8, 16, sse4)
+mc_rep_funcs(qpel_h,10, 4, 12, sse4)
+mc_rep_funcs(qpel_h,12, 8, 64, sse4)
+mc_rep_funcs(qpel_h,12, 8, 48, sse4)
+mc_rep_funcs(qpel_h,12, 8, 32, sse4)
+mc_rep_funcs(qpel_h,12, 8, 24, sse4)
+mc_rep_funcs(qpel_h,12, 8, 16, sse4)
+mc_rep_funcs(qpel_h,12, 4, 12, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
+mc_rep_funcs(qpel_v, 8, 8, 24, sse4)
+mc_rep_funcs(qpel_v,10, 8, 64, sse4)
+mc_rep_funcs(qpel_v,10, 8, 48, sse4)
+mc_rep_funcs(qpel_v,10, 8, 32, sse4)
+mc_rep_funcs(qpel_v,10, 8, 24, sse4)
+mc_rep_funcs(qpel_v,10, 8, 16, sse4)
+mc_rep_funcs(qpel_v,10, 4, 12, sse4)
+mc_rep_funcs(qpel_v,12, 8, 64, sse4)
+mc_rep_funcs(qpel_v,12, 8, 48, sse4)
+mc_rep_funcs(qpel_v,12, 8, 32, sse4)
+mc_rep_funcs(qpel_v,12, 8, 24, sse4)
+mc_rep_funcs(qpel_v,12, 8, 16, sse4)
+mc_rep_funcs(qpel_v,12, 4, 12, sse4)
+mc_rep_funcs(qpel_hv, 8, 8, 64, sse4)
+mc_rep_funcs(qpel_hv, 8, 8, 48, sse4)
+mc_rep_funcs(qpel_hv, 8, 8, 32, sse4)
+mc_rep_funcs(qpel_hv, 8, 8, 24, sse4)
+mc_rep_funcs(qpel_hv, 8, 8, 16, sse4)
+mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4)
+mc_rep_funcs(qpel_hv,10, 8, 64, sse4)
+mc_rep_funcs(qpel_hv,10, 8, 48, sse4)
+mc_rep_funcs(qpel_hv,10, 8, 32, sse4)
+mc_rep_funcs(qpel_hv,10, 8, 24, sse4)
+mc_rep_funcs(qpel_hv,10, 8, 16, sse4)
+mc_rep_funcs(qpel_hv,10, 4, 12, sse4)
+mc_rep_funcs(qpel_hv,12, 8, 64, sse4)
+mc_rep_funcs(qpel_hv,12, 8, 48, sse4)
+mc_rep_funcs(qpel_hv,12, 8, 32, sse4)
+mc_rep_funcs(qpel_hv,12, 8, 24, sse4)
+mc_rep_funcs(qpel_hv,12, 8, 16, sse4)
+mc_rep_funcs(qpel_hv,12, 4, 12, sse4)
+
+#define mc_rep_uni_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+ int height, int denom, int _wx, int _ox) \
+{ \
+ int i; \
+ int16_t *src; \
+ uint8_t *dst; \
+ for (i = 0; i < W; i += step) { \
+ src= _src + i; \
+ dst= _dst + (i * ((bitd + 7) / 8)); \
+ ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, \
+ height, denom, _wx, _ox); \
+ } \
+}
+
+mc_rep_uni_w(8, 6, 12, sse4)
+mc_rep_uni_w(8, 8, 16, sse4)
+mc_rep_uni_w(8, 8, 24, sse4)
+mc_rep_uni_w(8, 8, 32, sse4)
+mc_rep_uni_w(8, 8, 48, sse4)
+mc_rep_uni_w(8, 8, 64, sse4)
+
+mc_rep_uni_w(10, 6, 12, sse4)
+mc_rep_uni_w(10, 8, 16, sse4)
+mc_rep_uni_w(10, 8, 24, sse4)
+mc_rep_uni_w(10, 8, 32, sse4)
+mc_rep_uni_w(10, 8, 48, sse4)
+mc_rep_uni_w(10, 8, 64, sse4)
+
+mc_rep_uni_w(12, 6, 12, sse4)
+mc_rep_uni_w(12, 8, 16, sse4)
+mc_rep_uni_w(12, 8, 24, sse4)
+mc_rep_uni_w(12, 8, 32, sse4)
+mc_rep_uni_w(12, 8, 48, sse4)
+mc_rep_uni_w(12, 8, 64, sse4)
+
+#define mc_rep_bi_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+ int16_t *_src2, int height, \
+ int denom, int _wx0, int _wx1, int _ox0, int _ox1) \
+{ \
+ int i; \
+ int16_t *src; \
+ int16_t *src2; \
+ uint8_t *dst; \
+ for (i = 0; i < W; i += step) { \
+ src = _src + i; \
+ src2 = _src2 + i; \
+ dst = _dst + (i * ((bitd + 7) / 8)); \
+ ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2, \
+ height, denom, _wx0, _wx1, _ox0, _ox1); \
+ } \
+}
+
+mc_rep_bi_w(8, 6, 12, sse4)
+mc_rep_bi_w(8, 8, 16, sse4)
+mc_rep_bi_w(8, 8, 24, sse4)
+mc_rep_bi_w(8, 8, 32, sse4)
+mc_rep_bi_w(8, 8, 48, sse4)
+mc_rep_bi_w(8, 8, 64, sse4)
+
+mc_rep_bi_w(10, 6, 12, sse4)
+mc_rep_bi_w(10, 8, 16, sse4)
+mc_rep_bi_w(10, 8, 24, sse4)
+mc_rep_bi_w(10, 8, 32, sse4)
+mc_rep_bi_w(10, 8, 48, sse4)
+mc_rep_bi_w(10, 8, 64, sse4)
+
+mc_rep_bi_w(12, 6, 12, sse4)
+mc_rep_bi_w(12, 8, 16, sse4)
+mc_rep_bi_w(12, 8, 24, sse4)
+mc_rep_bi_w(12, 8, 32, sse4)
+mc_rep_bi_w(12, 8, 48, sse4)
+mc_rep_bi_w(12, 8, 64, sse4)
+
+#define mc_uni_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
+ uint8_t *_src, ptrdiff_t _srcstride, \
+ int height, int denom, \
+ int _wx, int _ox, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
+ ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
+ ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
+}
+
+#define mc_uni_w_funcs(name, bitd, opt) \
+ mc_uni_w_func(name, bitd, 4, opt) \
+ mc_uni_w_func(name, bitd, 8, opt) \
+ mc_uni_w_func(name, bitd, 12, opt) \
+ mc_uni_w_func(name, bitd, 16, opt) \
+ mc_uni_w_func(name, bitd, 24, opt) \
+ mc_uni_w_func(name, bitd, 32, opt) \
+ mc_uni_w_func(name, bitd, 48, opt) \
+ mc_uni_w_func(name, bitd, 64, opt)
+
+mc_uni_w_funcs(pel_pixels, 8, sse4)
+mc_uni_w_func(pel_pixels, 8, 6, sse4)
+mc_uni_w_funcs(epel_h, 8, sse4)
+mc_uni_w_func(epel_h, 8, 6, sse4)
+mc_uni_w_funcs(epel_v, 8, sse4)
+mc_uni_w_func(epel_v, 8, 6, sse4)
+mc_uni_w_funcs(epel_hv, 8, sse4)
+mc_uni_w_func(epel_hv, 8, 6, sse4)
+mc_uni_w_funcs(qpel_h, 8, sse4)
+mc_uni_w_funcs(qpel_v, 8, sse4)
+mc_uni_w_funcs(qpel_hv, 8, sse4)
+
+mc_uni_w_funcs(pel_pixels, 10, sse4)
+mc_uni_w_func(pel_pixels, 10, 6, sse4)
+mc_uni_w_funcs(epel_h, 10, sse4)
+mc_uni_w_func(epel_h, 10, 6, sse4)
+mc_uni_w_funcs(epel_v, 10, sse4)
+mc_uni_w_func(epel_v, 10, 6, sse4)
+mc_uni_w_funcs(epel_hv, 10, sse4)
+mc_uni_w_func(epel_hv, 10, 6, sse4)
+mc_uni_w_funcs(qpel_h, 10, sse4)
+mc_uni_w_funcs(qpel_v, 10, sse4)
+mc_uni_w_funcs(qpel_hv, 10, sse4)
+
+mc_uni_w_funcs(pel_pixels, 12, sse4)
+mc_uni_w_func(pel_pixels, 12, 6, sse4)
+mc_uni_w_funcs(epel_h, 12, sse4)
+mc_uni_w_func(epel_h, 12, 6, sse4)
+mc_uni_w_funcs(epel_v, 12, sse4)
+mc_uni_w_func(epel_v, 12, 6, sse4)
+mc_uni_w_funcs(epel_hv, 12, sse4)
+mc_uni_w_func(epel_hv, 12, 6, sse4)
+mc_uni_w_funcs(qpel_h, 12, sse4)
+mc_uni_w_funcs(qpel_v, 12, sse4)
+mc_uni_w_funcs(qpel_hv, 12, sse4)
+
+#define mc_bi_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
+ uint8_t *_src, ptrdiff_t _srcstride, \
+ int16_t *_src2, \
+ int height, int denom, \
+ int _wx0, int _wx1, int _ox0, int _ox1, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
+ ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
+ ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2, \
+ height, denom, _wx0, _wx1, _ox0, _ox1); \
+}
+
+#define mc_bi_w_funcs(name, bitd, opt) \
+ mc_bi_w_func(name, bitd, 4, opt) \
+ mc_bi_w_func(name, bitd, 8, opt) \
+ mc_bi_w_func(name, bitd, 12, opt) \
+ mc_bi_w_func(name, bitd, 16, opt) \
+ mc_bi_w_func(name, bitd, 24, opt) \
+ mc_bi_w_func(name, bitd, 32, opt) \
+ mc_bi_w_func(name, bitd, 48, opt) \
+ mc_bi_w_func(name, bitd, 64, opt)
+
+mc_bi_w_funcs(pel_pixels, 8, sse4)
+mc_bi_w_func(pel_pixels, 8, 6, sse4)
+mc_bi_w_funcs(epel_h, 8, sse4)
+mc_bi_w_func(epel_h, 8, 6, sse4)
+mc_bi_w_funcs(epel_v, 8, sse4)
+mc_bi_w_func(epel_v, 8, 6, sse4)
+mc_bi_w_funcs(epel_hv, 8, sse4)
+mc_bi_w_func(epel_hv, 8, 6, sse4)
+mc_bi_w_funcs(qpel_h, 8, sse4)
+mc_bi_w_funcs(qpel_v, 8, sse4)
+mc_bi_w_funcs(qpel_hv, 8, sse4)
+
+mc_bi_w_funcs(pel_pixels, 10, sse4)
+mc_bi_w_func(pel_pixels, 10, 6, sse4)
+mc_bi_w_funcs(epel_h, 10, sse4)
+mc_bi_w_func(epel_h, 10, 6, sse4)
+mc_bi_w_funcs(epel_v, 10, sse4)
+mc_bi_w_func(epel_v, 10, 6, sse4)
+mc_bi_w_funcs(epel_hv, 10, sse4)
+mc_bi_w_func(epel_hv, 10, 6, sse4)
+mc_bi_w_funcs(qpel_h, 10, sse4)
+mc_bi_w_funcs(qpel_v, 10, sse4)
+mc_bi_w_funcs(qpel_hv, 10, sse4)
+
+mc_bi_w_funcs(pel_pixels, 12, sse4)
+mc_bi_w_func(pel_pixels, 12, 6, sse4)
+mc_bi_w_funcs(epel_h, 12, sse4)
+mc_bi_w_func(epel_h, 12, 6, sse4)
+mc_bi_w_funcs(epel_v, 12, sse4)
+mc_bi_w_func(epel_v, 12, 6, sse4)
+mc_bi_w_funcs(epel_hv, 12, sse4)
+mc_bi_w_func(epel_hv, 12, 6, sse4)
+mc_bi_w_funcs(qpel_h, 12, sse4)
+mc_bi_w_funcs(qpel_v, 12, sse4)
+mc_bi_w_funcs(qpel_hv, 12, sse4)
+#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define SAO_BAND_FILTER_FUNCS(bitd, opt) \
+void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+ int16_t *sao_offset_val, int sao_left_class, int width, int height); \
+void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+ int16_t *sao_offset_val, int sao_left_class, int width, int height); \
+void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+ int16_t *sao_offset_val, int sao_left_class, int width, int height); \
+void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+ int16_t *sao_offset_val, int sao_left_class, int width, int height); \
+void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+SAO_BAND_FILTER_FUNCS(8, sse2)
+SAO_BAND_FILTER_FUNCS(10, sse2)
+SAO_BAND_FILTER_FUNCS(12, sse2)
+SAO_BAND_FILTER_FUNCS(8, avx)
+SAO_BAND_FILTER_FUNCS(10, avx)
+SAO_BAND_FILTER_FUNCS(12, avx)
+SAO_BAND_FILTER_FUNCS(8, avx2)
+SAO_BAND_FILTER_FUNCS(10, avx2)
+SAO_BAND_FILTER_FUNCS(12, avx2)
+
+#define SAO_BAND_INIT(bitd, opt) do { \
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \
+ c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
+ c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
+ c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
+ c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define SAO_EDGE_FILTER_FUNCS(bitd, opt) \
+void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+ int eo, int width, int height); \
+void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+ int eo, int width, int height); \
+void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+ int eo, int width, int height); \
+void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+ int eo, int width, int height); \
+void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+ int eo, int width, int height); \
+
+SAO_EDGE_FILTER_FUNCS(8, ssse3)
+SAO_EDGE_FILTER_FUNCS(8, avx2)
+SAO_EDGE_FILTER_FUNCS(10, sse2)
+SAO_EDGE_FILTER_FUNCS(10, avx2)
+SAO_EDGE_FILTER_FUNCS(12, sse2)
+SAO_EDGE_FILTER_FUNCS(12, avx2)
+
+#define SAO_EDGE_INIT(bitd, opt) do { \
+ c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_##bitd##_##opt; \
+ c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
+ c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
+ c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
+ PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
+ PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
+ PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
+ PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
+ PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
+ PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
+ PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
+ PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
+ PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
+#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \
+ PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
+ PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
+ PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
+ PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
+ PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
+ PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
+ PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
+ PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
{
int cpu_flags = av_get_cpu_flags();
-#define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \
- c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \
- c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \
- c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
- c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
- c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
- c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
- c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
- c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
-
-#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \
- c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \
- c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \
- c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
- c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
- c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
- c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
-
-#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf)
-#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
-
if (bit_depth == 8) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
@@ -303,10 +719,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
- c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
- c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
- c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
+ c->idct[2] = ff_hevc_idct_16x16_8_sse2;
+ c->idct[3] = ff_hevc_idct_32x32_8_sse2;
+ }
+ SAO_BAND_INIT(8, sse2);
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
@@ -315,41 +735,166 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_sse2;
c->idct[1] = ff_hevc_idct_8x8_8_sse2;
- SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
- SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
-
- SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 8, sse2);
- SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 8, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 8, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
+ c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
+ c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
+ c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
- SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
- SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
- SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
- SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
+ if(ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+ }
+ SAO_EDGE_INIT(8, ssse3);
+ }
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
+
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
}
if (EXTERNAL_AVX(cpu_flags)) {
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+
+ c->idct[2] = ff_hevc_idct_16x16_8_avx;
+ c->idct[3] = ff_hevc_idct_32x32_8_avx;
+ }
+ SAO_BAND_INIT(8, avx);
+
c->idct[0] = ff_hevc_idct_4x4_8_avx;
c->idct[1] = ff_hevc_idct_8x8_8_avx;
+
c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
}
if (EXTERNAL_AVX2(cpu_flags)) {
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
+ c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+ if (ARCH_X86_64) {
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+ c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+ c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+ c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+ c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+ c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+ c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+ c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+ c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+ c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+ c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+ c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+ c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+ c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+ c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+ c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+ c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
+ }
+ SAO_BAND_INIT(8, avx2);
+
+ c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
+ c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
+
c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
-
- c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
+
+ c->idct[2] = ff_hevc_idct_16x16_10_sse2;
+ c->idct[3] = ff_hevc_idct_32x32_10_sse2;
+ }
+ SAO_BAND_INIT(10, sse2);
+ SAO_EDGE_INIT(10, sse2);
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
@@ -357,89 +902,250 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_10_sse2;
c->idct[1] = ff_hevc_idct_8x8_10_sse2;
- SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
- SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
-
- SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 10, sse2);
- SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
}
+ if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+ }
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
+
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
+ }
if (EXTERNAL_AVX(cpu_flags)) {
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+
+ c->idct[2] = ff_hevc_idct_16x16_10_avx;
+ c->idct[3] = ff_hevc_idct_32x32_10_avx;
+ }
+
c->idct[0] = ff_hevc_idct_4x4_10_avx;
c->idct[1] = ff_hevc_idct_8x8_10_avx;
+
+ SAO_BAND_INIT(10, avx);
}
if (EXTERNAL_AVX2(cpu_flags)) {
- c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
- c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
}
- }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
+ if (ARCH_X86_64) {
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
-#if ARCH_X86_64
- if (bit_depth == 8) {
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->idct[2] = ff_hevc_idct_16x16_8_sse2;
- c->idct[3] = ff_hevc_idct_32x32_8_sse2;
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
- }
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
- if (EXTERNAL_SSE4(cpu_flags)) {
- SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 8, sse4);
- SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 8, sse4);
- SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 8, sse4);
- SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
- }
+ c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
- if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
- SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
- SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
- c->idct[2] = ff_hevc_idct_16x16_8_avx;
- c->idct[3] = ff_hevc_idct_32x32_8_avx;
+ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+ c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+ c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+ c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+ c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+ c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+ c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+ c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+ c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+ c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+ c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+ c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+ c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+ c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+ c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+ c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+ c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+ c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+ c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+ c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+ c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+ c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+ c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+ c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+ c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+ c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+ c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+ c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+ c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+ c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+ c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+ c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+ c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+ c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+ c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+ c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+ c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+ c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+ c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+ c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+ c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+ c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+ c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+ c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+ c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+ c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+ c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+ c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+ c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+ c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+ c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+ c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+ c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+ c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+ c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
+ }
+ SAO_BAND_INIT(10, avx2);
+ SAO_EDGE_INIT(10, avx2);
+
+ c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
+ c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
}
- if (EXTERNAL_AVX2(cpu_flags)) {
- c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
- c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+ } else if (bit_depth == 12) {
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
}
- } else if (bit_depth == 10) {
if (EXTERNAL_SSE2(cpu_flags)) {
- c->idct[2] = ff_hevc_idct_16x16_10_sse2;
- c->idct[3] = ff_hevc_idct_32x32_10_sse2;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+ }
+ SAO_BAND_INIT(12, sse2);
+ SAO_EDGE_INIT(12, sse2);
+
+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
}
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+ if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
}
- if (EXTERNAL_SSE4(cpu_flags)) {
- SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 10, sse4);
- SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 10, sse4);
- SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 10, sse4);
- SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
+
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
}
if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
- SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
- SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
- SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
- SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
- SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
- SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
- c->idct[2] = ff_hevc_idct_16x16_10_avx;
- c->idct[3] = ff_hevc_idct_32x32_10_avx;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
+ }
+ SAO_BAND_INIT(12, avx);
}
if (EXTERNAL_AVX2(cpu_flags)) {
- c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
- c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
+
+ SAO_BAND_INIT(12, avx2);
+ SAO_EDGE_INIT(12, avx2);
}
}
-#endif /* ARCH_X86_64 */
}
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 8e211140ca..ce5d7a4e28 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -1,20 +1,27 @@
;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
;* SIMD-optimized halfpel functions
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -22,26 +29,49 @@
SECTION_RODATA
cextern pb_1
+cextern pw_2
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
+
+cextern pw_8192
SECTION .text
; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_x2, 4,5,4
+%else
cglobal put_pixels8_x2, 4,5
+%endif
lea r4, [r2*2]
.loop:
- mova m0, [r1]
- mova m1, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
+ movu m0, [r1+1]
+ movu m1, [r1+r2+1]
+%if cpuflag(sse2)
+ movu m2, [r1]
+ movu m3, [r1+r2]
+ pavgb m0, m2
+ pavgb m1, m3
+%else
+ PAVGB m0, [r1]
+ PAVGB m1, [r1+r2]
+%endif
mova [r0], m0
mova [r0+r2], m1
add r1, r4
add r0, r4
- mova m0, [r1]
- mova m1, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
+ movu m0, [r1+1]
+ movu m1, [r1+r2+1]
+%if cpuflag(sse2)
+ movu m2, [r1]
+ movu m3, [r1+r2]
+ pavgb m0, m2
+ pavgb m1, m3
+%else
+ PAVGB m0, [r1]
+ PAVGB m1, [r1+r2]
+%endif
add r1, r4
mova [r0], m0
mova [r0+r2], m1
@@ -99,6 +129,9 @@ INIT_MMX mmxext
PUT_PIXELS_16
INIT_MMX 3dnow
PUT_PIXELS_16
+; The 8_X2 macro can easily be used here
+INIT_XMM sse2
+PUT_PIXELS8_X2
; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -144,20 +177,24 @@ PUT_NO_RND_PIXELS8_X2
; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_y2, 4,5,3
+%else
cglobal put_pixels8_y2, 4,5
+%endif
lea r4, [r2*2]
- mova m0, [r1]
+ movu m0, [r1]
sub r0, r2
.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
+ movu m1, [r1+r2]
+ movu m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
+ movu m1, [r1+r2]
+ movu m0, [r1+r4]
add r0, r4
add r1, r4
PAVGB m2, m1
@@ -174,6 +211,9 @@ INIT_MMX mmxext
PUT_PIXELS8_Y2
INIT_MMX 3dnow
PUT_PIXELS8_Y2
+; actually, put_pixels16_y2_sse2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -245,26 +285,48 @@ AVG_PIXELS8
; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_x2, 4,5,4
+%else
cglobal avg_pixels8_x2, 4,5
+%endif
lea r4, [r2*2]
+%if notcpuflag(mmxext)
+ pcmpeqd m5, m5
+ paddb m5, m5
+%endif
.loop:
- mova m0, [r1]
- mova m2, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m2, [r1+r2+1]
- PAVGB m0, [r0]
- PAVGB m2, [r0+r2]
+ movu m0, [r1]
+ movu m2, [r1+r2]
+%if cpuflag(sse2)
+ movu m1, [r1+1]
+ movu m3, [r1+r2+1]
+ pavgb m0, m1
+ pavgb m2, m3
+%else
+ PAVGB m0, [r1+1], m3, m5
+ PAVGB m2, [r1+r2+1], m4, m5
+%endif
+ PAVGB m0, [r0], m3, m5
+ PAVGB m2, [r0+r2], m4, m5
add r1, r4
mova [r0], m0
mova [r0+r2], m2
- mova m0, [r1]
- mova m2, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m2, [r1+r2+1]
+ movu m0, [r1]
+ movu m2, [r1+r2]
+%if cpuflag(sse2)
+ movu m1, [r1+1]
+ movu m3, [r1+r2+1]
+ pavgb m0, m1
+ pavgb m2, m3
+%else
+ PAVGB m0, [r1+1], m3, m5
+ PAVGB m2, [r1+r2+1], m4, m5
+%endif
add r0, r4
add r1, r4
- PAVGB m0, [r0]
- PAVGB m2, [r0+r2]
+ PAVGB m0, [r0], m3, m5
+ PAVGB m2, [r0+r2], m4, m5
mova [r0], m0
mova [r0+r2], m2
add r0, r4
@@ -273,40 +335,45 @@ cglobal avg_pixels8_x2, 4,5
REP_RET
%endmacro
+INIT_MMX mmx
+AVG_PIXELS8_X2
INIT_MMX mmxext
AVG_PIXELS8_X2
INIT_MMX 3dnow
AVG_PIXELS8_X2
+; actually avg_pixels16_x2
+INIT_XMM sse2
+AVG_PIXELS8_X2
; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_y2, 4,5,3
+%else
cglobal avg_pixels8_y2, 4,5
+%endif
lea r4, [r2*2]
- mova m0, [r1]
+ movu m0, [r1]
sub r0, r2
.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
+ movu m1, [r1+r2]
+ movu m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
- mova m3, [r0+r2]
- mova m4, [r0+r4]
- PAVGB m0, m3
- PAVGB m1, m4
+ PAVGB m0, [r0+r2]
+ PAVGB m1, [r0+r4]
mova [r0+r2], m0
mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
+ movu m1, [r1+r2]
+ movu m0, [r1+r4]
PAVGB m2, m1
PAVGB m1, m0
add r0, r4
add r1, r4
- mova m3, [r0+r2]
- mova m4, [r0+r4]
- PAVGB m2, m3
- PAVGB m1, m4
+ PAVGB m2, [r0+r2]
+ PAVGB m1, [r0+r4]
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
@@ -319,11 +386,16 @@ INIT_MMX mmxext
AVG_PIXELS8_Y2
INIT_MMX 3dnow
AVG_PIXELS8_Y2
+; actually avg_pixels16_y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_XY2 0
-cglobal avg_pixels8_xy2, 4,5
+; Note this is not correctly rounded, and is therefore used for
+; not-bitexact output
+%macro AVG_APPROX_PIXELS8_XY2 0
+cglobal avg_approx_pixels8_xy2, 4,5
mova m6, [pb_1]
lea r4, [r2*2]
mova m0, [r1]
@@ -360,6 +432,160 @@ cglobal avg_pixels8_xy2, 4,5
%endmacro
INIT_MMX mmxext
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
+INIT_MMX 3dnow
+AVG_APPROX_PIXELS8_XY2
+
+
+; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS_XY2 1
+%if cpuflag(sse2)
+cglobal %1_pixels16_xy2, 4,5,8
+%else
+cglobal %1_pixels8_xy2, 4,5
+%endif
+ pxor m7, m7
+ mova m6, [pw_2]
+ movu m0, [r1]
+ movu m4, [r1+1]
+ mova m1, m0
+ mova m5, m4
+ punpcklbw m0, m7
+ punpcklbw m4, m7
+ punpckhbw m1, m7
+ punpckhbw m5, m7
+ paddusw m4, m0
+ paddusw m5, m1
+ xor r4, r4
+ add r1, r2
+.loop:
+ movu m0, [r1+r4]
+ movu m2, [r1+r4+1]
+ mova m1, m0
+ mova m3, m2
+ punpcklbw m0, m7
+ punpcklbw m2, m7
+ punpckhbw m1, m7
+ punpckhbw m3, m7
+ paddusw m0, m2
+ paddusw m1, m3
+ paddusw m4, m6
+ paddusw m5, m6
+ paddusw m4, m0
+ paddusw m5, m1
+ psrlw m4, 2
+ psrlw m5, 2
+%ifidn %1, avg
+ mova m3, [r0+r4]
+ packuswb m4, m5
+ PAVGB m4, m3
+%else
+ packuswb m4, m5
+%endif
+ mova [r0+r4], m4
+ add r4, r2
+
+ movu m2, [r1+r4]
+ movu m4, [r1+r4+1]
+ mova m3, m2
+ mova m5, m4
+ punpcklbw m2, m7
+ punpcklbw m4, m7
+ punpckhbw m3, m7
+ punpckhbw m5, m7
+ paddusw m4, m2
+ paddusw m5, m3
+ paddusw m0, m6
+ paddusw m1, m6
+ paddusw m0, m4
+ paddusw m1, m5
+ psrlw m0, 2
+ psrlw m1, 2
+%ifidn %1, avg
+ mova m3, [r0+r4]
+ packuswb m0, m1
+ PAVGB m0, m3
+%else
+ packuswb m0, m1
+%endif
+ mova [r0+r4], m0
+ add r4, r2
+ sub r3d, 2
+ jnz .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+SET_PIXELS_XY2 avg
INIT_MMX 3dnow
-AVG_PIXELS8_XY2
+SET_PIXELS_XY2 avg
+INIT_XMM sse2
+SET_PIXELS_XY2 put
+SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+ mova m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+ mova m4, [pb_interleave8]
+%endif
+ mova m5, [pb_1]
+ movu m0, [r1]
+ movu m1, [r1+1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ xor r4, r4
+ add r1, r2
+.loop:
+ movu m2, [r1+r4]
+ movu m3, [r1+r4+1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ paddusw m0, m2
+ paddusw m1, m3
+ pmulhrsw m0, [pw_8192]
+ pmulhrsw m1, [pw_8192]
+%ifidn %1, avg
+ mova m6, [r0+r4]
+ packuswb m0, m1
+ pshufb m0, m4
+ pavgb m0, m6
+%else
+ packuswb m0, m1
+ pshufb m0, m4
+%endif
+ mova [r0+r4], m0
+ add r4, r2
+
+ movu m0, [r1+r4]
+ movu m1, [r1+r4+1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ paddusw m2, m0
+ paddusw m3, m1
+ pmulhrsw m2, [pw_8192]
+ pmulhrsw m3, [pw_8192]
+%ifidn %1, avg
+ mova m6, [r0+r4]
+ packuswb m2, m3
+ pshufb m2, m4
+ pavgb m2, m6
+%else
+ packuswb m2, m3
+ pshufb m2, m4
+%endif
+ mova [r0+r4], m2
+ add r4, r2
+ sub r3d, 2
+ jnz .loop
+ REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7
diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
index 566e518a09..bf97029b57 100644
--- a/libavcodec/x86/hpeldsp.h
+++ b/libavcodec/x86/hpeldsp.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -29,14 +29,29 @@ void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
-void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags);
+void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
#endif /* AVCODEC_X86_HPELDSP_H */
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 6a8d4205fa..58e27e3542 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -3,20 +3,20 @@
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
@@ -39,6 +39,14 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -61,10 +69,12 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
#define avg_pixels8_mmx ff_avg_pixels8_mmx
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
@@ -98,11 +108,13 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
#undef PAVGB
#undef STATIC
+#if HAVE_MMX
CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
+#endif
/***********************************/
/* MMX rounding */
@@ -125,11 +137,13 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
#undef PAVGBP
#undef PAVGB
+#if HAVE_MMX
CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
+#endif
#endif /* HAVE_INLINE_ASM */
@@ -143,32 +157,49 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
- CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8)
+ CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
+ CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
HPELDSP_AVG_PIXELS16(_3dnow)
HPELDSP_AVG_PIXELS16(_mmxext)
#endif /* HAVE_YASM */
+#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
+ if (HAVE_MMX_EXTERNAL) \
+ c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU;
+
+#if HAVE_MMX_INLINE
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
- c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
+ SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
} while (0)
+#else
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
+ do { \
+ SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
+ } while (0)
+#endif
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
-#if HAVE_MMX_INLINE
SET_HPEL_FUNCS(put, [0], 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
SET_HPEL_FUNCS(avg, [0], 16, mmx);
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
SET_HPEL_FUNCS(put, [1], 8, mmx);
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
- SET_HPEL_FUNCS(avg, [1], 8, mmx);
-#endif /* HAVE_MMX_INLINE */
+ if (HAVE_MMX_EXTERNAL) {
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
+ }
+#if HAVE_MMX_INLINE
+ c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
+#endif
}
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
@@ -180,6 +211,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+ c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
@@ -187,6 +219,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
@@ -194,8 +227,8 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
- c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
- c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+ c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
+ c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
@@ -209,6 +242,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+ c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
@@ -216,6 +250,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
if (!(flags & AV_CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
@@ -223,8 +258,8 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
- c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
- c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+ c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
+ c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
}
#endif /* HAVE_AMD3DNOW_EXTERNAL */
}
@@ -234,10 +269,26 @@ static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
#if HAVE_SSE2_EXTERNAL
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
@@ -254,6 +305,9 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
if (EXTERNAL_SSE2_FAST(cpu_flags))
hpeldsp_init_sse2_fast(c, flags);
+ if (EXTERNAL_SSSE3(cpu_flags))
+ hpeldsp_init_ssse3(c, flags);
+
if (CONFIG_VP3_DECODER)
- ff_hpeldsp_vp3_init_x86(c, cpu_flags);
+ ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
}
diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c
deleted file mode 100644
index c93c78e40e..0000000000
--- a/libavcodec/x86/hpeldsp_mmx.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2001 Fabrice Bellard
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "hpeldsp.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- JUMPALIGN();
- do {
- __asm__ volatile(
- "movq %1, %%mm0 \n\t"
- "movq 1%1, %%mm1 \n\t"
- "movq %0, %%mm3 \n\t"
- PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
- PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, %0 \n\t"
- :"+m"(*block)
- :"m"(*pixels)
- :"memory");
- pixels += line_size;
- block += line_size;
- } while (--h);
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
index 82231ad13d..2bff2d2766 100644
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/libavcodec/x86/hpeldsp_rnd_template.c
@@ -7,20 +7,20 @@
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -28,7 +28,7 @@
#include <stdint.h>
// put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
@@ -60,7 +60,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
:FF_REG_a, "memory");
}
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
@@ -106,7 +106,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff
:FF_REG_a, "memory");
}
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
@@ -115,14 +115,14 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"),%%mm2 \n\t"
+ "movq (%1, %%"FF_REG_a"),%%mm2\n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"),%%mm0 \n\t"
+ "movq (%1, %%"FF_REG_a"),%%mm0\n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
@@ -135,33 +135,34 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
:FF_REG_a, "memory");
}
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
- JUMPALIGN();
- do {
__asm__ volatile(
- "movq %1, %%mm0 \n\t"
- "movq 1%1, %%mm1 \n\t"
- "movq %0, %%mm3 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%2), %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, %0 \n\t"
- "movq 8%1, %%mm0 \n\t"
- "movq 9%1, %%mm1 \n\t"
- "movq 8%0, %%mm3 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 9(%1), %%mm1 \n\t"
+ "movq 8(%2), %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, 8%0 \n\t"
- :"+m"(*block)
- :"m"(*pixels)
+ "movq %%mm0, 8(%2) \n\t"
+ "add %3, %1 \n\t"
+ "add %3, %2 \n\t"
+ "subl $1, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
:"memory");
- pixels += line_size;
- block += line_size;
- } while (--h);
}
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
@@ -170,7 +171,7 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
@@ -182,7 +183,7 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
"add %%"FF_REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
diff --git a/libavcodec/x86/hpeldsp_vp3.asm b/libavcodec/x86/hpeldsp_vp3.asm
index 513f14e4b4..cba96d06cb 100644
--- a/libavcodec/x86/hpeldsp_vp3.asm
+++ b/libavcodec/x86/hpeldsp_vp3.asm
@@ -1,20 +1,20 @@
;******************************************************************************
;* SIMD-optimized halfpel functions for VP3
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/hpeldsp_vp3_init.c b/libavcodec/x86/hpeldsp_vp3_init.c
index cc1f5e4566..5979f4123c 100644
--- a/libavcodec/x86/hpeldsp_vp3_init.c
+++ b/libavcodec/x86/hpeldsp_vp3_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -38,15 +38,19 @@ void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
-av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags)
+av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
{
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+ if (flags & AV_CODEC_FLAG_BITEXACT) {
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+ }
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+ if (flags & AV_CODEC_FLAG_BITEXACT) {
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+ }
}
}
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index 692162b5b6..0d8cae354a 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -1,48 +1,155 @@
;******************************************************************************
;* SIMD-optimized HuffYUV functions
;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-pb_f: times 16 db 15
-pb_zzzzzzzz77777777: times 8 db -1
-pb_7: times 8 db 7
-pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
-pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
-
SECTION .text
-; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
-; const uint8_t *diff, int w,
-; int *left, int *left_top)
+
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+ movd m4, maskd
+ SPLATW m4, m4
+ add wd, wd
+ test wq, 2*mmsize - 1
+ jz %%.tomainloop
+ push tmpq
+%%.wordloop:
+ sub wq, 2
+%ifidn %2, add
+ mov tmpw, [srcq+wq]
+ add tmpw, [dstq+wq]
+%else
+ mov tmpw, [src1q+wq]
+ sub tmpw, [src2q+wq]
+%endif
+ and tmpw, maskw
+ mov [dstq+wq], tmpw
+ test wq, 2*mmsize - 1
+ jnz %%.wordloop
+ pop tmpq
+%%.tomainloop:
+%ifidn %2, add
+ add srcq, wq
+%else
+ add src1q, wq
+ add src2q, wq
+%endif
+ add dstq, wq
+ neg wq
+ jz %%.end
+%%.loop:
+%ifidn %2, add
+ mov%1 m0, [srcq+wq]
+ mov%1 m1, [dstq+wq]
+ mov%1 m2, [srcq+wq+mmsize]
+ mov%1 m3, [dstq+wq+mmsize]
+%else
+ mov%1 m0, [src1q+wq]
+ mov%1 m1, [src2q+wq]
+ mov%1 m2, [src1q+wq+mmsize]
+ mov%1 m3, [src2q+wq+mmsize]
+%endif
+ p%2w m0, m1
+ p%2w m2, m3
+ pand m0, m4
+ pand m2, m4
+ mov%1 [dstq+wq] , m0
+ mov%1 [dstq+wq+mmsize], m2
+ add wq, 2*mmsize
+ jl %%.loop
+%%.end:
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+ INT16_LOOP a, add
+%endif
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+ test srcq, mmsize-1
+ jnz .unaligned
+ test dstq, mmsize-1
+ jnz .unaligned
+ INT16_LOOP a, add
+.unaligned:
+ INT16_LOOP u, add
+
+; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
+; intptr_t w, uint8_t *left)
+%macro LEFT_BGR32 0
+cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
+ shl wq, 2
+ movd m0, [leftq]
+ lea dstq, [dstq + wq]
+ lea srcq, [srcq + wq]
+ LSHIFT m0, mmsize-4
+ neg wq
+.loop:
+ movu m1, [srcq+wq]
+ mova m2, m1
+%if mmsize == 8
+ punpckhdq m0, m0
+%endif
+ LSHIFT m1, 4
+ paddb m1, m2
+%if mmsize == 16
+ pshufd m0, m0, q3333
+ mova m2, m1
+ LSHIFT m1, 8
+ paddb m1, m2
+%endif
+ paddb m0, m1
+ movu [dstq+wq], m0
+ add wq, mmsize
+ jl .loop
+ movd m0, [dstq-4]
+ movd [leftq], m0
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+LEFT_BGR32
+%endif
+INIT_XMM sse2
+LEFT_BGR32
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
INIT_MMX mmxext
-cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
+cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+ add wd, wd
+ movd mm6, maskd
+ SPLATW mm6, mm6
movq mm0, [topq]
movq mm2, mm0
movd mm4, [left_topq]
- psllq mm2, 8
+ psllq mm2, 16
movq mm1, mm0
por mm4, mm2
movd mm3, [leftq]
- psubb mm0, mm4 ; t-tl
+ psubw mm0, mm4 ; t-tl
add dstq, wq
add topq, wq
add diffq, wq
@@ -51,115 +158,45 @@ cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
.loop:
movq mm4, [topq+wq]
movq mm0, mm4
- psllq mm4, 8
+ psllq mm4, 16
por mm4, mm1
movq mm1, mm0 ; t
- psubb mm0, mm4 ; t-tl
+ psubw mm0, mm4 ; t-tl
.skip:
movq mm2, [diffq+wq]
%assign i 0
-%rep 8
+%rep 4
movq mm4, mm0
- paddb mm4, mm3 ; t-tl+l
+ paddw mm4, mm3 ; t-tl+l
+ pand mm4, mm6
movq mm5, mm3
- pmaxub mm3, mm1
- pminub mm5, mm1
- pminub mm3, mm4
- pmaxub mm3, mm5 ; median
- paddb mm3, mm2 ; +residual
+ pmaxsw mm3, mm1
+ pminsw mm5, mm1
+ pminsw mm3, mm4
+ pmaxsw mm3, mm5 ; median
+ paddw mm3, mm2 ; +residual
+ pand mm3, mm6
%if i==0
movq mm7, mm3
- psllq mm7, 56
+ psllq mm7, 48
%else
- movq mm6, mm3
- psrlq mm7, 8
- psllq mm6, 56
- por mm7, mm6
+ movq mm4, mm3
+ psrlq mm7, 16
+ psllq mm4, 48
+ por mm7, mm4
%endif
-%if i<7
- psrlq mm0, 8
- psrlq mm1, 8
- psrlq mm2, 8
+%if i<3
+ psrlq mm0, 16
+ psrlq mm1, 16
+ psrlq mm2, 16
%endif
%assign i i+1
%endrep
movq [dstq+wq], mm7
add wq, 8
jl .loop
- movzx r2d, byte [dstq-1]
+ movzx r2d, word [dstq-2]
mov [leftq], r2d
- movzx r2d, byte [topq-1]
+ movzx r2d, word [topq-2]
mov [left_topq], r2d
RET
-
-
-%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
- add srcq, wq
- add dstq, wq
- neg wq
-%%.loop:
-%if %2
- mova m1, [srcq+wq]
-%else
- movu m1, [srcq+wq]
-%endif
- mova m2, m1
- psllw m1, 8
- paddb m1, m2
- mova m2, m1
- pshufb m1, m3
- paddb m1, m2
- pshufb m0, m5
- mova m2, m1
- pshufb m1, m4
- paddb m1, m2
-%if mmsize == 16
- mova m2, m1
- pshufb m1, m6
- paddb m1, m2
-%endif
- paddb m0, m1
-%if %1
- mova [dstq+wq], m0
-%else
- movq [dstq+wq], m0
- movhps [dstq+wq+8], m0
-%endif
- add wq, mmsize
- jl %%.loop
- mov eax, mmsize-1
- sub eax, wd
- movd m1, eax
- pshufb m0, m1
- movd eax, m0
- RET
-%endmacro
-
-; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
-INIT_MMX ssse3
-cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
-.skip_prologue:
- mova m5, [pb_7]
- mova m4, [pb_zzzz3333zzzzbbbb]
- mova m3, [pb_zz11zz55zz99zzdd]
- movd m0, leftm
- psllq m0, 56
- ADD_HFYU_LEFT_LOOP 1, 1
-
-INIT_XMM ssse3
-cglobal add_hfyu_left_pred_unaligned, 3,3,7, dst, src, w, left
- mova m5, [pb_f]
- mova m6, [pb_zzzzzzzz77777777]
- mova m4, [pb_zzzz3333zzzzbbbb]
- mova m3, [pb_zz11zz55zz99zzdd]
- movd m0, leftm
- pslldq m0, 15
- test srcq, 15
- jnz .src_unaligned
- test dstq, 15
- jnz .dst_unaligned
- ADD_HFYU_LEFT_LOOP 1, 1
-.dst_unaligned:
- ADD_HFYU_LEFT_LOOP 0, 1
-.src_unaligned:
- ADD_HFYU_LEFT_LOOP 0, 0
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 80e6cfbb12..26cf6214d8 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -1,132 +1,55 @@
/*
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvdsp.h"
-void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
- const uint8_t *diff, int w,
- int *left, int *left_top);
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+ intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
+ intptr_t w, uint8_t *left);
+void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
-int ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
- int w, int left);
-int ff_add_hfyu_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
- int w, int left);
-
-#if HAVE_INLINE_ASM
-
-#if HAVE_7REGS
-static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
- const uint8_t *diff, int w,
- int *left, int *left_top)
-{
- x86_reg w2 = -w;
- x86_reg x;
- int l = *left & 0xff;
- int tl = *left_top & 0xff;
- int t;
- __asm__ volatile (
- "mov %7, %3 \n"
- "1: \n"
- "movzbl (%3, %4), %2 \n"
- "mov %2, %k3 \n"
- "sub %b1, %b3 \n"
- "add %b0, %b3 \n"
- "mov %2, %1 \n"
- "cmp %0, %2 \n"
- "cmovg %0, %2 \n"
- "cmovg %1, %0 \n"
- "cmp %k3, %0 \n"
- "cmovg %k3, %0 \n"
- "mov %7, %3 \n"
- "cmp %2, %0 \n"
- "cmovl %2, %0 \n"
- "add (%6, %4), %b0 \n"
- "mov %b0, (%5, %4) \n"
- "inc %4 \n"
- "jl 1b \n"
- : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
- : "r"(dst + w), "r"(diff + w), "rm"(top + w)
- );
- *left = l;
- *left_top = tl;
-}
-#endif /* HAVE_7REGS */
-
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
- x86_reg i = 0;
-
- __asm__ volatile (
- "jmp 2f \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq (%2, %0), %%mm1 \n\t"
- "paddb %%mm0, %%mm1 \n\t"
- "movq %%mm1, (%2, %0) \n\t"
- "movq 8(%1, %0), %%mm0 \n\t"
- "movq 8(%2, %0), %%mm1 \n\t"
- "paddb %%mm0, %%mm1 \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "2: \n\t"
- "cmp %3, %0 \n\t"
- "js 1b \n\t"
- : "+r" (i)
- : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
- for (; i < w; i++)
- dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
+av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
{
int cpu_flags = av_get_cpu_flags();
+ const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
-#if HAVE_INLINE_ASM
-#if HAVE_7REGS
- if (cpu_flags & AV_CPU_FLAG_CMOV)
- c->add_hfyu_median_pred = add_hfyu_median_pred_cmov;
-#endif /* HAVE_7REGS */
-
- if (INLINE_MMX(cpu_flags))
- c->add_bytes = add_bytes_mmx;
-#endif /* HAVE_INLINE_ASM */
-
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- /* slower than cmov version on AMD */
- if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
- c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
+ if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+ c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+ c->add_int16 = ff_add_int16_mmx;
}
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
+ if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+ c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
}
- if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
- c->add_hfyu_left_pred = ff_add_hfyu_left_pred_unaligned_ssse3;
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->add_int16 = ff_add_int16_sse2;
+ c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
}
}
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
new file mode 100644
index 0000000000..1228aa8355
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -0,0 +1,143 @@
+;************************************************************************
+;* SIMD-optimized HuffYUV encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+section .text
+
+; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; unsigned mask, int w);
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+ movd m4, maskd
+ SPLATW m4, m4
+ add wd, wd
+ test wq, 2*mmsize - 1
+ jz %%.tomainloop
+ push tmpq
+%%.wordloop:
+ sub wq, 2
+%ifidn %2, add
+ mov tmpw, [srcq+wq]
+ add tmpw, [dstq+wq]
+%else
+ mov tmpw, [src1q+wq]
+ sub tmpw, [src2q+wq]
+%endif
+ and tmpw, maskw
+ mov [dstq+wq], tmpw
+ test wq, 2*mmsize - 1
+ jnz %%.wordloop
+ pop tmpq
+%%.tomainloop:
+%ifidn %2, add
+ add srcq, wq
+%else
+ add src1q, wq
+ add src2q, wq
+%endif
+ add dstq, wq
+ neg wq
+ jz %%.end
+%%.loop:
+%ifidn %2, add
+ mov%1 m0, [srcq+wq]
+ mov%1 m1, [dstq+wq]
+ mov%1 m2, [srcq+wq+mmsize]
+ mov%1 m3, [dstq+wq+mmsize]
+%else
+ mov%1 m0, [src1q+wq]
+ mov%1 m1, [src2q+wq]
+ mov%1 m2, [src1q+wq+mmsize]
+ mov%1 m3, [src2q+wq+mmsize]
+%endif
+ p%2w m0, m1
+ p%2w m2, m3
+ pand m0, m4
+ pand m2, m4
+ mov%1 [dstq+wq] , m0
+ mov%1 [dstq+wq+mmsize], m2
+ add wq, 2*mmsize
+ jl %%.loop
+%%.end:
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+ INT16_LOOP a, sub
+%endif
+
+INIT_XMM sse2
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+ test src1q, mmsize-1
+ jnz .unaligned
+ test src2q, mmsize-1
+ jnz .unaligned
+ test dstq, mmsize-1
+ jnz .unaligned
+ INT16_LOOP a, sub
+.unaligned:
+ INT16_LOOP u, sub
+
+INIT_MMX mmxext
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+ add wd, wd
+ movd mm7, maskd
+ SPLATW mm7, mm7
+ movq mm0, [src1q]
+ movq mm2, [src2q]
+ psllq mm0, 16
+ psllq mm2, 16
+ movd mm6, [left_topq]
+ por mm0, mm6
+ movd mm6, [leftq]
+ por mm2, mm6
+ xor maskq, maskq
+.loop:
+ movq mm1, [src1q + maskq]
+ movq mm3, [src2q + maskq]
+ movq mm4, mm2
+ psubw mm2, mm0
+ paddw mm2, mm1
+ pand mm2, mm7
+ movq mm5, mm4
+ pmaxsw mm4, mm1
+ pminsw mm1, mm5
+ pminsw mm4, mm2
+ pmaxsw mm4, mm1
+ psubw mm3, mm4
+ pand mm3, mm7
+ movq [dstq + maskq], mm3
+ add maskq, 8
+ movq mm0, [src1q + maskq - 2]
+ movq mm2, [src2q + maskq - 2]
+ cmp maskq, wq
+ jb .loop
+ movzx maskd, word [src1q + wq - 2]
+ mov [left_topq], maskd
+ movzx maskd, word [src2q + wq - 2]
+ mov [leftq], maskd
+ RET
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
new file mode 100644
index 0000000000..f66bc8c4f0
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -0,0 +1,54 @@
+/*
+ * SIMD-optimized HuffYUV encoding functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvencdsp.h"
+
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+ unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+ unsigned mask, int w);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+ unsigned mask, int w, int *left, int *left_top);
+
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
+{
+ av_unused int cpu_flags = av_get_cpu_flags();
+ const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+ if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+ c->diff_int16 = ff_diff_int16_mmx;
+ }
+
+ if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+ c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->diff_int16 = ff_diff_int16_sse2;
+ }
+}
diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm
new file mode 100644
index 0000000000..089425a9ab
--- /dev/null
+++ b/libavcodec/x86/idctdsp.asm
@@ -0,0 +1,183 @@
+;******************************************************************************
+;* SIMD-optimized IDCT-related routines
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+SECTION .text
+
+;--------------------------------------------------------------------------
+;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
+; ptrdiff_t line_size)
+;--------------------------------------------------------------------------
+
+%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
+ mova m1, [blockq+mmsize*0+%1]
+ mova m2, [blockq+mmsize*2+%1]
+%if mmsize == 8
+ mova m3, [blockq+mmsize*4+%1]
+ mova m4, [blockq+mmsize*6+%1]
+%endif
+ packsswb m1, [blockq+mmsize*1+%1]
+ packsswb m2, [blockq+mmsize*3+%1]
+%if mmsize == 8
+ packsswb m3, [blockq+mmsize*5+%1]
+ packsswb m4, [blockq+mmsize*7+%1]
+%endif
+ paddb m1, m0
+ paddb m2, m0
+%if mmsize == 8
+ paddb m3, m0
+ paddb m4, m0
+ movq [pixelsq+lsizeq*0], m1
+ movq [pixelsq+lsizeq*1], m2
+ movq [pixelsq+lsizeq*2], m3
+ movq [pixelsq+lsize3q ], m4
+%else
+ movq [pixelsq+lsizeq*0], m1
+ movhps [pixelsq+lsizeq*1], m1
+ movq [pixelsq+lsizeq*2], m2
+ movhps [pixelsq+lsize3q ], m2
+%endif
+%endmacro
+
+%macro PUT_SIGNED_PIXELS_CLAMPED 1
+cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
+ mova m0, [pb_80]
+ lea lsize3q, [lsizeq*3]
+ PUT_SIGNED_PIXELS_CLAMPED_HALF 0
+ lea pixelsq, [pixelsq+lsizeq*4]
+ PUT_SIGNED_PIXELS_CLAMPED_HALF 64
+ RET
+%endmacro
+
+INIT_MMX mmx
+PUT_SIGNED_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_SIGNED_PIXELS_CLAMPED 3
+
+;--------------------------------------------------------------------------
+; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
+; ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+ mova m0, [blockq+mmsize*0+%1]
+ mova m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+ mova m2, [blockq+mmsize*4+%1]
+ mova m3, [blockq+mmsize*6+%1]
+%endif
+ packuswb m0, [blockq+mmsize*1+%1]
+ packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+ packuswb m2, [blockq+mmsize*5+%1]
+ packuswb m3, [blockq+mmsize*7+%1]
+ movq [pixelsq], m0
+ movq [lsizeq+pixelsq], m1
+ movq [2*lsizeq+pixelsq], m2
+ movq [lsize3q+pixelsq], m3
+%else
+ movq [pixelsq], m0
+ movhps [lsizeq+pixelsq], m0
+ movq [2*lsizeq+pixelsq], m1
+ movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+%macro PUT_PIXELS_CLAMPED 0
+cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
+ lea lsize3q, [lsizeq*3]
+ PUT_PIXELS_CLAMPED_HALF 0
+ lea pixelsq, [pixelsq+lsizeq*4]
+ PUT_PIXELS_CLAMPED_HALF 64
+ RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED
+
+;--------------------------------------------------------------------------
+; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
+; ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro ADD_PIXELS_CLAMPED 1
+ mova m0, [blockq+mmsize*0+%1]
+ mova m1, [blockq+mmsize*1+%1]
+%if mmsize == 8
+ mova m5, [blockq+mmsize*2+%1]
+ mova m6, [blockq+mmsize*3+%1]
+%endif
+ movq m2, [pixelsq]
+ movq m3, [pixelsq+lsizeq]
+%if mmsize == 8
+ mova m7, m2
+ punpcklbw m2, m4
+ punpckhbw m7, m4
+ paddsw m0, m2
+ paddsw m1, m7
+ mova m7, m3
+ punpcklbw m3, m4
+ punpckhbw m7, m4
+ paddsw m5, m3
+ paddsw m6, m7
+%else
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ paddsw m0, m2
+ paddsw m1, m3
+%endif
+ packuswb m0, m1
+%if mmsize == 8
+ packuswb m5, m6
+ movq [pixelsq], m0
+ movq [pixelsq+lsizeq], m5
+%else
+ movq [pixelsq], m0
+ movhps [pixelsq+lsizeq], m0
+%endif
+%endmacro
+
+%macro ADD_PIXELS_CLAMPED 0
+cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
+ pxor m4, m4
+ ADD_PIXELS_CLAMPED 0
+ lea pixelsq, [pixelsq+lsizeq*2]
+ ADD_PIXELS_CLAMPED 32
+ lea pixelsq, [pixelsq+lsizeq*2]
+ ADD_PIXELS_CLAMPED 64
+ lea pixelsq, [pixelsq+lsizeq*2]
+ ADD_PIXELS_CLAMPED 96
+ RET
+%endmacro
+
+INIT_MMX mmx
+ADD_PIXELS_CLAMPED
+INIT_XMM sse2
+ADD_PIXELS_CLAMPED
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
index 6e6c68857d..0d0bdb5f57 100644
--- a/libavcodec/x86/idctdsp.h
+++ b/libavcodec/x86/idctdsp.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -24,9 +24,16 @@
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
+void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+ ptrdiff_t line_size);
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+ ptrdiff_t line_size);
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+ ptrdiff_t line_size);
+
#endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 853c6a3661..bcf7e5be0e 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -64,12 +64,10 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
- c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
- c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
- c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
-
if (!high_bit_depth &&
+ avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct_put = ff_simple_idct_put_mmx;
c->idct_add = ff_simple_idct_add_mmx;
@@ -77,4 +75,52 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
}
+ if (EXTERNAL_MMX(cpu_flags)) {
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+ c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
+ c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
+ }
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
+ c->put_pixels_clamped = ff_put_pixels_clamped_sse2;
+ c->add_pixels_clamped = ff_add_pixels_clamped_sse2;
+ }
+
+ if (ARCH_X86_64 && avctx->lowres == 0) {
+ if (avctx->bits_per_raw_sample == 10 &&
+ (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLE)) {
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->idct_put = ff_simple_idct10_put_sse2;
+ c->idct_add = NULL;
+ c->idct = ff_simple_idct10_sse2;
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+
+ }
+ if (EXTERNAL_AVX(cpu_flags)) {
+ c->idct_put = ff_simple_idct10_put_avx;
+ c->idct_add = NULL;
+ c->idct = ff_simple_idct10_avx;
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+ }
+ }
+
+ if (avctx->bits_per_raw_sample == 12 &&
+ (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->idct_put = ff_simple_idct12_put_sse2;
+ c->idct_add = NULL;
+ c->idct = ff_simple_idct12_sse2;
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+ }
+ if (EXTERNAL_AVX(cpu_flags)) {
+ c->idct_put = ff_simple_idct12_put_avx;
+ c->idct_add = NULL;
+ c->idct = ff_simple_idct12_avx;
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+ }
+ }
+ }
}
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
deleted file mode 100644
index 523f36816b..0000000000
--- a/libavcodec/x86/idctdsp_mmx.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * SIMD-optimized IDCT-related routines
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "idctdsp.h"
-#include "inline_asm.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- ptrdiff_t line_size)
-{
- const int16_t *p;
- uint8_t *pix;
-
- /* read the pixels */
- p = block;
- pix = pixels;
- /* unrolled loop */
- __asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "packuswb %%mm7, %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, (%0, %1) \n\t"
- "movq %%mm4, (%0, %1, 2) \n\t"
- "movq %%mm6, (%0, %2) \n\t"
- :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
- "r" (p)
- : "memory");
- pix += line_size * 4;
- p += 32;
-
- // if here would be an exact copy of the code above
- // compiler would generate some very strange code
- // thus using "r"
- __asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "packuswb %%mm7, %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, (%0, %1) \n\t"
- "movq %%mm4, (%0, %1, 2) \n\t"
- "movq %%mm6, (%0, %2) \n\t"
- :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
- "r" (p)
- : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off) \
- "movq "#off"(%2), %%mm1 \n\t" \
- "movq 16 + "#off"(%2), %%mm2 \n\t" \
- "movq 32 + "#off"(%2), %%mm3 \n\t" \
- "movq 48 + "#off"(%2), %%mm4 \n\t" \
- "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
- "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
- "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
- "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
- "paddb %%mm0, %%mm1 \n\t" \
- "paddb %%mm0, %%mm2 \n\t" \
- "paddb %%mm0, %%mm3 \n\t" \
- "paddb %%mm0, %%mm4 \n\t" \
- "movq %%mm1, (%0) \n\t" \
- "movq %%mm2, (%0, %3) \n\t" \
- "movq %%mm3, (%0, %3, 2) \n\t" \
- "movq %%mm4, (%0, %1) \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- ptrdiff_t line_size)
-{
- x86_reg line_skip = line_size;
- x86_reg line_skip3;
-
- __asm__ volatile (
- "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
- "lea (%3, %3, 2), %1 \n\t"
- put_signed_pixels_clamped_mmx_half(0)
- "lea (%0, %3, 4), %0 \n\t"
- put_signed_pixels_clamped_mmx_half(64)
- : "+&r" (pixels), "=&r" (line_skip3)
- : "r" (block), "r" (line_skip)
- : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- ptrdiff_t line_size)
-{
- const int16_t *p;
- uint8_t *pix;
- int i;
-
- /* read the pixels */
- p = block;
- pix = pixels;
- MOVQ_ZERO(mm7);
- i = 4;
- do {
- __asm__ volatile (
- "movq (%2), %%mm0 \n\t"
- "movq 8(%2), %%mm1 \n\t"
- "movq 16(%2), %%mm2 \n\t"
- "movq 24(%2), %%mm3 \n\t"
- "movq %0, %%mm4 \n\t"
- "movq %1, %%mm6 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddsw %%mm4, %%mm0 \n\t"
- "paddsw %%mm5, %%mm1 \n\t"
- "movq %%mm6, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm6 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddsw %%mm6, %%mm2 \n\t"
- "paddsw %%mm5, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "movq %%mm0, %0 \n\t"
- "movq %%mm2, %1 \n\t"
- : "+m" (*pix), "+m" (*(pix + line_size))
- : "r" (p)
- : "memory");
- pix += line_size * 2;
- p += 16;
- } while (--i);
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index f85e2e4cc3..960eabdda5 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -2,20 +2,20 @@
;* 36 point SSE-optimized IMDCT transform
;* Copyright (c) 2011 Vitor Sessak
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -50,7 +50,7 @@ ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
- dd 1.0, 0.70710678118654752439, 0.0, 0.0
+ dd 1.0, -0.70710678118654752439, 0.0, 0.0
costabs: times 4 dd 0.98480773
times 4 dd 0.93969262
@@ -129,7 +129,26 @@ SECTION .text
%endif
%endmacro
+%macro BUTTERF2 3
+%if cpuflag(sse3)
+ mulps %1, %1, [ps_cosh_sse3 + %3]
+ PSHUFD %2, %1, 0xe1
+ addsubps %1, %1, %2
+%else
+ mulps %1, [ps_cosh + %3]
+ PSHUFD %2, %1, 0xe1
+ xorps %1, [ps_p1m1p1m1]
+ addps %1, %2
+%endif
+%endmacro
+
%macro STORE 4
+%if cpuflag(sse4)
+ movss [%3 ], %1
+ extractps dword [%3 + %4], %1, 1
+ extractps dword [%3 + 2*%4], %1, 2
+ extractps dword [%3 + 3*%4], %1, 3
+%else
movhlps %2, %1
movss [%3 ], %1
movss [%3 + 2*%4], %2
@@ -137,6 +156,7 @@ SECTION .text
movss [%3 + %4], %1
movhlps %2, %1
movss [%3 + 3*%4], %2
+%endif
%endmacro
%macro LOAD 4
@@ -279,11 +299,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
BUTTERF m7, m2, 16
BUTTERF m3, m6, 32
BUTTERF m4, m1, 48
-
- mulps m5, m5, [ps_cosh + 64]
- PSHUFD m1, m5, 0xe1
- xorps m5, m5, [ps_p1m1p1m1]
- addps m5, m5, m1
+ BUTTERF2 m5, m1, 64
; permutates:
; m0 0 1 2 3 => 2 6 10 14 m1
@@ -358,8 +374,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
RET
%endmacro
+%if ARCH_X86_32
INIT_XMM sse
DEFINE_IMDCT
+%endif
INIT_XMM sse2
DEFINE_IMDCT
@@ -370,8 +388,10 @@ DEFINE_IMDCT
INIT_XMM ssse3
DEFINE_IMDCT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_IMDCT
+%endif
INIT_XMM sse
@@ -716,5 +736,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
INIT_XMM sse
DEFINE_FOUR_IMDCT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_FOUR_IMDCT
+%endif
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
index fc554bfc8d..0198746719 100644
--- a/libavcodec/x86/inline_asm.h
+++ b/libavcodec/x86/inline_asm.h
@@ -1,20 +1,20 @@
/*
* inline assembly helper macros
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -37,7 +37,7 @@
"paddb %%"#regd", %%"#regd" \n\t" ::)
#ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
#else
// for shared library it's better to use this way for accessing constants
// pcmpeqd -> -1
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
new file mode 100644
index 0000000000..56b5fbd606
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -0,0 +1,144 @@
+;******************************************************************************
+;* SIMD-optimized JPEG2000 DSP functions
+;* Copyright (c) 2014 Nicolas Bertrand
+;* Copyright (c) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pf_ict0: times 8 dd 1.402
+pf_ict1: times 8 dd 0.34413
+pf_ict2: times 8 dd 0.71414
+pf_ict3: times 8 dd 1.772
+
+SECTION .text
+
+;***********************************************************************
+; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
+;***********************************************************************
+%macro ICT_FLOAT 1
+cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
+ shl csized, 2
+ add src0q, csizeq
+ add src1q, csizeq
+ add src2q, csizeq
+ neg csizeq
+ movaps m6, [pf_ict0]
+ movaps m7, [pf_ict1]
+ %define ICT0 m6
+ %define ICT1 m7
+
+%if ARCH_X86_64
+ movaps m8, [pf_ict2]
+ %define ICT2 m8
+%if cpuflag(avx)
+ movaps m3, [pf_ict3]
+ %define ICT3 m3
+%else
+ movaps m9, [pf_ict3]
+ %define ICT3 m9
+%endif
+
+%else ; ARCH_X86_32
+ %define ICT2 [pf_ict2]
+%if cpuflag(avx)
+ movaps m3, [pf_ict3]
+ %define ICT3 m3
+%else
+ %define ICT3 [pf_ict3]
+%endif
+
+%endif ; ARCH
+
+align 16
+.loop:
+ movaps m0, [src0q+csizeq]
+ movaps m1, [src1q+csizeq]
+ movaps m2, [src2q+csizeq]
+
+%if cpuflag(avx)
+ mulps m5, m1, ICT1
+ mulps m4, m2, ICT0
+ mulps m1, m1, ICT3
+ mulps m2, m2, ICT2
+ subps m5, m0, m5
+%else ; sse
+ movaps m3, m1
+ movaps m4, m2
+ movaps m5, m0
+ mulps m3, ICT1
+ mulps m4, ICT0
+ mulps m1, ICT3
+ mulps m2, ICT2
+ subps m5, m3
+%endif
+ addps m4, m4, m0
+ addps m0, m0, m1
+ subps m5, m5, m2
+
+ movaps [src0q+csizeq], m4
+ movaps [src2q+csizeq], m0
+ movaps [src1q+csizeq], m5
+ add csizeq, mmsize
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+ICT_FLOAT 10
+INIT_YMM avx
+ICT_FLOAT 9
+
+;***************************************************************************
+; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
+;***************************************************************************
+%macro RCT_INT 0
+cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
+ shl csized, 2
+ add src0q, csizeq
+ add src1q, csizeq
+ add src2q, csizeq
+ neg csizeq
+
+align 16
+.loop:
+ mova m1, [src1q+csizeq]
+ mova m2, [src2q+csizeq]
+ mova m0, [src0q+csizeq]
+ paddd m3, m1, m2
+ psrad m3, 2
+ psubd m0, m3
+ paddd m1, m0
+ paddd m2, m0
+ mova [src1q+csizeq], m0
+ mova [src2q+csizeq], m1
+ mova [src0q+csizeq], m2
+ add csizeq, mmsize
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+RCT_INT
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RCT_INT
+%endif
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
new file mode 100644
index 0000000000..baa81383ea
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -0,0 +1,50 @@
+/*
+ * SIMD optimized JPEG 2000 DSP functions
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/jpeg2000dsp.h"
+
+void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+
+av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+ if (EXTERNAL_SSE(cpu_flags)) {
+ c->mct_decode[FF_DWT97] = ff_ict_float_sse;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
+ }
+
+ if (EXTERNAL_AVX_FAST(cpu_flags)) {
+ c->mct_decode[FF_DWT97] = ff_ict_float_avx;
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
+ }
+}
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/lossless_audiodsp.asm
index d6abd982e8..063d7b41af 100644
--- a/libavcodec/x86/apedsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -1,20 +1,20 @@
;******************************************************************************
;* Copyright (c) 2008 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -58,14 +58,7 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
mova [v1q + orderq + mmsize], m3
add orderq, mmsize*2
jl .loop
-%if mmsize == 16
- movhlps m0, m6
- paddd m6, m0
- pshuflw m0, m6, 0x4e
-%else
- pshufw m0, m6, 0x4e
-%endif
- paddd m6, m0
+ HADDD m6, m0
movd eax, m6
RET
%endmacro
@@ -75,6 +68,39 @@ SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
+INIT_XMM sse4
+; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
+; int order, int mul)
+cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+ SPLATW m7, m7
+ pxor m6, m6
+ add v1q, orderq
+ lea v2q, [v2q + 2*orderq]
+ add v3q, orderq
+ neg orderq
+.loop:
+ mova m3, [v1q + orderq]
+ movu m0, [v2q + 2*orderq]
+ pmovsxwd m4, m3
+ movu m1, [v2q + 2*orderq + mmsize]
+ movhlps m5, m3
+ movu m2, [v3q + orderq]
+ pmovsxwd m5, m5
+ pmullw m2, m7
+ pmulld m0, m4
+ pmulld m1, m5
+ paddw m2, m3
+ paddd m6, m0
+ paddd m6, m1
+ mova [v1q + orderq], m2
+ add orderq, 16
+ jl .loop
+ HADDD m6, m0
+ movd eax, m6
+ RET
+
%macro SCALARPRODUCT_LOOP 1
align 16
.loop%1:
@@ -159,9 +185,6 @@ SCALARPRODUCT_LOOP 4
SCALARPRODUCT_LOOP 2
SCALARPRODUCT_LOOP 0
.end:
- movhlps m0, m6
- paddd m6, m0
- pshuflw m0, m6, 0x4e
- paddd m6, m0
+ HADDD m6, m0
movd eax, m6
RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c
index f692c2b9b6..10b6a65622 100644
--- a/libavcodec/x86/apedsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -1,25 +1,25 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
const int16_t *v3,
@@ -31,8 +31,13 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
-av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+
+av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
{
+#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags))
@@ -44,4 +49,8 @@ av_cold void ff_apedsp_init_x86(APEDSPContext *c)
if (EXTERNAL_SSSE3(cpu_flags) &&
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+
+ if (EXTERNAL_SSE4(cpu_flags))
+ c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
+#endif
}
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
new file mode 100644
index 0000000000..443fe02951
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -0,0 +1,290 @@
+;******************************************************************************
+;* SIMD lossless video DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_ef: times 8 db 14,15
+pb_67: times 8 db 6, 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
+pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
+
+SECTION .text
+
+; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+; const uint8_t *diff, int w,
+; int *left, int *left_top)
+%macro MEDIAN_PRED 0
+cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
+ movu m0, [topq]
+ mova m2, m0
+ movd m4, [left_topq]
+ LSHIFT m2, 1
+ mova m1, m0
+ por m4, m2
+ movd m3, [leftq]
+ psubb m0, m4 ; t-tl
+ add dstq, wq
+ add topq, wq
+ add diffq, wq
+ neg wq
+ jmp .skip
+.loop:
+ movu m4, [topq+wq]
+ mova m0, m4
+ LSHIFT m4, 1
+ por m4, m1
+ mova m1, m0 ; t
+ psubb m0, m4 ; t-tl
+.skip:
+ movu m2, [diffq+wq]
+%assign i 0
+%rep mmsize
+ mova m4, m0
+ paddb m4, m3 ; t-tl+l
+ mova m5, m3
+ pmaxub m3, m1
+ pminub m5, m1
+ pminub m3, m4
+ pmaxub m3, m5 ; median
+ paddb m3, m2 ; +residual
+%if i==0
+ mova m7, m3
+ LSHIFT m7, mmsize-1
+%else
+ mova m6, m3
+ RSHIFT m7, 1
+ LSHIFT m6, mmsize-1
+ por m7, m6
+%endif
+%if i<mmsize-1
+ RSHIFT m0, 1
+ RSHIFT m1, 1
+ RSHIFT m2, 1
+%endif
+%assign i i+1
+%endrep
+ movu [dstq+wq], m7
+ add wq, mmsize
+ jl .loop
+ movzx r2d, byte [dstq-1]
+ mov [leftq], r2d
+ movzx r2d, byte [topq-1]
+ mov [left_topq], r2d
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+MEDIAN_PRED
+%endif
+INIT_XMM sse2
+MEDIAN_PRED
+
+
+%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+ add srcq, wq
+ add dstq, wq
+ neg wq
+%%.loop:
+%if %2
+ mova m1, [srcq+wq]
+%else
+ movu m1, [srcq+wq]
+%endif
+ mova m2, m1
+ psllw m1, 8
+ paddb m1, m2
+ mova m2, m1
+ pshufb m1, m3
+ paddb m1, m2
+ pshufb m0, m5
+ mova m2, m1
+ pshufb m1, m4
+ paddb m1, m2
+%if mmsize == 16
+ mova m2, m1
+ pshufb m1, m6
+ paddb m1, m2
+%endif
+ paddb m0, m1
+%if %1
+ mova [dstq+wq], m0
+%else
+ movq [dstq+wq], m0
+ movhps [dstq+wq+8], m0
+%endif
+ add wq, mmsize
+ jl %%.loop
+ mov eax, mmsize-1
+ sub eax, wd
+ movd m1, eax
+ pshufb m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
+INIT_MMX ssse3
+cglobal add_left_pred, 3,3,7, dst, src, w, left
+.skip_prologue:
+ mova m5, [pb_7]
+ mova m4, [pb_zzzz3333zzzzbbbb]
+ mova m3, [pb_zz11zz55zz99zzdd]
+ movd m0, leftm
+ psllq m0, 56
+ ADD_LEFT_LOOP 1, 1
+
+INIT_XMM ssse3
+cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
+ mova m5, [pb_15]
+ mova m6, [pb_zzzzzzzz77777777]
+ mova m4, [pb_zzzz3333zzzzbbbb]
+ mova m3, [pb_zz11zz55zz99zzdd]
+ movd m0, leftm
+ pslldq m0, 15
+ test srcq, 15
+ jnz .src_unaligned
+ test dstq, 15
+ jnz .dst_unaligned
+ ADD_LEFT_LOOP 1, 1
+.dst_unaligned:
+ ADD_LEFT_LOOP 0, 1
+.src_unaligned:
+ ADD_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+ mov sizeq, wq
+ and sizeq, -2*mmsize
+ jz .2
+ add dstq, sizeq
+ add srcq, sizeq
+ neg sizeq
+.1:
+ mova m0, [srcq + sizeq]
+ mova m1, [srcq + sizeq + mmsize]
+ paddb m0, [dstq + sizeq]
+ paddb m1, [dstq + sizeq + mmsize]
+ mova [dstq + sizeq], m0
+ mova [dstq + sizeq + mmsize], m1
+ add sizeq, 2*mmsize
+ jl .1
+.2:
+ and wq, 2*mmsize-1
+ jz .end
+ add dstq, wq
+ add srcq, wq
+ neg wq
+.3:
+ mov sizeb, [srcq + wq]
+ add [dstq + wq], sizeb
+ inc wq
+ jl .3
+.end:
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_BYTES
+%endif
+INIT_XMM sse2
+ADD_BYTES
+
+%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
+ add wd, wd
+ add srcq, wq
+ add dstq, wq
+ neg wq
+%%.loop:
+ mov%2 m1, [srcq+wq]
+ mova m2, m1
+ pslld m1, 16
+ paddw m1, m2
+ mova m2, m1
+
+ pshufb m1, m3
+ paddw m1, m2
+ pshufb m0, m5
+%if mmsize == 16
+ mova m2, m1
+ pshufb m1, m4
+ paddw m1, m2
+%endif
+ paddw m0, m1
+ pand m0, m7
+%ifidn %1, a
+ mova [dstq+wq], m0
+%else
+ movq [dstq+wq], m0
+ movhps [dstq+wq+8], m0
+%endif
+ add wq, mmsize
+ jl %%.loop
+ mov eax, mmsize-1
+ sub eax, wd
+ mov wd, eax
+ shl wd, 8
+ lea eax, [wd+eax-1]
+ movd m1, eax
+ pshufb m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
+INIT_MMX ssse3
+cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
+.skip_prologue:
+ mova m5, [pb_67]
+ mova m3, [pb_zzzz2323zzzzabab]
+ movd m0, leftm
+ psllq m0, 48
+ movd m7, maskm
+ SPLATW m7 ,m7
+ ADD_HFYU_LEFT_LOOP_INT16 a, a
+
+INIT_XMM sse4
+cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
+ mova m5, [pb_ef]
+ mova m4, [pb_zzzzzzzz67676767]
+ mova m3, [pb_zzzz2323zzzzabab]
+ movd m0, leftm
+ pslldq m0, 14
+ movd m7, maskm
+ SPLATW m7 ,m7
+ test srcq, 15
+ jnz .src_unaligned
+ test dstq, 15
+ jnz .dst_unaligned
+ ADD_HFYU_LEFT_LOOP_INT16 a, a
+.dst_unaligned:
+ ADD_HFYU_LEFT_LOOP_INT16 u, a
+.src_unaligned:
+ ADD_HFYU_LEFT_LOOP_INT16 u, u
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
new file mode 100644
index 0000000000..21bbd12bd2
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -0,0 +1,118 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/x86/asm.h"
+#include "../lossless_videodsp.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+
+void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+ const uint8_t *diff, ptrdiff_t w,
+ int *left, int *left_top);
+void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+ const uint8_t *diff, ptrdiff_t w,
+ int *left, int *left_top);
+
+int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t w, int left);
+int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t w, int left);
+
+int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+ const uint8_t *diff, ptrdiff_t w,
+ int *left, int *left_top)
+{
+ x86_reg w2 = -w;
+ x86_reg x;
+ int l = *left & 0xff;
+ int tl = *left_top & 0xff;
+ int t;
+ __asm__ volatile (
+ "mov %7, %3 \n"
+ "1: \n"
+ "movzbl (%3, %4), %2 \n"
+ "mov %2, %k3 \n"
+ "sub %b1, %b3 \n"
+ "add %b0, %b3 \n"
+ "mov %2, %1 \n"
+ "cmp %0, %2 \n"
+ "cmovg %0, %2 \n"
+ "cmovg %1, %0 \n"
+ "cmp %k3, %0 \n"
+ "cmovg %k3, %0 \n"
+ "mov %7, %3 \n"
+ "cmp %2, %0 \n"
+ "cmovl %2, %0 \n"
+ "add (%6, %4), %b0 \n"
+ "mov %b0, (%5, %4) \n"
+ "inc %4 \n"
+ "jl 1b \n"
+ : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
+ : "r"(dst + w), "r"(diff + w), "rm"(top + w)
+ );
+ *left = l;
+ *left_top = tl;
+}
+#endif
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+ if (cpu_flags & AV_CPU_FLAG_CMOV)
+ c->add_median_pred = add_median_pred_cmov;
+#endif
+
+ if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+ c->add_bytes = ff_add_bytes_mmx;
+ }
+
+ if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
+ /* slower than cmov version on AMD */
+ if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
+ c->add_median_pred = ff_add_median_pred_mmxext;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->add_bytes = ff_add_bytes_sse2;
+ c->add_median_pred = ff_add_median_pred_sse2;
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ c->add_left_pred = ff_add_left_pred_ssse3;
+ c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
+ }
+
+ if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
+ c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
+ }
+
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ c->add_left_pred_int16 = ff_add_left_pred_int16_sse4;
+ }
+}
diff --git a/libavcodec/x86/lossless_videoencdsp.asm b/libavcodec/x86/lossless_videoencdsp.asm
new file mode 100644
index 0000000000..63fd72174a
--- /dev/null
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@@ -0,0 +1,150 @@
+;************************************************************************
+;* SIMD-optimized lossless video encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+section .text
+
+; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; intptr_t w);
+%macro DIFF_BYTES_PROLOGUE 0
+%if ARCH_X86_32
+cglobal diff_bytes, 3,5,2, dst, src1, src2
+%define wq r4q
+ DECLARE_REG_TMP 3
+ mov wq, r3mp
+%else
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
+ DECLARE_REG_TMP 4
+%endif ; ARCH_X86_32
+%define i t0q
+%endmacro
+
+; label to jump to if w < regsize
+%macro DIFF_BYTES_LOOP_PREP 1
+ mov i, wq
+ and i, -2 * regsize
+ jz %1
+ add dstq, i
+ add src1q, i
+ add src2q, i
+ neg i
+%endmacro
+
+; mov type used for src1q, dstq, first reg, second reg
+%macro DIFF_BYTES_LOOP_CORE 4
+%if mmsize != 16
+ mov%1 %3, [src1q + i]
+ mov%1 %4, [src1q + i + regsize]
+ psubb %3, [src2q + i]
+ psubb %4, [src2q + i + regsize]
+ mov%2 [dstq + i], %3
+ mov%2 [regsize + dstq + i], %4
+%else
+ ; SSE enforces alignment of psubb operand
+ mov%1 %3, [src1q + i]
+ movu %4, [src2q + i]
+ psubb %3, %4
+ mov%2 [dstq + i], %3
+ mov%1 %3, [src1q + i + regsize]
+ movu %4, [src2q + i + regsize]
+ psubb %3, %4
+ mov%2 [regsize + dstq + i], %3
+%endif
+%endmacro
+
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
+ %define regsize mmsize
+.loop_%1%2:
+ DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
+ add i, 2 * regsize
+ jl .loop_%1%2
+.skip_main_%1%2:
+ and wq, 2 * regsize - 1
+ jz .end_%1%2
+%if mmsize > 16
+ ; fall back to narrower xmm
+ %define regsize mmsize / 2
+ DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
+.loop2_%1%2:
+ DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
+ add i, 2 * regsize
+ jl .loop2_%1%2
+.setup_loop_gpr_%1%2:
+ and wq, 2 * regsize - 1
+ jz .end_%1%2
+%endif
+ add dstq, wq
+ add src1q, wq
+ add src2q, wq
+ neg wq
+.loop_gpr_%1%2:
+ mov t0b, [src1q + wq]
+ sub t0b, [src2q + wq]
+ mov [dstq + wq], t0b
+ inc wq
+ jl .loop_gpr_%1%2
+.end_%1%2:
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES_PROLOGUE
+ %define regsize mmsize
+ DIFF_BYTES_LOOP_PREP .skip_main_aa
+ DIFF_BYTES_BODY a, a
+%undef i
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES_PROLOGUE
+ %define regsize mmsize
+ DIFF_BYTES_LOOP_PREP .skip_main_aa
+ test dstq, regsize - 1
+ jnz .loop_uu
+ test src1q, regsize - 1
+ jnz .loop_ua
+ DIFF_BYTES_BODY a, a
+ DIFF_BYTES_BODY u, a
+ DIFF_BYTES_BODY u, u
+%undef i
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_BYTES_PROLOGUE
+ %define regsize mmsize
+ ; Directly using unaligned SSE2 version is marginally faster than
+ ; branching based on arguments.
+ DIFF_BYTES_LOOP_PREP .skip_main_uu
+ test dstq, regsize - 1
+ jnz .loop_uu
+ test src1q, regsize - 1
+ jnz .loop_ua
+ DIFF_BYTES_BODY a, a
+ DIFF_BYTES_BODY u, a
+ DIFF_BYTES_BODY u, u
+%undef i
+%endif
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/lossless_videoencdsp_init.c
index 8ffaced37d..fc728c9fd1 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/lossless_videoencdsp_init.c
@@ -1,24 +1,24 @@
/*
- * SIMD-optimized HuffYUV encoding functions
+ * SIMD-optimized lossless video encoding functions
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -26,38 +26,21 @@
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/huffyuvencdsp.h"
+#include "libavcodec/lossless_videoencdsp.h"
#include "libavcodec/mathops.h"
-#if HAVE_INLINE_ASM
-
-static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
-{
- x86_reg i = 0;
-
- __asm__ volatile (
- "1: \n\t"
- "movq (%2, %0), %%mm0 \n\t"
- "movq (%1, %0), %%mm1 \n\t"
- "psubb %%mm0, %%mm1 \n\t"
- "movq %%mm1, (%3, %0) \n\t"
- "movq 8(%2, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "psubb %%mm0, %%mm1 \n\t"
- "movq %%mm1, 8(%3, %0) \n\t"
- "add $16, %0 \n\t"
- "cmp %4, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (i)
- : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w - 15));
+void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+ intptr_t w);
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+ intptr_t w);
+void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+ intptr_t w);
- for (; i < w; i++)
- dst[i + 0] = src1[i + 0] - src2[i + 0];
-}
+#if HAVE_INLINE_ASM
-static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
- const uint8_t *src2, int w,
- int *left, int *left_top)
+static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
+ const uint8_t *src2, intptr_t w,
+ int *left, int *left_top)
{
x86_reg i = 0;
uint8_t l, lt;
@@ -97,17 +80,25 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
#endif /* HAVE_INLINE_ASM */
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c)
+av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
{
-#if HAVE_INLINE_ASM
- int cpu_flags = av_get_cpu_flags();
+ av_unused int cpu_flags = av_get_cpu_flags();
- if (INLINE_MMX(cpu_flags)) {
- c->diff_bytes = diff_bytes_mmx;
+ if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+ c->diff_bytes = ff_diff_bytes_mmx;
}
+#if HAVE_INLINE_ASM
if (INLINE_MMXEXT(cpu_flags)) {
- c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
+ c->sub_median_pred = sub_median_pred_mmxext;
}
#endif /* HAVE_INLINE_ASM */
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->diff_bytes = ff_diff_bytes_sse2;
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->diff_bytes = ff_diff_bytes_avx2;
+ }
}
diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c
index e8cce42af4..6c72e21bac 100644
--- a/libavcodec/x86/lpc.c
+++ b/libavcodec/x86/lpc.c
@@ -2,26 +2,25 @@
* SIMD-optimized LPC functions
* Copyright (c) 2007 Loren Merritt
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
@@ -73,6 +72,7 @@ static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
"3: \n\t"
:"+&r"(i), "+&r"(j)
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
+ NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm5", "%xmm6", "%xmm7")
);
@@ -117,6 +117,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
"movsd %%xmm2, 16(%1) \n\t"
:"+&r"(i)
:"r"(autoc+j), "r"(data+len), "r"(data+len-j)
+ NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
:"memory"
);
} else {
@@ -140,6 +141,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
"movsd %%xmm1, %2 \n\t"
:"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
:"r"(data+len), "r"(data+len-j)
+ NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
);
}
}
@@ -152,7 +154,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c)
#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
- if (INLINE_SSE2_SLOW(cpu_flags)) {
+ if (INLINE_SSE2(cpu_flags) || INLINE_SSE2_SLOW(cpu_flags)) {
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
}
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
index 2c04d9d1bd..6298f5ed19 100644
--- a/libavcodec/x86/mathops.h
+++ b/libavcodec/x86/mathops.h
@@ -2,20 +2,20 @@
* simple math operations
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -105,7 +105,7 @@ __asm__ volatile(\
#endif /* HAVE_I686 */
#define MASK_ABS(mask, level) \
- __asm__ ("cltd \n\t" \
+ __asm__ ("cdq \n\t" \
"xorl %1, %0 \n\t" \
"subl %1, %0 \n\t" \
: "+a"(level), "=&d"(mask))
diff --git a/libavcodec/x86/mdct.h b/libavcodec/x86/mdct.h
deleted file mode 100644
index cc107cb86a..0000000000
--- a/libavcodec/x86/mdct.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_MDCT_H
-#define AVCODEC_X86_MDCT_H
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-#endif /* AVCODEC_X86_MDCT_H */
diff --git a/libavcodec/x86/mdct_init.c b/libavcodec/x86/mdct_init.c
deleted file mode 100644
index db642d863d..0000000000
--- a/libavcodec/x86/mdct_init.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-
-#include "mdct.h"
-
-av_cold void ff_mdct_init_x86(FFTContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
- if (EXTERNAL_AMD3DNOW(cpu_flags)) {
- s->imdct_calc = ff_imdct_calc_3dnow;
- s->imdct_half = ff_imdct_half_3dnow;
- }
-
- if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
- s->imdct_calc = ff_imdct_calc_3dnowext;
- s->imdct_half = ff_imdct_half_3dnowext;
- }
-#endif /* ARCH_X86_32 */
-
- if (EXTERNAL_SSE(cpu_flags)) {
- s->imdct_calc = ff_imdct_calc_sse;
- s->imdct_half = ff_imdct_half_sse;
- }
-
- if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
- s->imdct_half = ff_imdct_half_avx;
- }
-}
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 1a87f37b39..ad06d485ab 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -4,25 +4,30 @@
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
%include "libavutil/x86/x86util.asm"
+SECTION_RODATA
+
+cextern pb_1
+cextern pb_80
+
SECTION .text
%macro DIFF_PIXELS_1 4
@@ -210,7 +215,7 @@ hadamard8_16_wrapper %1, 3
%elif cpuflag(mmx)
ALIGN 16
; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
-; uint8_t *src2, int stride, int h)
+; uint8_t *src2, ptrdiff_t stride, int h)
; r0 = void *s = unused, int h = unused (always 8)
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
; can simply call this 2x2x (and that's why we access rsp+gprsize
@@ -274,19 +279,27 @@ INIT_XMM ssse3
%define ABS_SUM_8x8 ABS_SUM_8x8_64
HADAMARD8_DIFF 9
-INIT_XMM sse2
-; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-; int line_size, int h);
-cglobal sse16, 5, 5, 8
- shr r4d, 1
+; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+; ptrdiff_t line_size, int h)
+
+%macro SUM_SQUARED_ERRORS 1
+cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
+%if %1 == mmsize
+ shr hd, 1
+%endif
pxor m0, m0 ; mm0 = 0
pxor m7, m7 ; mm7 holds the sum
.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
- movu m1, [r1 ] ; mm1 = pix1[0][0-15]
- movu m2, [r2 ] ; mm2 = pix2[0][0-15]
- movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
- movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
+ movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx
+ movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx
+%if %1 == mmsize
+ movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
+ movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
+%else ; %1 / 2 == mmsize; mmx only
+ mova m3, [pix1q+8] ; m3 = pix1[0][8-15]
+ mova m4, [pix2q+8] ; m4 = pix2[0][8-15]
+%endif
; todo: mm1-mm2, mm3-mm4
; algo: subtract mm1 from mm2 with saturation and vice versa
@@ -315,22 +328,607 @@ cglobal sse16, 5, 5, 8
pmaddwd m1, m1
pmaddwd m3, m3
- lea r1, [r1+r3*2] ; pix1 += 2*line_size
- lea r2, [r2+r3*2] ; pix2 += 2*line_size
-
paddd m1, m2
paddd m3, m4
paddd m7, m1
paddd m7, m3
- dec r4
+%if %1 == mmsize
+ lea pix1q, [pix1q + 2*lsizeq]
+ lea pix2q, [pix2q + 2*lsizeq]
+%else
+ add pix1q, lsizeq
+ add pix2q, lsizeq
+%endif
+ dec hd
jnz .next2lines
- mova m1, m7
- psrldq m7, 8 ; shift hi qword to lo
- paddd m7, m1
- mova m1, m7
- psrldq m7, 4 ; shift hi dword to lo
- paddd m7, m1
+ HADDD m7, m1
movd eax, m7 ; return value
RET
+%endmacro
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 8
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 16
+
+INIT_XMM sse2
+SUM_SQUARED_ERRORS 16
+
+;-----------------------------------------------
+;int ff_sum_abs_dctelem(int16_t *block)
+;-----------------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline loops
+
+%macro SUM_ABS_DCTELEM 2
+cglobal sum_abs_dctelem, 1, 1, %1, block
+ pxor m0, m0
+ pxor m1, m1
+%assign %%i 0
+%rep %2
+ mova m2, [blockq+mmsize*(0+%%i)]
+ mova m3, [blockq+mmsize*(1+%%i)]
+ mova m4, [blockq+mmsize*(2+%%i)]
+ mova m5, [blockq+mmsize*(3+%%i)]
+ ABS1_SUM m2, m6, m0
+ ABS1_SUM m3, m6, m1
+ ABS1_SUM m4, m6, m0
+ ABS1_SUM m5, m6, m1
+%assign %%i %%i+4
+%endrep
+ paddusw m0, m1
+ HSUM m0, m1, eax
+ and eax, 0xFFFF
+ RET
+%endmacro
+
+INIT_MMX mmx
+SUM_ABS_DCTELEM 0, 4
+INIT_MMX mmxext
+SUM_ABS_DCTELEM 0, 4
+INIT_XMM sse2
+SUM_ABS_DCTELEM 7, 2
+INIT_XMM ssse3
+SUM_ABS_DCTELEM 6, 2
+
+;------------------------------------------------------------------------------
+; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
+;------------------------------------------------------------------------------
+; %1 = 8/16. %2-5=m#
+%macro HF_NOISE_PART1 5
+ mova m%2, [pix1q]
+%if %1 == 8
+ mova m%3, m%2
+ psllq m%2, 8
+ psrlq m%3, 8
+ psrlq m%2, 8
+%else
+ mova m%3, [pix1q+1]
+%endif
+ mova m%4, m%2
+ mova m%5, m%3
+ punpcklbw m%2, m7
+ punpcklbw m%3, m7
+ punpckhbw m%4, m7
+ punpckhbw m%5, m7
+ psubw m%2, m%3
+ psubw m%4, m%5
+%endmacro
+
+; %1-2 = m#
+%macro HF_NOISE_PART2 4
+ psubw m%1, m%3
+ psubw m%2, m%4
+ pxor m3, m3
+ pxor m1, m1
+ pcmpgtw m3, m%1
+ pcmpgtw m1, m%2
+ pxor m%1, m3
+ pxor m%2, m1
+ psubw m%1, m3
+ psubw m%2, m1
+ paddw m%2, m%1
+ paddw m6, m%2
+%endmacro
+
+; %1 = 8/16
+%macro HF_NOISE 1
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h
+ sub hd, 2
+ pxor m7, m7
+ pxor m6, m6
+ HF_NOISE_PART1 %1, 0, 1, 2, 3
+ add pix1q, lsizeq
+ HF_NOISE_PART1 %1, 4, 1, 5, 3
+ HF_NOISE_PART2 0, 2, 4, 5
+ add pix1q, lsizeq
+.loop:
+ HF_NOISE_PART1 %1, 0, 1, 2, 3
+ HF_NOISE_PART2 4, 5, 0, 2
+ add pix1q, lsizeq
+ HF_NOISE_PART1 %1, 4, 1, 5, 3
+ HF_NOISE_PART2 0, 2, 4, 5
+ add pix1q, lsizeq
+ sub hd, 2
+ jne .loop
+
+ mova m0, m6
+ punpcklwd m0, m7
+ punpckhwd m6, m7
+ paddd m6, m0
+ mova m0, m6
+ psrlq m6, 32
+ paddd m0, m6
+ movd eax, m0 ; eax = result of hf_noise8;
+ REP_RET ; return eax;
+%endmacro
+
+INIT_MMX mmx
+HF_NOISE 8
+HF_NOISE 16
+
+;---------------------------------------------------------------------------------------
+;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;---------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD 1
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+ movu m2, [pix2q]
+ movu m1, [pix2q+strideq]
+ psadbw m2, [pix1q]
+ psadbw m1, [pix1q+strideq]
+ paddw m2, m1
+%if %1 != mmsize
+ movu m0, [pix2q+8]
+ movu m1, [pix2q+strideq+8]
+ psadbw m0, [pix1q+8]
+ psadbw m1, [pix1q+strideq+8]
+ paddw m2, m0
+ paddw m2, m1
+%endif
+ sub hd, 2
+
+align 16
+.loop:
+ lea pix1q, [pix1q+strideq*2]
+ lea pix2q, [pix2q+strideq*2]
+ movu m0, [pix2q]
+ movu m1, [pix2q+strideq]
+ psadbw m0, [pix1q]
+ psadbw m1, [pix1q+strideq]
+ paddw m2, m0
+ paddw m2, m1
+%if %1 != mmsize
+ movu m0, [pix2q+8]
+ movu m1, [pix2q+strideq+8]
+ psadbw m0, [pix1q+8]
+ psadbw m1, [pix1q+strideq+8]
+ paddw m2, m0
+ paddw m2, m1
+%endif
+ sub hd, 2
+ jg .loop
+%if mmsize == 16
+ movhlps m0, m2
+ paddw m2, m0
+%endif
+ movd eax, m2
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SAD 8
+SAD 16
+INIT_XMM sse2
+SAD 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_X2 1
+cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
+ movu m0, [pix2q]
+ movu m2, [pix2q+strideq]
+%if mmsize == 16
+ movu m3, [pix2q+1]
+ movu m4, [pix2q+strideq+1]
+ pavgb m0, m3
+ pavgb m2, m4
+%else
+ pavgb m0, [pix2q+1]
+ pavgb m2, [pix2q+strideq+1]
+%endif
+ psadbw m0, [pix1q]
+ psadbw m2, [pix1q+strideq]
+ paddw m0, m2
+%if %1 != mmsize
+ movu m1, [pix2q+8]
+ movu m2, [pix2q+strideq+8]
+ pavgb m1, [pix2q+9]
+ pavgb m2, [pix2q+strideq+9]
+ psadbw m1, [pix1q+8]
+ psadbw m2, [pix1q+strideq+8]
+ paddw m0, m1
+ paddw m0, m2
+%endif
+ sub hd, 2
+
+align 16
+.loop:
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
+ movu m1, [pix2q]
+ movu m2, [pix2q+strideq]
+%if mmsize == 16
+ movu m3, [pix2q+1]
+ movu m4, [pix2q+strideq+1]
+ pavgb m1, m3
+ pavgb m2, m4
+%else
+ pavgb m1, [pix2q+1]
+ pavgb m2, [pix2q+strideq+1]
+%endif
+ psadbw m1, [pix1q]
+ psadbw m2, [pix1q+strideq]
+ paddw m0, m1
+ paddw m0, m2
+%if %1 != mmsize
+ movu m1, [pix2q+8]
+ movu m2, [pix2q+strideq+8]
+ pavgb m1, [pix2q+9]
+ pavgb m2, [pix2q+strideq+9]
+ psadbw m1, [pix1q+8]
+ psadbw m2, [pix1q+strideq+8]
+ paddw m0, m1
+ paddw m0, m2
+%endif
+ sub hd, 2
+ jg .loop
+%if mmsize == 16
+ movhlps m1, m0
+ paddw m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_X2 8
+SAD_X2 16
+INIT_XMM sse2
+SAD_X2 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_Y2 1
+cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
+ movu m1, [pix2q]
+ movu m0, [pix2q+strideq]
+ movu m3, [pix2q+2*strideq]
+ pavgb m1, m0
+ pavgb m0, m3
+ psadbw m1, [pix1q]
+ psadbw m0, [pix1q+strideq]
+ paddw m0, m1
+ mova m1, m3
+%if %1 != mmsize
+ movu m4, [pix2q+8]
+ movu m5, [pix2q+strideq+8]
+ movu m6, [pix2q+2*strideq+8]
+ pavgb m4, m5
+ pavgb m5, m6
+ psadbw m4, [pix1q+8]
+ psadbw m5, [pix1q+strideq+8]
+ paddw m0, m4
+ paddw m0, m5
+ mova m4, m6
+%endif
+ add pix2q, strideq
+ sub hd, 2
+
+align 16
+.loop:
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
+ movu m2, [pix2q]
+ movu m3, [pix2q+strideq]
+ pavgb m1, m2
+ pavgb m2, m3
+ psadbw m1, [pix1q]
+ psadbw m2, [pix1q+strideq]
+ paddw m0, m1
+ paddw m0, m2
+ mova m1, m3
+%if %1 != mmsize
+ movu m5, [pix2q+8]
+ movu m6, [pix2q+strideq+8]
+ pavgb m4, m5
+ pavgb m5, m6
+ psadbw m4, [pix1q+8]
+ psadbw m5, [pix1q+strideq+8]
+ paddw m0, m4
+ paddw m0, m5
+ mova m4, m6
+%endif
+ sub hd, 2
+ jg .loop
+%if mmsize == 16
+ movhlps m1, m0
+ paddw m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_Y2 8
+SAD_Y2 16
+INIT_XMM sse2
+SAD_Y2 16
+
+;-------------------------------------------------------------------------------------------
+;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;-------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_APPROX_XY2 1
+cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
+ mova m4, [pb_1]
+ movu m1, [pix2q]
+ movu m0, [pix2q+strideq]
+ movu m3, [pix2q+2*strideq]
+%if mmsize == 16
+ movu m5, [pix2q+1]
+ movu m6, [pix2q+strideq+1]
+ movu m2, [pix2q+2*strideq+1]
+ pavgb m1, m5
+ pavgb m0, m6
+ pavgb m3, m2
+%else
+ pavgb m1, [pix2q+1]
+ pavgb m0, [pix2q+strideq+1]
+ pavgb m3, [pix2q+2*strideq+1]
+%endif
+ psubusb m0, m4
+ pavgb m1, m0
+ pavgb m0, m3
+ psadbw m1, [pix1q]
+ psadbw m0, [pix1q+strideq]
+ paddw m0, m1
+ mova m1, m3
+%if %1 != mmsize
+ movu m5, [pix2q+8]
+ movu m6, [pix2q+strideq+8]
+ movu m7, [pix2q+2*strideq+8]
+ pavgb m5, [pix2q+1+8]
+ pavgb m6, [pix2q+strideq+1+8]
+ pavgb m7, [pix2q+2*strideq+1+8]
+ psubusb m6, m4
+ pavgb m5, m6
+ pavgb m6, m7
+ psadbw m5, [pix1q+8]
+ psadbw m6, [pix1q+strideq+8]
+ paddw m0, m5
+ paddw m0, m6
+ mova m5, m7
+%endif
+ add pix2q, strideq
+ sub hd, 2
+
+align 16
+.loop:
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
+ movu m2, [pix2q]
+ movu m3, [pix2q+strideq]
+%if mmsize == 16
+ movu m5, [pix2q+1]
+ movu m6, [pix2q+strideq+1]
+ pavgb m2, m5
+ pavgb m3, m6
+%else
+ pavgb m2, [pix2q+1]
+ pavgb m3, [pix2q+strideq+1]
+%endif
+ psubusb m2, m4
+ pavgb m1, m2
+ pavgb m2, m3
+ psadbw m1, [pix1q]
+ psadbw m2, [pix1q+strideq]
+ paddw m0, m1
+ paddw m0, m2
+ mova m1, m3
+%if %1 != mmsize
+ movu m6, [pix2q+8]
+ movu m7, [pix2q+strideq+8]
+ pavgb m6, [pix2q+8+1]
+ pavgb m7, [pix2q+strideq+8+1]
+ psubusb m6, m4
+ pavgb m5, m6
+ pavgb m6, m7
+ psadbw m5, [pix1q+8]
+ psadbw m6, [pix1q+strideq+8]
+ paddw m0, m5
+ paddw m0, m6
+ mova m5, m7
+%endif
+ sub hd, 2
+ jg .loop
+%if mmsize == 16
+ movhlps m1, m0
+ paddw m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_APPROX_XY2 8
+SAD_APPROX_XY2 16
+INIT_XMM sse2
+SAD_APPROX_XY2 16
+
+;--------------------------------------------------------------------
+;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+; ptrdiff_t line_size, int h);
+;--------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_INTRA 1
+cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
+ mova m0, [pix1q]
+%if %1 == mmsize
+ mova m2, [pix1q+lsizeq]
+ psadbw m0, m2
+%else
+ mova m2, [pix1q+lsizeq]
+ mova m3, [pix1q+8]
+ mova m4, [pix1q+lsizeq+8]
+ psadbw m0, m2
+ psadbw m3, m4
+ paddw m0, m3
+%endif
+ sub hd, 2
+
+.loop:
+ lea pix1q, [pix1q + 2*lsizeq]
+%if %1 == mmsize
+ mova m1, [pix1q]
+ psadbw m2, m1
+ paddw m0, m2
+ mova m2, [pix1q+lsizeq]
+ psadbw m1, m2
+ paddw m0, m1
+%else
+ mova m1, [pix1q]
+ mova m3, [pix1q+8]
+ psadbw m2, m1
+ psadbw m4, m3
+ paddw m0, m2
+ paddw m0, m4
+ mova m2, [pix1q+lsizeq]
+ mova m4, [pix1q+lsizeq+8]
+ psadbw m1, m2
+ psadbw m3, m4
+ paddw m0, m1
+ paddw m0, m3
+%endif
+ sub hd, 2
+ jg .loop
+
+%if mmsize == 16
+ pshufd m1, m0, 0xe
+ paddd m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_INTRA 8
+VSAD_INTRA 16
+INIT_XMM sse2
+VSAD_INTRA 16
+
+;---------------------------------------------------------------------
+;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+; ptrdiff_t line_size, int h);
+;---------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_APPROX 1
+cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+ mova m1, [pb_80]
+ mova m0, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+ mova m4, [pix1q+lsizeq]
+%if mmsize == 16
+ movu m3, [pix2q]
+ movu m2, [pix2q+lsizeq]
+ psubb m0, m3
+ psubb m4, m2
+%else
+ psubb m0, [pix2q]
+ psubb m4, [pix2q+lsizeq]
+%endif
+ pxor m0, m1
+ pxor m4, m1
+ psadbw m0, m4
+%else ; vsad16_mmxext
+ mova m3, [pix1q+8]
+ psubb m0, [pix2q]
+ psubb m3, [pix2q+8]
+ pxor m0, m1
+ pxor m3, m1
+ mova m4, [pix1q+lsizeq]
+ mova m5, [pix1q+lsizeq+8]
+ psubb m4, [pix2q+lsizeq]
+ psubb m5, [pix2q+lsizeq+8]
+ pxor m4, m1
+ pxor m5, m1
+ psadbw m0, m4
+ psadbw m3, m5
+ paddw m0, m3
+%endif
+ sub hd, 2
+
+.loop:
+ lea pix1q, [pix1q + 2*lsizeq]
+ lea pix2q, [pix2q + 2*lsizeq]
+ mova m2, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+%if mmsize == 16
+ movu m3, [pix2q]
+ psubb m2, m3
+%else
+ psubb m2, [pix2q]
+%endif
+ pxor m2, m1
+ psadbw m4, m2
+ paddw m0, m4
+ mova m4, [pix1q+lsizeq]
+ movu m3, [pix2q+lsizeq]
+ psubb m4, m3
+ pxor m4, m1
+ psadbw m2, m4
+ paddw m0, m2
+%else ; vsad16_mmxext
+ mova m3, [pix1q+8]
+ psubb m2, [pix2q]
+ psubb m3, [pix2q+8]
+ pxor m2, m1
+ pxor m3, m1
+ psadbw m4, m2
+ psadbw m5, m3
+ paddw m0, m4
+ paddw m0, m5
+ mova m4, [pix1q+lsizeq]
+ mova m5, [pix1q+lsizeq+8]
+ psubb m4, [pix2q+lsizeq]
+ psubb m5, [pix2q+lsizeq+8]
+ pxor m4, m1
+ pxor m5, m1
+ psadbw m2, m4
+ psadbw m3, m5
+ paddw m0, m2
+ paddw m0, m3
+%endif
+ sub hd, 2
+ jg .loop
+
+%if mmsize == 16
+ pshufd m1, m0, 0xe
+ paddd m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_APPROX 8
+VSAD_APPROX 16
+INIT_XMM sse2
+VSAD_APPROX 16
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index ee5f559547..dc3e6f8668 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -5,20 +5,20 @@
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -29,382 +29,67 @@
#include "libavcodec/me_cmp.h"
#include "libavcodec/mpegvideo.h"
-#if HAVE_INLINE_ASM
-
-static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
- ptrdiff_t stride, int h)
-{
- int tmp;
-
- __asm__ volatile (
- "movl %4, %%ecx \n"
- "shr $1, %%ecx \n"
- "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */
- "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */
- "1: \n"
- "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */
- "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */
- "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */
- "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */
-
- /* todo: mm1-mm2, mm3-mm4 */
- /* algo: subtract mm1 from mm2 with saturation and vice versa */
- /* OR the results to get absolute difference */
- "movq %%mm1, %%mm5 \n"
- "movq %%mm3, %%mm6 \n"
- "psubusb %%mm2, %%mm1 \n"
- "psubusb %%mm4, %%mm3 \n"
- "psubusb %%mm5, %%mm2 \n"
- "psubusb %%mm6, %%mm4 \n"
-
- "por %%mm1, %%mm2 \n"
- "por %%mm3, %%mm4 \n"
-
- /* now convert to 16-bit vectors so we can square them */
- "movq %%mm2, %%mm1 \n"
- "movq %%mm4, %%mm3 \n"
-
- "punpckhbw %%mm0, %%mm2 \n"
- "punpckhbw %%mm0, %%mm4 \n"
- "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */
- "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */
-
- "pmaddwd %%mm2, %%mm2 \n"
- "pmaddwd %%mm4, %%mm4 \n"
- "pmaddwd %%mm1, %%mm1 \n"
- "pmaddwd %%mm3, %%mm3 \n"
-
- "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * stride */
- "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * stride */
-
- "paddd %%mm2, %%mm1 \n"
- "paddd %%mm4, %%mm3 \n"
- "paddd %%mm1, %%mm7 \n"
- "paddd %%mm3, %%mm7 \n"
-
- "decl %%ecx \n"
- "jnz 1b \n"
-
- "movq %%mm7, %%mm1 \n"
- "psrlq $32, %%mm7 \n" /* shift hi dword to lo */
- "paddd %%mm7, %%mm1 \n"
- "movd %%mm1, %2 \n"
- : "+r" (pix1), "+r" (pix2), "=r" (tmp)
- : "r" (stride), "m" (h)
- : "%ecx");
-
- return tmp;
-}
-
-static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
- ptrdiff_t stride, int h)
-{
- int tmp;
-
- __asm__ volatile (
- "movl %4, %%ecx\n"
- "pxor %%mm0, %%mm0\n" /* mm0 = 0 */
- "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */
- "1:\n"
- "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */
- "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */
- "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */
- "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */
-
- /* todo: mm1-mm2, mm3-mm4 */
- /* algo: subtract mm1 from mm2 with saturation and vice versa */
- /* OR the results to get absolute difference */
- "movq %%mm1, %%mm5\n"
- "movq %%mm3, %%mm6\n"
- "psubusb %%mm2, %%mm1\n"
- "psubusb %%mm4, %%mm3\n"
- "psubusb %%mm5, %%mm2\n"
- "psubusb %%mm6, %%mm4\n"
-
- "por %%mm1, %%mm2\n"
- "por %%mm3, %%mm4\n"
-
- /* now convert to 16-bit vectors so we can square them */
- "movq %%mm2, %%mm1\n"
- "movq %%mm4, %%mm3\n"
-
- "punpckhbw %%mm0, %%mm2\n"
- "punpckhbw %%mm0, %%mm4\n"
- "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
- "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
-
- "pmaddwd %%mm2, %%mm2\n"
- "pmaddwd %%mm4, %%mm4\n"
- "pmaddwd %%mm1, %%mm1\n"
- "pmaddwd %%mm3, %%mm3\n"
-
- "add %3, %0\n"
- "add %3, %1\n"
-
- "paddd %%mm2, %%mm1\n"
- "paddd %%mm4, %%mm3\n"
- "paddd %%mm1, %%mm7\n"
- "paddd %%mm3, %%mm7\n"
-
- "decl %%ecx\n"
- "jnz 1b\n"
-
- "movq %%mm7, %%mm1\n"
- "psrlq $32, %%mm7\n" /* shift hi dword to lo */
- "paddd %%mm7, %%mm1\n"
- "movd %%mm1, %2\n"
- : "+r" (pix1), "+r" (pix2), "=r" (tmp)
- : "r" (stride), "m" (h)
- : "%ecx");
-
- return tmp;
-}
-
-static int hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
- int tmp;
-
- __asm__ volatile (
- "movl %3, %%ecx\n"
- "pxor %%mm7, %%mm7\n"
- "pxor %%mm6, %%mm6\n"
-
- "movq (%0), %%mm0\n"
- "movq %%mm0, %%mm1\n"
- "psllq $8, %%mm0\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm0\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm0\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm2\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
-
- "add %2, %0\n"
-
- "movq (%0), %%mm4\n"
- "movq %%mm4, %%mm1\n"
- "psllq $8, %%mm4\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm4\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm4\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm5\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2, %0\n"
- "1:\n"
-
- "movq (%0), %%mm0\n"
- "movq %%mm0, %%mm1\n"
- "psllq $8, %%mm0\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm0\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm0\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm2\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
- "psubw %%mm0, %%mm4\n"
- "psubw %%mm2, %%mm5\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm4, %%mm3\n\t"
- "pcmpgtw %%mm5, %%mm1\n\t"
- "pxor %%mm3, %%mm4\n"
- "pxor %%mm1, %%mm5\n"
- "psubw %%mm3, %%mm4\n"
- "psubw %%mm1, %%mm5\n"
- "paddw %%mm4, %%mm5\n"
- "paddw %%mm5, %%mm6\n"
-
- "add %2, %0\n"
-
- "movq (%0), %%mm4\n"
- "movq %%mm4, %%mm1\n"
- "psllq $8, %%mm4\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm4\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm4\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm5\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2, %0\n"
- "subl $2, %%ecx\n"
- " jnz 1b\n"
-
- "movq %%mm6, %%mm0\n"
- "punpcklwd %%mm7, %%mm0\n"
- "punpckhwd %%mm7, %%mm6\n"
- "paddd %%mm0, %%mm6\n"
-
- "movq %%mm6, %%mm0\n"
- "psrlq $32, %%mm6\n"
- "paddd %%mm6, %%mm0\n"
- "movd %%mm0, %1\n"
- : "+r" (pix1), "=r" (tmp)
- : "r" (stride), "g" (h - 2)
- : "%ecx");
-
- return tmp;
-}
-
-static int hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
- int tmp;
- uint8_t *pix = pix1;
-
- __asm__ volatile (
- "movl %3, %%ecx\n"
- "pxor %%mm7, %%mm7\n"
- "pxor %%mm6, %%mm6\n"
-
- "movq (%0), %%mm0\n"
- "movq 1(%0), %%mm1\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm0\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm2\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
-
- "add %2, %0\n"
-
- "movq (%0), %%mm4\n"
- "movq 1(%0), %%mm1\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm4\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm5\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2, %0\n"
- "1:\n"
-
- "movq (%0), %%mm0\n"
- "movq 1(%0), %%mm1\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm0\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm2\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
- "psubw %%mm0, %%mm4\n"
- "psubw %%mm2, %%mm5\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm4, %%mm3\n\t"
- "pcmpgtw %%mm5, %%mm1\n\t"
- "pxor %%mm3, %%mm4\n"
- "pxor %%mm1, %%mm5\n"
- "psubw %%mm3, %%mm4\n"
- "psubw %%mm1, %%mm5\n"
- "paddw %%mm4, %%mm5\n"
- "paddw %%mm5, %%mm6\n"
-
- "add %2, %0\n"
-
- "movq (%0), %%mm4\n"
- "movq 1(%0), %%mm1\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm4\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm5\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2, %0\n"
- "subl $2, %%ecx\n"
- " jnz 1b\n"
-
- "movq %%mm6, %%mm0\n"
- "punpcklwd %%mm7, %%mm0\n"
- "punpckhwd %%mm7, %%mm6\n"
- "paddd %%mm0, %%mm6\n"
+int ff_sum_abs_dctelem_mmx(int16_t *block);
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
+int ff_sum_abs_dctelem_sse2(int16_t *block);
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
- "movq %%mm6, %%mm0\n"
- "psrlq $32, %%mm6\n"
- "paddd %%mm6, %%mm0\n"
- "movd %%mm0, %1\n"
- : "+r" (pix1), "=r" (tmp)
- : "r" (stride), "g" (h - 2)
- : "%ecx");
+#define hadamard_func(cpu) \
+ int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
+ uint8_t *src2, ptrdiff_t stride, int h); \
+ int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
+ uint8_t *src2, ptrdiff_t stride, int h);
- return tmp + hf_noise8_mmx(pix + 8, stride, h);
-}
+hadamard_func(mmx)
+hadamard_func(mmxext)
+hadamard_func(sse2)
+hadamard_func(ssse3)
+#if HAVE_YASM
static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
@@ -413,9 +98,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
if (c)
score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
else
- score1 = sse16_mmx(c, pix1, pix2, stride, h);
- score2 = hf_noise16_mmx(pix1, stride, h) -
- hf_noise16_mmx(pix2, stride, h);
+ score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
+ score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
+ - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
if (c)
return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -426,9 +111,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
- int score1 = sse8_mmx(c, pix1, pix2, stride, h);
- int score2 = hf_noise8_mmx(pix1, stride, h) -
- hf_noise8_mmx(pix2, stride, h);
+ int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
+ int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
+ ff_hf_noise8_mmx(pix2, stride, h);
if (c)
return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -436,13 +121,17 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
return score1 + FFABS(score2) * 8;
}
+#endif /* HAVE_YASM */
+
+#if HAVE_INLINE_ASM
+
static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
ptrdiff_t stride, int h)
{
int tmp;
- assert((((int) pix) & 7) == 0);
- assert((stride & 7) == 0);
+ av_assert2((((int) pix) & 7) == 0);
+ av_assert2((stride & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
@@ -500,57 +189,14 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
}
#undef SUM
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
- ptrdiff_t stride, int h)
-{
- int tmp;
-
- assert((((int) pix) & 7) == 0);
- assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1) \
- "movq (%0), " #out0 "\n" \
- "movq 8(%0), " #out1 "\n" \
- "add %2, %0\n" \
- "psadbw " #out0 ", " #in0 "\n" \
- "psadbw " #out1 ", " #in1 "\n" \
- "paddw " #in1 ", " #in0 "\n" \
- "paddw " #in0 ", %%mm6\n"
-
- __asm__ volatile (
- "movl %3, %%ecx\n"
- "pxor %%mm6, %%mm6\n"
- "pxor %%mm7, %%mm7\n"
- "movq (%0), %%mm0\n"
- "movq 8(%0), %%mm1\n"
- "add %2, %0\n"
- "jmp 2f\n"
- "1:\n"
-
- SUM(%%mm4, %%mm5, %%mm0, %%mm1)
- "2:\n"
- SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
- "subl $2, %%ecx\n"
- "jnz 1b\n"
-
- "movd %%mm6, %1\n"
- : "+r" (pix), "=r" (tmp)
- : "r" (stride), "m" (h)
- : "%ecx");
-
- return tmp;
-}
-#undef SUM
-
static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int tmp;
- assert((((int) pix1) & 7) == 0);
- assert((((int) pix2) & 7) == 0);
- assert((stride & 7) == 0);
+ av_assert2((((int) pix1) & 7) == 0);
+ av_assert2((((int) pix2) & 7) == 0);
+ av_assert2((stride & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
@@ -624,191 +270,16 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
}
#undef SUM
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
- ptrdiff_t stride, int h)
-{
- int tmp;
-
- assert((((int) pix1) & 7) == 0);
- assert((((int) pix2) & 7) == 0);
- assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1) \
- "movq (%0), " #out0 "\n" \
- "movq (%1), %%mm2\n" \
- "movq 8(%0), " #out1 "\n" \
- "movq 8(%1), %%mm3\n" \
- "add %3, %0\n" \
- "add %3, %1\n" \
- "psubb %%mm2, " #out0 "\n" \
- "psubb %%mm3, " #out1 "\n" \
- "pxor %%mm7, " #out0 "\n" \
- "pxor %%mm7, " #out1 "\n" \
- "psadbw " #out0 ", " #in0 "\n" \
- "psadbw " #out1 ", " #in1 "\n" \
- "paddw " #in1 ", " #in0 "\n" \
- "paddw " #in0 ", %%mm6\n "
-
- __asm__ volatile (
- "movl %4, %%ecx\n"
- "pxor %%mm6, %%mm6\n"
- "pcmpeqw %%mm7, %%mm7\n"
- "psllw $15, %%mm7\n"
- "packsswb %%mm7, %%mm7\n"
- "movq (%0), %%mm0\n"
- "movq (%1), %%mm2\n"
- "movq 8(%0), %%mm1\n"
- "movq 8(%1), %%mm3\n"
- "add %3, %0\n"
- "add %3, %1\n"
- "psubb %%mm2, %%mm0\n"
- "psubb %%mm3, %%mm1\n"
- "pxor %%mm7, %%mm0\n"
- "pxor %%mm7, %%mm1\n"
- "jmp 2f\n"
- "1:\n"
-
- SUM(%%mm4, %%mm5, %%mm0, %%mm1)
- "2:\n"
- SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
- "subl $2, %%ecx\n"
- "jnz 1b\n"
-
- "movd %%mm6, %2\n"
- : "+r" (pix1), "+r" (pix2), "=r" (tmp)
- : "r" (stride), "m" (h)
- : "%ecx");
-
- return tmp;
-}
-#undef SUM
-
-#define MMABS_MMX(a,z) \
- "pxor " #z ", " #z " \n\t" \
- "pcmpgtw " #a ", " #z " \n\t" \
- "pxor " #z ", " #a " \n\t" \
- "psubw " #z ", " #a " \n\t"
-
-#define MMABS_MMXEXT(a, z) \
- "pxor " #z ", " #z " \n\t" \
- "psubw " #a ", " #z " \n\t" \
- "pmaxsw " #z ", " #a " \n\t"
-
-#define MMABS_SSSE3(a,z) \
- "pabsw " #a ", " #a " \n\t"
-
-#define MMABS_SUM(a,z, sum) \
- MMABS(a,z) \
- "paddusw " #a ", " #sum " \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
- * natural video, and it's even more unlikely to not have any alternative
- * mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst) \
- "movq " #a ", " #t " \n\t" \
- "psrlq $32, " #a " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movq " #a ", " #t " \n\t" \
- "psrlq $16, " #a " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define HSUM_MMXEXT(a, t, dst) \
- "pshufw $0x0E, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshufw $0x01, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define HSUM_SSE2(a, t, dst) \
- "movhlps " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshuflw $0x0E, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshuflw $0x01, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define DCT_SAD4(m, mm, o) \
- "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
- "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
- "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
- "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
- MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
- MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
- MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
- MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
-
-#define DCT_SAD_MMX \
- "pxor %%mm0, %%mm0 \n\t" \
- "pxor %%mm1, %%mm1 \n\t" \
- DCT_SAD4(q, %%mm, 0) \
- DCT_SAD4(q, %%mm, 8) \
- DCT_SAD4(q, %%mm, 64) \
- DCT_SAD4(q, %%mm, 72) \
- "paddusw %%mm1, %%mm0 \n\t" \
- HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2 \
- "pxor %%xmm0, %%xmm0 \n\t" \
- "pxor %%xmm1, %%xmm1 \n\t" \
- DCT_SAD4(dqa, %%xmm, 0) \
- DCT_SAD4(dqa, %%xmm, 64) \
- "paddusw %%xmm1, %%xmm0 \n\t" \
- HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu) \
-static int sum_abs_dctelem_ ## cpu(int16_t *block) \
-{ \
- int sum; \
- __asm__ volatile ( \
- DCT_SAD \
- :"=r"(sum) \
- :"r"(block)); \
- return sum & 0xFFFF; \
-}
-
-#define DCT_SAD DCT_SAD_MMX
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
-#define MMABS(a, z) MMABS_MMX(a, z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
-#define MMABS(a, z) MMABS_MMXEXT(a, z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD DCT_SAD_SSE2
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a, z) MMABS_SSSE3(a, z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
-
-
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
};
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
-
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
- x86_reg len = -(stride * h);
+ x86_reg len = -stride * h;
__asm__ volatile (
".p2align 4 \n\t"
"1: \n\t"
@@ -841,133 +312,10 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
}
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
- ptrdiff_t stride, int h)
-{
- __asm__ volatile (
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" (stride));
-}
-
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
- ptrdiff_t stride, int h)
-{
- int ret;
- __asm__ volatile (
- "pxor %%xmm2, %%xmm2 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movdqu (%1), %%xmm0 \n\t"
- "movdqu (%1, %4), %%xmm1 \n\t"
- "psadbw (%2), %%xmm0 \n\t"
- "psadbw (%2, %4), %%xmm1 \n\t"
- "paddw %%xmm0, %%xmm2 \n\t"
- "paddw %%xmm1, %%xmm2 \n\t"
- "lea (%1,%4,2), %1 \n\t"
- "lea (%2,%4,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- "movhlps %%xmm2, %%xmm0 \n\t"
- "paddw %%xmm0, %%xmm2 \n\t"
- "movd %%xmm2, %3 \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
- : "r" (stride));
- return ret;
-}
-
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
- ptrdiff_t stride, int h)
-{
- __asm__ volatile (
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "pavgb 1(%1), %%mm0 \n\t"
- "pavgb 1(%1, %3), %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" (stride));
-}
-
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
- ptrdiff_t stride, int h)
-{
- __asm__ volatile (
- "movq (%1), %%mm0 \n\t"
- "add %3, %1 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "pavgb %%mm1, %%mm0 \n\t"
- "pavgb %%mm2, %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" (stride));
-}
-
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
- ptrdiff_t stride, int h)
-{
- __asm__ volatile (
- "movq "MANGLE(bone)", %%mm5 \n\t"
- "movq (%1), %%mm0 \n\t"
- "pavgb 1(%1), %%mm0 \n\t"
- "add %3, %1 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1,%3), %%mm2 \n\t"
- "pavgb 1(%1), %%mm1 \n\t"
- "pavgb 1(%1,%3), %%mm2 \n\t"
- "psubusb %%mm5, %%mm1 \n\t"
- "pavgb %%mm1, %%mm0 \n\t"
- "pavgb %%mm2, %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2,%3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" (stride));
-}
-
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
ptrdiff_t stride, int h)
{
- x86_reg len = -(stride * h);
+ x86_reg len = -stride * h;
__asm__ volatile (
".p2align 4 \n\t"
"1: \n\t"
@@ -1006,7 +354,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
- x86_reg len = -(stride * h);
+ x86_reg len = -stride * h;
__asm__ volatile (
"movq (%1, %%"FF_REG_a"), %%mm0\n\t"
"movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
@@ -1030,7 +378,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
"punpckhbw %%mm7, %%mm5 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"paddw %%mm5, %%mm3 \n\t"
- "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
+ "movq %5, %%mm5 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
"paddw %%mm5, %%mm0 \n\t"
@@ -1054,7 +402,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
- "r" (stride));
+ "r" (stride), "m" (round_tab[2]));
}
static inline int sum_mmx(void)
@@ -1072,15 +420,6 @@ static inline int sum_mmx(void)
return ret & 0xFFFF;
}
-static inline int sum_mmxext(void)
-{
- int ret;
- __asm__ volatile (
- "movd %%mm6, %0 \n\t"
- : "=r" (ret));
- return ret;
-}
-
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
@@ -1097,7 +436,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
- assert(h == 8); \
+ av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
@@ -1111,7 +450,7 @@ static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
- assert(h == 8); \
+ av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
@@ -1126,7 +465,7 @@ static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
- assert(h == 8); \
+ av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
@@ -1141,7 +480,7 @@ static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
- assert(h == 8); \
+ av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
@@ -1211,32 +550,15 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
} \
PIX_SAD(mmx)
-PIX_SAD(mmxext)
#endif /* HAVE_INLINE_ASM */
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
- ptrdiff_t stride, int h);
-
-#define hadamard_func(cpu) \
- int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
- uint8_t *src2, ptrdiff_t stride, int h); \
- int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
- uint8_t *src2, ptrdiff_t stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmxext)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
- c->sum_abs_dctelem = sum_abs_dctelem_mmx;
-
c->pix_abs[0][0] = sad16_mmx;
c->pix_abs[0][1] = sad16_x2_mmx;
c->pix_abs[0][2] = sad16_y2_mmx;
@@ -1249,77 +571,81 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = sad16_mmx;
c->sad[1] = sad8_mmx;
- c->sse[0] = sse16_mmx;
- c->sse[1] = sse8_mmx;
c->vsad[4] = vsad_intra16_mmx;
- c->nsse[0] = nsse16_mmx;
- c->nsse[1] = nsse8_mmx;
-
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->vsad[0] = vsad16_mmx;
}
}
- if (INLINE_MMXEXT(cpu_flags)) {
- c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
-
- c->vsad[4] = vsad_intra16_mmxext;
-
- c->pix_abs[0][0] = sad16_mmxext;
- c->pix_abs[1][0] = sad8_mmxext;
-
- c->sad[0] = sad16_mmxext;
- c->sad[1] = sad8_mmxext;
-
- if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
- c->pix_abs[0][1] = sad16_x2_mmxext;
- c->pix_abs[0][2] = sad16_y2_mmxext;
- c->pix_abs[0][3] = sad16_xy2_mmxext;
- c->pix_abs[1][1] = sad8_x2_mmxext;
- c->pix_abs[1][2] = sad8_y2_mmxext;
- c->pix_abs[1][3] = sad8_xy2_mmxext;
-
- c->vsad[0] = vsad16_mmxext;
- }
- }
-
- if (INLINE_SSE2(cpu_flags)) {
- c->sum_abs_dctelem = sum_abs_dctelem_sse2;
- }
-
- if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
- c->sad[0] = sad16_sse2;
- }
-
-#if HAVE_SSSE3_INLINE
- if (INLINE_SSSE3(cpu_flags)) {
- c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
- }
-#endif
#endif /* HAVE_INLINE_ASM */
if (EXTERNAL_MMX(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
+ c->sse[0] = ff_sse16_mmx;
+ c->sse[1] = ff_sse8_mmx;
+#if HAVE_YASM
+ c->nsse[0] = nsse16_mmx;
+ c->nsse[1] = nsse8_mmx;
+#endif
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
+
+ c->sad[0] = ff_sad16_mmxext;
+ c->sad[1] = ff_sad8_mmxext;
+
+ c->pix_abs[0][0] = ff_sad16_mmxext;
+ c->pix_abs[0][1] = ff_sad16_x2_mmxext;
+ c->pix_abs[0][2] = ff_sad16_y2_mmxext;
+ c->pix_abs[1][0] = ff_sad8_mmxext;
+ c->pix_abs[1][1] = ff_sad8_x2_mmxext;
+ c->pix_abs[1][2] = ff_sad8_y2_mmxext;
+
+ c->vsad[4] = ff_vsad_intra16_mmxext;
+ c->vsad[5] = ff_vsad_intra8_mmxext;
+
+ if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+ c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
+ c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
+
+ c->vsad[0] = ff_vsad16_approx_mmxext;
+ c->vsad[1] = ff_vsad8_approx_mmxext;
+ }
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->sse[0] = ff_sse16_sse2;
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
#endif
+ if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+ c->sad[0] = ff_sad16_sse2;
+ c->pix_abs[0][0] = ff_sad16_sse2;
+ c->pix_abs[0][1] = ff_sad16_x2_sse2;
+ c->pix_abs[0][2] = ff_sad16_y2_sse2;
+
+ c->vsad[4] = ff_vsad_intra16_sse2;
+ if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+ c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
+ c->vsad[0] = ff_vsad16_approx_sse2;
+ }
+ }
}
- if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
+#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+#endif
}
}
diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm
new file mode 100644
index 0000000000..3dc641e89e
--- /dev/null
+++ b/libavcodec/x86/mlpdsp.asm
@@ -0,0 +1,196 @@
+;******************************************************************************
+;* SIMD-optimized MLP DSP functions
+;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%if ARCH_X86_64
+
+%macro SHLX 2
+%if cpuflag(bmi2)
+ shlx %1, %1, %2q
+%else
+ shl %1, %2b
+%endif
+%endmacro
+
+%macro REMATRIX 0
+ movdqa m0, [samplesq]
+ movdqa m1, [coeffsq ]
+ pshufd m2, m0, q2301
+ pshufd m3, m1, q2301
+ pmuldq m0, m1
+ pmuldq m3, m2
+ paddq m0, m3
+%if notcpuflag(avx2)
+ movdqa m1, [samplesq + 16]
+ movdqa m2, [coeffsq + 16]
+ pshufd m3, m1, q2301
+ pshufd m4, m2, q2301
+ pmuldq m1, m2
+ pmuldq m4, m3
+ paddq m0, m1
+ paddq m0, m4
+%else
+ vextracti128 xm1, m0, 1
+ paddq xm0, xm1
+%endif
+%endmacro
+
+%macro LOOP_END 0
+ pshufd xm1, xm0, q0032
+ paddq xm0, xm1
+ movq accumq, xm0
+ movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs
+ sar accumq, 14 ; accum >>= 14
+ and accumd, maskd ; accum &= mask
+ add accumd, blsbsd ; accum += *bypassed_lsbs
+ mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
+ add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
+ add samplesq, 32 ; samples += MAX_CHANNELS;
+ cmp blsbs_ptrq, cntq
+%endmacro
+
+%macro LOOP_SHIFT_END 0
+ pshufd xm1, xm0, q0032
+ paddq xm0, xm1
+ movq accumq, xm0
+ and indexd, auspd ; index &= access_unit_size_pow2;
+ movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
+ add indexd, index2d ; index += index2
+ SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift
+ add accumq, noiseq ; accum += noise_buffer[index]
+ movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register)
+ sar accumq, 14 ; accum >>= 14
+ and accumd, maskd ; accum &= mask
+ add accumd, noised ; accum += *bypassed_lsbs
+ mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
+ add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
+ add samplesq, 32 ; samples += MAX_CHANNELS;
+ cmp blsbs_ptrq, cntq
+%endmacro
+
+;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
+; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
+; int index, unsigned int dest_ch, uint16_t blockpos,
+; unsigned int maxchan, int matrix_noise_shift,
+; int access_unit_size_pow2, int32_t mask)
+%macro MLP_REMATRIX_CHANNEL 0
+cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
+ index, dest_ch, blockpos, maxchan, mns, \
+ accum, mask, cnt
+ mov mnsd, mnsm ; load matrix_noise_shift
+ movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)
+ mov maxchand, maxchanm ; load maxchan
+ mov maskd, maskm ; load mask
+%if WIN64
+ mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)
+%endif
+ shl dest_chd, 2
+ lea cntq, [blsbs_ptrq + blockposq*8]
+ test mnsd, mnsd ; is matrix_noise_shift != 0?
+ jne .shift ; jump if true
+ cmp maxchand, 4 ; is maxchan < 4?
+ jl .loop4 ; jump if true
+
+align 16
+.loop8:
+ ; Process 5 or more channels
+ REMATRIX
+ LOOP_END
+ jne .loop8
+ RET
+
+align 16
+.loop4:
+ ; Process up to 4 channels
+ movdqa xm0, [samplesq]
+ movdqa xm1, [coeffsq ]
+ pshufd xm2, xm0, q2301
+ pshufd xm3, xm1, q2301
+ pmuldq xm0, xm1
+ pmuldq xm3, xm2
+ paddq xm0, xm3
+ LOOP_END
+ jne .loop4
+ RET
+
+.shift:
+%if WIN64
+ mov indexd, indexm ; load index (not needed on UNIX64)
+%endif
+ mov r9d, r9m ; load access_unit_size_pow2
+%if cpuflag(bmi2)
+ ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
+ DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
+ index, dest_ch, accum, index2, mns, \
+ ausp, mask, cnt, noise
+ add mnsd, 7 ; matrix_noise_shift += 7
+%else ; sse4
+ mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift
+%if WIN64
+ ; r0 = rcx
+ DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
+ index2, accum, ausp, mask, cnt, noise
+%else ; UNIX64
+ ; r3 = rcx
+ DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
+ index2, accum, ausp, mask, cnt, noise
+%endif
+ lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7
+%endif ; cpuflag
+ sub auspd, 1 ; access_unit_size_pow2 -= 1
+ cmp r7d, 4 ; is maxchan < 4?
+ lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
+ jl .loop4_shift ; jump if maxchan < 4
+
+align 16
+.loop8_shift:
+ ; Process 5 or more channels
+ REMATRIX
+ LOOP_SHIFT_END
+ jne .loop8_shift
+ RET
+
+align 16
+.loop4_shift:
+ ; Process up to 4 channels
+ movdqa xm0, [samplesq]
+ movdqa xm1, [coeffsq ]
+ pshufd xm2, xm0, q2301
+ pshufd xm3, xm1, q2301
+ pmuldq xm0, xm1
+ pmuldq xm3, xm2
+ paddq xm0, xm3
+ LOOP_SHIFT_END
+ jne .loop4_shift
+ RET
+%endmacro
+
+INIT_XMM sse4
+MLP_REMATRIX_CHANNEL
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2, bmi2
+MLP_REMATRIX_CHANNEL
+%endif
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c
index 72fc637764..7f5e6b11d5 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp_init.c
@@ -2,32 +2,47 @@
* MLP DSP functions x86-optimized
* Copyright (c) 2009 Ramiro Polla
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mlpdsp.h"
#include "libavcodec/mlp.h"
-#if HAVE_7REGS && HAVE_INLINE_ASM
+#define REMATRIX_CHANNEL_FUNC(opt) \
+void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
+ const int32_t *coeffs, \
+ const uint8_t *bypassed_lsbs, \
+ const int8_t *noise_buffer, \
+ int index, \
+ unsigned int dest_ch, \
+ uint16_t blockpos, \
+ unsigned int maxchan, \
+ int matrix_noise_shift, \
+ int access_unit_size_pow2, \
+ int32_t mask);
+
+REMATRIX_CHANNEL_FUNC(sse4)
+REMATRIX_CHANNEL_FUNC(avx2_bmi2)
+
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
extern char ff_mlp_firorder_8;
extern char ff_mlp_firorder_7;
@@ -45,12 +60,12 @@ extern char ff_mlp_iirorder_2;
extern char ff_mlp_iirorder_1;
extern char ff_mlp_iirorder_0;
-static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
&ff_mlp_firorder_2, &ff_mlp_firorder_3,
&ff_mlp_firorder_4, &ff_mlp_firorder_5,
&ff_mlp_firorder_6, &ff_mlp_firorder_7,
&ff_mlp_firorder_8 };
-static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
&ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
&ff_mlp_iirorder_4 };
@@ -133,8 +148,8 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
FIRMUL (ff_mlp_firorder_6, 0x14 )
FIRMUL (ff_mlp_firorder_5, 0x10 )
FIRMUL (ff_mlp_firorder_4, 0x0c )
- FIRMULREG(ff_mlp_firorder_3, 0x08,10)
- FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
+ FIRMUL (ff_mlp_firorder_3, 0x08 )
+ FIRMUL (ff_mlp_firorder_2, 0x04 )
FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
"jmp *%6 \n\t"
@@ -163,8 +178,6 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
: /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
/* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
, /* 8*/"r"((int64_t)coeff[0])
- , /* 9*/"r"((int64_t)coeff[1])
- , /*10*/"r"((int64_t)coeff[2])
: "rax", "rdx", "rsi"
#else /* ARCH_X86_32 */
/* 3*/"+m"(blocksize)
@@ -179,9 +192,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
{
-#if HAVE_7REGS && HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
if (INLINE_MMX(cpu_flags))
c->mlp_filter_channel = mlp_filter_channel_x86;
#endif
+ if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
+ if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
}
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index 591f5270bd..9499141eea 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -2,20 +2,20 @@
* SIMD-optimized MP3 decoding functions
* Copyright (c) 2010 Vitor Sessak
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -26,11 +26,20 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegaudiodsp.h"
-void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+#if HAVE_YASM
+#if ARCH_X86_32
+DECL(sse)
+#endif
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+#endif /* HAVE_YASM */
+
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
float *tmpbuf);
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
@@ -38,7 +47,7 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
-#if HAVE_SSE2_INLINE
+#if HAVE_6REGS && HAVE_SSE_INLINE
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -182,7 +191,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
*out = sum;
}
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
#if HAVE_YASM
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
@@ -217,16 +226,22 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
} \
}
+#if HAVE_SSE
+#if ARCH_X86_32
DECL_IMDCT_BLOCKS(sse,sse)
+#endif
DECL_IMDCT_BLOCKS(sse2,sse)
DECL_IMDCT_BLOCKS(sse3,sse)
DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
DECL_IMDCT_BLOCKS(avx,avx)
+#endif
#endif /* HAVE_YASM */
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
{
- int cpu_flags = av_get_cpu_flags();
+ av_unused int cpu_flags = av_get_cpu_flags();
int i, j;
for (j = 0; j < 4; j++) {
@@ -242,16 +257,19 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
}
}
-#if HAVE_SSE2_INLINE
- if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS && HAVE_SSE_INLINE
+ if (INLINE_SSE(cpu_flags)) {
s->apply_window_float = apply_window_mp3;
}
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_SSE_INLINE */
#if HAVE_YASM
+#if HAVE_SSE
+#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse;
}
+#endif
if (EXTERNAL_SSE2(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse2;
}
@@ -261,8 +279,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
if (EXTERNAL_SSSE3(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_ssse3;
}
+#endif
+#if HAVE_AVX_EXTERNAL
if (EXTERNAL_AVX(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_avx;
}
+#endif
#endif /* HAVE_YASM */
}
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 6c0493e6b8..35a8264804 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -2,20 +2,20 @@
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
* H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,8 +25,9 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
@@ -35,7 +36,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
qmul = qscale << 1;
- assert(s->block_last_index[n]>=0 || s->h263_aic);
+ av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
if (!s->h263_aic) {
if (n < 4)
@@ -111,7 +112,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
qmul = qscale << 1;
qadd = (qscale - 1) | 1;
- assert(s->block_last_index[n]>=0 || s->h263_aic);
+ av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
@@ -171,7 +172,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
const uint16_t *quant_matrix;
int block0;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
@@ -190,9 +191,9 @@ __asm__ volatile(
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %%"FF_REG_a"), %%mm0\n\t"
+ "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
- "movq (%1, %%"FF_REG_a"), %%mm4\n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
@@ -208,7 +209,7 @@ __asm__ volatile(
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t"
@@ -222,7 +223,7 @@ __asm__ volatile(
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"FF_REG_a")\n\t"
+ "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
@@ -239,7 +240,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
x86_reg nCoeffs;
const uint16_t *quant_matrix;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
@@ -253,9 +254,9 @@ __asm__ volatile(
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %%"FF_REG_a"), %%mm0\n\t"
+ "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
- "movq (%1, %%"FF_REG_a"), %%mm4\n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
@@ -275,7 +276,7 @@ __asm__ volatile(
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $4, %%mm0 \n\t"
"psraw $4, %%mm1 \n\t"
@@ -289,7 +290,7 @@ __asm__ volatile(
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"FF_REG_a")\n\t"
+ "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
@@ -306,7 +307,10 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
const uint16_t *quant_matrix;
int block0;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
+
+ if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+ else qscale <<= 1;
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -325,9 +329,9 @@ __asm__ volatile(
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %%"FF_REG_a"), %%mm0\n\t"
+ "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
- "movq (%1, %%"FF_REG_a"), %%mm4\n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
@@ -343,17 +347,17 @@ __asm__ volatile(
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
- "psraw $3, %%mm0 \n\t"
- "psraw $3, %%mm1 \n\t"
+ "psraw $4, %%mm0 \n\t"
+ "psraw $4, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"FF_REG_a")\n\t"
+ "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
@@ -371,7 +375,10 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
x86_reg nCoeffs;
const uint16_t *quant_matrix;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
+
+ if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+ else qscale <<= 1;
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -386,9 +393,9 @@ __asm__ volatile(
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %%"FF_REG_a"), %%mm0\n\t"
+ "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
- "movq (%1, %%"FF_REG_a"), %%mm4\n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
@@ -408,10 +415,10 @@ __asm__ volatile(
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
- "psrlw $4, %%mm0 \n\t"
- "psrlw $4, %%mm1 \n\t"
+ "psrlw $5, %%mm0 \n\t"
+ "psrlw $5, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
@@ -420,7 +427,7 @@ __asm__ volatile(
"pandn %%mm1, %%mm5 \n\t"
"pxor %%mm4, %%mm7 \n\t"
"pxor %%mm5, %%mm7 \n\t"
- "movq %%mm4, (%0, %%"FF_REG_a")\n\t"
+ "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
@@ -442,11 +449,11 @@ __asm__ volatile(
);
}
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
{
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
@@ -458,5 +465,5 @@ av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
}
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
}
diff --git a/libavcodec/x86/mpegvideodsp.c b/libavcodec/x86/mpegvideodsp.c
index b701ef8cc7..e0498f3849 100644
--- a/libavcodec/x86/mpegvideodsp.c
+++ b/libavcodec/x86/mpegvideodsp.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -21,6 +21,7 @@
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegvideodsp.h"
+#include "libavcodec/videodsp.h"
#if HAVE_INLINE_ASM
@@ -42,20 +43,24 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
const uint64_t shift2 = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+ uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
int x, y;
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
const int dxh = dxy * (h - 1);
const int dyw = dyx * (w - 1);
+ int need_emu = (unsigned) ix >= width - w ||
+ (unsigned) iy >= height - h;
if ( // non-constant fullpel offset (3% of blocks)
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
// uses more than 16 bits of subpel mv (only at huge resolution)
(dxx | dxy | dyx | dyy) & 15 ||
- (unsigned) ix >= width - w ||
- (unsigned) iy >= height - h) {
+ (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
// FIXME could still use mmx for some of the rows
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
shift, r, width, height);
@@ -63,6 +68,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
}
src += ix + iy * stride;
+ if (need_emu) {
+ ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
+ src = edge_buf;
+ }
__asm__ volatile (
"movd %0, %%mm6 \n\t"
@@ -149,4 +158,3 @@ av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
c->gmc = gmc_mmx;
#endif /* HAVE_INLINE_ASM */
}
-
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 47349d17ec..67b26178a8 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -2,20 +2,20 @@
* The simplest mpeg encoder (well, it was the simplest!)
* Copyright (c) 2000,2001 Fabrice Bellard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -30,6 +30,8 @@
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
+#if HAVE_6REGS
+
#if HAVE_MMX_INLINE
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
@@ -81,7 +83,10 @@ DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSSE3_INLINE */
+#endif /* HAVE_6REGS */
+
#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
@@ -135,7 +140,9 @@ static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
: "r"(block+64)
);
}
+#endif /* HAVE_MMX_INLINE */
+#if HAVE_SSE2_INLINE
static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
@@ -191,9 +198,10 @@ static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
+#endif /* HAVE_SSE2_INLINE */
#endif /* HAVE_INLINE_ASM */
-av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
{
const int dct_algo = s->avctx->dct_algo;
int i;
@@ -205,21 +213,25 @@ av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
+#if HAVE_6REGS
s->dct_quantize = dct_quantize_mmx;
+#endif
s->denoise_dct = denoise_dct_mmx;
}
#endif
-#if HAVE_MMXEXT_INLINE
+#if HAVE_6REGS && HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
s->dct_quantize = dct_quantize_mmxext;
#endif
#if HAVE_SSE2_INLINE
if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
+#endif
s->denoise_dct = denoise_dct_sse2;
}
#endif
-#if HAVE_SSSE3_INLINE
+#if HAVE_6REGS && HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
s->dct_quantize = dct_quantize_ssse3;
#endif
diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c b/libavcodec/x86/mpegvideoenc_qns_template.c
index 8d8d68762a..882d486205 100644
--- a/libavcodec/x86/mpegvideoenc_qns_template.c
+++ b/libavcodec/x86/mpegvideoenc_qns_template.c
@@ -5,26 +5,26 @@
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include <assert.h>
#include <stdint.h>
+#include "libavutil/avassert.h"
#include "libavutil/common.h"
#include "libavutil/x86/asm.h"
@@ -36,7 +36,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[
{
x86_reg i=0;
- assert(FFABS(scale) < MAX_ABS);
+ av_assert2(FFABS(scale) < MAX_ABS);
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index 72df76b749..b2512744ca 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -3,20 +3,20 @@
*
* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -108,7 +108,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
const uint16_t *qmat, *bias;
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
- assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+ av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
//s->fdct (block);
RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
@@ -118,10 +118,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
if (s->mb_intra) {
int dummy;
- if (n < 4)
+ if (n < 4){
q = s->y_dc_scale;
- else
+ bias = s->q_intra_matrix16[qscale][1];
+ qmat = s->q_intra_matrix16[qscale][0];
+ }else{
q = s->c_dc_scale;
+ bias = s->q_chroma_intra_matrix16[qscale][1];
+ qmat = s->q_chroma_intra_matrix16[qscale][0];
+ }
/* note: block[0] is assumed to be positive */
if (!s->h263_aic) {
__asm__ volatile (
@@ -136,8 +141,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
block[0]=0; //avoid fake overflow
// temp_block[0] = (block[0] + (q >> 1)) / q;
last_non_zero_p1 = 1;
- bias = s->q_intra_matrix16[qscale][1];
- qmat = s->q_intra_matrix16[qscale][0];
} else {
last_non_zero_p1 = 0;
bias = s->q_inter_matrix16[qscale][1];
@@ -173,7 +176,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"FF_REG_a" \n\t"
- "movzb %%al, %%"FF_REG_a" \n\t" // last_non_zero_p1
+ "movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat), "r" (bias),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -207,7 +210,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"FF_REG_a" \n\t"
- "movzb %%al, %%"FF_REG_a" \n\t" // last_non_zero_p1
+ "movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -221,7 +224,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"psubusw "MM"1, "MM"4 \n\t"
"packuswb "MM"4, "MM"4 \n\t"
#if COMPILE_TEMPLATE_SSE2
- "packuswb "MM"4, "MM"4 \n\t"
+ "packsswb "MM"4, "MM"4 \n\t"
#endif
"movd "MM"4, %0 \n\t" // *overflow
: "=g" (*overflow)
@@ -275,6 +278,50 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+ }else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
+ if(last_non_zero_p1 <= 1) goto end;
+ block[0x04] = temp_block[0x01];
+ block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+ if(last_non_zero_p1 <= 4) goto end;
+ block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+ block[0x05] = temp_block[0x03];
+ if(last_non_zero_p1 <= 7) goto end;
+ block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+ block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+ if(last_non_zero_p1 <= 11) goto end;
+ block[0x1C] = temp_block[0x19];
+ block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+ block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+ if(last_non_zero_p1 <= 16) goto end;
+ block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+ block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+ block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+ block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+ if(last_non_zero_p1 <= 24) goto end;
+ block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+ block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+ block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+ block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+ if(last_non_zero_p1 <= 32) goto end;
+ block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+ block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+ block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+ block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+ if(last_non_zero_p1 <= 40) goto end;
+ block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+ block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+ block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+ block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+ if(last_non_zero_p1 <= 48) goto end;
+ block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+ block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+ block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+ block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+ if(last_non_zero_p1 <= 56) goto end;
+ block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+ block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+ block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+ block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}else{
if(last_non_zero_p1 <= 1) goto end;
block[0x01] = temp_block[0x01];
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 9326ee776d..aec73f82dc 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -4,92 +4,151 @@
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
%include "libavutil/x86/x86util.asm"
-SECTION .text
+SECTION_RODATA
-INIT_MMX mmx
+cextern pw_1
+
+SECTION .text
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
movsxdifnidn r1, r1d
- mov r2, r1
- neg r2
- shl r2, 4
- sub r0, r2
- pxor m7, m7
- pxor m6, m6
+ mov r2, %1
+%if mmsize == 16
+ lea r3, [r1*3]
+%endif
+%if notcpuflag(xop)
+ pxor m5, m5
+%endif
+ pxor m4, m4
.loop:
- mova m0, [r0+r2+0]
- mova m1, [r0+r2+0]
- mova m2, [r0+r2+8]
- mova m3, [r0+r2+8]
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
+%if cpuflag(xop)
+ vphaddubq m0, [r0]
+ vphaddubq m1, [r0+r1]
+ vphaddubq m2, [r0+r1*2]
+ vphaddubq m3, [r0+r3]
+%else
+ mova m0, [r0]
+%if mmsize == 8
+ mova m1, [r0+8]
+%if cpuflag(mmxext)
+ mova m2, [r0+r1]
+ mova m3, [r0+r1+8]
+%endif
+%else ; sse2
+ mova m1, [r0+r1]
+ mova m2, [r0+r1*2]
+ mova m3, [r0+r3]
+%endif
+%if cpuflag(mmxext)
+ psadbw m0, m5
+ psadbw m1, m5
+ psadbw m2, m5
+ psadbw m3, m5
+%else ; mmx
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif ; cpuflag(mmxext)
+%endif ; cpuflag(xop)
paddw m1, m0
paddw m3, m2
paddw m3, m1
- paddw m6, m3
- add r2, r1
- js .loop
- mova m5, m6
- psrlq m6, 32
- paddw m6, m5
- mova m5, m6
- psrlq m6, 16
- paddw m6, m5
- movd eax, m6
- and eax, 0xffff
+ paddw m4, m3
+%if cpuflag(mmxext)
+ lea r0, [r0+r1*%3]
+%else
+ add r0, r1
+%endif
+ dec r2
+ jne .loop
+%if mmsize == 16
+ pshufd m0, m4, q0032
+ paddd m4, m0
+%elif notcpuflag(mmxext)
+ HADDW m4, m5
+%endif
+ movd eax, m4
RET
+%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16 8, 4, 2
+%endif
+INIT_XMM sse2
+PIX_SUM16 4, 4, 4
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+PIX_SUM16 4, 4, 4
+%endif
+
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_NORM1 2
+cglobal pix_norm1, 2, 3, %1
movsxdifnidn r1, r1d
- mov r2, 16
+ mov r2, %2
pxor m0, m0
- pxor m7, m7
+ pxor m5, m5
.loop:
mova m2, [r0+0]
+%if mmsize == 8
mova m3, [r0+8]
- mova m1, m2
- punpckhbw m1, m0
+%else
+ mova m3, [r0+r1]
+%endif
+ punpckhbw m1, m2, m0
punpcklbw m2, m0
- mova m4, m3
- punpckhbw m3, m0
- punpcklbw m4, m0
+ punpckhbw m4, m3, m0
+ punpcklbw m3, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
- paddd m7, m2
+ paddd m5, m2
+ paddd m5, m4
+%if mmsize == 8
add r0, r1
- paddd m7, m4
+%else
+ lea r0, [r0+r1*2]
+%endif
dec r2
jne .loop
- mova m1, m7
- psrlq m7, 32
- paddd m1, m7
- movd eax, m1
+ HADDD m5, m1
+ movd eax, m5
RET
+%endmacro
+
+INIT_MMX mmx
+PIX_NORM1 0, 16
+INIT_XMM sse2
+PIX_NORM1 6, 8
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index 71fbf2874f..532836cec9 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -1,29 +1,34 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideoencdsp.h"
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
+int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
+int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
#if HAVE_INLINE_ASM
@@ -123,7 +128,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
: "+r" (ptr)
: "r" ((x86_reg) wrap), "r" ((x86_reg) width),
"r" (ptr + wrap * height));
- } else {
+ } else if (w == 16) {
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
@@ -141,6 +146,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
+ : "+r"(ptr)
+ : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+ );
+ } else {
+ av_assert1(w == 4);
+ __asm__ volatile (
+ "1: \n\t"
+ "movd (%0), %%mm0 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpcklwd %%mm0, %%mm0 \n\t"
+ "movd %%mm0, -4(%0) \n\t"
+ "movd -4(%0, %2), %%mm1 \n\t"
+ "punpcklbw %%mm1, %%mm1 \n\t"
+ "punpckhwd %%mm1, %%mm1 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, (%0, %2) \n\t"
+ "add %1, %0 \n\t"
+ "cmp %3, %0 \n\t"
+ "jb 1b \n\t"
: "+r" (ptr)
: "r" ((x86_reg) wrap), "r" ((x86_reg) width),
"r" (ptr + wrap * height));
@@ -195,11 +219,26 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
{
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx;
}
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->pix_sum = ff_pix_sum16_mmxext;
+ }
+#endif
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->pix_sum = ff_pix_sum16_sse2;
+ c->pix_norm1 = ff_pix_norm1_sse2;
+ }
+
+ if (EXTERNAL_XOP(cpu_flags)) {
+ c->pix_sum = ff_pix_sum16_xop;
+ }
+
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
diff --git a/libavcodec/x86/pixblockdsp.asm b/libavcodec/x86/pixblockdsp.asm
index 871244297c..440fe29bcc 100644
--- a/libavcodec/x86/pixblockdsp.asm
+++ b/libavcodec/x86/pixblockdsp.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
@@ -50,7 +50,7 @@ cglobal get_pixels, 3,4
REP_RET
INIT_XMM sse2
-cglobal get_pixels, 3, 4
+cglobal get_pixels, 3, 4, 5
lea r3, [r2*3]
pxor m4, m4
movh m0, [r1]
@@ -80,28 +80,49 @@ cglobal get_pixels, 3, 4
mova [r0+0x70], m3
RET
-INIT_MMX mmx
; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
; ptrdiff_t stride);
-cglobal diff_pixels, 4,5
- pxor m7, m7
+%macro DIFF_PIXELS 0
+cglobal diff_pixels, 4,5,5
+ pxor m4, m4
add r0, 128
mov r4, -128
.loop:
- mova m0, [r1]
- mova m2, [r2]
- mova m1, m0
- mova m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
+ movq m0, [r1]
+ movq m2, [r2]
+%if mmsize == 8
+ movq m1, m0
+ movq m3, m2
+ punpcklbw m0, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m4
+ punpckhbw m3, m4
+%else
+ movq m1, [r1+r3]
+ movq m3, [r2+r3]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+%endif
psubw m0, m2
psubw m1, m3
mova [r0+r4+0], m0
- mova [r0+r4+8], m1
+ mova [r0+r4+mmsize], m1
+%if mmsize == 8
add r1, r3
add r2, r3
- add r4, 16
+%else
+ lea r1, [r1+r3*2]
+ lea r2, [r2+r3*2]
+%endif
+ add r4, 2 * mmsize
jne .loop
- REP_RET
+ RET
+%endmacro
+
+INIT_MMX mmx
+DIFF_PIXELS
+
+INIT_XMM sse2
+DIFF_PIXELS
diff --git a/libavcodec/x86/pixblockdsp_init.c b/libavcodec/x86/pixblockdsp_init.c
index faa5141327..fa9578a2d3 100644
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@ -1,20 +1,20 @@
/*
* SIMD-optimized pixel operations
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,6 +27,8 @@ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride);
+void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride);
av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
AVCodecContext *avctx,
@@ -43,5 +45,6 @@ av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
if (EXTERNAL_SSE2(cpu_flags)) {
if (!high_bit_depth)
c->get_pixels = ff_get_pixels_sse2;
+ c->diff_pixels = ff_diff_pixels_sse2;
}
}
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index 722caf0fd1..50e4255dec 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
and waq, ~(mmsize*2-1)
jmp .end_v
.loop_v:
- mova m0, [src1q+iq]
- mova m1, [src1q+iq+mmsize]
- paddb m0, [src2q+iq]
- paddb m1, [src2q+iq+mmsize]
- mova [dstq+iq ], m0
- mova [dstq+iq+mmsize], m1
+ movu m0, [src2q+iq]
+ movu m1, [src2q+iq+mmsize]
+ paddb m0, [src1q+iq]
+ paddb m1, [src1q+iq+mmsize]
+ movu [dstq+iq ], m0
+ movu [dstq+iq+mmsize], m1
add iq, mmsize*2
.end_v:
cmp iq, waq
@@ -157,7 +157,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
movh [dstq], m3
add dstq, bppq
cmp dstq, endq
- jle .loop
+ jl .loop
mov dstq, [rsp]
dec cntrq
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c
index 34a3da36d7..7dca62c675 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/x86/pngdsp_init.c
@@ -2,20 +2,20 @@
* x86 PNG optimizations.
* Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 9613fa1448..16fc262aeb 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -1,427 +1,66 @@
;******************************************************************************
;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT except for the clip range
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
;*
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
-%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
-%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
-%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
-%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
-%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
-%define W6sh2 8867 ; W6 = 35468 = 8867<<2
-%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
-
%if ARCH_X86_64
SECTION_RODATA
-w4_plus_w2: times 4 dw W4sh2, +W2sh2
-w4_min_w2: times 4 dw W4sh2, -W2sh2
-w4_plus_w6: times 4 dw W4sh2, +W6sh2
-w4_min_w6: times 4 dw W4sh2, -W6sh2
-w1_plus_w3: times 4 dw W1sh2, +W3sh2
-w3_min_w1: times 4 dw W3sh2, -W1sh2
-w7_plus_w3: times 4 dw W7sh2, +W3sh2
-w3_min_w7: times 4 dw W3sh2, -W7sh2
-w1_plus_w5: times 4 dw W1sh2, +W5sh2
-w5_min_w1: times 4 dw W5sh2, -W1sh2
-w5_plus_w7: times 4 dw W5sh2, +W7sh2
-w7_min_w5: times 4 dw W7sh2, -W5sh2
-row_round: times 8 dw (1<<14)
-
+pw_88: times 8 dw 0x2008
+cextern pw_1
cextern pw_4
-cextern pw_8
-cextern pw_512
cextern pw_1019
+; Below are defined in simple_idct10.asm built from selecting idctdsp
+cextern w4_plus_w2
+cextern w4_min_w2
+cextern w4_plus_w6
+cextern w4_min_w6
+cextern w1_plus_w3
+cextern w3_min_w1
+cextern w7_plus_w3
+cextern w3_min_w7
+cextern w1_plus_w5
+cextern w5_min_w1
+cextern w5_plus_w7
+cextern w7_min_w5
+
+%include "libavcodec/x86/simple_idct10_template.asm"
SECTION .text
-; interleave data while maintaining source
-; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
-%macro SBUTTERFLY3 5
- punpckl%1 m%2, m%4, m%5
- punpckh%1 m%3, m%4, m%5
-%endmacro
-
-; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
-; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
-; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
-%macro SUMSUB_SHPK 7
- psubd %3, %1, %5 ; { a0 - b0 }[0-3]
- psubd %4, %2, %6 ; { a0 - b0 }[4-7]
- paddd %1, %5 ; { a0 + b0 }[0-3]
- paddd %2, %6 ; { a0 + b0 }[4-7]
- psrad %1, %7
- psrad %2, %7
- psrad %3, %7
- psrad %4, %7
- packssdw %1, %2 ; row[0]
- packssdw %3, %4 ; row[7]
-%endmacro
-
-; %1 = row or col (for rounding variable)
-; %2 = number of bits to shift at the end
-%macro IDCT_1D 2
- ; a0 = (W4 * row[0]) + (1 << (15 - 1));
- ; a1 = a0;
- ; a2 = a0;
- ; a3 = a0;
- ; a0 += W2 * row[2];
- ; a1 += W6 * row[2];
- ; a2 -= W6 * row[2];
- ; a3 -= W2 * row[2];
-%ifidn %1, col
- paddw m10,[pw_8]
-%endif
- SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
-%ifidn %1, row
- psubw m10,[row_round]
-%endif
- SIGNEXTEND m8, m9, m14 ; { row[2] }[0-3] / [4-7]
- SIGNEXTEND m10, m11, m14 ; { row[0] }[0-3] / [4-7]
- pmaddwd m2, m0, [w4_plus_w6]
- pmaddwd m3, m1, [w4_plus_w6]
- pmaddwd m4, m0, [w4_min_w6]
- pmaddwd m5, m1, [w4_min_w6]
- pmaddwd m6, m0, [w4_min_w2]
- pmaddwd m7, m1, [w4_min_w2]
- pmaddwd m0, [w4_plus_w2]
- pmaddwd m1, [w4_plus_w2]
- pslld m2, 2
- pslld m3, 2
- pslld m4, 2
- pslld m5, 2
- pslld m6, 2
- pslld m7, 2
- pslld m0, 2
- pslld m1, 2
-
- ; a0: -1*row[0]-1*row[2]
- ; a1: -1*row[0]
- ; a2: -1*row[0]
- ; a3: -1*row[0]+1*row[2]
- psubd m2, m10 ; a1[0-3]
- psubd m3, m11 ; a1[4-7]
- psubd m4, m10 ; a2[0-3]
- psubd m5, m11 ; a2[4-7]
- psubd m0, m10
- psubd m1, m11
- psubd m6, m10
- psubd m7, m11
- psubd m0, m8 ; a0[0-3]
- psubd m1, m9 ; a0[4-7]
- paddd m6, m8 ; a3[0-3]
- paddd m7, m9 ; a3[4-7]
-
- ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
- ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
- ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
- ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
- SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
- SIGNEXTEND m13, m14, m10 ; { row[4] }[0-3] / [4-7]
- pmaddwd m10, m8, [w4_plus_w6]
- pmaddwd m11, m9, [w4_plus_w6]
- pslld m10, 2
- pslld m11, 2
- psubd m10, m13
- psubd m11, m14
- paddd m0, m10 ; a0[0-3]
- paddd m1, m11 ; a0[4-7]
- pmaddwd m10, m8, [w4_min_w6]
- pmaddwd m11, m9, [w4_min_w6]
- pslld m10, 2
- pslld m11, 2
- psubd m10, m13
- psubd m11, m14
- paddd m6, m10 ; a3[0-3]
- paddd m7, m11 ; a3[4-7]
- pmaddwd m10, m8, [w4_min_w2]
- pmaddwd m11, m9, [w4_min_w2]
- pmaddwd m8, [w4_plus_w2]
- pmaddwd m9, [w4_plus_w2]
- pslld m10, 2
- pslld m11, 2
- pslld m8, 2
- pslld m9, 2
- psubd m10, m13
- psubd m11, m14
- psubd m8, m13
- psubd m9, m14
- psubd m4, m10 ; a2[0-3] intermediate
- psubd m5, m11 ; a2[4-7] intermediate
- psubd m2, m8 ; a1[0-3] intermediate
- psubd m3, m9 ; a1[4-7] intermediate
- SIGNEXTEND m12, m13, m10 ; { row[6] }[0-3] / [4-7]
- psubd m4, m12 ; a2[0-3]
- psubd m5, m13 ; a2[4-7]
- paddd m2, m12 ; a1[0-3]
- paddd m3, m13 ; a1[4-7]
-
- ; load/store
- mova [r2+ 0], m0
- mova [r2+ 32], m2
- mova [r2+ 64], m4
- mova [r2+ 96], m6
- mova m10,[r2+ 16] ; { row[1] }[0-7]
- mova m8, [r2+ 48] ; { row[3] }[0-7]
- mova m13,[r2+ 80] ; { row[5] }[0-7]
- mova m14,[r2+112] ; { row[7] }[0-7]
- mova [r2+ 16], m1
- mova [r2+ 48], m3
- mova [r2+ 80], m5
- mova [r2+112], m7
-%ifidn %1, row
- pmullw m10,[r3+ 16]
- pmullw m8, [r3+ 48]
- pmullw m13,[r3+ 80]
- pmullw m14,[r3+112]
-%endif
-
- ; b0 = MUL(W1, row[1]);
- ; MAC(b0, W3, row[3]);
- ; b1 = MUL(W3, row[1]);
- ; MAC(b1, -W7, row[3]);
- ; b2 = MUL(W5, row[1]);
- ; MAC(b2, -W1, row[3]);
- ; b3 = MUL(W7, row[1]);
- ; MAC(b3, -W5, row[3]);
- SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
- SIGNEXTEND m10, m11, m12 ; { row[1] }[0-3] / [4-7]
- SIGNEXTEND m8, m9, m12 ; { row[3] }[0-3] / [4-7]
- pmaddwd m2, m0, [w3_min_w7]
- pmaddwd m3, m1, [w3_min_w7]
- pmaddwd m4, m0, [w5_min_w1]
- pmaddwd m5, m1, [w5_min_w1]
- pmaddwd m6, m0, [w7_min_w5]
- pmaddwd m7, m1, [w7_min_w5]
- pmaddwd m0, [w1_plus_w3]
- pmaddwd m1, [w1_plus_w3]
- pslld m2, 2
- pslld m3, 2
- pslld m4, 2
- pslld m5, 2
- pslld m6, 2
- pslld m7, 2
- pslld m0, 2
- pslld m1, 2
-
- ; b0: +1*row[1]+2*row[3]
- ; b1: +2*row[1]-1*row[3]
- ; b2: -1*row[1]-1*row[3]
- ; b3: +1*row[1]+1*row[3]
- psubd m2, m8
- psubd m3, m9
- paddd m0, m8
- paddd m1, m9
- paddd m8, m10 ; { row[1] + row[3] }[0-3]
- paddd m9, m11 ; { row[1] + row[3] }[4-7]
- paddd m10, m10
- paddd m11, m11
- paddd m0, m8 ; b0[0-3]
- paddd m1, m9 ; b0[4-7]
- paddd m2, m10 ; b1[0-3]
- paddd m3, m11 ; b2[4-7]
- psubd m4, m8 ; b2[0-3]
- psubd m5, m9 ; b2[4-7]
- paddd m6, m8 ; b3[0-3]
- paddd m7, m9 ; b3[4-7]
-
- ; MAC(b0, W5, row[5]);
- ; MAC(b0, W7, row[7]);
- ; MAC(b1, -W1, row[5]);
- ; MAC(b1, -W5, row[7]);
- ; MAC(b2, W7, row[5]);
- ; MAC(b2, W3, row[7]);
- ; MAC(b3, W3, row[5]);
- ; MAC(b3, -W1, row[7]);
- SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
- SIGNEXTEND m13, m12, m11 ; { row[5] }[0-3] / [4-7]
- SIGNEXTEND m14, m11, m10 ; { row[7] }[0-3] / [4-7]
-
- ; b0: -1*row[5]+1*row[7]
- ; b1: -1*row[5]+1*row[7]
- ; b2: +1*row[5]+2*row[7]
- ; b3: +2*row[5]-1*row[7]
- paddd m4, m13
- paddd m5, m12
- paddd m6, m13
- paddd m7, m12
- psubd m13, m14 ; { row[5] - row[7] }[0-3]
- psubd m12, m11 ; { row[5] - row[7] }[4-7]
- paddd m14, m14
- paddd m11, m11
- psubd m0, m13
- psubd m1, m12
- psubd m2, m13
- psubd m3, m12
- paddd m4, m14
- paddd m5, m11
- paddd m6, m13
- paddd m7, m12
-
- pmaddwd m10, m8, [w1_plus_w5]
- pmaddwd m11, m9, [w1_plus_w5]
- pmaddwd m12, m8, [w5_plus_w7]
- pmaddwd m13, m9, [w5_plus_w7]
- pslld m10, 2
- pslld m11, 2
- pslld m12, 2
- pslld m13, 2
- psubd m2, m10 ; b1[0-3]
- psubd m3, m11 ; b1[4-7]
- paddd m0, m12 ; b0[0-3]
- paddd m1, m13 ; b0[4-7]
- pmaddwd m12, m8, [w7_plus_w3]
- pmaddwd m13, m9, [w7_plus_w3]
- pmaddwd m8, [w3_min_w1]
- pmaddwd m9, [w3_min_w1]
- pslld m12, 2
- pslld m13, 2
- pslld m8, 2
- pslld m9, 2
- paddd m4, m12 ; b2[0-3]
- paddd m5, m13 ; b2[4-7]
- paddd m6, m8 ; b3[0-3]
- paddd m7, m9 ; b3[4-7]
-
- ; row[0] = (a0 + b0) >> 15;
- ; row[7] = (a0 - b0) >> 15;
- ; row[1] = (a1 + b1) >> 15;
- ; row[6] = (a1 - b1) >> 15;
- ; row[2] = (a2 + b2) >> 15;
- ; row[5] = (a2 - b2) >> 15;
- ; row[3] = (a3 + b3) >> 15;
- ; row[4] = (a3 - b3) >> 15;
- mova m8, [r2+ 0] ; a0[0-3]
- mova m9, [r2+16] ; a0[4-7]
- SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
- mova m0, [r2+32] ; a1[0-3]
- mova m1, [r2+48] ; a1[4-7]
- SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
- mova m1, [r2+64] ; a2[0-3]
- mova m2, [r2+80] ; a2[4-7]
- SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
- mova m2, [r2+96] ; a3[0-3]
- mova m3, [r2+112] ; a3[4-7]
- SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
-%endmacro
-
-; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t linesize,
-; int16_t *block, const int16_t *qmat);
-%macro idct_put_fn 1
-cglobal prores_idct_put_10, 4, 4, %1
- pxor m15, m15 ; zero
-
- ; for (i = 0; i < 8; i++)
- ; idctRowCondDC(block + i*8);
- mova m10,[r2+ 0] ; { row[0] }[0-7]
- mova m8, [r2+32] ; { row[2] }[0-7]
- mova m13,[r2+64] ; { row[4] }[0-7]
- mova m12,[r2+96] ; { row[6] }[0-7]
-
- pmullw m10,[r3+ 0]
- pmullw m8, [r3+32]
- pmullw m13,[r3+64]
- pmullw m12,[r3+96]
-
- IDCT_1D row, 17
-
- ; transpose for second part of IDCT
- TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
- mova [r2+ 16], m0
- mova [r2+ 48], m2
- mova [r2+ 80], m11
- mova [r2+112], m10
- SWAP 8, 10
- SWAP 1, 8
- SWAP 4, 13
- SWAP 9, 12
-
- ; for (i = 0; i < 8; i++)
- ; idctSparseColAdd(dest + i, line_size, block + i);
- IDCT_1D col, 20
-
- ; clip/store
- mova m6, [pw_512]
- mova m3, [pw_4]
- mova m5, [pw_1019]
- paddw m8, m6
- paddw m0, m6
- paddw m1, m6
- paddw m2, m6
- paddw m4, m6
- paddw m11, m6
- paddw m9, m6
- paddw m10, m6
- pmaxsw m8, m3
- pmaxsw m0, m3
- pmaxsw m1, m3
- pmaxsw m2, m3
- pmaxsw m4, m3
- pmaxsw m11, m3
- pmaxsw m9, m3
- pmaxsw m10, m3
- pminsw m8, m5
- pminsw m0, m5
- pminsw m1, m5
- pminsw m2, m5
- pminsw m4, m5
- pminsw m11, m5
- pminsw m9, m5
- pminsw m10, m5
-
- lea r2, [r1*3]
- mova [r0 ], m8
- mova [r0+r1 ], m0
- mova [r0+r1*2], m1
- mova [r0+r2 ], m2
- lea r0, [r0+r1*4]
- mova [r0 ], m4
- mova [r0+r1 ], m11
- mova [r0+r1*2], m9
- mova [r0+r2 ], m10
+%macro idct_fn 0
+cglobal prores_idct_put_10, 4, 4, 15
+ IDCT_FN pw_1, 15, pw_88, 18, pw_4, pw_1019, r3
RET
%endmacro
-%macro SIGNEXTEND 2-3
-%if cpuflag(sse4) ; dstlow, dsthigh
- movhlps %2, %1
- pmovsxwd %1, %1
- pmovsxwd %2, %2
-%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
- pxor %3, %3
- pcmpgtw %3, %1
- mova %2, %1
- punpcklwd %1, %3
- punpckhwd %2, %3
-%endif
-%endmacro
-
INIT_XMM sse2
-idct_put_fn 16
-INIT_XMM sse4
-idct_put_fn 16
+idct_fn
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
-idct_put_fn 16
+idct_fn
+%endif
%endif
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index ff4d39836b..8ca4d4d9b3 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -3,20 +3,20 @@
*
* Copyright (c) 2010-2011 Maxim Poliakovski
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,12 +27,10 @@
void ff_prores_idct_put_10_sse2(uint16_t *dst, ptrdiff_t linesize,
int16_t *block, const int16_t *qmat);
-void ff_prores_idct_put_10_sse4(uint16_t *dst, ptrdiff_t linesize,
- int16_t *block, const int16_t *qmat);
void ff_prores_idct_put_10_avx (uint16_t *dst, ptrdiff_t linesize,
int16_t *block, const int16_t *qmat);
-av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
{
#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
@@ -42,11 +40,6 @@ av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
dsp->idct_put = ff_prores_idct_put_10_sse2;
}
- if (EXTERNAL_SSE4(cpu_flags)) {
- dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
- dsp->idct_put = ff_prores_idct_put_10_sse4;
- }
-
if (EXTERNAL_AVX(cpu_flags)) {
dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
dsp->idct_put = ff_prores_idct_put_10_avx;
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 27a1c63b8a..4e72d5084f 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index ef5f1d8826..282faed14f 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -1,22 +1,23 @@
;******************************************************************************
-;* quarterpel DSP functions
-;*
+;* mpeg4 qpel
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index cdefe50a3c..3268d907ab 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -1,20 +1,22 @@
/*
* quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -77,13 +79,13 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
const uint8_t *src,
int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
#if HAVE_YASM
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
#define QPEL_OP(OPNAME, RND, MMX) \
static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, \
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index 0c76d91647..09946bd23f 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -7,20 +7,20 @@
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -30,7 +30,7 @@
#include "inline_asm.h"
// put_pixels
-STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
@@ -46,12 +46,12 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
+ "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
"add %3, %1 \n\t"
".p2align 3 \n\t"
"1: \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
- "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
@@ -67,11 +67,11 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
- "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
+ "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
+ "add %3, %%"FF_REG_a" \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm2\n\t" // 0 <-> 2 1 <-> 3
- "movq 1(%1, %%"FF_REG_a"), %%mm4\n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
@@ -87,8 +87,8 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
- "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
+ "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
+ "add %3, %%"FF_REG_a" \n\t"
"subl $2, %0 \n\t"
"jnz 1b \n\t"
@@ -99,7 +99,7 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
// avg_pixels
// this routine is 'slightly' suboptimal but mostly unused
-STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
@@ -115,12 +115,12 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
+ "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
"add %3, %1 \n\t"
".p2align 3 \n\t"
"1: \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
- "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
@@ -135,16 +135,16 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
"paddusw %%mm1, %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t"
- "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+ "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
- "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
+ "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
+ "add %3, %%"FF_REG_a" \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm2\n\t" // 0 <-> 2 1 <-> 3
- "movq 1(%1, %%"FF_REG_a"), %%mm4\n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
@@ -159,13 +159,13 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
"paddusw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
- "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+ "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t"
PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
- "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
+ "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
+ "add %3, %%"FF_REG_a" \n\t"
"subl $2, %0 \n\t"
"jnz 1b \n\t"
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b600..692b4acfcd 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -2,20 +2,20 @@
;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -64,6 +64,7 @@ rv34_idct dc
rv34_idct dc_noround
; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+%if ARCH_X86_32
INIT_MMX mmx
cglobal rv34_idct_dc_add, 3, 3
; calculate DC
@@ -97,6 +98,7 @@ cglobal rv34_idct_dc_add, 3, 3
movh [r2], m4
movh [r2+r1], m5
RET
+%endif
; Load coeffs and perform row transform
; Output: coeffs in mm[0467], rounder in mm5
@@ -167,7 +169,7 @@ cglobal rv34_idct_add, 3,3,0, d, s, b
ret
; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
-INIT_XMM sse4
+%macro RV34_IDCT_DC_ADD 0
cglobal rv34_idct_dc_add, 3, 3, 6
; load data
IDCT_DC_ROUND r2
@@ -190,7 +192,22 @@ cglobal rv34_idct_dc_add, 3, 3, 6
paddw m4, m0
packuswb m2, m4
movd [r0], m2
+%if cpuflag(sse4)
pextrd [r0+r1], m2, 1
pextrd [r2], m2, 2
pextrd [r2+r1], m2, 3
+%else
+ psrldq m2, 4
+ movd [r0+r1], m2
+ psrldq m2, 4
+ movd [r2], m2
+ psrldq m2, 4
+ movd [r2+r1], m2
+%endif
RET
+%endmacro
+
+INIT_XMM sse2
+RV34_IDCT_DC_ADD
+INIT_XMM sse4
+RV34_IDCT_DC_ADD
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 32d4c1aac2..7310122458 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -2,20 +2,20 @@
* RV30/40 MMX/SSE2 optimizations
* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,6 +27,7 @@
void ff_rv34_idct_dc_mmxext(int16_t *block);
void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
+void ff_rv34_idct_dc_add_sse2(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
@@ -34,12 +35,14 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
{
int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_MMX(cpu_flags))
+ if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
c->rv34_idct_add = ff_rv34_idct_add_mmxext;
}
+ if (EXTERNAL_SSE2(cpu_flags))
+ c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse2;
if (EXTERNAL_SSE4(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
}
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 77f6ddb25d..d0c3af0f8d 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index f6d4165452..340173d063 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -2,20 +2,20 @@
* RV40 decoder motion compensation functions x86-optimised
* Copyright (c) 2008 Konstantin Shishkov
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -32,6 +32,13 @@
#include "libavutil/x86/cpu.h"
#include "hpeldsp.h"
+#define DEFINE_FN(op, size, insn) \
+static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
+}
+
#if HAVE_YASM
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
@@ -75,7 +82,7 @@ static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
{ \
int i; \
if (PH && PV) { \
- DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
+ LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]); \
uint8_t *tmpptr = tmp + SIZE * 2; \
src -= stride * 2; \
\
@@ -127,8 +134,8 @@ QPEL_FUNCS_DECL(OP, 3, 2, OPT)
/** @} */
#define LOOPSIZE 8
-#define HCOFF(x) (32 * (x - 1))
-#define VCOFF(x) (32 * (x - 1))
+#define HCOFF(x) (32 * ((x) - 1))
+#define VCOFF(x) (32 * ((x) - 1))
QPEL_MC_DECL(put_, _ssse3)
QPEL_MC_DECL(avg_, _ssse3)
@@ -136,8 +143,8 @@ QPEL_MC_DECL(avg_, _ssse3)
#undef HCOFF
#undef VCOFF
#define LOOPSIZE 8
-#define HCOFF(x) (64 * (x - 1))
-#define VCOFF(x) (64 * (x - 1))
+#define HCOFF(x) (64 * ((x) - 1))
+#define VCOFF(x) (64 * ((x) - 1))
QPEL_MC_DECL(put_, _sse2)
QPEL_MC_DECL(avg_, _sse2)
@@ -146,8 +153,8 @@ QPEL_MC_DECL(avg_, _sse2)
#undef HCOFF
#undef VCOFF
#define LOOPSIZE 4
-#define HCOFF(x) (64 * (x - 1))
-#define VCOFF(x) (64 * (x - 1))
+#define HCOFF(x) (64 * ((x) - 1))
+#define VCOFF(x) (64 * ((x) - 1))
QPEL_MC_DECL(put_, _mmx)
@@ -186,34 +193,28 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \
QPEL_FUNCS_SET (OP, 3, 2, OPT)
/** @} */
+DEFINE_FN(put, 8, ssse3)
+
+DEFINE_FN(put, 16, sse2)
+DEFINE_FN(put, 16, ssse3)
+
+DEFINE_FN(avg, 8, mmxext)
+DEFINE_FN(avg, 8, ssse3)
+
+DEFINE_FN(avg, 16, sse2)
+DEFINE_FN(avg, 16, ssse3)
#endif /* HAVE_YASM */
#if HAVE_MMX_INLINE
-static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride)
-{
- ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride)
-{
- ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride)
-{
- ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride)
-{
- ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-#endif /* HAVE_MMX_INLINE */
+DEFINE_FN(put, 8, mmx)
+DEFINE_FN(avg, 8, mmx)
+DEFINE_FN(put, 16, mmx)
+DEFINE_FN(avg, 16, mmx)
+#endif
av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
{
- int cpu_flags = av_get_cpu_flags();
+ av_unused int cpu_flags = av_get_cpu_flags();
#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags)) {
@@ -240,6 +241,7 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
#endif
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmxext;
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
@@ -251,6 +253,8 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
#endif
}
if (EXTERNAL_SSE2(cpu_flags)) {
+ c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_sse2;
+ c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_sse2;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
@@ -259,6 +263,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
QPEL_MC_SET(avg_, _sse2)
}
if (EXTERNAL_SSSE3(cpu_flags)) {
+ c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_ssse3;
+ c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_ssse3;
+ c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_ssse3;
+ c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_ssse3;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index b449de5f9a..07a412b2ae 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -2,20 +2,20 @@
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -25,7 +25,14 @@ SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
-ps_neg times 4 dd 1<<31
+ps_mask3 dd 0, 0, 0, 1<<31
+ps_noise0 times 2 dd 1.0, 0.0,
+ps_noise2 times 2 dd -1.0, 0.0
+ps_noise13 dd 0.0, 1.0, 0.0, -1.0
+ dd 0.0, -1.0, 0.0, 1.0
+ dd 0.0, 1.0, 0.0, -1.0
+cextern sbr_noise_table
+cextern ps_neg
SECTION .text
@@ -136,7 +143,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
mova m3, m1
mova m4, m2
- mova m7, [ps_mask]
; Set pointers
%if ARCH_X86_64 == 0 || WIN64
@@ -156,30 +162,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
shl start, 3 ; offset from num loops
mova m0, [X_lowq + start]
- movlhps m1, m1 ; (a2 a3 a2 a3)
- movlhps m2, m2 ; (a0 a1 a0 a1)
- shufps m3, m3, q0101 ; (a3 a2 a3 a2)
- shufps m4, m4, q0101 ; (a1 a0 a1 a0)
- xorps m3, m7 ; (-a3 a2 -a3 a2)
- xorps m4, m7 ; (-a1 a0 -a1 a0)
+ shufps m3, m3, q1111
+ shufps m4, m4, q1111
+ xorps m3, [ps_mask]
+ shufps m1, m1, q0000
+ shufps m2, m2, q0000
+ xorps m4, [ps_mask]
.loop2:
- mova m5, m0
+ movu m7, [X_lowq + start + 8] ; BbCc
mova m6, m0
- shufps m0, m0, q2200 ; {Xl[-2][0],",Xl[-1][0],"}
- shufps m5, m5, q3311 ; {Xl[-2][1],",Xl[-1][1],"}
- mulps m0, m2
- mulps m5, m4
- mova m7, m6
- addps m5, m0
- mova m0, [X_lowq + start + 2*2*4]
- shufps m6, m0, q0022 ; {Xl[-1][0],",Xl[0][0],"}
- shufps m7, m0, q1133 ; {Xl[-1][1],",Xl[1][1],"}
- mulps m6, m1
+ mova m5, m7
+ shufps m0, m0, q2301 ; aAbB
+ shufps m7, m7, q2301 ; bBcC
+ mulps m0, m4
mulps m7, m3
- addps m5, m6
+ mulps m6, m2
+ mulps m5, m1
+ addps m7, m0
+ mova m0, [X_lowq + start +16] ; CcDd
addps m7, m0
- addps m5, m7
- mova [X_highq + start], m5
+ addps m6, m5
+ addps m7, m6
+ mova [X_highq + start], m7
add start, 16
jnz .loop2
RET
@@ -246,33 +250,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z
jne .loop
REP_RET
-INIT_XMM sse2
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY 0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
- mova m2, [src0q+cq+mmsize]
- mova m3, [src1q+mmsize]
- pshufd m4, m0, q0123
- pshufd m5, m1, q0123
- pshufd m6, m2, q0123
- pshufd m7, m3, q0123
- addps m3, m4
+ mova m4, [src0q+cq+mmsize]
+ mova m5, [src1q+mmsize]
+%if cpuflag(sse2)
+ pshufd m2, m0, q0123
+ pshufd m3, m1, q0123
+ pshufd m6, m4, q0123
+ pshufd m7, m5, q0123
+%else
+ shufps m2, m0, m0, q0123
+ shufps m3, m1, m1, q0123
+ shufps m6, m4, m4, q0123
+ shufps m7, m5, m5, q0123
+%endif
+ addps m5, m2
subps m0, m7
addps m1, m6
- subps m2, m5
+ subps m4, m3
mova [vrevq], m1
- mova [vrevq+mmsize], m3
+ mova [vrevq+mmsize], m5
mova [vq+cq], m0
- mova [vq+cq+mmsize], m2
+ mova [vq+cq+mmsize], m4
add src1q, 2*mmsize
add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
INIT_XMM sse2
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
@@ -303,3 +321,228 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
movq m2, [zq]
movq [r2q], m2
REP_RET
+
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+
+%macro LOAD_NST 1
+%ifdef PIC
+ lea NOISE_TABLE, [%1]
+ mova m0, [kxq + NOISE_TABLE]
+%else
+ mova m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ mova m0, [ps_noise0]
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ and kxq, 1
+ shl kxq, 4
+ LOAD_NST ps_noise13
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ mova m0, [ps_noise2]
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ and kxq, 1
+ shl kxq, 4
+ LOAD_NST ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+ mov kxd, m_maxm
+%define count kxq
+%else
+%define count m_maxq
+%endif
+ movsxdifnidn noiseq, noised
+ dec noiseq
+ shl count, 2
+%ifdef PIC
+ lea NOISE_TABLE, [sbr_noise_table]
+%endif
+ lea Yq, [Yq + 2*count]
+ add s_mq, count
+ add q_filtq, count
+ shl noiseq, 3
+ pxor m5, m5
+ neg count
+.loop:
+ mova m1, [q_filtq + count]
+ movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
+ movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
+ add noiseq, 2*mmsize
+ and noiseq, 0x1ff<<3
+ punpckhdq m2, m1, m1
+ punpckldq m1, m1
+ mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+ mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+ mova m3, [s_mq + count]
+ ; TODO: replace by a vpermd in AVX2
+ punpckhdq m4, m3, m3
+ punpckldq m3, m3
+ pcmpeqd m6, m3, m5 ; m6 == 0
+ pcmpeqd m7, m4, m5 ; m7 == 0
+ mulps m3, m0 ; s_m[m] * phi_sign
+ mulps m4, m0 ; s_m[m] * phi_sign
+ pand m1, m6
+ pand m2, m7
+ movu m6, [Yq + 2*count]
+ movu m7, [Yq + 2*count + mmsize]
+ addps m3, m1
+ addps m4, m2
+ addps m6, m3
+ addps m7, m4
+ movu [Yq + 2*count], m6
+ movu [Yq + 2*count + mmsize], m7
+ add count, mmsize
+ jl .loop
+ RET
+
+INIT_XMM sse
+cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
+%define COUNT 32*4
+%define OFFSET 32*4
+ mov cq, -COUNT
+ lea vrevq, [vq + OFFSET + COUNT]
+ add vq, OFFSET-mmsize
+ add srcq, 2*COUNT
+ mova m3, [ps_neg]
+.loop:
+ mova m0, [srcq + 2*cq + 0*mmsize]
+ mova m1, [srcq + 2*cq + 1*mmsize]
+ shufps m2, m0, m1, q2020
+ shufps m1, m0, q1313
+ xorps m2, m3
+ mova [vq], m1
+ mova [vrevq + cq], m2
+ sub vq, mmsize
+ add cq, mmsize
+ jl .loop
+ REP_RET
+
+%macro SBR_AUTOCORRELATE 0
+cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
+ mov cntq, 37*8
+ add xq, cntq
+ neg cntq
+
+%if cpuflag(sse3)
+%define MOVH movsd
+ movddup m5, [xq+cntq]
+%else
+%define MOVH movlps
+ movlps m5, [xq+cntq]
+ movlhps m5, m5
+%endif
+ MOVH m7, [xq+cntq+8 ]
+ MOVH m1, [xq+cntq+16]
+ shufps m7, m7, q0110
+ shufps m1, m1, q0110
+ mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
+ mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1];
+ mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
+ movaps [rsp ], m3
+ movaps [rsp+16], m4
+ add cntq, 8
+
+ MOVH m2, [xq+cntq+16]
+ movlhps m7, m7
+ shufps m2, m2, q0110
+ mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
+ mulps m4, m7, m2
+ mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1];
+ addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
+
+align 16
+.loop:
+ add cntq, 8
+ MOVH m0, [xq+cntq+16]
+ movlhps m1, m1
+ shufps m0, m0, q0110
+ mulps m3, m1, m2
+ mulps m4, m1, m0
+ mulps m1, m1
+ addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+ addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+ addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
+ add cntq, 8
+ MOVH m1, [xq+cntq+16]
+ movlhps m2, m2
+ shufps m1, m1, q0110
+ mulps m3, m2, m0
+ mulps m4, m2, m1
+ mulps m2, m2
+ addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+ addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+ addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
+ add cntq, 8
+ MOVH m2, [xq+cntq+16]
+ movlhps m0, m0
+ shufps m2, m2, q0110
+ mulps m3, m0, m1
+ mulps m4, m0, m2
+ mulps m0, m0
+ addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+ addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+ addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
+ jl .loop
+
+ movlhps m1, m1
+ mulps m2, m1
+ mulps m1, m1
+ addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
+ addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
+ addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
+ addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
+
+ xorps m2, [ps_mask3]
+ xorps m5, [ps_mask3]
+ xorps m6, [ps_mask3]
+ HADDPS m2, m5, m3
+ HADDPS m7, m6, m4
+%if cpuflag(sse3)
+ movshdup m0, m1
+%else
+ movss m0, m1
+ shufps m1, m1, q0001
+%endif
+ addss m1, m0
+ movaps [phiq ], m2
+ movhps [phiq+0x18], m7
+ movss [phiq+0x28], m7
+ movss [phiq+0x10], m1
+ RET
+%endmacro
+
+INIT_XMM sse
+SBR_AUTOCORRELATE
+INIT_XMM sse3
+SBR_AUTOCORRELATE
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 9600852163..6911a1a515 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -2,20 +2,20 @@
* AAC Spectral Band Replication decoding functions
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -34,9 +34,28 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
float bw, int start, int end);
void ff_sbr_neg_odd_64_sse(float *z);
void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_pre_shuffle_sse2(float *z);
+void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+
+void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
+
+void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
+void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
+
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -48,10 +67,21 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
s->hf_g_filt = ff_sbr_hf_g_filt_sse;
s->hf_gen = ff_sbr_hf_gen_sse;
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+ s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
+ s->qmf_deint_neg = ff_sbr_qmf_deint_neg_sse;
+ s->autocorrelate = ff_sbr_autocorrelate_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2;
+ s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
+ s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
+ s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
+ s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
+ }
+
+ if (EXTERNAL_SSE3(cpu_flags)) {
+ s->autocorrelate = ff_sbr_autocorrelate_sse3;
}
}
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index 0939a49a3a..d3a19fa60d 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -3,24 +3,23 @@
*
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
@@ -86,7 +85,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
static inline void idct(int16_t *block)
{
- DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+ LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
int16_t * const temp= (int16_t*)align_tmp;
__asm__ volatile(
@@ -893,6 +892,7 @@ Temp
"9: \n\t"
:: "r" (block), "r" (temp), "r" (coeffs)
+ NAMED_CONSTRAINTS_ADD(wm1010,d40000)
: "%eax"
);
}
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 15784a9501..ad76bafd85 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -26,4 +26,16 @@ void ff_simple_idct_mmx(int16_t *block);
void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct10_sse2(int16_t *block);
+void ff_simple_idct10_avx(int16_t *block);
+
+void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct12_sse2(int16_t *block);
+void ff_simple_idct12_avx(int16_t *block);
+
+void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
#endif /* AVCODEC_X86_SIMPLE_IDCT_H */
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
new file mode 100644
index 0000000000..5dee533de0
--- /dev/null
+++ b/libavcodec/x86/simple_idct10.asm
@@ -0,0 +1,100 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2015 Christophe Gisquet
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+cextern pw_2
+cextern pw_16
+cextern pw_1023
+cextern pw_4095
+pd_round_12: times 4 dd 1<<(12-1)
+pd_round_15: times 4 dd 1<<(15-1)
+pd_round_19: times 4 dd 1<<(19-1)
+
+%macro CONST_DEC 3
+const %1
+times 4 dw %2, %3
+%endmacro
+
+%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
+%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
+%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
+%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
+%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
+%define W6sh2 8867 ; W6 = 35468 = 8867<<2
+%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
+
+CONST_DEC w4_plus_w2, W4sh2, +W2sh2
+CONST_DEC w4_min_w2, W4sh2, -W2sh2
+CONST_DEC w4_plus_w6, W4sh2, +W6sh2
+CONST_DEC w4_min_w6, W4sh2, -W6sh2
+CONST_DEC w1_plus_w3, W1sh2, +W3sh2
+CONST_DEC w3_min_w1, W3sh2, -W1sh2
+CONST_DEC w7_plus_w3, W7sh2, +W3sh2
+CONST_DEC w3_min_w7, W3sh2, -W7sh2
+CONST_DEC w1_plus_w5, W1sh2, +W5sh2
+CONST_DEC w5_min_w1, W5sh2, -W1sh2
+CONST_DEC w5_plus_w7, W5sh2, +W7sh2
+CONST_DEC w7_min_w5, W7sh2, -W5sh2
+
+%include "libavcodec/x86/simple_idct10_template.asm"
+
+SECTION .text
+
+%macro idct_fn 0
+cglobal simple_idct10, 1, 1, 16
+ IDCT_FN "", 12, "", 19
+ RET
+
+cglobal simple_idct10_put, 3, 3, 16
+ IDCT_FN "", 12, "", 19, 0, pw_1023
+ RET
+
+cglobal simple_idct12, 1, 1, 16
+ ; coeffs are already 15bits, adding the offset would cause
+ ; overflow in the input
+ IDCT_FN "", 15, pw_2, 16
+ RET
+
+cglobal simple_idct12_put, 3, 3, 16
+ ; range isn't known, so the C simple_idct range is used
+ ; Also, using a bias on input overflows, so use the bias
+ ; on output of the first butterfly instead
+ IDCT_FN "", 15, pw_2, 16, 0, pw_4095
+ RET
+%endmacro
+
+INIT_XMM sse2
+idct_fn
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+idct_fn
+%endif
+
+%endif
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
new file mode 100644
index 0000000000..9d323d99b3
--- /dev/null
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -0,0 +1,315 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+; add SECTION_RODATA and proper include before including this file!
+
+%if ARCH_X86_64
+
+; interleave data while maintaining source
+; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
+%macro SBUTTERFLY3 5
+ punpckl%1 m%2, m%4, m%5
+ punpckh%1 m%3, m%4, m%5
+%endmacro
+
+; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
+; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
+; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
+%macro SUMSUB_SHPK 7
+ psubd %3, %1, %5 ; { a0 - b0 }[0-3]
+ psubd %4, %2, %6 ; { a0 - b0 }[4-7]
+ paddd %1, %5 ; { a0 + b0 }[0-3]
+ paddd %2, %6 ; { a0 + b0 }[4-7]
+ psrad %1, %7
+ psrad %2, %7
+ psrad %3, %7
+ psrad %4, %7
+ packssdw %1, %2 ; row[0]
+ packssdw %3, %4 ; row[7]
+%endmacro
+
+; %1 = initial bias ("" if nop)
+; %2 = number of bits to shift at the end
+; %3 = qmat (for prores)
+%macro IDCT_1D 2-3
+ ; a0 = (W4 * row[0]) + (1 << (15 - 1));
+ ; a1 = a0;
+ ; a2 = a0;
+ ; a3 = a0;
+ ; a0 += W2 * row[2];
+ ; a1 += W6 * row[2];
+ ; a2 -= W6 * row[2];
+ ; a3 -= W2 * row[2];
+%ifstr %1
+ mova m15, [pd_round_ %+ %2]
+%else
+ paddw m10, [%1]
+%endif
+ SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
+ pmaddwd m2, m0, [w4_plus_w6]
+ pmaddwd m3, m1, [w4_plus_w6]
+ pmaddwd m4, m0, [w4_min_w6]
+ pmaddwd m5, m1, [w4_min_w6]
+ pmaddwd m6, m0, [w4_min_w2]
+ pmaddwd m7, m1, [w4_min_w2]
+ pmaddwd m0, [w4_plus_w2]
+ pmaddwd m1, [w4_plus_w2]
+%ifstr %1
+ ; Adding 1<<(%2-1) for >=15 bits values
+ paddd m2, m15
+ paddd m3, m15
+ paddd m4, m15
+ paddd m5, m15
+ paddd m6, m15
+ paddd m7, m15
+ paddd m0, m15
+ paddd m1, m15
+%endif
+
+ ; a0: -1*row[0]-1*row[2]
+ ; a1: -1*row[0]
+ ; a2: -1*row[0]
+ ; a3: -1*row[0]+1*row[2]
+
+ ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
+ ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
+ ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
+ ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
+ SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
+ pmaddwd m10, m8, [w4_plus_w6]
+ pmaddwd m11, m9, [w4_plus_w6]
+ paddd m0, m10 ; a0[0-3]
+ paddd m1, m11 ; a0[4-7]
+ pmaddwd m10, m8, [w4_min_w6]
+ pmaddwd m11, m9, [w4_min_w6]
+ paddd m6, m10 ; a3[0-3]
+ paddd m7, m11 ; a3[4-7]
+ pmaddwd m10, m8, [w4_min_w2]
+ pmaddwd m11, m9, [w4_min_w2]
+ pmaddwd m8, [w4_plus_w2]
+ pmaddwd m9, [w4_plus_w2]
+ psubd m4, m10 ; a2[0-3] intermediate
+ psubd m5, m11 ; a2[4-7] intermediate
+ psubd m2, m8 ; a1[0-3] intermediate
+ psubd m3, m9 ; a1[4-7] intermediate
+
+ ; load/store
+ mova [COEFFS+ 0], m0
+ mova [COEFFS+ 32], m2
+ mova [COEFFS+ 64], m4
+ mova [COEFFS+ 96], m6
+ mova m10,[COEFFS+ 16] ; { row[1] }[0-7]
+ mova m8, [COEFFS+ 48] ; { row[3] }[0-7]
+ mova m13,[COEFFS+ 80] ; { row[5] }[0-7]
+ mova m14,[COEFFS+112] ; { row[7] }[0-7]
+ mova [COEFFS+ 16], m1
+ mova [COEFFS+ 48], m3
+ mova [COEFFS+ 80], m5
+ mova [COEFFS+112], m7
+%if %0 == 3
+ pmullw m10,[%3+ 16]
+ pmullw m8, [%3+ 48]
+ pmullw m13,[%3+ 80]
+ pmullw m14,[%3+112]
+%endif
+
+ ; b0 = MUL(W1, row[1]);
+ ; MAC(b0, W3, row[3]);
+ ; b1 = MUL(W3, row[1]);
+ ; MAC(b1, -W7, row[3]);
+ ; b2 = MUL(W5, row[1]);
+ ; MAC(b2, -W1, row[3]);
+ ; b3 = MUL(W7, row[1]);
+ ; MAC(b3, -W5, row[3]);
+ SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
+ pmaddwd m2, m0, [w3_min_w7]
+ pmaddwd m3, m1, [w3_min_w7]
+ pmaddwd m4, m0, [w5_min_w1]
+ pmaddwd m5, m1, [w5_min_w1]
+ pmaddwd m6, m0, [w7_min_w5]
+ pmaddwd m7, m1, [w7_min_w5]
+ pmaddwd m0, [w1_plus_w3]
+ pmaddwd m1, [w1_plus_w3]
+
+ ; b0: +1*row[1]+2*row[3]
+ ; b1: +2*row[1]-1*row[3]
+ ; b2: -1*row[1]-1*row[3]
+ ; b3: +1*row[1]+1*row[3]
+
+ ; MAC(b0, W5, row[5]);
+ ; MAC(b0, W7, row[7]);
+ ; MAC(b1, -W1, row[5]);
+ ; MAC(b1, -W5, row[7]);
+ ; MAC(b2, W7, row[5]);
+ ; MAC(b2, W3, row[7]);
+ ; MAC(b3, W3, row[5]);
+ ; MAC(b3, -W1, row[7]);
+ SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
+
+ ; b0: -1*row[5]+1*row[7]
+ ; b1: -1*row[5]+1*row[7]
+ ; b2: +1*row[5]+2*row[7]
+ ; b3: +2*row[5]-1*row[7]
+
+ pmaddwd m10, m8, [w1_plus_w5]
+ pmaddwd m11, m9, [w1_plus_w5]
+ pmaddwd m12, m8, [w5_plus_w7]
+ pmaddwd m13, m9, [w5_plus_w7]
+ psubd m2, m10 ; b1[0-3]
+ psubd m3, m11 ; b1[4-7]
+ paddd m0, m12 ; b0[0-3]
+ paddd m1, m13 ; b0[4-7]
+ pmaddwd m12, m8, [w7_plus_w3]
+ pmaddwd m13, m9, [w7_plus_w3]
+ pmaddwd m8, [w3_min_w1]
+ pmaddwd m9, [w3_min_w1]
+ paddd m4, m12 ; b2[0-3]
+ paddd m5, m13 ; b2[4-7]
+ paddd m6, m8 ; b3[0-3]
+ paddd m7, m9 ; b3[4-7]
+
+ ; row[0] = (a0 + b0) >> 15;
+ ; row[7] = (a0 - b0) >> 15;
+ ; row[1] = (a1 + b1) >> 15;
+ ; row[6] = (a1 - b1) >> 15;
+ ; row[2] = (a2 + b2) >> 15;
+ ; row[5] = (a2 - b2) >> 15;
+ ; row[3] = (a3 + b3) >> 15;
+ ; row[4] = (a3 - b3) >> 15;
+ mova m8, [COEFFS+ 0] ; a0[0-3]
+ mova m9, [COEFFS+16] ; a0[4-7]
+ SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
+ mova m0, [COEFFS+32] ; a1[0-3]
+ mova m1, [COEFFS+48] ; a1[4-7]
+ SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
+ mova m1, [COEFFS+64] ; a2[0-3]
+ mova m2, [COEFFS+80] ; a2[4-7]
+ SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
+ mova m2, [COEFFS+96] ; a3[0-3]
+ mova m3, [COEFFS+112] ; a3[4-7]
+ SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
+%endmacro
+
+; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride,
+; int16_t *block, const int16_t *qmat);
+
+; %1 = row shift
+; %2 = row bias macro
+; %3 = column shift
+; %4 = column bias macro
+; %5 = min pixel value
+; %6 = max pixel value
+; %7 = qmat (for prores)
+
+%macro IDCT_FN 4-7
+%if %0 == 4
+ ; No clamping, means pure idct
+%xdefine COEFFS r0
+%else
+ movsxd r1, r1d
+%xdefine COEFFS r2
+%endif
+
+ ; for (i = 0; i < 8; i++)
+ ; idctRowCondDC(block + i*8);
+ mova m10,[COEFFS+ 0] ; { row[0] }[0-7]
+ mova m8, [COEFFS+32] ; { row[2] }[0-7]
+ mova m13,[COEFFS+64] ; { row[4] }[0-7]
+ mova m12,[COEFFS+96] ; { row[6] }[0-7]
+
+%if %0 == 7
+ pmullw m10,[%7+ 0]
+ pmullw m8, [%7+32]
+ pmullw m13,[%7+64]
+ pmullw m12,[%7+96]
+
+ IDCT_1D %1, %2, %7
+%else
+ IDCT_1D %1, %2
+%endif
+
+ ; transpose for second part of IDCT
+ TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
+ mova [COEFFS+ 16], m0
+ mova [COEFFS+ 48], m2
+ mova [COEFFS+ 80], m11
+ mova [COEFFS+112], m10
+ SWAP 8, 10
+ SWAP 1, 8
+ SWAP 4, 13
+ SWAP 9, 12
+
+ ; for (i = 0; i < 8; i++)
+ ; idctSparseColAdd(dest + i, line_size, block + i);
+ IDCT_1D %3, %4
+
+ ; clip/store
+%if %0 == 4
+ ; No clamping, means pure idct
+ mova [r0+ 0], m8
+ mova [r0+ 16], m0
+ mova [r0+ 32], m1
+ mova [r0+ 48], m2
+ mova [r0+ 64], m4
+ mova [r0+ 80], m11
+ mova [r0+ 96], m9
+ mova [r0+112], m10
+%else
+%ifidn %5, 0
+ pxor m3, m3
+%else
+ mova m3, [%5]
+%endif
+ mova m5, [%6]
+ pmaxsw m8, m3
+ pmaxsw m0, m3
+ pmaxsw m1, m3
+ pmaxsw m2, m3
+ pmaxsw m4, m3
+ pmaxsw m11, m3
+ pmaxsw m9, m3
+ pmaxsw m10, m3
+ pminsw m8, m5
+ pminsw m0, m5
+ pminsw m1, m5
+ pminsw m2, m5
+ pminsw m4, m5
+ pminsw m11, m5
+ pminsw m9, m5
+ pminsw m10, m5
+
+ lea r2, [r1*3]
+ mova [r0 ], m8
+ mova [r0+r1 ], m0
+ mova [r0+r1*2], m1
+ mova [r0+r2 ], m2
+ lea r0, [r0+r1*4]
+ mova [r0 ], m4
+ mova [r0+r1 ], m11
+ mova [r0+r1*2], m9
+ mova [r0+r2 ], m10
+%endif
+%endmacro
+
+%endif
diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c
new file mode 100644
index 0000000000..218e6864db
--- /dev/null
+++ b/libavcodec/x86/snowdsp.c
@@ -0,0 +1,908 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+ const int w2= (width+1)>>1;
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ IDWTELEM * const ref = b + w2 - 1;
+ IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+ // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+ // The savings in code and time are well worth having to store this value and
+ // calculate b[0] correctly afterwards.
+
+ i = 0;
+ __asm__ volatile(
+ "pcmpeqd %%xmm7, %%xmm7 \n\t"
+ "pcmpeqd %%xmm3, %%xmm3 \n\t"
+ "psllw $1, %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "psllw $13, %%xmm3 \n\t"
+ ::);
+ for(; i<w_l-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm2 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
+ "pmulhw %%xmm3, %%xmm2 \n\t"
+ "pmulhw %%xmm3, %%xmm6 \n\t"
+ "paddw (%0), %%xmm2 \n\t"
+ "paddw 16(%0), %%xmm6 \n\t"
+ "movdqa %%xmm2, (%0) \n\t"
+ "movdqa %%xmm6, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ }
+
+ { // Lift 1
+ IDWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+ dst[i] = dst[i] - (b[i] + b[i + 1]);
+ }
+ for(; i<w_r-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqa 16(%0), %%xmm4 \n\t"
+ "psubw %%xmm2, %%xmm0 \n\t"
+ "psubw %%xmm6, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ IDWTELEM * const ref = b+w2 - 1;
+ IDWTELEM b_0 = b[0];
+
+ i = 0;
+ __asm__ volatile(
+ "psllw $15, %%xmm7 \n\t"
+ "pcmpeqw %%xmm6, %%xmm6 \n\t"
+ "psrlw $13, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
+ ::);
+ for(; i<w_l-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu 16(%1), %%xmm4 \n\t"
+ "movdqu 2(%1), %%xmm1 \n\t"
+ "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
+ "paddw %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm5, %%xmm4 \n\t"
+ "psubw %%xmm7, %%xmm0 \n\t"
+ "psubw %%xmm7, %%xmm4 \n\t"
+ "psraw $1, %%xmm0 \n\t"
+ "psraw $1, %%xmm4 \n\t"
+ "movdqa (%0), %%xmm1 \n\t"
+ "movdqa 16(%0), %%xmm5 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "psraw $2, %%xmm0 \n\t"
+ "psraw $2, %%xmm4 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+ }
+
+ { // Lift 3
+ IDWTELEM * const src = b+w2;
+
+ i = 0;
+ for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+ temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+ }
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw (%1), %%xmm2 \n\t"
+ "paddw 16(%1), %%xmm6 \n\t"
+ "movdqu (%0), %%xmm0 \n\t"
+ "movdqu 16(%0), %%xmm4 \n\t"
+ "paddw %%xmm2, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "psraw $1, %%xmm2 \n\t"
+ "psraw $1, %%xmm6 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "paddw %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm2, (%2) \n\t"
+ "movdqa %%xmm6, 16(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x3E) != 0x3E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=62; i>=0; i-=64){
+ __asm__ volatile(
+ "movdqa (%1), %%xmm0 \n\t"
+ "movdqa 16(%1), %%xmm2 \n\t"
+ "movdqa 32(%1), %%xmm4 \n\t"
+ "movdqa 48(%1), %%xmm6 \n\t"
+ "movdqa (%1), %%xmm1 \n\t"
+ "movdqa 16(%1), %%xmm3 \n\t"
+ "movdqa 32(%1), %%xmm5 \n\t"
+ "movdqa 48(%1), %%xmm7 \n\t"
+ "punpcklwd (%2), %%xmm0 \n\t"
+ "punpcklwd 16(%2), %%xmm2 \n\t"
+ "punpcklwd 32(%2), %%xmm4 \n\t"
+ "punpcklwd 48(%2), %%xmm6 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm2, 32(%0) \n\t"
+ "movdqa %%xmm4, 64(%0) \n\t"
+ "movdqa %%xmm6, 96(%0) \n\t"
+ "punpckhwd (%2), %%xmm1 \n\t"
+ "punpckhwd 16(%2), %%xmm3 \n\t"
+ "punpckhwd 32(%2), %%xmm5 \n\t"
+ "punpckhwd 48(%2), %%xmm7 \n\t"
+ "movdqa %%xmm1, 16(%0) \n\t"
+ "movdqa %%xmm3, 48(%0) \n\t"
+ "movdqa %%xmm5, 80(%0) \n\t"
+ "movdqa %%xmm7, 112(%0) \n\t"
+ :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+ const int w2= (width+1)>>1;
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ IDWTELEM * const ref = b + w2 - 1;
+
+ i = 1;
+ b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm3, %%mm3 \n\t"
+ "psllw $1, %%mm3 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "psllw $13, %%mm3 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%1), %%mm6 \n\t"
+ "paddw 2(%1), %%mm2 \n\t"
+ "paddw 10(%1), %%mm6 \n\t"
+ "paddw %%mm7, %%mm2 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
+ "pmulhw %%mm3, %%mm2 \n\t"
+ "pmulhw %%mm3, %%mm6 \n\t"
+ "paddw (%0), %%mm2 \n\t"
+ "paddw 8(%0), %%mm6 \n\t"
+ "movq %%mm2, (%0) \n\t"
+ "movq %%mm6, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ }
+
+ { // Lift 1
+ IDWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%1), %%mm6 \n\t"
+ "paddw 2(%1), %%mm2 \n\t"
+ "paddw 10(%1), %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm6, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ IDWTELEM * const ref = b+w2 - 1;
+
+ i = 1;
+ b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+ __asm__ volatile(
+ "psllw $15, %%mm7 \n\t"
+ "pcmpeqw %%mm6, %%mm6 \n\t"
+ "psrlw $13, %%mm6 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm4 \n\t"
+ "movq 2(%1), %%mm1 \n\t"
+ "movq 10(%1), %%mm5 \n\t"
+ "paddw %%mm6, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm5, %%mm4 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm4 \n\t"
+ "psraw $1, %%mm0 \n\t"
+ "psraw $1, %%mm4 \n\t"
+ "movq (%0), %%mm1 \n\t"
+ "movq 8(%0), %%mm5 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm5, %%mm4 \n\t"
+ "psraw $2, %%mm0 \n\t"
+ "psraw $2, %%mm4 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm5, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ }
+
+ { // Lift 3
+ IDWTELEM * const src = b+w2;
+ i = 0;
+
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movq 2(%1), %%mm2 \n\t"
+ "movq 10(%1), %%mm6 \n\t"
+ "paddw (%1), %%mm2 \n\t"
+ "paddw 8(%1), %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "psraw $1, %%mm2 \n\t"
+ "psraw $1, %%mm6 \n\t"
+ "paddw %%mm0, %%mm2 \n\t"
+ "paddw %%mm4, %%mm6 \n\t"
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm6, 8(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x1E) != 0x1E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=30; i>=0; i-=32){
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm6 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm5 \n\t"
+ "movq 24(%1), %%mm7 \n\t"
+ "punpcklwd (%2), %%mm0 \n\t"
+ "punpcklwd 8(%2), %%mm2 \n\t"
+ "punpcklwd 16(%2), %%mm4 \n\t"
+ "punpcklwd 24(%2), %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, 16(%0) \n\t"
+ "movq %%mm4, 32(%0) \n\t"
+ "movq %%mm6, 48(%0) \n\t"
+ "punpckhwd (%2), %%mm1 \n\t"
+ "punpckhwd 8(%2), %%mm3 \n\t"
+ "punpckhwd 16(%2), %%mm5 \n\t"
+ "punpckhwd 24(%2), %%mm7 \n\t"
+ "movq %%mm1, 8(%0) \n\t"
+ "movq %%mm3, 24(%0) \n\t"
+ "movq %%mm5, 40(%0) \n\t"
+ "movq %%mm7, 56(%0) \n\t"
+ :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+ ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
+ ""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\
+ ""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\
+ ""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+ snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+ snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "psubw %%"s0", %%"t0" \n\t"\
+ "psubw %%"s1", %%"t1" \n\t"\
+ "psubw %%"s2", %%"t2" \n\t"\
+ "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+ "movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\
+ "movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\
+ "movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\
+ "movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+ "psraw $"n", %%"t0" \n\t"\
+ "psraw $"n", %%"t1" \n\t"\
+ "psraw $"n", %%"t2" \n\t"\
+ "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "paddw %%"s0", %%"t0" \n\t"\
+ "paddw %%"s1", %%"t1" \n\t"\
+ "paddw %%"s2", %%"t2" \n\t"\
+ "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "pmulhw %%"s0", %%"t0" \n\t"\
+ "pmulhw %%"s1", %%"t1" \n\t"\
+ "pmulhw %%"s2", %%"t2" \n\t"\
+ "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "movdqa %%"s0", %%"t0" \n\t"\
+ "movdqa %%"s1", %%"t1" \n\t"\
+ "movdqa %%"s2", %%"t2" \n\t"\
+ "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+ x86_reg i = width;
+
+ while(i & 0x1F)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+ i+=i;
+
+ __asm__ volatile (
+ "jmp 2f \n\t"
+ "1: \n\t"
+ snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+ "pcmpeqw %%xmm0, %%xmm0 \n\t"
+ "pcmpeqw %%xmm2, %%xmm2 \n\t"
+ "paddw %%xmm2, %%xmm2 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "psllw $13, %%xmm2 \n\t"
+ snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+ "pcmpeqw %%xmm7, %%xmm7 \n\t"
+ "pcmpeqw %%xmm5, %%xmm5 \n\t"
+ "psllw $15, %%xmm7 \n\t"
+ "psrlw $13, %%xmm5 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+ "movq (%2,%%"FF_REG_d"), %%xmm1 \n\t"
+ "movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm3, %%xmm2 \n\t"
+ "movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t"
+ "movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm4 \n\t"
+ "pavgw %%xmm3, %%xmm6 \n\t"
+ snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+ snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+ "2: \n\t"
+ "sub $64, %%"FF_REG_d" \n\t"
+ "jge 1b \n\t"
+ :"+d"(i)
+ :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+ ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
+ ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
+ ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
+ ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+ snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+ snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+ "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
+ "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
+ "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
+ "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "movq %%"s0", %%"t0" \n\t"\
+ "movq %%"s1", %%"t1" \n\t"\
+ "movq %%"s2", %%"t2" \n\t"\
+ "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+ x86_reg i = width;
+ while(i & 15)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+ i+=i;
+ __asm__ volatile(
+ "jmp 2f \n\t"
+ "1: \n\t"
+
+ snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+ "pcmpeqw %%mm0, %%mm0 \n\t"
+ "pcmpeqw %%mm2, %%mm2 \n\t"
+ "paddw %%mm2, %%mm2 \n\t"
+ "paddw %%mm0, %%mm2 \n\t"
+ "psllw $13, %%mm2 \n\t"
+ snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm5, %%mm5 \n\t"
+ "psllw $15, %%mm7 \n\t"
+ "psrlw $13, %%mm5 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+ "movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
+ "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm3, %%mm2 \n\t"
+ "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
+ "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm4 \n\t"
+ "pavgw %%mm3, %%mm6 \n\t"
+ snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+ snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+ "2: \n\t"
+ "sub $32, %%"FF_REG_d" \n\t"
+ "jge 1b \n\t"
+ :"+d"(i)
+ :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#if HAVE_6REGS
+#define snow_inner_add_yblock_sse2_header \
+ IDWTELEM * * dst_array = sb->line + src_y;\
+ x86_reg tmp;\
+ __asm__ volatile(\
+ "mov %7, %%"FF_REG_c" \n\t"\
+ "mov %6, %2 \n\t"\
+ "mov %4, %%"FF_REG_S" \n\t"\
+ "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
+ "pcmpeqd %%xmm3, %%xmm3 \n\t"\
+ "psllw $15, %%xmm3 \n\t"\
+ "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
+ "1: \n\t"\
+ "mov %1, %%"FF_REG_D" \n\t"\
+ "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
+ "add %3, %%"FF_REG_D" \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+ "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+ "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
+ "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+ "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
+ "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ "pmullw %%xmm0, %%"out_reg1" \n\t"\
+ "pmullw %%xmm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+ "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+ "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
+ "movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+ "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
+ "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ "pmullw %%xmm0, %%"out_reg1" \n\t"\
+ "pmullw %%xmm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+ snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+ "paddusw %%xmm2, %%xmm1 \n\t"\
+ "paddusw %%xmm6, %%xmm5 \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+ snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+ "paddusw %%xmm2, %%xmm1 \n\t"\
+ "paddusw %%xmm6, %%xmm5 \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+ "add $32, %%"FF_REG_S" \n\t"\
+ "add %%"FF_REG_c", %0 \n\t"\
+ "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
+ "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
+ "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
+ "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+ "jnz 1b \n\t"\
+ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+ :\
+ "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+ XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
+ "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+ "sal $1, %%"FF_REG_c" \n\t"\
+ "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
+ snow_inner_add_yblock_sse2_end_common1\
+ "sar $1, %%"FF_REG_c" \n\t"\
+ "sub $2, %2 \n\t"\
+ snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+ "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
+ snow_inner_add_yblock_sse2_end_common1\
+ "dec %2 \n\t"\
+ snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+ "mov %0, %%"FF_REG_d" \n\t"
+ "movdqa (%%"FF_REG_D"), %%xmm0 \n\t"
+ "movdqa %%xmm1, %%xmm2 \n\t"
+
+ "punpckhwd %%xmm7, %%xmm1 \n\t"
+ "punpcklwd %%xmm7, %%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm0 \n\t"
+ "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
+ "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm3, %%xmm0 \n\t"
+ "paddd %%xmm3, %%xmm2 \n\t"
+
+ "mov %1, %%"FF_REG_D" \n\t"
+ "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
+ "add %3, %%"FF_REG_D" \n\t"
+
+ "movdqa (%%"FF_REG_D"), %%xmm4 \n\t"
+ "movdqa %%xmm5, %%xmm6 \n\t"
+ "punpckhwd %%xmm7, %%xmm5 \n\t"
+ "punpcklwd %%xmm7, %%xmm6 \n\t"
+ "paddd %%xmm6, %%xmm4 \n\t"
+ "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
+ "paddd %%xmm5, %%xmm6 \n\t"
+ "paddd %%xmm3, %%xmm4 \n\t"
+ "paddd %%xmm3, %%xmm6 \n\t"
+
+ "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
+ "packssdw %%xmm2, %%xmm0 \n\t"
+ "packuswb %%xmm7, %%xmm0 \n\t"
+ "movq %%xmm0, (%%"FF_REG_d") \n\t"
+
+ "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
+ "packssdw %%xmm6, %%xmm4 \n\t"
+ "packuswb %%xmm7, %%xmm4 \n\t"
+ "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+ "mov %0, %%"FF_REG_d" \n\t"
+ "psrlw $4, %%xmm1 \n\t"
+ "psrlw $4, %%xmm5 \n\t"
+ "paddw (%%"FF_REG_D"), %%xmm1 \n\t"
+ "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
+ "paddw %%xmm3, %%xmm1 \n\t"
+ "paddw %%xmm3, %%xmm5 \n\t"
+ "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
+ "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
+ "packuswb %%xmm5, %%xmm1 \n\t"
+
+ "movdqu %%xmm1, (%%"FF_REG_d") \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+ IDWTELEM * * dst_array = sb->line + src_y;\
+ x86_reg tmp;\
+ __asm__ volatile(\
+ "mov %7, %%"FF_REG_c" \n\t"\
+ "mov %6, %2 \n\t"\
+ "mov %4, %%"FF_REG_S" \n\t"\
+ "pxor %%mm7, %%mm7 \n\t" /* 0 */\
+ "pcmpeqd %%mm3, %%mm3 \n\t"\
+ "psllw $15, %%mm3 \n\t"\
+ "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
+ "1: \n\t"\
+ "mov %1, %%"FF_REG_D" \n\t"\
+ "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
+ "add %3, %%"FF_REG_D" \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+ "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+ "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\
+ "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\
+ "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+ "movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\
+ "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "pmullw %%mm0, %%"out_reg1" \n\t"\
+ "pmullw %%mm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+ snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+ "paddusw %%mm2, %%mm1 \n\t"\
+ "paddusw %%mm6, %%mm5 \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+ "mov %0, %%"FF_REG_d" \n\t"\
+ "psrlw $4, %%mm1 \n\t"\
+ "psrlw $4, %%mm5 \n\t"\
+ "paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\
+ "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psraw $4, %%mm1 \n\t"\
+ "psraw $4, %%mm5 \n\t"\
+ "packuswb %%mm5, %%mm1 \n\t"\
+ "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+ "add $"s_step", %%"FF_REG_S" \n\t"\
+ "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
+ "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
+ "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
+ "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\
+ "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\
+ "add %%"FF_REG_c", %0 \n\t"\
+ "dec %2 \n\t"\
+ "jnz 1b \n\t"\
+ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+ :\
+ "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+ "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16) {
+ if (!(b_h & 1))
+ inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ } else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16)
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+ c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+#if HAVE_6REGS
+ c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+#endif
+ }
+ else{
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+ c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+ }
+#if HAVE_6REGS
+ c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+#endif
+ }
+ }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/x86/svq1enc.asm b/libavcodec/x86/svq1enc.asm
new file mode 100644
index 0000000000..a87632836d
--- /dev/null
+++ b/libavcodec/x86/svq1enc.asm
@@ -0,0 +1,61 @@
+;******************************************************************************
+;* SIMD-optimized SVQ1 encoder functions
+;* Copyright (c) 2007 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SSD_INT8_VS_INT16 0
+cglobal ssd_int8_vs_int16, 3, 3, 3, pix1, pix2, size
+ pxor m0, m0
+.loop:
+ sub sizeq, 8
+ movq m1, [pix1q + sizeq]
+ mova m2, [pix2q + sizeq*2]
+%if mmsize == 8
+ movq m3, [pix2q + sizeq*2 + mmsize]
+ punpckhbw m4, m1
+ punpcklbw m1, m1
+ psraw m4, 8
+ psraw m1, 8
+ psubw m3, m4
+ psubw m2, m1
+ pmaddwd m3, m3
+ pmaddwd m2, m2
+ paddd m0, m3
+ paddd m0, m2
+%else
+ punpcklbw m1, m1
+ psraw m1, 8
+ psubw m2, m1
+ pmaddwd m2, m2
+ paddd m0, m2
+%endif
+ jg .loop
+ HADDD m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmx
+SSD_INT8_VS_INT16
+INIT_XMM sse2
+SSD_INT8_VS_INT16
diff --git a/libavcodec/x86/svq1enc.c b/libavcodec/x86/svq1enc.c
deleted file mode 100644
index 02b0a84b8c..0000000000
--- a/libavcodec/x86/svq1enc.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/svq1enc.h"
-
-#if HAVE_INLINE_ASM
-
-static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
- int size)
-{
- int sum;
- x86_reg i = size;
-
- __asm__ volatile (
- "pxor %%mm4, %%mm4 \n"
- "1: \n"
- "sub $8, %0 \n"
- "movq (%2, %0), %%mm2 \n"
- "movq (%3, %0, 2), %%mm0 \n"
- "movq 8(%3, %0, 2), %%mm1 \n"
- "punpckhbw %%mm2, %%mm3 \n"
- "punpcklbw %%mm2, %%mm2 \n"
- "psraw $8, %%mm3 \n"
- "psraw $8, %%mm2 \n"
- "psubw %%mm3, %%mm1 \n"
- "psubw %%mm2, %%mm0 \n"
- "pmaddwd %%mm1, %%mm1 \n"
- "pmaddwd %%mm0, %%mm0 \n"
- "paddd %%mm1, %%mm4 \n"
- "paddd %%mm0, %%mm4 \n"
- "jg 1b \n"
- "movq %%mm4, %%mm3 \n"
- "psrlq $32, %%mm3 \n"
- "paddd %%mm3, %%mm4 \n"
- "movd %%mm4, %1 \n"
- : "+r" (i), "=r" (sum)
- : "r" (pix1), "r" (pix2));
-
- return sum;
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
-{
-#if HAVE_INLINE_ASM
- int cpu_flags = av_get_cpu_flags();
-
- if (INLINE_MMX(cpu_flags)) {
- c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
- }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/svq1enc_init.c b/libavcodec/x86/svq1enc_init.c
new file mode 100644
index 0000000000..40b4b0e183
--- /dev/null
+++ b/libavcodec/x86/svq1enc_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/svq1enc.h"
+
+int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
+ intptr_t size);
+int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
+ intptr_t size);
+
+av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
+ }
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
+ }
+}
diff --git a/libavcodec/x86/synth_filter.asm b/libavcodec/x86/synth_filter.asm
new file mode 100644
index 0000000000..bc1a48f409
--- /dev/null
+++ b/libavcodec/x86/synth_filter.asm
@@ -0,0 +1,246 @@
+;******************************************************************************
+;* SSE-optimized functions for the DCA decoder
+;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+ pxor %1, %1
+%else
+ xorps %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 3
+%if cpuflag(avx)
+ mova %3, [%2 - 16]
+ vperm2f128 %1, %3, %3, 1
+ vshufps %1, %1, %1, q0123
+%elif cpuflag(sse2)
+ pshufd %1, [%2], q0123
+%else
+ mova %1, [%2]
+ shufps %1, %1, q0123
+%endif
+%endmacro
+
+%macro INNER_LOOP 1
+ ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
+ ;~ a += window[i + j] * (-synth_buf[15 - i + j])
+ ;~ b += window[i + j + 16] * (synth_buf[i + j])
+ SHUF m5, ptr2 + j + (15 - 3) * 4, m6
+ mova m6, [ptr1 + j]
+%if ARCH_X86_64
+ SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
+ mova m12, [ptr1 + j + mmsize]
+%endif
+%if cpuflag(fma3)
+ fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
+ fnmaddps m1, m5, [win + %1 + j], m1
+%if ARCH_X86_64
+ fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
+ fnmaddps m7, m11, [win + %1 + j + mmsize], m7
+%endif
+%else ; non-FMA
+ mulps m6, m6, [win + %1 + j + 16 * 4]
+ mulps m5, m5, [win + %1 + j]
+%if ARCH_X86_64
+ mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
+ mulps m11, m11, [win + %1 + j + mmsize]
+%endif
+ addps m2, m2, m6
+ subps m1, m1, m5
+%if ARCH_X86_64
+ addps m8, m8, m12
+ subps m7, m7, m11
+%endif
+%endif ; cpuflag(fma3)
+ ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
+ ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
+ SHUF m6, ptr2 + j + (31 - 3) * 4, m5
+ mova m5, [ptr1 + j + 16 * 4]
+%if ARCH_X86_64
+ SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
+ mova m11, [ptr1 + j + mmsize + 16 * 4]
+%endif
+%if cpuflag(fma3)
+ fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
+ fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
+%if ARCH_X86_64
+ fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
+ fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
+%endif
+%else ; non-FMA
+ mulps m5, m5, [win + %1 + j + 32 * 4]
+ mulps m6, m6, [win + %1 + j + 48 * 4]
+%if ARCH_X86_64
+ mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
+ mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
+%endif
+ addps m3, m3, m5
+ addps m4, m4, m6
+%if ARCH_X86_64
+ addps m9, m9, m11
+ addps m10, m10, m12
+%endif
+%endif ; cpuflag(fma3)
+ sub j, 64 * 4
+%endmacro
+
+; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
+; const float window[512], float out[32],
+; intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
+cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
+ synth_buf, synth_buf2, window, out, off, scale
+%define scale m0
+%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+ movd scale, scalem
+ SPLATD m0
+%else
+ VBROADCASTSS m0, scalem
+%endif
+; Make sure offset is in a register and not on the stack
+%define OFFQ r4q
+%else
+ SPLATD xmm0
+%if cpuflag(avx)
+ vinsertf128 m0, m0, xmm0, 1
+%endif
+%define OFFQ offq
+%endif
+ ; prepare inner counter limit 1
+ mov r5q, 480
+ sub r5q, offmp
+ and r5q, -64
+ shl r5q, 2
+%if ARCH_X86_32 || notcpuflag(avx)
+ mov OFFQ, r5q
+%define i r5q
+ mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
+%else
+%define i 0
+%define OFFQ r5q
+%endif
+
+%define buf2 synth_buf2q
+%if ARCH_X86_32
+ mov buf2, synth_buf2mp
+%endif
+.mainloop:
+ ; m1 = a m2 = b m3 = c m4 = d
+ SETZERO m3
+ SETZERO m4
+ mova m1, [buf2 + i]
+ mova m2, [buf2 + i + 16 * 4]
+%if ARCH_X86_32
+%define ptr1 r0q
+%define ptr2 r1q
+%define win r2q
+%define j r3q
+ mov win, windowm
+ mov ptr1, synth_bufm
+%if ARCH_X86_32 || notcpuflag(avx)
+ add win, i
+ add ptr1, i
+%endif
+%else ; ARCH_X86_64
+%define ptr1 r6q
+%define ptr2 r7q ; must be loaded
+%define win r8q
+%define j r9q
+ SETZERO m9
+ SETZERO m10
+ mova m7, [buf2 + i + mmsize]
+ mova m8, [buf2 + i + mmsize + 16 * 4]
+ lea win, [windowq + i]
+ lea ptr1, [synth_bufq + i]
+%endif
+ mov ptr2, synth_bufmp
+ ; prepare the inner loop counter
+ mov j, OFFQ
+%if ARCH_X86_32 || notcpuflag(avx)
+ sub ptr2, i
+%endif
+.loop1:
+ INNER_LOOP 0
+ jge .loop1
+
+ mov j, 448 * 4
+ sub j, OFFQ
+ jz .end
+ sub ptr1, j
+ sub ptr2, j
+ add win, OFFQ ; now at j-64, so define OFFSET
+ sub j, 64 * 4
+.loop2:
+ INNER_LOOP 64 * 4
+ jge .loop2
+
+.end:
+%if ARCH_X86_32
+ mov buf2, synth_buf2m ; needed for next iteration anyway
+ mov outq, outmp ; j, which will be set again during it
+%endif
+ ;~ out[i] = a * scale;
+ ;~ out[i + 16] = b * scale;
+ mulps m1, m1, scale
+ mulps m2, m2, scale
+%if ARCH_X86_64
+ mulps m7, m7, scale
+ mulps m8, m8, scale
+%endif
+ ;~ synth_buf2[i] = c;
+ ;~ synth_buf2[i + 16] = d;
+ mova [buf2 + i + 0 * 4], m3
+ mova [buf2 + i + 16 * 4], m4
+%if ARCH_X86_64
+ mova [buf2 + i + 0 * 4 + mmsize], m9
+ mova [buf2 + i + 16 * 4 + mmsize], m10
+%endif
+ ;~ out[i] = a;
+ ;~ out[i + 16] = a;
+ mova [outq + i + 0 * 4], m1
+ mova [outq + i + 16 * 4], m2
+%if ARCH_X86_64
+ mova [outq + i + 0 * 4 + mmsize], m7
+ mova [outq + i + 16 * 4 + mmsize], m8
+%endif
+%if ARCH_X86_32 || notcpuflag(avx)
+ sub i, (ARCH_X86_64 + 1) * mmsize
+ jge .mainloop
+%endif
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
+INIT_YMM avx
+SYNTH_FILTER
+INIT_YMM fma3
+SYNTH_FILTER
diff --git a/libavcodec/x86/synth_filter_init.c b/libavcodec/x86/synth_filter_init.c
new file mode 100644
index 0000000000..9ef00cdb0a
--- /dev/null
+++ b/libavcodec/x86/synth_filter_init.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/synth_filter.h"
+
+#define SYNTH_FILTER_FUNC(opt) \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
+ const float window[512], \
+ float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct, \
+ float *synth_buf_ptr, int *synth_buf_offset, \
+ float synth_buf2[32], const float window[512], \
+ float out[32], const float in[32], float scale) \
+{ \
+ float *synth_buf= synth_buf_ptr + *synth_buf_offset; \
+ \
+ imdct->imdct_half(imdct, synth_buf, in); \
+ \
+ ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \
+ out, *synth_buf_offset, scale); \
+ \
+ *synth_buf_offset = (*synth_buf_offset - 32) & 511; \
+} \
+
+#if HAVE_YASM
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
+SYNTH_FILTER_FUNC(avx)
+SYNTH_FILTER_FUNC(fma3)
+#endif /* HAVE_YASM */
+
+av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+ if (EXTERNAL_SSE(cpu_flags)) {
+ s->synth_filter_float = synth_filter_sse;
+ }
+#endif
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ s->synth_filter_float = synth_filter_sse2;
+ }
+ if (EXTERNAL_AVX_FAST(cpu_flags)) {
+ s->synth_filter_float = synth_filter_avx;
+ }
+ if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+ s->synth_filter_float = synth_filter_fma3;
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
new file mode 100644
index 0000000000..5f3ded3ea2
--- /dev/null
+++ b/libavcodec/x86/takdsp.asm
@@ -0,0 +1,116 @@
+;******************************************************************************
+;* TAK DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_128: times 4 dd 128
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
+ shl lengthd, 2
+ add p1q, lengthq
+ add p2q, lengthq
+ neg lengthq
+.loop:
+ mova m0, [p1q+lengthq+mmsize*0]
+ mova m1, [p1q+lengthq+mmsize*1]
+ paddd m0, [p2q+lengthq+mmsize*0]
+ paddd m1, [p2q+lengthq+mmsize*1]
+ mova [p2q+lengthq+mmsize*0], m0
+ mova [p2q+lengthq+mmsize*1], m1
+ add lengthq, mmsize*2
+ jl .loop
+ REP_RET
+
+cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
+ shl lengthd, 2
+ add p1q, lengthq
+ add p2q, lengthq
+ neg lengthq
+
+.loop:
+ mova m0, [p2q+lengthq+mmsize*0]
+ mova m1, [p2q+lengthq+mmsize*1]
+ psubd m0, [p1q+lengthq+mmsize*0]
+ psubd m1, [p1q+lengthq+mmsize*1]
+ mova [p1q+lengthq+mmsize*0], m0
+ mova [p1q+lengthq+mmsize*1], m1
+ add lengthq, mmsize*2
+ jl .loop
+ REP_RET
+
+cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
+ shl lengthd, 2
+ add p1q, lengthq
+ add p2q, lengthq
+ neg lengthq
+
+.loop:
+ mova m0, [p1q+lengthq]
+ mova m1, [p2q+lengthq]
+ mova m3, [p1q+lengthq+mmsize]
+ mova m4, [p2q+lengthq+mmsize]
+ mova m2, m1
+ mova m5, m4
+ psrad m2, 1
+ psrad m5, 1
+ psubd m0, m2
+ psubd m3, m5
+ paddd m1, m0
+ paddd m4, m3
+ mova [p1q+lengthq], m0
+ mova [p2q+lengthq], m1
+ mova [p1q+lengthq+mmsize], m3
+ mova [p2q+lengthq+mmsize], m4
+ add lengthq, mmsize*2
+ jl .loop
+ REP_RET
+
+INIT_XMM sse4
+cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
+ shl lengthd, 2
+ add p1q, lengthq
+ add p2q, lengthq
+ neg lengthq
+
+ movd m2, dshiftm
+ movd m3, dfactorm
+ pshufd m3, m3, 0
+ mova m4, [pd_128]
+
+.loop:
+ mova m0, [p1q+lengthq]
+ mova m1, [p2q+lengthq]
+ psrad m1, m2
+ pmulld m1, m3
+ paddd m1, m4
+ psrad m1, 8
+ pslld m1, m2
+ psubd m1, m0
+ mova [p1q+lengthq], m1
+ add lengthq, mmsize
+ jl .loop
+ REP_RET
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
new file mode 100644
index 0000000000..555d0649c9
--- /dev/null
+++ b/libavcodec/x86/takdsp_init.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/takdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+
+av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->decorrelate_ls = ff_tak_decorrelate_ls_sse2;
+ c->decorrelate_sr = ff_tak_decorrelate_sr_sse2;
+ c->decorrelate_sm = ff_tak_decorrelate_sm_sse2;
+ }
+
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
+ }
+#endif
+}
diff --git a/libavcodec/x86/ttadsp.asm b/libavcodec/x86/ttadsp.asm
new file mode 100644
index 0000000000..db12a32eca
--- /dev/null
+++ b/libavcodec/x86/ttadsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224: dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTA_FILTER 2
+INIT_XMM %1
+cglobal tta_filter_process, 5,5,%2, qm, dx, dl, error, in, shift, round
+ mova m2, [qmq ]
+ mova m3, [qmq + 0x10]
+ mova m4, [dxq ]
+ mova m5, [dxq + 0x10]
+
+ movd m6, [errorq] ; if (filter->error < 0) {
+ SPLATD m6 ; for (int i = 0; i < 8; i++)
+ psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i];
+ psignd m1, m5, m6 ; } else if (filter->error > 0) {
+ paddd m2, m0 ; for (int i = 0; i < 8; i++)
+ paddd m3, m1 ; filter->qm[i] += filter->dx[i];
+ mova [qmq ], m2 ; }
+ mova [qmq + 0x10], m3 ;
+
+ mova m0, [dlq ]
+ mova m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+ pmulld m2, m0
+ pmulld m3, m1
+%else
+ pshufd m6, m0, 0xb1
+ pshufd m7, m2, 0xb1
+ pmuludq m6, m7
+ pshufd m6, m6, 0xd8
+ pmuludq m2, m0
+ pshufd m2, m2, 0xd8
+ punpckldq m2, m6
+
+ pshufd m6, m1, 0xb1
+ pshufd m7, m3, 0xb1
+ pmuludq m6, m7
+ pshufd m6, m6, 0xd8
+ pmuludq m3, m1
+ pshufd m3, m3, 0xd8
+ punpckldq m3, m6
+%endif
+ ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+ paddd m2, m3 ; int sum = filter->round +
+ ; filter->dl[0] * filter->qm[0] +
+ pshufd m3, m2, 0xe ; filter->dl[1] * filter->qm[1] +
+ paddd m2, m3 ; filter->dl[2] * filter->qm[2] +
+ ; filter->dl[3] * filter->qm[3] +
+ movd m6, roundm ; filter->dl[4] * filter->qm[4] +
+ paddd m6, m2 ; filter->dl[5] * filter->qm[5] +
+ pshufd m2, m2, 0x1 ; filter->dl[6] * filter->qm[6] +
+ paddd m6, m2 ; filter->dl[7] * filter->qm[7];
+
+ palignr m5, m4, 4 ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+ ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+ palignr m2, m1, m0, 4 ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+ ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+ psrad m4, m1, 30 ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+ por m4, [pd_1224 ] ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+ pand m4, [pd_n0113] ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+ ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+ mova [dlq ], m2
+ mova [dxq ], m5
+ mova [dxq + 0x10], m4
+ movd m0, [inq] ; filter->error = *in;
+ movd [errorq], m0 ;
+
+ movd m2, shiftm ; *in += (sum >> filter->shift);
+ psrad m6, m2 ;
+ paddd m0, m6 ;
+ movd [inq], m0 ;
+
+ psrldq m1, 4 ;
+ pslldq m0, 12 ; filter->dl[4] = -filter->dl[5];
+ pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6];
+ psubd m0, m1 ; filter->dl[6] = *in - filter->dl[7];
+ psrldq m1, m0, 4 ; filter->dl[7] = *in;
+ pshufd m1, m1, 0xf4 ; filter->dl[5] += filter->dl[6];
+ paddd m0, m1 ; filter->dl[4] += filter->dl[5];
+ psrldq m1, 4 ;
+ paddd m0, m1 ;
+ mova [dlq + 0x10], m0 ;
+ RET
+%endmacro
+
+TTA_FILTER ssse3, 8
+TTA_FILTER sse4, 7
diff --git a/libavcodec/x86/ttadsp_init.c b/libavcodec/x86/ttadsp_init.c
new file mode 100644
index 0000000000..aa998c1afd
--- /dev/null
+++ b/libavcodec/x86/ttadsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttadsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tta_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+ int32_t *error, int32_t *in, int32_t shift,
+ int32_t round);
+void ff_tta_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+ int32_t *error, int32_t *in, int32_t shift,
+ int32_t round);
+
+av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSSE3(cpu_flags))
+ c->filter_process = ff_tta_filter_process_ssse3;
+ if (EXTERNAL_SSE4(cpu_flags))
+ c->filter_process = ff_tta_filter_process_sse4;
+#endif
+}
diff --git a/libavcodec/x86/ttaencdsp.asm b/libavcodec/x86/ttaencdsp.asm
new file mode 100644
index 0000000000..c9cbd49874
--- /dev/null
+++ b/libavcodec/x86/ttaencdsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA Encoder DSP SIMD optimizations
+;*
+;* Copyright (C) 2014-2016 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224: dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTAENC_FILTER 2
+INIT_XMM %1
+cglobal ttaenc_filter_process, 5,5,%2, qm, dx, dl, error, in, shift, round
+ mova m2, [qmq ]
+ mova m3, [qmq + 0x10]
+ mova m4, [dxq ]
+ mova m5, [dxq + 0x10]
+
+ movd m6, [errorq] ; if (filter->error < 0) {
+ SPLATD m6 ; for (int i = 0; i < 8; i++)
+ psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i];
+ psignd m1, m5, m6 ; } else if (filter->error > 0) {
+ paddd m2, m0 ; for (int i = 0; i < 8; i++)
+ paddd m3, m1 ; filter->qm[i] += filter->dx[i];
+ mova [qmq ], m2 ; }
+ mova [qmq + 0x10], m3 ;
+
+ mova m0, [dlq ]
+ mova m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+ pmulld m2, m0
+ pmulld m3, m1
+%else
+ pshufd m6, m0, 0xb1
+ pshufd m7, m2, 0xb1
+ pmuludq m6, m7
+ pshufd m6, m6, 0xd8
+ pmuludq m2, m0
+ pshufd m2, m2, 0xd8
+ punpckldq m2, m6
+
+ pshufd m6, m1, 0xb1
+ pshufd m7, m3, 0xb1
+ pmuludq m6, m7
+ pshufd m6, m6, 0xd8
+ pmuludq m3, m1
+ pshufd m3, m3, 0xd8
+ punpckldq m3, m6
+%endif
+ ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+ paddd m2, m3 ; int sum = filter->round +
+ ; filter->dl[0] * filter->qm[0] +
+ pshufd m3, m2, 0xe ; filter->dl[1] * filter->qm[1] +
+ paddd m2, m3 ; filter->dl[2] * filter->qm[2] +
+ ; filter->dl[3] * filter->qm[3] +
+ movd m6, roundm ; filter->dl[4] * filter->qm[4] +
+ paddd m6, m2 ; filter->dl[5] * filter->qm[5] +
+ pshufd m2, m2, 0x1 ; filter->dl[6] * filter->qm[6] +
+ paddd m6, m2 ; filter->dl[7] * filter->qm[7];
+
+ palignr m5, m4, 4 ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+ ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+ palignr m2, m1, m0, 4 ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+ ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+ psrad m4, m1, 30 ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+ por m4, [pd_1224 ] ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+ pand m4, [pd_n0113] ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+ ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+ mova [dlq ], m2
+ mova [dxq ], m5
+ mova [dxq + 0x10], m4
+
+ movd m2, shiftm ;
+ movd m0, [inq] ;
+ psrad m6, m2 ;
+ psubd m3, m0, m6 ;
+ movd [inq], m3 ; *in -= (sum >> filter->shift);
+ movd [errorq], m3 ; filter->error = *in;
+
+ psrldq m1, 4 ;
+ pslldq m0, 12 ; filter->dl[4] = -filter->dl[5];
+ pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6];
+ psubd m0, m1 ; filter->dl[6] = *in - filter->dl[7];
+ psrldq m1, m0, 4 ; filter->dl[7] = *in;
+ pshufd m1, m1, 0xf4 ; filter->dl[5] += filter->dl[6];
+ paddd m0, m1 ; filter->dl[4] += filter->dl[5];
+ psrldq m1, 4 ;
+ paddd m0, m1 ;
+ mova [dlq + 0x10], m0 ;
+ RET
+%endmacro
+
+TTAENC_FILTER ssse3, 8
+TTAENC_FILTER sse4, 7
diff --git a/libavcodec/x86/ttaencdsp_init.c b/libavcodec/x86/ttaencdsp_init.c
new file mode 100644
index 0000000000..c1a02fdac9
--- /dev/null
+++ b/libavcodec/x86/ttaencdsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014-2016 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttaencdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_ttaenc_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+ int32_t *error, int32_t *in, int32_t shift,
+ int32_t round);
+void ff_ttaenc_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+ int32_t *error, int32_t *in, int32_t shift,
+ int32_t round);
+
+av_cold void ff_ttaencdsp_init_x86(TTAEncDSPContext *c)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSSE3(cpu_flags))
+ c->filter_process = ff_ttaenc_filter_process_ssse3;
+ if (EXTERNAL_SSE4(cpu_flags))
+ c->filter_process = ff_ttaenc_filter_process_sse4;
+#endif
+}
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000000..f579307aa0
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void ff_v210_x86_init(V210DecContext *s)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (s->aligned_input) {
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+ if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+ s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+ }
+ else {
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+ if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+ s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+ }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000000..c24c765e5b
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,90 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 1
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1, 5, 5, 7
+ movsxdifnidn r4, r4d
+ lea r1, [r1+2*r4]
+ add r2, r4
+ add r3, r4
+ neg r4
+
+ mova m3, [v210_mult]
+ mova m4, [v210_mask]
+ mova m5, [v210_luma_shuf]
+ mova m6, [v210_chroma_shuf]
+.loop:
+%ifidn %1, unaligned
+ movu m0, [r0]
+%else
+ mova m0, [r0]
+%endif
+
+ pmullw m1, m0, m3
+ psrld m0, 10
+ psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5
+ pand m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+ shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+ pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+ movu [r1+2*r4], m2
+
+ shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+ pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+ movq [r2+r4], m1
+ movhps [r3+r4], m1
+
+ add r0, mmsize
+ add r4, 6
+ jl .loop
+
+ REP_RET
+%endmacro
+
+INIT_XMM ssse3
+v210_planar_unpack unaligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack unaligned
+%endif
+
+INIT_XMM ssse3
+v210_planar_unpack aligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack aligned
+%endif
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 0db0196313..965f2bea3c 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -2,20 +2,20 @@
;* V210 SIMD pack
;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -23,8 +23,9 @@
SECTION_RODATA 32
-v210_enc_min_10: times 32 dw 0x4
-v210_enc_max_10: times 32 dw 0x3fb
+cextern pw_4
+%define v210_enc_min_10 pw_4
+v210_enc_max_10: times 16 dw 0x3fb
v210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0
v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
@@ -32,16 +33,19 @@ v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
v210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0
v210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1
-v210_enc_min_8: times 32 db 0x1
-v210_enc_max_8: times 32 db 0xfe
+cextern pb_1
+%define v210_enc_min_8 pb_1
+cextern pb_FE
+%define v210_enc_max_8 pb_FE
-v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
+v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
-v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
+v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
+
SECTION .text
%macro v210_planar_pack_10 0
@@ -59,16 +63,16 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
.loop:
movu xm0, [yq+2*widthq]
%if cpuflag(avx2)
- vinserti128 m0, m0, [yq+2*widthq+12], 1
+ vinserti128 m0, m0, [yq+widthq*2+12], 1
%endif
CLIPW m0, m2, m3
- movq xm1, [uq+widthq]
- movhps xm1, [vq+widthq]
+ movq xm1, [uq+widthq]
+ movhps xm1, [vq+widthq]
%if cpuflag(avx2)
movq xm4, [uq+widthq+6]
movhps xm4, [vq+widthq+6]
- vinserti128 m1, m1, xm4, 1
+ vinserti128 m1, m1, xm4, 1
%endif
CLIPW m1, m2, m3
@@ -93,6 +97,7 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
INIT_XMM ssse3
v210_planar_pack_10
%endif
+
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
v210_planar_pack_10
@@ -113,9 +118,9 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
pxor m6, m6
.loop:
- movu xm1, [yq+2*widthq]
+ movu xm1, [yq+widthq*2]
%if cpuflag(avx2)
- vinserti128 m1, m1, [yq+2*widthq+12], 1
+ vinserti128 m1, m1, [yq+widthq*2+12], 1
%endif
CLIPUB m1, m4, m5
@@ -172,6 +177,7 @@ v210_planar_pack_8
INIT_XMM avx
v210_planar_pack_8
%endif
+
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
v210_planar_pack_8
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index c4d2745b6f..e997b4b67a 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h
index 9b6c8ada26..fdd4de1813 100644
--- a/libavcodec/x86/vc1dsp.h
+++ b/libavcodec/x86/vc1dsp.h
@@ -1,20 +1,20 @@
/*
* VC-1 and WMV3 decoder - X86 DSP init functions
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 8982ff908a..79d22a294f 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -27,6 +27,7 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
#include "libavcodec/vc1dsp.h"
#include "fpel.h"
#include "vc1dsp.h"
@@ -63,11 +64,22 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
}
-static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd)
-{
- ff_avg_pixels8_mmxext(dst, src, stride, 8);
-}
+#define DECLARE_FUNCTION(OP, DEPTH, INSN) \
+ static void OP##vc1_mspel_mc00_##DEPTH##INSN(uint8_t *dst, \
+ const uint8_t *src, ptrdiff_t stride, int rnd) \
+ { \
+ ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH); \
+ }
+
+DECLARE_FUNCTION(put_, 8, _mmx)
+DECLARE_FUNCTION(put_, 16, _mmx)
+DECLARE_FUNCTION(avg_, 8, _mmx)
+DECLARE_FUNCTION(avg_, 16, _mmx)
+DECLARE_FUNCTION(avg_, 8, _mmxext)
+DECLARE_FUNCTION(avg_, 16, _mmxext)
+DECLARE_FUNCTION(put_, 16, _sse2)
+DECLARE_FUNCTION(avg_, 16, _sse2)
+
#endif /* HAVE_YASM */
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
@@ -80,16 +92,24 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
+void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+ int16_t *block);
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
- if (INLINE_MMX(cpu_flags))
+ if (HAVE_6REGS && INLINE_MMX(cpu_flags) && EXTERNAL_MMX(cpu_flags))
ff_vc1dsp_init_mmx(dsp);
- if (INLINE_MMXEXT(cpu_flags))
+ if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags) && EXTERNAL_MMXEXT(cpu_flags))
ff_vc1dsp_init_mmxext(dsp);
#define ASSIGN_LF(EXT) \
@@ -103,6 +123,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
#if HAVE_YASM
if (EXTERNAL_MMX(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
+
+ dsp->put_vc1_mspel_pixels_tab[1][0] = put_vc1_mspel_mc00_8_mmx;
+ dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_mmx;
+ dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmx;
+ dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
@@ -111,13 +136,22 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
ASSIGN_LF(mmxext);
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmxext;
+
+ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmxext;
+ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmxext;
+ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmxext;
+ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+
+ dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_sse2;
+ dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_LF(ssse3);
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp_loopfilter.asm
index adf08d7d84..1838f6f235 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp_loopfilter.asm
@@ -1,21 +1,21 @@
;******************************************************************************
-;* VC1 deblocking optimizations
+;* VC1 loopfilter optimizations
;* Copyright (c) 2009 David Conrad
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
new file mode 100644
index 0000000000..2850ca861d
--- /dev/null
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -0,0 +1,292 @@
+;******************************************************************************
+;* VC1 motion compensation optimizations
+;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_9
+cextern pw_128
+
+section .text
+
+%if HAVE_MMX_INLINE
+
+; XXX some of these macros are not used right now, but they will in the future
+; when more functions are ported.
+
+%macro OP_PUT 2 ; dst, src
+%endmacro
+
+%macro OP_AVG 2 ; dst, src
+ pavgb %1, %2
+%endmacro
+
+%macro NORMALIZE_MMX 1 ; shift
+ paddw m3, m7 ; +bias-r
+ paddw m4, m7 ; +bias-r
+ psraw m3, %1
+ psraw m4, %1
+%endmacro
+
+%macro TRANSFER_DO_PACK 2 ; op, dst
+ packuswb m3, m4
+ %1 m3, [%2]
+ mova [%2], m3
+%endmacro
+
+%macro TRANSFER_DONT_PACK 2 ; op, dst
+ %1 m3, [%2]
+ %1 m3, [%2 + mmsize]
+ mova [%2], m3
+ mova [mmsize + %2], m4
+%endmacro
+
+; see MSPEL_FILTER13_CORE for use as UNPACK macro
+%macro DO_UNPACK 1 ; reg
+ punpcklbw %1, m0
+%endmacro
+%macro DONT_UNPACK 1 ; reg
+%endmacro
+
+; Compute the rounder 32-r or 8-r and unpacks it to m7
+%macro LOAD_ROUNDER_MMX 1 ; round
+ movd m7, %1
+ punpcklwd m7, m7
+ punpckldq m7, m7
+%endmacro
+
+%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
+ paddw m%3, m%4
+ movh m%2, [srcq + stride_neg2]
+ pmullw m%3, m6
+ punpcklbw m%2, m0
+ movh m%5, [srcq + strideq]
+ psubw m%3, m%2
+ punpcklbw m%5, m0
+ paddw m%3, m7
+ psubw m%3, m%5
+ psraw m%3, shift
+ movu [dstq + %1], m%3
+ add srcq, strideq
+%endmacro
+
+INIT_MMX mmx
+; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
+; x86_reg stride, int rnd, int64_t shift)
+; Sacrificing m6 makes it possible to pipeline loads from src
+%if ARCH_X86_32
+cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
+ DECLARE_REG_TMP 3, 4, 5
+ %define rnd r3mp
+ %define shift qword r4m
+%else ; X86_64
+cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
+ DECLARE_REG_TMP 4, 5, 6
+ %define rnd r3d
+ ; We need shift either in memory or in a mm reg as it's used in psraw
+ ; On WIN64, the arg is already on the stack
+ ; On UNIX64, m5 doesn't seem to be used
+%if WIN64
+ %define shift r4mp
+%else ; UNIX64
+ %define shift m5
+ mova shift, r4q
+%endif ; WIN64
+%endif ; X86_32
+%define stride_neg2 t0q
+%define stride_9minus4 t1q
+%define i t2q
+ mov stride_neg2, strideq
+ neg stride_neg2
+ add stride_neg2, stride_neg2
+ lea stride_9minus4, [strideq * 9 - 4]
+ mov i, 3
+ LOAD_ROUNDER_MMX rnd
+ mova m6, [pw_9]
+ pxor m0, m0
+.loop:
+ movh m2, [srcq]
+ add srcq, strideq
+ movh m3, [srcq]
+ punpcklbw m2, m0
+ punpcklbw m3, m0
+ SHIFT2_LINE 0, 1, 2, 3, 4
+ SHIFT2_LINE 24, 2, 3, 4, 1
+ SHIFT2_LINE 48, 3, 4, 1, 2
+ SHIFT2_LINE 72, 4, 1, 2, 3
+ SHIFT2_LINE 96, 1, 2, 3, 4
+ SHIFT2_LINE 120, 2, 3, 4, 1
+ SHIFT2_LINE 144, 3, 4, 1, 2
+ SHIFT2_LINE 168, 4, 1, 2, 3
+ sub srcq, stride_9minus4
+ add dstq, 8
+ dec i
+ jnz .loop
+ REP_RET
+%undef rnd
+%undef shift
+%undef stride_neg2
+%undef stride_9minus4
+%undef i
+
+; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+; const int16_t *src, int rnd);
+; Data is already unpacked, so some operations can directly be made from
+; memory.
+%macro HOR_16B_SHIFT2 2 ; op, opname
+cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
+ mov hq, 8
+ sub srcq, 2
+ sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias
+ LOAD_ROUNDER_MMX rndd
+ mova m5, [pw_9]
+ mova m6, [pw_128]
+ pxor m0, m0
+
+.loop:
+ mova m1, [srcq + 2 * 0]
+ mova m2, [srcq + 2 * 0 + mmsize]
+ mova m3, [srcq + 2 * 1]
+ mova m4, [srcq + 2 * 1 + mmsize]
+ paddw m3, [srcq + 2 * 2]
+ paddw m4, [srcq + 2 * 2 + mmsize]
+ paddw m1, [srcq + 2 * 3]
+ paddw m2, [srcq + 2 * 3 + mmsize]
+ pmullw m3, m5
+ pmullw m4, m5
+ psubw m3, m1
+ psubw m4, m2
+ NORMALIZE_MMX 7
+ ; remove bias
+ paddw m3, m6
+ paddw m4, m6
+ TRANSFER_DO_PACK %1, dstq
+ add srcq, 24
+ add dstq, strideq
+ dec hq
+ jnz .loop
+
+ RET
+%endmacro
+
+INIT_MMX mmx
+HOR_16B_SHIFT2 OP_PUT, put
+
+INIT_MMX mmxext
+HOR_16B_SHIFT2 OP_AVG, avg
+%endif ; HAVE_MMX_INLINE
+
+%macro INV_TRANS_INIT 0
+ movsxdifnidn linesizeq, linesized
+ movd m0, blockd
+ SPLATW m0, m0
+ pxor m1, m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+
+ DEFINE_ARGS dest, linesize, linesize3
+ lea linesize3q, [linesizeq*3]
+%endmacro
+
+%macro INV_TRANS_PROCESS 1
+ mov%1 m2, [destq+linesizeq*0]
+ mov%1 m3, [destq+linesizeq*1]
+ mov%1 m4, [destq+linesizeq*2]
+ mov%1 m5, [destq+linesize3q]
+ paddusb m2, m0
+ paddusb m3, m0
+ paddusb m4, m0
+ paddusb m5, m0
+ psubusb m2, m1
+ psubusb m3, m1
+ psubusb m4, m1
+ psubusb m5, m1
+ mov%1 [linesizeq*0+destq], m2
+ mov%1 [linesizeq*1+destq], m3
+ mov%1 [linesizeq*2+destq], m4
+ mov%1 [linesize3q +destq], m5
+%endmacro
+
+; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
+ movsx r3d, WORD [blockq]
+ mov blockd, r3d ; dc
+ shl blockd, 4 ; 16 * dc
+ lea blockd, [blockq+r3+4] ; 17 * dc + 4
+ sar blockd, 3 ; >> 3
+ mov r3d, blockd ; dc
+ shl blockd, 4 ; 16 * dc
+ lea blockd, [blockq+r3+64] ; 17 * dc + 64
+ sar blockd, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS h
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
+ movsx r3d, WORD [blockq]
+ mov blockd, r3d ; dc
+ shl blockd, 4 ; 16 * dc
+ lea blockd, [blockq+r3+4] ; 17 * dc + 4
+ sar blockd, 3 ; >> 3
+ shl blockd, 2 ; 4 * dc
+ lea blockd, [blockq*3+64] ; 12 * dc + 64
+ sar blockd, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS h
+ lea destq, [destq+linesizeq*4]
+ INV_TRANS_PROCESS h
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
+ movsx blockd, WORD [blockq] ; dc
+ lea blockd, [blockq*3+1] ; 3 * dc + 1
+ sar blockd, 1 ; >> 1
+ mov r3d, blockd ; dc
+ shl blockd, 4 ; 16 * dc
+ lea blockd, [blockq+r3+64] ; 17 * dc + 64
+ sar blockd, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS a
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
+ movsx blockd, WORD [blockq] ; dc
+ lea blockd, [blockq*3+1] ; 3 * dc + 1
+ sar blockd, 1 ; >> 1
+ lea blockd, [blockq*3+16] ; 3 * dc + 16
+ sar blockd, 5 ; >> 5
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS a
+ lea destq, [destq+linesizeq*4]
+ INV_TRANS_PROCESS a
+ RET
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index d64ddf0174..45c8a68f29 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -25,7 +25,6 @@
*/
#include "libavutil/cpu.h"
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
@@ -34,7 +33,15 @@
#include "fpel.h"
#include "vc1dsp.h"
-#if HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
+
+void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+ const uint8_t *src, x86_reg stride,
+ int rnd, int64_t shift);
+void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+ const int16_t *src, int rnd);
+void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
+ const int16_t *src, int rnd);
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -67,102 +74,6 @@
"punpcklwd %%mm7, %%mm7 \n\t" \
"punpckldq %%mm7, %%mm7 \n\t"
-#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
- "paddw %%mm"#R2", %%mm"#R1" \n\t" \
- "movd (%0,%3), %%mm"#R0" \n\t" \
- "pmullw %%mm6, %%mm"#R1" \n\t" \
- "punpcklbw %%mm0, %%mm"#R0" \n\t" \
- "movd (%0,%2), %%mm"#R3" \n\t" \
- "psubw %%mm"#R0", %%mm"#R1" \n\t" \
- "punpcklbw %%mm0, %%mm"#R3" \n\t" \
- "paddw %%mm7, %%mm"#R1" \n\t" \
- "psubw %%mm"#R3", %%mm"#R1" \n\t" \
- "psraw %4, %%mm"#R1" \n\t" \
- "movq %%mm"#R1", "#OFF"(%1) \n\t" \
- "add %2, %0 \n\t"
-
-/** Sacrificing mm6 allows to pipeline loads from src */
-static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
- const uint8_t *src, x86_reg stride,
- int rnd, int64_t shift)
-{
- __asm__ volatile(
- "mov $3, %%"FF_REG_c" \n\t"
- LOAD_ROUNDER_MMX("%5")
- "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
- "1: \n\t"
- "movd (%0), %%mm2 \n\t"
- "add %2, %0 \n\t"
- "movd (%0), %%mm3 \n\t"
- "punpcklbw %%mm0, %%mm2 \n\t"
- "punpcklbw %%mm0, %%mm3 \n\t"
- SHIFT2_LINE( 0, 1, 2, 3, 4)
- SHIFT2_LINE( 24, 2, 3, 4, 1)
- SHIFT2_LINE( 48, 3, 4, 1, 2)
- SHIFT2_LINE( 72, 4, 1, 2, 3)
- SHIFT2_LINE( 96, 1, 2, 3, 4)
- SHIFT2_LINE(120, 2, 3, 4, 1)
- SHIFT2_LINE(144, 3, 4, 1, 2)
- SHIFT2_LINE(168, 4, 1, 2, 3)
- "sub %6, %0 \n\t"
- "add $8, %1 \n\t"
- "dec %%"FF_REG_c" \n\t"
- "jnz 1b \n\t"
- : "+r"(src), "+r"(dst)
- : "r"(stride), "r"(-2*stride),
- "m"(shift), "m"(rnd), "r"(9*stride-4)
- : "%"FF_REG_c, "memory"
- );
-}
-
-/**
- * Data is already unpacked, so some operations can directly be made from
- * memory.
- */
-#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
- const int16_t *src, int rnd)\
-{\
- int h = 8;\
-\
- src -= 1;\
- rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
- __asm__ volatile(\
- LOAD_ROUNDER_MMX("%4")\
- "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
- "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
- "1: \n\t"\
- "movq 2*0+0(%1), %%mm1 \n\t"\
- "movq 2*0+8(%1), %%mm2 \n\t"\
- "movq 2*1+0(%1), %%mm3 \n\t"\
- "movq 2*1+8(%1), %%mm4 \n\t"\
- "paddw 2*3+0(%1), %%mm1 \n\t"\
- "paddw 2*3+8(%1), %%mm2 \n\t"\
- "paddw 2*2+0(%1), %%mm3 \n\t"\
- "paddw 2*2+8(%1), %%mm4 \n\t"\
- "pmullw %%mm5, %%mm3 \n\t"\
- "pmullw %%mm5, %%mm4 \n\t"\
- "psubw %%mm1, %%mm3 \n\t"\
- "psubw %%mm2, %%mm4 \n\t"\
- NORMALIZE_MMX("$7")\
- /* Remove bias */\
- "paddw %%mm6, %%mm3 \n\t"\
- "paddw %%mm6, %%mm4 \n\t"\
- TRANSFER_DO_PACK(OP)\
- "add $24, %1 \n\t"\
- "add %3, %2 \n\t"\
- "decl %0 \n\t"\
- "jnz 1b \n\t"\
- : "+r"(h), "+r" (src), "+r" (dst)\
- : "r"(stride), "m"(rnd)\
- : "memory"\
- );\
-}
-
-VC1_HOR_16b_SHIFT2(OP_PUT, put_)
-VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
-
-
/**
* Purely vertical or horizontal 1/2 shift interpolation.
* Sacrifice mm6 for *9 factor.
@@ -213,6 +124,7 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
: "+r"(src), "+r"(dst)\
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
"g"(stride-offset)\
+ NAMED_CONSTRAINTS_ADD(ff_pw_9)\
: "%"FF_REG_c, "memory"\
);\
}
@@ -315,6 +227,7 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(src_stride), "r"(3*src_stride), \
"m"(rnd), "m"(shift) \
+ NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \
: "memory" \
); \
}
@@ -352,6 +265,7 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(stride), "m"(rnd) \
+ NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \
: "memory" \
); \
}
@@ -387,6 +301,7 @@ OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
+ NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \
: "memory" \
); \
}
@@ -420,14 +335,14 @@ typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_
* @param hmode Vertical filter.
* @param rnd Rounding bias.
*/
-#define VC1_MSPEL_MC(OP)\
+#define VC1_MSPEL_MC(OP, INSTR)\
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
int hmode, int vmode, int rnd)\
{\
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
- { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+ { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
- { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
+ { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
\
@@ -441,7 +356,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
static const int shift_value[] = { 0, 5, 1, 5 };\
int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
int r;\
- DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
+ LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\
\
r = (1<<(shift-1)) + rnd-1;\
vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
@@ -457,10 +372,19 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
\
/* Horizontal mode with no vertical mode */\
vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
+} \
+static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
+ int stride, int hmode, int vmode, int rnd)\
+{ \
+ OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+ OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
+ dst += 8*stride; src += 8*stride; \
+ OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+ OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
}
-VC1_MSPEL_MC(put_)
-VC1_MSPEL_MC(avg_)
+VC1_MSPEL_MC(put_, mmx)
+VC1_MSPEL_MC(avg_, mmxext)
/** Macro to ease bicubic filter interpolation functions declarations */
#define DECLARE_FUNCTION(a, b) \
@@ -477,6 +401,20 @@ static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
int rnd) \
{ \
avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
+}\
+static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \
+ const uint8_t *src, \
+ ptrdiff_t stride, \
+ int rnd) \
+{ \
+ put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \
+ const uint8_t *src,\
+ ptrdiff_t stride, \
+ int rnd) \
+{ \
+ avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
}
DECLARE_FUNCTION(0, 1)
@@ -498,261 +436,51 @@ DECLARE_FUNCTION(3, 1)
DECLARE_FUNCTION(3, 2)
DECLARE_FUNCTION(3, 3)
-static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
- int16_t *block)
-{
- int dc = block[0];
- dc = (17 * dc + 4) >> 3;
- dc = (17 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t *)(dest + 0 * stride)),
- "+m"(*(uint32_t *)(dest + 1 * stride)),
- "+m"(*(uint32_t *)(dest + 2 * stride)),
- "+m"(*(uint32_t *)(dest + 3 * stride))
- );
-}
-
-static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
- int16_t *block)
-{
- int dc = block[0];
- dc = (17 * dc + 4) >> 3;
- dc = (12 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t *)(dest + 0 * stride)),
- "+m"(*(uint32_t *)(dest + 1 * stride)),
- "+m"(*(uint32_t *)(dest + 2 * stride)),
- "+m"(*(uint32_t *)(dest + 3 * stride))
- );
- dest += 4 * stride;
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t *)(dest + 0 * stride)),
- "+m"(*(uint32_t *)(dest + 1 * stride)),
- "+m"(*(uint32_t *)(dest + 2 * stride)),
- "+m"(*(uint32_t *)(dest + 3 * stride))
- );
-}
-
-static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
- int16_t *block)
-{
- int dc = block[0];
- dc = ( 3 * dc + 1) >> 1;
- dc = (17 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t *)(dest + 0 * stride)),
- "+m"(*(uint32_t *)(dest + 1 * stride)),
- "+m"(*(uint32_t *)(dest + 2 * stride)),
- "+m"(*(uint32_t *)(dest + 3 * stride))
- );
-}
-
-static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
- int16_t *block)
-{
- int dc = block[0];
- dc = (3 * dc + 1) >> 1;
- dc = (3 * dc + 16) >> 5;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t *)(dest + 0 * stride)),
- "+m"(*(uint32_t *)(dest + 1 * stride)),
- "+m"(*(uint32_t *)(dest + 2 * stride)),
- "+m"(*(uint32_t *)(dest + 3 * stride))
- );
- dest += 4 * stride;
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t *)(dest + 0 * stride)),
- "+m"(*(uint32_t *)(dest + 1 * stride)),
- "+m"(*(uint32_t *)(dest + 2 * stride)),
- "+m"(*(uint32_t *)(dest + 3 * stride))
- );
-}
-
-static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd)
-{
- ff_put_pixels8_mmx(dst, src, stride, 8);
-}
+#define FN_ASSIGN(OP, X, Y, INSN) \
+ dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
+ dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
{
- dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
- dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
-
- dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
- dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
-
- dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
- dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
- dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
-
- dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
- dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
- dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+ FN_ASSIGN(put_, 0, 1, _mmx);
+ FN_ASSIGN(put_, 0, 2, _mmx);
+ FN_ASSIGN(put_, 0, 3, _mmx);
+
+ FN_ASSIGN(put_, 1, 0, _mmx);
+ FN_ASSIGN(put_, 1, 1, _mmx);
+ FN_ASSIGN(put_, 1, 2, _mmx);
+ FN_ASSIGN(put_, 1, 3, _mmx);
+
+ FN_ASSIGN(put_, 2, 0, _mmx);
+ FN_ASSIGN(put_, 2, 1, _mmx);
+ FN_ASSIGN(put_, 2, 2, _mmx);
+ FN_ASSIGN(put_, 2, 3, _mmx);
+
+ FN_ASSIGN(put_, 3, 0, _mmx);
+ FN_ASSIGN(put_, 3, 1, _mmx);
+ FN_ASSIGN(put_, 3, 2, _mmx);
+ FN_ASSIGN(put_, 3, 3, _mmx);
}
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
{
- dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
-
- dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
-
- dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
-
- dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
-
- dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
- dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
- dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
- dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
+ FN_ASSIGN(avg_, 0, 1, _mmxext);
+ FN_ASSIGN(avg_, 0, 2, _mmxext);
+ FN_ASSIGN(avg_, 0, 3, _mmxext);
+
+ FN_ASSIGN(avg_, 1, 0, _mmxext);
+ FN_ASSIGN(avg_, 1, 1, _mmxext);
+ FN_ASSIGN(avg_, 1, 2, _mmxext);
+ FN_ASSIGN(avg_, 1, 3, _mmxext);
+
+ FN_ASSIGN(avg_, 2, 0, _mmxext);
+ FN_ASSIGN(avg_, 2, 1, _mmxext);
+ FN_ASSIGN(avg_, 2, 2, _mmxext);
+ FN_ASSIGN(avg_, 2, 3, _mmxext);
+
+ FN_ASSIGN(avg_, 3, 0, _mmxext);
+ FN_ASSIGN(avg_, 3, 1, _mmxext);
+ FN_ASSIGN(avg_, 3, 2, _mmxext);
+ FN_ASSIGN(avg_, 3, 3, _mmxext);
}
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index b22e0fec8b..e237860700 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -2,20 +2,20 @@
;* Core video DSP functions
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -54,13 +54,13 @@ SECTION .text
; | | <- bottom is copied from last line in body of source
; '----' <- bh
%if ARCH_X86_64
-cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
start_y, end_y, bh, w
%else ; x86-32
cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
%define src_strideq r3mp
-%define dst_strideq r2mp
- mov srcq, r1mp
+%define dst_strideq r1mp
+ mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
@@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
neg n_wordsq
lea start_xq, [start_xq+n_wordsq*2]
.y_loop: ; do {
- ; FIXME also write a ssse3 version using pshufb
+%if cpuflag(avx2)
+ vpbroadcastb m0, [dstq+start_xq]
+ mov wq, n_wordsq ; initialize w
+%else
movzx wd, byte [dstq+start_xq] ; w = read(1)
imul wd, 0x01010101 ; w *= 0x01010101
movd m0, wd
@@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
%else ; mmx
punpckldq m0, m0 ; splat
%endif ; mmx/sse
+%endif ; avx2
.x_loop: ; do {
movu [dstq+wq*2], m0 ; write($reg, $mmsize)
add wq, mmsize/2 ; w -= $mmsize/2
@@ -127,6 +131,11 @@ hvar_fn
INIT_XMM sse2
hvar_fn
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+hvar_fn
+%endif
+
; macro to read/write a horizontal number of pixels (%2) to/from registers
; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
; - if (%2 & 8) fills 8 bytes into xmm$next
@@ -184,10 +193,10 @@ hvar_fn
mov valb, [srcq+%2-1]
%elif (%2-%%off) == 2
mov valw, [srcq+%2-2]
-%elifidn %1, body
- mov vald, [srcq+%2-3]
%else
- movd mm %+ %%mmx_idx, [srcq+%2-3]
+ mov valb, [srcq+%2-1]
+ ror vald, 16
+ mov valw, [srcq+%2-3]
%endif
%endif ; (%2-%%off) >= 1
%endmacro ; READ_NUM_BYTES
@@ -240,15 +249,13 @@ hvar_fn
mov [dstq+%2-1], valb
%elif (%2-%%off) == 2
mov [dstq+%2-2], valw
-%elifidn %1, body
- mov [dstq+%2-3], valw
- shr vald, 16
- mov [dstq+%2-1], valb
%else
- movd vald, mm %+ %%mmx_idx
mov [dstq+%2-3], valw
- shr vald, 16
+ ror vald, 16
mov [dstq+%2-1], valb
+%ifnidn %1, body
+ ror vald, 16
+%endif
%endif
%endif ; (%2-%%off) >= 1
%endmacro ; WRITE_NUM_BYTES
@@ -262,30 +269,30 @@ hvar_fn
%rep 1+%2-%1
%if %%n <= 3
%if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
start_y, end_y, val, bh
mov bhq, r6mp ; r6mp = bhmp
%else ; x86-32
cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
mov dstq, r0mp
- mov srcq, r1mp
+ mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
%define src_strideq r3mp
%endif ; x86-64/32
%else
%if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
start_y, end_y, bh
%else ; x86-32
cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
- mov srcq, r1mp
+ mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
%define src_strideq r3mp
%endif ; x86-64/32
%endif
@@ -344,9 +351,8 @@ VERTICAL_EXTEND 16, 22
; obviously not the same on both sides.
%macro READ_V_PIXEL 2
-%if %1 == 2
- movzx valw, byte %2
- imul valw, 0x0101
+%if cpuflag(avx2)
+ vpbroadcastb m0, %2
%else
movzx vald, byte %2
imul vald, 0x01010101
@@ -356,13 +362,16 @@ VERTICAL_EXTEND 16, 22
pshufd m0, m0, q0000
%else
punpckldq m0, m0
-%endif
-%endif ; %1 >= 8
-%endif
+%endif ; mmsize == 16
+%endif ; %1 > 16
+%endif ; avx2
%endmacro ; READ_V_PIXEL
%macro WRITE_V_PIXEL 2
%assign %%off 0
+
+%if %1 >= 8
+
%rep %1/mmsize
movu [%2+%%off], m0
%assign %%off %%off+mmsize
@@ -378,34 +387,44 @@ VERTICAL_EXTEND 16, 22
%assign %%off %%off+8
%endif
%endif ; %1-%%off >= 8
-%endif
+%endif ; mmsize == 16
%if %1-%%off >= 4
%if %1 > 8 && %1-%%off > 4
movq [%2+%1-8], m0
%assign %%off %1
-%elif %1 >= 8 && %1-%%off >= 4
- movd [%2+%%off], m0
-%assign %%off %%off+4
%else
- mov [%2+%%off], vald
+ movd [%2+%%off], m0
%assign %%off %%off+4
%endif
%endif ; %1-%%off >= 4
-%if %1-%%off >= 2
-%if %1 >= 8
- movd [%2+%1-4], m0
+%else ; %1 < 8
+
+%rep %1/4
+ mov [%2+%%off], vald
+%assign %%off %%off+4
+%endrep ; %1/4
+
+%endif ; %1 >=/< 8
+
+%if %1-%%off == 2
+%if cpuflag(avx2)
+ movd [%2+%%off-2], m0
%else
mov [%2+%%off], valw
-%endif
+%endif ; avx2
%endif ; (%1-%%off)/2
%endmacro ; WRITE_V_PIXEL
%macro H_EXTEND 2
%assign %%n %1
%rep 1+(%2-%1)/2
+%if cpuflag(avx2)
+cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
+%else
cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
+%endif
.loop_y: ; do {
READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n)
WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n)
@@ -426,6 +445,11 @@ H_EXTEND 16, 22
INIT_XMM sse2
H_EXTEND 16, 22
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+H_EXTEND 8, 22
+%endif
+
%macro PREFETCH_FN 1
cglobal prefetch, 3, 3, 0, buf, stride, h
.loop:
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index 8ee837096a..26e072bb12 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -1,25 +1,27 @@
/*
+ * Copyright (C) 2002-2012 Michael Niedermayer
* Copyright (C) 2012 Ronald S. Bultje
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
@@ -28,11 +30,11 @@
#include "libavcodec/videodsp.h"
#if HAVE_YASM
-typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src,
- x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+ const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh);
-typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src,
- x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+ const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh,
x86_reg w);
@@ -59,7 +61,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
#if ARCH_X86_32
-static emu_edge_vfix_func *vfixtbl_mmx[22] = {
+static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
&ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx,
&ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx,
&ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx,
@@ -78,7 +80,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
-static emu_edge_vfix_func *vfixtbl_sse[22] = {
+static emu_edge_vfix_func * const vfixtbl_sse[22] = {
ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx,
ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx,
ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx,
@@ -107,7 +109,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
#if ARCH_X86_32
-static emu_edge_hfix_func *hfixtbl_mmx[11] = {
+static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
@@ -119,13 +121,30 @@ extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
-static emu_edge_hfix_func *hfixtbl_sse2[11] = {
+static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
};
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
+#if HAVE_AVX2_EXTERNAL
+extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
+static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
+ ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
+ ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
+ ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
+ ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
+#endif
static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
ptrdiff_t dst_stride,
@@ -133,22 +152,26 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
x86_reg block_w, x86_reg block_h,
x86_reg src_x, x86_reg src_y,
x86_reg w, x86_reg h,
- emu_edge_vfix_func **vfix_tbl,
+ emu_edge_vfix_func * const *vfix_tbl,
emu_edge_vvar_func *v_extend_var,
- emu_edge_hfix_func **hfix_tbl,
+ emu_edge_hfix_func * const *hfix_tbl,
emu_edge_hvar_func *h_extend_var)
{
x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
if (!w || !h)
- return;
+ return;
+
+ av_assert2(block_w <= FFABS(dst_stride));
if (src_y >= h) {
- src -= src_y * src_stride;
- src_y = src_y_add = h - 1;
+ src -= src_y*src_stride;
+ src_y_add = h - 1;
+ src_y = h - 1;
} else if (src_y <= -block_h) {
- src -= src_y*src_stride;
- src_y = src_y_add = 1 - block_h;
+ src -= src_y*src_stride;
+ src_y_add = 1 - block_h;
+ src_y = 1 - block_h;
}
if (src_x >= w) {
src += w - 1 - src_x;
@@ -162,18 +185,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
start_x = FFMAX(0, -src_x);
end_y = FFMIN(block_h, h-src_y);
end_x = FFMIN(block_w, w-src_x);
- assert(start_x < end_x && block_w > 0);
- assert(start_y < end_y && block_h > 0);
+ av_assert2(start_x < end_x && block_w > 0);
+ av_assert2(start_y < end_y && block_h > 0);
// fill in the to-be-copied part plus all above/below
src += (src_y_add + start_y) * src_stride + start_x;
w = end_x - start_x;
if (w <= 22) {
- vfix_tbl[w - 1](dst + start_x, src,
- dst_stride, src_stride,
+ vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h);
} else {
- v_extend_var(dst + start_x, src, dst_stride, src_stride,
+ v_extend_var(dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h, w);
}
@@ -212,7 +234,7 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
-static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src,
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
ptrdiff_t buf_stride,
ptrdiff_t src_stride,
int block_w, int block_h,
@@ -231,10 +253,24 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
int src_x, int src_y, int w,
int h)
{
- emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x,
- src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+ emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+ src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
}
+
+#if HAVE_AVX2_EXTERNAL
+static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
+ ptrdiff_t buf_stride,
+ ptrdiff_t src_stride,
+ int block_w, int block_h,
+ int src_x, int src_y, int w,
+ int h)
+{
+ emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+ src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+ hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
+}
+#endif /* HAVE_AVX2_EXTERNAL */
#endif /* HAVE_YASM */
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
@@ -264,5 +300,10 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_sse2;
}
+#if HAVE_AVX2_EXTERNAL
+ if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
+ ctx->emulated_edge_mc = emulated_edge_mc_avx2;
+ }
+#endif
#endif /* HAVE_YASM */
}
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
index c54650eef5..b25d838868 100644
--- a/libavcodec/x86/vorbisdsp.asm
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -2,20 +2,20 @@
;* Vorbis x86 optimizations
;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index bbd83195cc..bc1cc43a18 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 8587741f95..d88d5a1edf 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -2,20 +2,20 @@
;* MMX/SSE2-optimized functions for the VP3 decoder
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -40,6 +40,7 @@ pb_81: times 8 db 0x81
cextern pb_1
cextern pb_3
cextern pb_80
+cextern pb_FE
cextern pw_8
@@ -141,6 +142,49 @@ cglobal vp3_h_loop_filter, 3, 4
STORE_4_WORDS m3
RET
+%macro PAVGB_NO_RND 0
+ mova m4, m0
+ mova m5, m2
+ pand m4, m1
+ pand m5, m3
+ pxor m1, m0
+ pxor m3, m2
+ pand m1, m6
+ pand m3, m6
+ psrlq m1, 1
+ psrlq m3, 1
+ paddb m4, m1
+ paddb m5, m3
+%endmacro
+
+INIT_MMX mmx
+cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
+ mova m6, [pb_FE]
+ lea stride3q,[strideq+strideq*2]
+.loop:
+ mova m0, [src1q]
+ mova m1, [src2q]
+ mova m2, [src1q+strideq]
+ mova m3, [src2q+strideq]
+ PAVGB_NO_RND
+ mova [dstq], m4
+ mova [dstq+strideq], m5
+
+ mova m0, [src1q+strideq*2]
+ mova m1, [src2q+strideq*2]
+ mova m2, [src1q+stride3q]
+ mova m3, [src2q+stride3q]
+ PAVGB_NO_RND
+ mova [dstq+strideq*2], m4
+ mova [dstq+stride3q], m5
+
+ lea src1q, [src1q+strideq*4]
+ lea src2q, [src2q+strideq*4]
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jnz .loop
+ RET
+
; from original comments: The Macro does IDct on 4 1-D Dcts
%macro BeginIDCT 0
movq m2, I(3)
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 043e10f720..1ba9576431 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -1,18 +1,20 @@
/*
- * This file is part of Libav.
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
*
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -23,7 +25,6 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/vp3dsp.h"
-#include "config.h"
void ff_vp3_idct_put_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
@@ -38,16 +39,21 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
int *bounding_values);
+void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
+ const uint8_t *b, ptrdiff_t stride,
+ int h);
+
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
-#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
+ c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
+#if ARCH_X86_32
c->idct_put = ff_vp3_idct_put_mmx;
c->idct_add = ff_vp3_idct_add_mmx;
- }
#endif
+ }
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h
index 0a693684af..810cc8dcd8 100644
--- a/libavcodec/x86/vp56_arith.h
+++ b/libavcodec/x86/vp56_arith.h
@@ -4,49 +4,46 @@
* Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
* Copyright (C) 2010 Eli Friedman
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_VP56_ARITH_H
#define AVCODEC_X86_VP56_ARITH_H
-#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
#define vp56_rac_get_prob vp56_rac_get_prob
static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
{
unsigned int code_word = vp56_rac_renorm(c);
- unsigned int high = c->high;
- unsigned int low = 1 + (((high - 1) * prob) >> 8);
+ unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
unsigned int low_shift = low << 16;
int bit = 0;
+ c->code_word = code_word;
__asm__(
"subl %4, %1 \n\t"
"subl %3, %2 \n\t"
- "leal (%2, %3), %3 \n\t"
"setae %b0 \n\t"
"cmovb %4, %1 \n\t"
- "cmovb %3, %2 \n\t"
- : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
- : "r"(low)
+ "cmovb %5, %2 \n\t"
+ : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
+ : "r"(low_shift), "r"(low), "r"(code_word)
);
- c->high = high;
- c->code_word = code_word;
return bit;
}
#endif
diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm
index b667d38aea..0be531e5c2 100644
--- a/libavcodec/x86/vp6dsp.asm
+++ b/libavcodec/x86/vp6dsp.asm
@@ -3,20 +3,20 @@
;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c
index 6d98db19d4..ce498931d0 100644
--- a/libavcodec/x86/vp6dsp_init.c
+++ b/libavcodec/x86/vp6dsp_init.c
@@ -3,20 +3,20 @@
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index b0f6b83767..e303b80293 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -906,6 +906,7 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
%4 [dst2q+strideq+%3], m5
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
; load data
@@ -929,8 +930,9 @@ cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, movh
RET
+%endif
-INIT_XMM sse4
+%macro VP8_IDCT_DC_ADD 0
cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
; load data
movd m0, [blockq]
@@ -956,10 +958,25 @@ cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
paddw m4, m0
packuswb m2, m4
movd [dst1q], m2
+%if cpuflag(sse4)
pextrd [dst1q+strideq], m2, 1
pextrd [dst2q], m2, 2
pextrd [dst2q+strideq], m2, 3
+%else
+ psrldq m2, 4
+ movd [dst1q+strideq], m2
+ psrldq m2, 4
+ movd [dst2q], m2
+ psrldq m2, 4
+ movd [dst2q+strideq], m2
+%endif
RET
+%endmacro
+
+INIT_XMM sse2
+VP8_IDCT_DC_ADD
+INIT_XMM sse4
+VP8_IDCT_DC_ADD
;-----------------------------------------------------------------------------
; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 3e84bed424..814bf69382 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -3,20 +3,20 @@
* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -168,7 +168,7 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
- DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
+ LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
src -= srcstride * (TAPNUMY / 2 - 1); \
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
@@ -213,7 +213,7 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
- DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
+ LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
tmp, SIZE, src, srcstride, height + 1, mx, my); \
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
@@ -233,6 +233,8 @@ HVBILIN(ssse3, 8, 16, 16)
void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
+void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
@@ -346,7 +348,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
- if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+ if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -370,9 +372,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
- c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
#if ARCH_X86_32
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
@@ -416,7 +418,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
}
- if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+ if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
@@ -427,9 +429,10 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
}
if (EXTERNAL_SSE2(cpu_flags)) {
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
@@ -454,7 +457,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
}
if (EXTERNAL_SSE4(cpu_flags)) {
- c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm
index 9ffd83a39b..caeb405267 100644
--- a/libavcodec/x86/vp8dsp_loopfilter.asm
+++ b/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 59cde79d89..66363d46ad 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -1,221 +1,124 @@
/*
* VP9 SIMD optimizations
*
- * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
#if HAVE_YASM
-#define fpel_func(avg, sz, opt) \
-void ff_vp9_ ## avg ## sz ## _ ## opt(uint8_t *dst, ptrdiff_t dst_stride, \
- const uint8_t *src, ptrdiff_t src_stride, \
- int h, int mx, int my)
-
-fpel_func(put, 4, mmx);
-fpel_func(put, 8, mmx);
-fpel_func(put, 16, sse);
-fpel_func(put, 32, sse);
-fpel_func(put, 64, sse);
-fpel_func(avg, 4, mmxext);
-fpel_func(avg, 8, mmxext);
-fpel_func(avg, 16, sse2);
-fpel_func(avg, 32, sse2);
-fpel_func(avg, 64, sse2);
-fpel_func(put, 32, avx);
-fpel_func(put, 64, avx);
-fpel_func(avg, 32, avx2);
-fpel_func(avg, 64, avx2);
-#undef fpel_func
-
-#define mc_func(avg, sz, dir, opt, type, f_sz) \
-void \
-ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
- ptrdiff_t dst_stride, \
- const uint8_t *src, \
- ptrdiff_t src_stride, \
- int h, \
- const type (*filter)[f_sz])
-
-#define mc_funcs(sz, opt, type, f_sz) \
- mc_func(put, sz, h, opt, type, f_sz); \
- mc_func(avg, sz, h, opt, type, f_sz); \
- mc_func(put, sz, v, opt, type, f_sz); \
- mc_func(avg, sz, v, opt, type, f_sz)
-
-mc_funcs(4, mmxext, int16_t, 8);
-mc_funcs(8, sse2, int16_t, 8);
-mc_funcs(4, ssse3, int8_t, 32);
-mc_funcs(8, ssse3, int8_t, 32);
+decl_fpel_func(put, 4, , mmx);
+decl_fpel_func(put, 8, , mmx);
+decl_fpel_func(put, 16, , sse);
+decl_fpel_func(put, 32, , sse);
+decl_fpel_func(put, 64, , sse);
+decl_fpel_func(avg, 4, _8, mmxext);
+decl_fpel_func(avg, 8, _8, mmxext);
+decl_fpel_func(avg, 16, _8, sse2);
+decl_fpel_func(avg, 32, _8, sse2);
+decl_fpel_func(avg, 64, _8, sse2);
+decl_fpel_func(put, 32, , avx);
+decl_fpel_func(put, 64, , avx);
+decl_fpel_func(avg, 32, _8, avx2);
+decl_fpel_func(avg, 64, _8, avx2);
+
+decl_mc_funcs(4, mmxext, int16_t, 8, 8);
+decl_mc_funcs(8, sse2, int16_t, 8, 8);
+decl_mc_funcs(4, ssse3, int8_t, 32, 8);
+decl_mc_funcs(8, ssse3, int8_t, 32, 8);
#if ARCH_X86_64
-mc_funcs(16, ssse3, int8_t, 32);
-mc_funcs(32, avx2, int8_t, 32);
+decl_mc_funcs(16, ssse3, int8_t, 32, 8);
+decl_mc_funcs(32, avx2, int8_t, 32, 8);
#endif
-#undef mc_funcs
-#undef mc_func
-
-#define mc_rep_func(avg, sz, hsz, dir, opt, type, f_sz) \
-static av_always_inline void \
-ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
- ptrdiff_t dst_stride, \
- const uint8_t *src, \
- ptrdiff_t src_stride, \
- int h, \
- const type (*filter)[f_sz]) \
-{ \
- ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, \
- dst_stride, \
- src, \
- src_stride, \
- h, \
- filter); \
- ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz, \
- dst_stride, \
- src + hsz, \
- src_stride, \
- h, filter); \
-}
-
-#define mc_rep_funcs(sz, hsz, opt, type, f_sz) \
- mc_rep_func(put, sz, hsz, h, opt, type, f_sz) \
- mc_rep_func(avg, sz, hsz, h, opt, type, f_sz) \
- mc_rep_func(put, sz, hsz, v, opt, type, f_sz) \
- mc_rep_func(avg, sz, hsz, v, opt, type, f_sz)
-
-mc_rep_funcs(16, 8, sse2, int16_t, 8)
+mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8)
#if ARCH_X86_32
-mc_rep_funcs(16, 8, ssse3, int8_t, 32)
+mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8)
#endif
-mc_rep_funcs(32, 16, sse2, int16_t, 8)
-mc_rep_funcs(32, 16, ssse3, int8_t, 32)
-mc_rep_funcs(64, 32, sse2, int16_t, 8)
-mc_rep_funcs(64, 32, ssse3, int8_t, 32)
+mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8)
+mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8)
+mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8)
+mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-mc_rep_funcs(64, 32, avx2, int8_t, 32)
+mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8)
#endif
-#undef mc_rep_funcs
-#undef mc_rep_func
-
extern const int8_t ff_filters_ssse3[3][15][4][32];
extern const int16_t ff_filters_sse2[3][15][8][8];
-#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, opt) \
-static void \
-op ## _8tap_ ## fname ## _ ## sz ## hv_ ## opt(uint8_t *dst, \
- ptrdiff_t dst_stride, \
- const uint8_t *src, \
- ptrdiff_t src_stride, \
- int h, int mx, int my) \
-{ \
- LOCAL_ALIGNED_ ## align(uint8_t, temp, [71 * 64]); \
- ff_vp9_put_8tap_1d_h_ ## sz ## _ ## opt(temp, 64, \
- src - 3 * src_stride, \
- src_stride, h + 7, \
- ff_filters_ ## f_opt[f][mx - 1]); \
- ff_vp9_ ## op ## _8tap_1d_v_ ## sz ## _ ## opt(dst, dst_stride, \
- temp + 3 * 64, 64, h, \
- ff_filters_ ## f_opt[f][my - 1]); \
-}
-
-#define filters_8tap_2d_fn(op, sz, align, opt, f_opt) \
- filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, opt) \
- filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, opt) \
- filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, opt)
-
-#define filters_8tap_2d_fn2(op, align, opt4, opt8, f_opt) \
- filters_8tap_2d_fn(op, 64, align, opt8, f_opt) \
- filters_8tap_2d_fn(op, 32, align, opt8, f_opt) \
- filters_8tap_2d_fn(op, 16, align, opt8, f_opt) \
- filters_8tap_2d_fn(op, 8, align, opt8, f_opt) \
- filters_8tap_2d_fn(op, 4, align, opt4, f_opt)
-
-
-filters_8tap_2d_fn2(put, 16, mmxext, sse2, sse2)
-filters_8tap_2d_fn2(avg, 16, mmxext, sse2, sse2)
-filters_8tap_2d_fn2(put, 16, ssse3, ssse3, ssse3)
-filters_8tap_2d_fn2(avg, 16, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-filters_8tap_2d_fn(put, 64, 32, avx2, ssse3)
-filters_8tap_2d_fn(put, 32, 32, avx2, ssse3)
-filters_8tap_2d_fn(avg, 64, 32, avx2, ssse3)
-filters_8tap_2d_fn(avg, 32, 32, avx2, ssse3)
+filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
#endif
-#undef filters_8tap_2d_fn2
-#undef filters_8tap_2d_fn
-#undef filter_8tap_2d_fn
-
-#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, opt) \
-static void \
-op ## _8tap_ ## fname ## _ ## sz ## dir ## _ ## opt(uint8_t *dst, \
- ptrdiff_t dst_stride, \
- const uint8_t *src, \
- ptrdiff_t src_stride, \
- int h, int mx, \
- int my) \
-{ \
- ff_vp9_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(dst, \
- dst_stride, \
- src, \
- src_stride, h,\
- ff_filters_ ## f_opt[f][dvar - 1]); \
-}
-
-#define filters_8tap_1d_fn(op, sz, dir, dvar, opt, f_opt) \
- filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, opt) \
- filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, opt) \
- filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, opt)
-
-#define filters_8tap_1d_fn2(op, sz, opt, f_opt) \
- filters_8tap_1d_fn(op, sz, h, mx, opt, f_opt) \
- filters_8tap_1d_fn(op, sz, v, my, opt, f_opt)
-
-#define filters_8tap_1d_fn3(op, opt4, opt8, f_opt) \
- filters_8tap_1d_fn2(op, 64, opt8, f_opt) \
- filters_8tap_1d_fn2(op, 32, opt8, f_opt) \
- filters_8tap_1d_fn2(op, 16, opt8, f_opt) \
- filters_8tap_1d_fn2(op, 8, opt8, f_opt) \
- filters_8tap_1d_fn2(op, 4, opt4, f_opt)
-
-filters_8tap_1d_fn3(put, mmxext, sse2, sse2)
-filters_8tap_1d_fn3(avg, mmxext, sse2, sse2)
-filters_8tap_1d_fn3(put, ssse3, ssse3, ssse3)
-filters_8tap_1d_fn3(avg, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-filters_8tap_1d_fn2(put, 64, avx2, ssse3)
-filters_8tap_1d_fn2(put, 32, avx2, ssse3)
-filters_8tap_1d_fn2(avg, 64, avx2, ssse3)
-filters_8tap_1d_fn2(avg, 32, avx2, ssse3)
+filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
#endif
-#undef filters_8tap_1d_fn
-#undef filters_8tap_1d_fn2
-#undef filters_8tap_1d_fn3
-#undef filter_8tap_1d_fn
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct, idct, size, opt); \
+itxfm_func(iadst, idct, size, opt); \
+itxfm_func(idct, iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_func(idct, idct, 4, mmxext);
+itxfm_func(idct, iadst, 4, sse2);
+itxfm_func(iadst, idct, 4, sse2);
+itxfm_func(iadst, iadst, 4, sse2);
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, sse2);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, sse2);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, sse2);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+itxfm_funcs(16, avx2);
+itxfm_func(idct, idct, 32, avx2);
+
+#undef itxfm_func
+#undef itxfm_funcs
#define lpf_funcs(size1, size2, opt) \
void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
@@ -223,6 +126,8 @@ void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
int E, int I, int H)
+lpf_funcs(4, 8, mmxext);
+lpf_funcs(8, 8, mmxext);
lpf_funcs(16, 16, sse2);
lpf_funcs(16, 16, ssse3);
lpf_funcs(16, 16, avx);
@@ -241,42 +146,88 @@ lpf_funcs(88, 16, avx);
#undef lpf_funcs
-#endif /* HAVE_YASM */
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *l, const uint8_t *a)
-av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
-{
-#if HAVE_YASM
- int cpu_flags = av_get_cpu_flags();
+ipred_func(8, v, mmx);
+
+#define ipred_dc_funcs(size, opt) \
+ipred_func(size, dc, opt); \
+ipred_func(size, dc_left, opt); \
+ipred_func(size, dc_top, opt)
+
+ipred_dc_funcs(4, mmxext);
+ipred_dc_funcs(8, mmxext);
+
+#define ipred_dir_tm_funcs(size, opt) \
+ipred_func(size, tm, opt); \
+ipred_func(size, dl, opt); \
+ipred_func(size, dr, opt); \
+ipred_func(size, hd, opt); \
+ipred_func(size, hu, opt); \
+ipred_func(size, vl, opt); \
+ipred_func(size, vr, opt)
+
+ipred_dir_tm_funcs(4, mmxext);
+
+ipred_func(16, v, sse);
+ipred_func(32, v, sse);
-#define init_fpel(idx1, idx2, sz, type, opt) \
- dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
- dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
- dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
- dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_ ## type ## sz ## _ ## opt
+ipred_dc_funcs(16, sse2);
+ipred_dc_funcs(32, sse2);
+#define ipred_dir_tm_h_funcs(size, opt) \
+ipred_dir_tm_funcs(size, opt); \
+ipred_func(size, h, opt)
-#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
- dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv] = type ## _8tap_smooth_ ## sz ## dir ## _ ## opt; \
- dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
- dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv] = type ## _8tap_sharp_ ## sz ## dir ## _ ## opt
+ipred_dir_tm_h_funcs(8, sse2);
+ipred_dir_tm_h_funcs(16, sse2);
+ipred_dir_tm_h_funcs(32, sse2);
-#define init_subpel2(idx1, idx2, sz, type, opt) \
- init_subpel1(idx1, idx2, 1, 1, sz, hv, type, opt); \
- init_subpel1(idx1, idx2, 0, 1, sz, v, type, opt); \
- init_subpel1(idx1, idx2, 1, 0, sz, h, type, opt)
+ipred_func(4, h, sse2);
-#define init_subpel3_32_64(idx, type, opt) \
- init_subpel2(0, idx, 64, type, opt); \
- init_subpel2(1, idx, 32, type, opt)
+#define ipred_all_funcs(size, opt) \
+ipred_dc_funcs(size, opt); \
+ipred_dir_tm_h_funcs(size, opt)
-#define init_subpel3_8to64(idx, type, opt) \
- init_subpel3_32_64(idx, type, opt); \
- init_subpel2(2, idx, 16, type, opt); \
- init_subpel2(3, idx, 8, type, opt)
+// FIXME hd/vl_4x4_ssse3 does not exist
+ipred_all_funcs(4, ssse3);
+ipred_all_funcs(8, ssse3);
+ipred_all_funcs(16, ssse3);
+ipred_all_funcs(32, ssse3);
-#define init_subpel3(idx, type, opt) \
- init_subpel3_8to64(idx, type, opt); \
- init_subpel2(4, idx, 4, type, opt)
+ipred_dir_tm_h_funcs(8, avx);
+ipred_dir_tm_h_funcs(16, avx);
+ipred_dir_tm_h_funcs(32, avx);
+
+ipred_func(32, v, avx);
+
+ipred_dc_funcs(32, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
+#undef ipred_func
+#undef ipred_dir_tm_h_funcs
+#undef ipred_dir_tm_funcs
+#undef ipred_dc_funcs
+
+#endif /* HAVE_YASM */
+
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
+{
+#if HAVE_YASM
+ int cpu_flags;
+
+ if (bpp == 10) {
+ ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
+ return;
+ } else if (bpp == 12) {
+ ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
+ return;
+ }
+
+ cpu_flags = av_get_cpu_flags();
#define init_lpf(opt) do { \
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
@@ -291,53 +242,169 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
} while (0)
+#define init_ipred(sz, opt, t, e) \
+ dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
+
+#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
+#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
+#define init_dir_tm_ipred(sz, opt) do { \
+ init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
+ init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
+ init_ipred(sz, opt, hd, HOR_DOWN); \
+ init_ipred(sz, opt, vl, VERT_LEFT); \
+ init_ipred(sz, opt, hu, HOR_UP); \
+ init_ipred(sz, opt, tm, TM_VP8); \
+ init_ipred(sz, opt, vr, VERT_RIGHT); \
+} while (0)
+#define init_dir_tm_h_ipred(sz, opt) do { \
+ init_dir_tm_ipred(sz, opt); \
+ init_ipred(sz, opt, h, HOR); \
+} while (0)
+#define init_dc_ipred(sz, opt) do { \
+ init_ipred(sz, opt, dc, DC); \
+ init_ipred(sz, opt, dc_left, LEFT_DC); \
+ init_ipred(sz, opt, dc_top, TOP_DC); \
+} while (0)
+#define init_all_ipred(sz, opt) do { \
+ init_dc_ipred(sz, opt); \
+ init_dir_tm_h_ipred(sz, opt); \
+} while (0)
+
if (EXTERNAL_MMX(cpu_flags)) {
- init_fpel(4, 0, 4, put, mmx);
- init_fpel(3, 0, 8, put, mmx);
+ init_fpel_func(4, 0, 4, put, , mmx);
+ init_fpel_func(3, 0, 8, put, , mmx);
+ if (!bitexact) {
+ dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+ dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+ dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+ dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+ }
+ init_ipred(8, mmx, v, VERT);
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
- init_subpel2(4, 0, 4, put, mmxext);
- init_subpel2(4, 1, 4, avg, mmxext);
- init_fpel(4, 1, 4, avg, mmxext);
- init_fpel(3, 1, 8, avg, mmxext);
+ dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext;
+ dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext;
+ dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext;
+ dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
+ init_subpel2(4, 0, 4, put, 8, mmxext);
+ init_subpel2(4, 1, 4, avg, 8, mmxext);
+ init_fpel_func(4, 1, 4, avg, _8, mmxext);
+ init_fpel_func(3, 1, 8, avg, _8, mmxext);
+ dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
+ init_dc_ipred(4, mmxext);
+ init_dc_ipred(8, mmxext);
+ init_dir_tm_ipred(4, mmxext);
}
if (EXTERNAL_SSE(cpu_flags)) {
- init_fpel(2, 0, 16, put, sse);
- init_fpel(1, 0, 32, put, sse);
- init_fpel(0, 0, 64, put, sse);
+ init_fpel_func(2, 0, 16, put, , sse);
+ init_fpel_func(1, 0, 32, put, , sse);
+ init_fpel_func(0, 0, 64, put, , sse);
+ init_ipred(16, sse, v, VERT);
+ init_ipred(32, sse, v, VERT);
}
if (EXTERNAL_SSE2(cpu_flags)) {
- init_subpel3_8to64(0, put, sse2);
- init_subpel3_8to64(1, avg, sse2);
- init_fpel(2, 1, 16, avg, sse2);
- init_fpel(1, 1, 32, avg, sse2);
- init_fpel(0, 1, 64, avg, sse2);
+ init_subpel3_8to64(0, put, 8, sse2);
+ init_subpel3_8to64(1, avg, 8, sse2);
+ init_fpel_func(2, 1, 16, avg, _8, sse2);
+ init_fpel_func(1, 1, 32, avg, _8, sse2);
+ init_fpel_func(0, 1, 64, avg, _8, sse2);
init_lpf(sse2);
+ dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2;
+ dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2;
+ dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
+ init_dc_ipred(16, sse2);
+ init_dc_ipred(32, sse2);
+ init_dir_tm_h_ipred(8, sse2);
+ init_dir_tm_h_ipred(16, sse2);
+ init_dir_tm_h_ipred(32, sse2);
+ init_ipred(4, sse2, h, HOR);
}
if (EXTERNAL_SSSE3(cpu_flags)) {
- init_subpel3(0, put, ssse3);
- init_subpel3(1, avg, ssse3);
+ init_subpel3(0, put, 8, ssse3);
+ init_subpel3(1, avg, 8, ssse3);
+ dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
init_lpf(ssse3);
+ init_all_ipred(4, ssse3);
+ init_all_ipred(8, ssse3);
+ init_all_ipred(16, ssse3);
+ init_all_ipred(32, ssse3);
}
if (EXTERNAL_AVX(cpu_flags)) {
- init_fpel(1, 0, 32, put, avx);
- init_fpel(0, 0, 64, put, avx);
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
init_lpf(avx);
+ init_dir_tm_h_ipred(8, avx);
+ init_dir_tm_h_ipred(16, avx);
+ init_dir_tm_h_ipred(32, avx);
+ }
+ if (EXTERNAL_AVX_FAST(cpu_flags)) {
+ init_fpel_func(1, 0, 32, put, , avx);
+ init_fpel_func(0, 0, 64, put, , avx);
+ init_ipred(32, avx, v, VERT);
}
- if (EXTERNAL_AVX2(cpu_flags)) {
- init_fpel(1, 1, 32, avg, avx2);
- init_fpel(0, 1, 64, avg, avx2);
-
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ init_fpel_func(1, 1, 32, avg, _8, avx2);
+ init_fpel_func(0, 1, 64, avg, _8, avx2);
+ if (ARCH_X86_64) {
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
- init_subpel3_32_64(0, put, avx2);
- init_subpel3_32_64(1, avg, avx2);
-#endif /* ARCH_X86_64 && HAVE_AVX2_EXTERNAL */
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
+ init_subpel3_32_64(0, put, 8, avx2);
+ init_subpel3_32_64(1, avg, 8, avx2);
+#endif
+ }
+ init_dc_ipred(32, avx2);
+ init_ipred(32, avx2, h, HOR);
+ init_ipred(32, avx2, tm, TM_VP8);
}
#undef init_fpel
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
new file mode 100644
index 0000000000..e410cab3a1
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -0,0 +1,189 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP9DSP_INIT_H
+#define AVCODEC_X86_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+// hack to force-expand BPC
+#define cat(a, bpp, b) a##bpp##b
+
+#define decl_fpel_func(avg, sz, bpp, opt) \
+void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
+void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, const type (*filter)[f_sz])
+
+#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
+decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+ ptrdiff_t stride, \
+ const uint8_t *l, \
+ const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type, 4, bpp, opt4); \
+decl_ipred_fn(type, 8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
+#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
+void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int16_t *block, \
+ int eob)
+
+#define decl_itxfm_funcs(size, bpp, opt) \
+decl_itxfm_func(idct, idct, size, bpp, opt); \
+decl_itxfm_func(iadst, idct, size, bpp, opt); \
+decl_itxfm_func(idct, iadst, size, bpp, opt); \
+decl_itxfm_func(iadst, iadst, size, bpp, opt)
+
+#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
+static av_always_inline void \
+ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, const type (*filter)[f_sz]) \
+{ \
+ ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst, dst_stride, src, \
+ src_stride, h, filter); \
+ ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
+ src_stride, h, filter); \
+}
+
+#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
+
+#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
+static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
+ h, ff_filters_##f_opt[f][dvar - 1]); \
+}
+
+#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, bpp, opt)
+
+#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
+
+#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
+
+#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
+static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
+ ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
+ src_stride, h + 7, \
+ ff_filters_##f_opt[f][mx - 1]); \
+ ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
+ 64 * bytes, h, \
+ ff_filters_##f_opt[f][my - 1]); \
+}
+
+#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, bpp, bytes, opt)
+
+#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
+filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
+
+#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+ type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+ type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
+ type##_8tap_sharp_##sz##dir##_##bpp##_##opt
+
+#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
+ init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
+ init_subpel1(idx1, idx2, 0, 1, sz, v, type, bpp, opt); \
+ init_subpel1(idx1, idx2, 1, 0, sz, h, type, bpp, opt)
+
+#define init_subpel3_32_64(idx, type, bpp, opt) \
+ init_subpel2(0, idx, 64, type, bpp, opt); \
+ init_subpel2(1, idx, 32, type, bpp, opt)
+
+#define init_subpel3_8to64(idx, type, bpp, opt) \
+ init_subpel3_32_64(idx, type, bpp, opt); \
+ init_subpel2(2, idx, 16, type, bpp, opt); \
+ init_subpel2(3, idx, 8, type, bpp, opt)
+
+#define init_subpel3(idx, type, bpp, opt) \
+ init_subpel3_8to64(idx, type, bpp, opt); \
+ init_subpel2(4, idx, 4, type, bpp, opt)
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+ dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+ cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+ init_ipred_func(type, enum, 8, bpp, opt); \
+ init_ipred_func(type, enum, 16, bpp, opt); \
+ init_ipred_func(type, enum, 32, bpp, opt)
+
+#define init_ipred_funcs(type, enum, bpp, opt) \
+ init_ipred_func(type, enum, 4, bpp, opt); \
+ init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
+void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VP9DSP_INIT_H */
diff --git a/libavcodec/x86/vp9dsp_init_10bpp.c b/libavcodec/x86/vp9dsp_init_10bpp.c
new file mode 100644
index 0000000000..2694c06cb2
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_10bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_12bpp.c b/libavcodec/x86/vp9dsp_init_12bpp.c
new file mode 100644
index 0000000000..5da3bc1840
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_12bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
new file mode 100644
index 0000000000..4576ff1692
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -0,0 +1,141 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+decl_fpel_func(put, 8, , mmx);
+decl_fpel_func(avg, 8, _16, mmxext);
+decl_fpel_func(put, 16, , sse);
+decl_fpel_func(put, 32, , sse);
+decl_fpel_func(put, 64, , sse);
+decl_fpel_func(put, 128, , sse);
+decl_fpel_func(avg, 16, _16, sse2);
+decl_fpel_func(avg, 32, _16, sse2);
+decl_fpel_func(avg, 64, _16, sse2);
+decl_fpel_func(avg, 128, _16, sse2);
+decl_fpel_func(put, 32, , avx);
+decl_fpel_func(put, 64, , avx);
+decl_fpel_func(put, 128, , avx);
+decl_fpel_func(avg, 32, _16, avx2);
+decl_fpel_func(avg, 64, _16, avx2);
+decl_fpel_func(avg, 128, _16, avx2);
+
+decl_ipred_fns(v, 16, mmx, sse);
+decl_ipred_fns(h, 16, mmxext, sse2);
+decl_ipred_fns(dc, 16, mmxext, sse2);
+decl_ipred_fns(dc_top, 16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
+decl_ipred_fn(dl, 16, 16, avx2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2, sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx, avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
+#endif /* HAVE_YASM */
+
+av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ init_fpel_func(4, 0, 8, put, , mmx);
+ init_ipred_func(v, VERT, 4, 16, mmx);
+ }
+
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ init_fpel_func(4, 1, 8, avg, _16, mmxext);
+ init_ipred_func(h, HOR, 4, 16, mmxext);
+ init_ipred_func(dc, DC, 4, 16, mmxext);
+ init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext);
+ init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
+ }
+
+ if (EXTERNAL_SSE(cpu_flags)) {
+ init_fpel_func(3, 0, 16, put, , sse);
+ init_fpel_func(2, 0, 32, put, , sse);
+ init_fpel_func(1, 0, 64, put, , sse);
+ init_fpel_func(0, 0, 128, put, , sse);
+ init_8_16_32_ipred_funcs(v, VERT, 16, sse);
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ init_fpel_func(3, 1, 16, avg, _16, sse2);
+ init_fpel_func(2, 1, 32, avg, _16, sse2);
+ init_fpel_func(1, 1, 64, avg, _16, sse2);
+ init_fpel_func(0, 1, 128, avg, _16, sse2);
+ init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+ init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+ init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2);
+ init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+ init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+ init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+ init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+ init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+ init_ipred_funcs(hu, HOR_UP, 16, sse2);
+ init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+ init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+ init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+ init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+ init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+ init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
+ }
+
+ if (EXTERNAL_AVX_FAST(cpu_flags)) {
+ init_fpel_func(2, 0, 32, put, , avx);
+ init_fpel_func(1, 0, 64, put, , avx);
+ init_fpel_func(0, 0, 128, put, , avx);
+ init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+ init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
+ init_ipred_funcs(vl, VERT_LEFT, 16, avx);
+ init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
+ init_ipred_funcs(hu, HOR_UP, 16, avx);
+ init_ipred_funcs(hd, HOR_DOWN, 16, avx);
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ init_fpel_func(2, 1, 32, avg, _16, avx2);
+ init_fpel_func(1, 1, 64, avg, _16, avx2);
+ init_fpel_func(0, 1, 128, avg, _16, avx2);
+ init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+ }
+
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
new file mode 100644
index 0000000000..4840b2844e
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -0,0 +1,240 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+extern const int16_t ff_filters_16bpp[3][15][4][16];
+
+decl_mc_funcs(4, sse2, int16_t, 16, BPC);
+decl_mc_funcs(8, sse2, int16_t, 16, BPC);
+decl_mc_funcs(16, avx2, int16_t, 16, BPC);
+
+mc_rep_funcs(16, 8, 16, sse2, int16_t, 16, BPC)
+mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC)
+#if HAVE_AVX2_EXTERNAL
+mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC)
+#endif
+
+filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp)
+filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp)
+#endif
+
+filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp)
+filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
+#endif
+
+#define decl_lpf_func(dir, wd, bpp, opt) \
+void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H)
+
+#define decl_lpf_funcs(dir, wd, bpp) \
+decl_lpf_func(dir, wd, bpp, sse2); \
+decl_lpf_func(dir, wd, bpp, ssse3); \
+decl_lpf_func(dir, wd, bpp, avx)
+
+#define decl_lpf_funcs_wd(dir) \
+decl_lpf_funcs(dir, 4, BPC); \
+decl_lpf_funcs(dir, 8, BPC); \
+decl_lpf_funcs(dir, 16, BPC)
+
+decl_lpf_funcs_wd(h);
+decl_lpf_funcs_wd(v);
+
+#define lpf_16_wrapper(dir, off, bpp, opt) \
+static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H) \
+{ \
+ ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst, stride, E, I, H); \
+ ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
+}
+
+#define lpf_16_wrappers(bpp, opt) \
+lpf_16_wrapper(h, 8 * stride, bpp, opt) \
+lpf_16_wrapper(v, 16, bpp, opt)
+
+lpf_16_wrappers(BPC, sse2)
+lpf_16_wrappers(BPC, ssse3)
+lpf_16_wrappers(BPC, avx)
+
+#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
+static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H) \
+{ \
+ ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst, stride, \
+ E & 0xff, I & 0xff, H & 0xff); \
+ ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
+ E >> 8, I >> 8, H >> 8); \
+}
+
+#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(v, 16, wd1, wd2, bpp, opt)
+
+#define lpf_mix2_wrappers_set(bpp, opt) \
+lpf_mix2_wrappers(4, 4, bpp, opt) \
+lpf_mix2_wrappers(4, 8, bpp, opt) \
+lpf_mix2_wrappers(8, 4, bpp, opt) \
+lpf_mix2_wrappers(8, 8, bpp, opt) \
+
+lpf_mix2_wrappers_set(BPC, sse2)
+lpf_mix2_wrappers_set(BPC, ssse3)
+lpf_mix2_wrappers_set(BPC, avx)
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
+
+decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+decl_itxfm_func(idct, idct, 4, BPC, mmxext);
+decl_itxfm_funcs(4, BPC, ssse3);
+#else
+decl_itxfm_func(idct, idct, 4, BPC, sse2);
+#endif
+decl_itxfm_func(idct, iadst, 4, BPC, sse2);
+decl_itxfm_func(iadst, idct, 4, BPC, sse2);
+decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
+decl_itxfm_funcs(8, BPC, sse2);
+decl_itxfm_funcs(16, BPC, sse2);
+decl_itxfm_func(idct, idct, 32, BPC, sse2);
+#endif /* HAVE_YASM */
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
+ dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
+#define init_lpf_16_func(idx, dir, bpp, opt) \
+ dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
+#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
+ dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
+
+#define init_lpf_funcs(bpp, opt) \
+ init_lpf_8_func(0, 0, h, 4, bpp, opt); \
+ init_lpf_8_func(0, 1, v, 4, bpp, opt); \
+ init_lpf_8_func(1, 0, h, 8, bpp, opt); \
+ init_lpf_8_func(1, 1, v, 8, bpp, opt); \
+ init_lpf_8_func(2, 0, h, 16, bpp, opt); \
+ init_lpf_8_func(2, 1, v, 16, bpp, opt); \
+ init_lpf_16_func(0, h, bpp, opt); \
+ init_lpf_16_func(1, v, bpp, opt); \
+ init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
+ init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
+ init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
+ init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
+ init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
+ init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
+ init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
+ init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
+
+#define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \
+ dsp->itxfm_add[idxa][idxb] = \
+ cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt);
+#define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \
+ init_itx_func(idx, DCT_DCT, typea, typeb, size, bpp, opt); \
+ init_itx_func(idx, ADST_DCT, typea, typeb, size, bpp, opt); \
+ init_itx_func(idx, DCT_ADST, typea, typeb, size, bpp, opt); \
+ init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
+#define init_itx_funcs(idx, size, bpp, opt) \
+ init_itx_func(idx, DCT_DCT, idct, idct, size, bpp, opt); \
+ init_itx_func(idx, ADST_DCT, idct, iadst, size, bpp, opt); \
+ init_itx_func(idx, DCT_ADST, iadst, idct, size, bpp, opt); \
+ init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
+
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+ if (!bitexact) {
+ init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+ init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext);
+#endif
+ }
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ init_subpel3(0, put, BPC, sse2);
+ init_subpel3(1, avg, BPC, sse2);
+ init_lpf_funcs(BPC, sse2);
+ init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
+#if BPC == 10
+ if (!bitexact) {
+ init_itx_func(TX_4X4, ADST_DCT, idct, iadst, 4, 10, sse2);
+ init_itx_func(TX_4X4, DCT_ADST, iadst, idct, 4, 10, sse2);
+ init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2);
+ }
+#else
+ init_itx_funcs(TX_4X4, 4, 12, sse2);
+#endif
+ init_itx_funcs(TX_8X8, 8, BPC, sse2);
+ init_itx_funcs(TX_16X16, 16, BPC, sse2);
+ init_itx_func_one(TX_32X32, idct, idct, 32, BPC, sse2);
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ init_lpf_funcs(BPC, ssse3);
+#if BPC == 10
+ if (!bitexact) {
+ init_itx_funcs(TX_4X4, 4, BPC, ssse3);
+ }
+#endif
+ }
+
+ if (EXTERNAL_AVX(cpu_flags)) {
+ init_lpf_funcs(BPC, avx);
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if HAVE_AVX2_EXTERNAL
+ init_subpel3_32_64(0, put, BPC, avx2);
+ init_subpel3_32_64(1, avg, BPC, avx2);
+ init_subpel2(2, 0, 16, put, BPC, avx2);
+ init_subpel2(2, 1, 16, avg, BPC, avx2);
+#endif
+ }
+
+#endif /* HAVE_YASM */
+
+ ff_vp9dsp_init_16bpp_x86(dsp);
+}
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
new file mode 100644
index 0000000000..31f7d449fd
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* Parts based on:
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Fiona Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m256: times 16 dw -256
+pw_m255: times 16 dw -255
+pw_4096: times 8 dw 4096
+
+pb_4x3_4x2_4x1_4x0: times 4 db 3
+ times 4 db 2
+ times 4 db 1
+ times 4 db 0
+pb_8x1_8x0: times 8 db 1
+ times 8 db 0
+pb_8x3_8x2: times 8 db 3
+ times 8 db 2
+pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7
+ times 8 db -1
+pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6
+ times 9 db 7
+pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
+ times 10 db 7
+pb_2to6_3x7:
+pb_2to6_11x7: db 2, 3, 4, 5, 6
+ times 11 db 7
+pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+pb_13456_3xm1: db 1, 3, 4, 5, 6
+ times 3 db -1
+pb_6012_4xm1: db 6, 0, 1, 2
+ times 4 db -1
+pb_6xm1_246_8toE: times 6 db -1
+ db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
+pb_6xm1_BDF_0to6: times 6 db -1
+ db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
+pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_15x0_1xm1: times 15 db 0
+ db -1
+pb_0to2_5x3: db 0, 1, 2
+ times 5 db 3
+pb_6xm1_2x0: times 6 db -1
+ times 2 db 0
+pb_6x0_2xm1: times 6 db 0
+ times 2 db -1
+
+cextern pb_1
+cextern pb_2
+cextern pb_3
+cextern pb_15
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_255
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_8192
+
+SECTION .text
+
+; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_4to8_FUNCS 0
+cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq]
+ pxor m1, m1
+ psadbw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_4096]
+ pshufb m0, m1
+%else
+ paddw m0, [pw_4]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ RET
+
+cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [lq]
+ movq m1, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048]
+ pshufb m0, m2
+%else
+ paddw m0, [pw_8]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+DC_4to8_FUNCS
+INIT_MMX ssse3
+DC_4to8_FUNCS
+
+%macro DC_16to32_FUNCS 0
+cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_1024]
+ pshufb m0, m2
+%else
+ paddw m0, [pw_16]
+ psraw m0, 5
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [lq+16]
+ mova m2, [aq]
+ mova m3, [aq+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m4, m4
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m4
+ psadbw m3, m4
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_512]
+ pshufb m0, m4
+%else
+ paddw m0, [pw_32]
+ psraw m0, 6
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DC_16to32_FUNCS
+INIT_XMM ssse3
+DC_16to32_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ movhlps xm1, xm0
+ paddw xm0, xm1
+ pmulhrsw xm0, [pw_512]
+ vpbroadcastb m0, xm0
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endif
+
+; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [%2q]
+ pxor m1, m1
+ psadbw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_8192]
+ pshufb m0, m1
+%else
+ paddw m0, [pw_2]
+ psraw m0, 2
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ RET
+
+cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [%2q]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pxor m1, m1
+ psadbw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_4096]
+ pshufb m0, m1
+%else
+ paddw m0, [pw_4]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+DC_1D_4to8_FUNCS top, a
+DC_1D_4to8_FUNCS left, l
+INIT_MMX ssse3
+DC_1D_4to8_FUNCS top, a
+DC_1D_4to8_FUNCS left, l
+
+%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
+ mova m0, [%2q]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048]
+ pshufb m0, m2
+%else
+ paddw m0, [pw_8]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+ mova m0, [%2q]
+ mova m1, [%2q+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_1024]
+ pshufb m0, m2
+%else
+ paddw m0, [pw_16]
+ psraw m0, 5
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DC_1D_16to32_FUNCS top, a
+DC_1D_16to32_FUNCS left, l
+INIT_XMM ssse3
+DC_1D_16to32_FUNCS top, a
+DC_1D_16to32_FUNCS left, l
+
+%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
+%if HAVE_AVX2_EXTERNAL
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+ mova m0, [%2q]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ movhlps xm1, xm0
+ paddw xm0, xm1
+ pmulhrsw xm0, [pw_1024]
+ vpbroadcastb m0, xm0
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INIT_YMM avx2
+DC_1D_AVX2_FUNCS top, a
+DC_1D_AVX2_FUNCS left, l
+
+; v
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
+ mova m0, [aq]
+ mova m1, [aq+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_YMM avx
+cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+; h
+
+%macro H_XMM_FUNCS 2
+%if notcpuflag(avx)
+cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
+ movd m0, [lq]
+%if cpuflag(ssse3)
+ pshufb m0, [pb_4x3_4x2_4x1_4x0]
+%else
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0123
+ punpcklwd m0, m0
+%endif
+ lea stride3q, [strideq*3]
+ movd [dstq+strideq*0], m0
+ psrldq m0, 4
+ movd [dstq+strideq*1], m0
+ psrldq m0, 4
+ movd [dstq+strideq*2], m0
+ psrldq m0, 4
+ movd [dstq+stride3q ], m0
+ RET
+%endif
+
+cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+ mova m2, [pb_8x1_8x0]
+ mova m3, [pb_8x3_8x2]
+%endif
+ lea stride3q, [strideq*3]
+ mov cntq, 1
+.loop:
+ movd m0, [lq+cntq*4]
+%if cpuflag(ssse3)
+ pshufb m1, m0, m3
+ pshufb m0, m2
+%else
+ punpcklbw m0, m0
+ punpcklwd m0, m0
+ pshufd m1, m0, q2233
+ pshufd m0, m0, q0011
+%endif
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ movq [dstq+strideq*2], m0
+ movhps [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+
+cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m7, [pb_3]
+ pxor m4, m4
+%endif
+ lea stride3q, [strideq*3]
+ mov cntq, 3
+.loop:
+ movd m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+ pshufb m0, m3, m7
+ pshufb m1, m3, m6
+%else
+ punpcklbw m3, m3
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+%endif
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+ pshufb m2, m3, m5
+ pshufb m3, m4
+%else
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+%endif
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+
+cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m7, [pb_3]
+ pxor m4, m4
+%endif
+ lea stride3q, [strideq*3]
+ mov cntq, 7
+.loop:
+ movd m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+ pshufb m0, m3, m7
+ pshufb m1, m3, m6
+%else
+ punpcklbw m3, m3
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+%endif
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+%if cpuflag(ssse3)
+ pshufb m2, m3, m5
+ pshufb m3, m4
+%else
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+%endif
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+H_XMM_FUNCS 2, 4
+INIT_XMM ssse3
+H_XMM_FUNCS 4, 8
+INIT_XMM avx
+H_XMM_FUNCS 4, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m7, [pb_3]
+ pxor m4, m4
+ lea stride3q, [strideq*3]
+ mov cntq, 7
+.loop:
+ movd xm3, [lq+cntq*4]
+ vinserti128 m3, m3, xm3, 1
+ pshufb m0, m3, m7
+ pshufb m1, m3, m6
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufb m2, m3, m5
+ pshufb m3, m4
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+%endif
+
+; tm
+
+%macro TM_MMX_FUNCS 0
+cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
+ pxor m1, m1
+ movd m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpcklbw m0, m1
+ DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+ mova m3, [pw_m256]
+ mova m1, [pw_m255]
+ pshufb m2, m3
+%else
+ punpcklbw m2, m1
+ pshufw m2, m2, q0000
+%endif
+ psubw m0, m2
+ mov cntq, 1
+.loop:
+ pinsrw m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m4, m2, m1
+ pshufb m2, m3
+%else
+ punpcklbw m2, m1
+ pshufw m4, m2, q1111
+ pshufw m2, m2, q0000
+%endif
+ paddw m4, m0
+ paddw m2, m0
+ packuswb m4, m4
+ packuswb m2, m2
+ movd [dstq+strideq*0], m4
+ movd [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+%endmacro
+
+INIT_MMX mmxext
+TM_MMX_FUNCS
+INIT_MMX ssse3
+TM_MMX_FUNCS
+
+%macro TM_XMM_FUNCS 0
+cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
+ pxor m1, m1
+ movh m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpcklbw m0, m1
+ DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+ mova m3, [pw_m256]
+ mova m1, [pw_m255]
+ pshufb m2, m3
+%else
+ punpcklbw m2, m1
+ punpcklwd m2, m2
+ pshufd m2, m2, q0000
+%endif
+ psubw m0, m2
+ mov cntq, 3
+.loop:
+ pinsrw m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m4, m2, m1
+ pshufb m2, m3
+%else
+ punpcklbw m2, m1
+ punpcklwd m2, m2
+ pshufd m4, m2, q1111
+ pshufd m2, m2, q0000
+%endif
+ paddw m4, m0
+ paddw m2, m0
+ packuswb m4, m2
+ movh [dstq+strideq*0], m4
+ movhps [dstq+strideq*1], m4
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
+ pxor m3, m3
+ mova m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+ mova m4, [pw_m256]
+ mova m3, [pw_m255]
+ pshufb m2, m4
+%else
+ punpcklbw m2, m3
+ punpcklwd m2, m2
+ pshufd m2, m2, q0000
+%endif
+ psubw m1, m2
+ psubw m0, m2
+ mov cntq, 7
+.loop:
+ pinsrw m7, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m5, m7, m3
+ pshufb m7, m4
+%else
+ punpcklbw m7, m3
+ punpcklwd m7, m7
+ pshufd m5, m7, q1111
+ pshufd m7, m7, q0000
+%endif
+ paddw m2, m5, m0
+ paddw m5, m1
+ paddw m6, m7, m0
+ paddw m7, m1
+ packuswb m2, m5
+ packuswb m6, m7
+ mova [dstq+strideq*0], m2
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+
+%if ARCH_X86_64
+%define mem 0
+%else
+%define mem 64
+%endif
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
+ pxor m5, m5
+ pinsrw m4, [aq-1], 0
+ mova m0, [aq]
+ mova m2, [aq+16]
+ DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+ mova m12, [pw_m256]
+ mova m13, [pw_m255]
+%define pw_m256_reg m12
+%define pw_m255_reg m13
+%else
+%define pw_m256_reg [pw_m256]
+%define pw_m255_reg [pw_m255]
+%endif
+ pshufb m4, pw_m256_reg
+%else
+ punpcklbw m4, m5
+ punpcklwd m4, m4
+ pshufd m4, m4, q0000
+%endif
+ punpckhbw m1, m0, m5
+ punpckhbw m3, m2, m5
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ psubw m1, m4
+ psubw m0, m4
+ psubw m3, m4
+ psubw m2, m4
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+%else
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+ mova [rsp+3*16], m3
+%endif
+ mov cntq, 15
+.loop:
+ pinsrw m3, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m7, m3, pw_m255_reg
+ pshufb m3, pw_m256_reg
+%else
+ pxor m7, m7
+ punpcklbw m3, m7
+ punpcklwd m3, m3
+ pshufd m7, m3, q1111
+ pshufd m3, m3, q0000
+%endif
+%if ARCH_X86_64
+ paddw m4, m7, m8
+ paddw m5, m7, m9
+ paddw m6, m7, m10
+ paddw m7, m11
+ paddw m0, m3, m8
+ paddw m1, m3, m9
+ paddw m2, m3, m10
+ paddw m3, m11
+%else
+ paddw m4, m7, [rsp+0*16]
+ paddw m5, m7, [rsp+1*16]
+ paddw m6, m7, [rsp+2*16]
+ paddw m7, [rsp+3*16]
+ paddw m0, m3, [rsp+0*16]
+ paddw m1, m3, [rsp+1*16]
+ paddw m2, m3, [rsp+2*16]
+ paddw m3, [rsp+3*16]
+%endif
+ packuswb m4, m5
+ packuswb m6, m7
+ packuswb m0, m1
+ packuswb m2, m3
+ mova [dstq+strideq*0+ 0], m4
+ mova [dstq+strideq*0+16], m6
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m2
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+%undef pw_m256_reg
+%undef pw_m255_reg
+%undef mem
+%endmacro
+
+INIT_XMM sse2
+TM_XMM_FUNCS
+INIT_XMM ssse3
+TM_XMM_FUNCS
+INIT_XMM avx
+TM_XMM_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+ pxor m3, m3
+ pinsrw xm2, [aq-1], 0
+ vinserti128 m2, m2, xm2, 1
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, l, cnt
+ mova m4, [pw_m256]
+ mova m5, [pw_m255]
+ pshufb m2, m4
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ psubw m1, m2
+ psubw m0, m2
+ mov cntq, 15
+.loop:
+ pinsrw xm7, [lq+cntq*2], 0
+ vinserti128 m7, m7, xm7, 1
+ pshufb m3, m7, m5
+ pshufb m7, m4
+ paddw m2, m3, m0
+ paddw m3, m1
+ paddw m6, m7, m0
+ paddw m7, m1
+ packuswb m2, m3
+ packuswb m6, m7
+ mova [dstq+strideq*0], m2
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+%endif
+
+; dl
+
+%macro LOWPASS 4 ; left [dst], center, right, tmp
+ pxor m%4, m%1, m%3
+ pand m%4, [pb_1]
+ pavgb m%1, m%3
+ psubusb m%1, m%4
+ pavgb m%1, m%2
+%endmacro
+
+%macro DL_MMX_FUNCS 0
+cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
+ movq m1, [aq]
+%if cpuflag(ssse3)
+ pshufb m0, m1, [pb_0to5_2x7]
+ pshufb m2, m1, [pb_2to6_3x7]
+%else
+ punpckhbw m3, m1, m1 ; 44556677
+ pand m0, m1, [pb_6xm1_2x0] ; 012345__
+ pand m3, [pb_6x0_2xm1] ; ______77
+ psrlq m2, m1, 16 ; 234567__
+ por m0, m3 ; 01234577
+ por m2, m3 ; 23456777
+%endif
+ psrlq m1, 8
+ LOWPASS 0, 1, 2, 3
+
+ pshufw m1, m0, q3321
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*2], m1
+ psrlq m0, 8
+ psrlq m1, 8
+ add dstq, strideq
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*2], m1
+ RET
+%endmacro
+
+INIT_MMX mmxext
+DL_MMX_FUNCS
+INIT_MMX ssse3
+DL_MMX_FUNCS
+
+%macro DL_XMM_FUNCS 0
+cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
+ movq m0, [aq]
+ lea stride5q, [strideq*5]
+%if cpuflag(ssse3)
+ pshufb m1, m0, [pb_1to6_10x7]
+%else
+ punpcklbw m1, m0, m0 ; 0011223344556677
+ punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7
+%endif
+ shufps m0, m1, q3310
+%if notcpuflag(ssse3)
+ psrldq m1, m0, 1
+ shufps m1, m0, q3210
+%endif
+ psrldq m2, m1, 1
+ LOWPASS 0, 1, 2, 3
+
+ pshufd m1, m0, q3321
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*4], m1
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*1], m0
+ movq [dstq+stride5q ], m1
+ lea dstq, [dstq+strideq*2]
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*4], m1
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*1], m0
+ movq [dstq+stride5q ], m1
+ RET
+
+cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
+ mova m0, [aq]
+%if cpuflag(ssse3)
+ mova m5, [pb_1toE_2xF]
+ pshufb m1, m0, m5
+ pshufb m2, m1, m5
+ pshufb m4, m0, [pb_15]
+%else
+ pand m5, m0, [pb_15x0_1xm1] ; _______________F
+ psrldq m1, m0, 1 ; 123456789ABCDEF_
+ por m1, m5 ; 123456789ABCDEFF
+ psrldq m2, m1, 1 ; 23456789ABCDEFF_
+ por m2, m5 ; 23456789ABCDEFFF
+ pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF
+%endif
+ LOWPASS 0, 1, 2, 3
+ DEFINE_ARGS dst, stride, cnt, stride9
+ lea stride9q, [strideq+strideq*8]
+ mov cntd, 4
+
+.loop:
+ movhlps m4, m0
+ mova [dstq+strideq*0], m0
+%if cpuflag(ssse3)
+ pshufb m0, m5
+%else
+ psrldq m0, 1
+ por m0, m5
+%endif
+ mova [dstq+strideq*8], m4
+ movhlps m4, m0
+ mova [dstq+strideq*1], m0
+%if cpuflag(ssse3)
+ pshufb m0, m5
+%else
+ psrldq m0, 1
+ por m0, m5
+%endif
+ mova [dstq+stride9q ], m4
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
+ mova m0, [aq]
+ mova m1, [aq+16]
+ PALIGNR m2, m1, m0, 1, m4
+ PALIGNR m3, m1, m0, 2, m4
+ LOWPASS 0, 2, 3, 4
+%if cpuflag(ssse3)
+ mova m5, [pb_1toE_2xF]
+ pshufb m2, m1, m5
+ pshufb m3, m2, m5
+ pshufb m6, m1, [pb_15]
+ mova m7, m6
+%else
+ pand m5, m1, [pb_15x0_1xm1] ; _______________F
+ psrldq m2, m1, 1 ; 123456789ABCDEF_
+ por m2, m5 ; 123456789ABCDEFF
+ psrldq m3, m2, 1 ; 23456789ABCDEFF_
+ por m3, m5 ; 23456789ABCDEFFF
+ pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF
+ pshufd m6, m7, q3333
+%endif
+ LOWPASS 1, 2, 3, 4
+ lea dst16q, [dstq +strideq*8]
+ mov cntd, 8
+ lea dst16q, [dst16q+strideq*8]
+.loop:
+ movhlps m7, m1
+ mova [dstq +strideq*0+ 0], m0
+ mova [dstq +strideq*0+16], m1
+ movhps [dstq+strideq*8+ 0], m0
+ movq [dstq +strideq*8+ 8], m1
+ mova [dstq +strideq*8+16], m7
+ mova [dst16q+strideq*0+ 0], m1
+ mova [dst16q+strideq*0+16], m6
+ mova [dst16q+strideq*8+ 0], m7
+ mova [dst16q+strideq*8+16], m6
+%if cpuflag(avx)
+ vpalignr m0, m1, m0, 1
+ pshufb m1, m5
+%elif cpuflag(ssse3)
+ palignr m2, m1, m0, 1
+ pshufb m1, m5
+ mova m0, m2
+%else
+ mova m4, m1
+ psrldq m0, 1
+ pslldq m4, 15
+ psrldq m1, 1
+ por m0, m4
+ por m1, m5
+%endif
+ add dstq, strideq
+ add dst16q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DL_XMM_FUNCS
+INIT_XMM ssse3
+DL_XMM_FUNCS
+INIT_XMM avx
+DL_XMM_FUNCS
+
+; dr
+
+%macro DR_MMX_FUNCS 0
+cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq-1]
+ movd m1, [aq+3]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ PALIGNR m1, m0, 1, m3
+ psrlq m2, m1, 8
+ LOWPASS 0, 1, 2, 3
+
+ movd [dstq+stride3q ], m0
+ psrlq m0, 8
+ movd [dstq+strideq*2], m0
+ psrlq m0, 8
+ movd [dstq+strideq*1], m0
+ psrlq m0, 8
+ movd [dstq+strideq*0], m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+DR_MMX_FUNCS
+INIT_MMX ssse3
+DR_MMX_FUNCS
+
+%macro DR_XMM_FUNCS 0
+cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
+ movq m1, [lq]
+ movhps m1, [aq-1]
+ movd m2, [aq+7]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pslldq m0, m1, 1
+ PALIGNR m2, m1, 1, m3
+ LOWPASS 0, 1, 2, 3
+
+ movhps [dstq+strideq*0], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*1], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*2], m0
+ pslldq m0, 1
+ movhps [dstq+stride3q ], m0
+ pslldq m0, 1
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*1], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*2], m0
+ pslldq m0, 1
+ movhps [dstq+stride3q ], m0
+ RET
+
+cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
+ mova m1, [lq]
+ movu m2, [aq-1]
+ movd m4, [aq+15]
+ DEFINE_ARGS dst, stride, stride9, cnt
+ lea stride9q, [strideq *3]
+ mov cntd, 4
+ lea stride9q, [stride9q*3]
+ PALIGNR m4, m2, 1, m5
+ PALIGNR m3, m2, m1, 15, m5
+ LOWPASS 3, 2, 4, 5
+ pslldq m0, m1, 1
+ PALIGNR m2, m1, 1, m4
+ LOWPASS 0, 1, 2, 4
+
+.loop:
+ mova [dstq+strideq*0 ], m3
+ movhps [dstq+strideq*8+0], m0
+ movq [dstq+strideq*8+8], m3
+ PALIGNR m3, m0, 15, m1
+ pslldq m0, 1
+ mova [dstq+strideq*1 ], m3
+ movhps [dstq+stride9q +0], m0
+ movq [dstq+stride9q +8], m3
+ PALIGNR m3, m0, 15, m1
+ pslldq m0, 1
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
+ mova m1, [lq]
+ mova m2, [lq+16]
+ movu m3, [aq-1]
+ movu m4, [aq+15]
+ movd m5, [aq+31]
+ DEFINE_ARGS dst, stride, stride8, cnt
+ lea stride8q, [strideq*8]
+ PALIGNR m5, m4, 1, m7
+ PALIGNR m6, m4, m3, 15, m7
+ LOWPASS 5, 4, 6, 7
+ PALIGNR m4, m3, 1, m7
+ PALIGNR m6, m3, m2, 15, m7
+ LOWPASS 4, 3, 6, 7
+ PALIGNR m3, m2, 1, m7
+ PALIGNR m6, m2, m1, 15, m7
+ LOWPASS 3, 2, 6, 7
+ PALIGNR m2, m1, 1, m6
+ pslldq m0, m1, 1
+ LOWPASS 2, 1, 0, 6
+ mov cntd, 16
+
+ ; out=m2/m3/m4/m5
+.loop:
+ mova [dstq+stride8q*0+ 0], m4
+ mova [dstq+stride8q*0+16], m5
+ mova [dstq+stride8q*2+ 0], m3
+ mova [dstq+stride8q*2+16], m4
+ PALIGNR m5, m4, 15, m6
+ PALIGNR m4, m3, 15, m6
+ PALIGNR m3, m2, 15, m6
+ pslldq m2, 1
+ add dstq, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DR_XMM_FUNCS
+INIT_XMM ssse3
+DR_XMM_FUNCS
+INIT_XMM avx
+DR_XMM_FUNCS
+
+; vl
+
+INIT_MMX mmxext
+cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
+ movq m0, [aq]
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ movd [dstq+strideq*0], m1
+ movd [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ psrlq m1, 8
+ psrlq m2, 8
+ movd [dstq+strideq*0], m1
+ movd [dstq+strideq*1], m2
+ RET
+
+%macro VL_XMM_FUNCS 0
+cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
+ movq m0, [aq]
+%if cpuflag(ssse3)
+ pshufb m0, [pb_0to6_9x7]
+%else
+ punpcklbw m1, m0, m0
+ punpckhwd m1, m1
+ shufps m0, m1, q3310
+%endif
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psrldq m1, m0, 1
+ psrldq m2, m0, 2
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ movq [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m2
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m2
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+%if cpuflag(ssse3)
+ mova m4, [pb_1toE_2xF]
+ pshufb m1, m0, m4
+ pshufb m2, m1, m4
+%else
+ pand m4, m0, [pb_15x0_1xm1] ; _______________F
+ psrldq m1, m0, 1 ; 123456789ABCDEF_
+ por m1, m4 ; 123456789ABCDEFF
+ psrldq m2, m1, 1 ; 23456789ABCDEFF_
+ por m2, m4 ; 23456789ABCDEFFF
+%endif
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+%if cpuflag(ssse3)
+ pshufb m1, m4
+ pshufb m2, m4
+%else
+ psrldq m1, 1
+ psrldq m2, 1
+ por m1, m4
+ por m2, m4
+%endif
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+%if cpuflag(ssse3)
+ pshufb m1, m4
+ pshufb m2, m4
+%else
+ psrldq m1, 1
+ psrldq m2, 1
+ por m1, m4
+ por m2, m4
+%endif
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
+ mova m0, [aq]
+ mova m5, [aq+16]
+ DEFINE_ARGS dst, stride, dst16, cnt
+ PALIGNR m2, m5, m0, 1, m4
+ PALIGNR m3, m5, m0, 2, m4
+ lea dst16q, [dstq +strideq*8]
+ LOWPASS 3, 2, 0, 6
+ pavgb m2, m0
+%if cpuflag(ssse3)
+ mova m4, [pb_1toE_2xF]
+ pshufb m0, m5, m4
+ pshufb m1, m0, m4
+%else
+ pand m4, m5, [pb_15x0_1xm1] ; _______________F
+ psrldq m0, m5, 1 ; 123456789ABCDEF_
+ por m0, m4 ; 123456789ABCDEFF
+ psrldq m1, m0, 1 ; 23456789ABCDEFF_
+ por m1, m4 ; 23456789ABCDEFFF
+%endif
+ lea dst16q, [dst16q+strideq*8]
+ LOWPASS 1, 0, 5, 6
+ pavgb m0, m5
+%if cpuflag(ssse3)
+ pshufb m5, [pb_15]
+%else
+ punpckhbw m5, m4, m4
+ pshufhw m5, m5, q3333
+ punpckhqdq m5, m5
+%endif
+ mov cntd, 8
+
+.loop:
+%macro %%write 3
+ mova [dstq+stride%1+ 0], %2
+ mova [dstq+stride%1+16], %3
+ movhps [dst16q+stride%1 ], %2
+ movu [dst16q+stride%1+ 8], %3
+ movq [dst16q+stride%1+24], m5
+%if cpuflag(avx)
+ palignr %2, %3, %2, 1
+ pshufb %3, m4
+%elif cpuflag(ssse3)
+ palignr m6, %3, %2, 1
+ pshufb %3, m4
+ mova %2, m6
+%else
+ pslldq m6, %3, 15
+ psrldq %3, 1
+ psrldq %2, 1
+ por %3, m4
+ por %2, m6
+%endif
+%endmacro
+
+ %%write q*0, m2, m0
+ %%write q*1, m3, m1
+ lea dstq, [dstq +strideq*2]
+ lea dst16q, [dst16q+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+VL_XMM_FUNCS
+INIT_XMM ssse3
+VL_XMM_FUNCS
+INIT_XMM avx
+VL_XMM_FUNCS
+
+; vr
+
+%macro VR_MMX_FUNCS 0
+cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
+ movq m1, [aq-1]
+ punpckldq m2, [lq]
+ movd m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pavgb m0, m1
+ PALIGNR m1, m2, 5, m3
+ psrlq m2, m1, 8
+ psllq m3, m1, 8
+ LOWPASS 2, 1, 3, 4
+
+ ; ABCD <- for the following predictor:
+ ; EFGH
+ ; IABC | m0 contains ABCDxxxx
+ ; JEFG | m2 contains xJIEFGHx
+
+%if cpuflag(ssse3)
+ punpckldq m0, m2
+ pshufb m2, [pb_13456_3xm1]
+ movd [dstq+strideq*0], m0
+ pshufb m0, [pb_6012_4xm1]
+ movd [dstq+stride3q ], m2
+ psrlq m2, 8
+ movd [dstq+strideq*2], m0
+ movd [dstq+strideq*1], m2
+%else
+ psllq m1, m2, 40
+ psrlq m2, 24
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m2
+ PALIGNR m0, m1, 7, m3
+ psllq m1, 8
+ PALIGNR m2, m1, 7, m3
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m2
+%endif
+ RET
+%endmacro
+
+INIT_MMX mmxext
+VR_MMX_FUNCS
+INIT_MMX ssse3
+VR_MMX_FUNCS
+
+%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
+cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
+ movu m1, [aq-1]
+ movhps m2, [lq]
+ movq m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pavgb m0, m1
+ PALIGNR m1, m2, 9, m3
+ pslldq m2, m1, 1
+ pslldq m3, m1, 2
+ LOWPASS 1, 2, 3, 4
+
+ ; ABCDEFGH <- for the following predictor:
+ ; IJKLMNOP
+ ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx
+ ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP
+ ; SQABCDEF
+ ; TRIJKLMN
+ ; USQABCDE
+ ; VTRIJKLM
+
+%if cpuflag(ssse3)
+ punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ
+%endif
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+ pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
+ pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
+%else
+ psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx
+ pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx
+ packuswb m3, m2 ; xVTRxxxxxUSQxxxx
+ pslldq m3, 4 ; xxxxxVTRxxxxxUSQ
+ PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG
+ psrldq m1, 8
+ pslldq m3, 8
+ PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO
+%endif
+ movhps [dstq+strideq*2], m0
+ movhps [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ pslldq m0, 1
+ pslldq m1, 1
+ movhps [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m1
+ pslldq m0, 1
+ pslldq m1, 1
+ movhps [dstq+strideq*2], m0
+ movhps [dstq+stride3q ], m1
+ RET
+
+cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
+ mova m0, [aq]
+ movu m1, [aq-1]
+ mova m2, [lq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ PALIGNR m3, m1, m2, 15, m6
+ LOWPASS 3, 1, 0, 4
+ pavgb m0, m1
+ PALIGNR m1, m2, 1, m6
+ pslldq m4, m2, 1
+ LOWPASS 1, 2, 4, 5
+%if cpuflag(ssse3)
+ pshufb m1, [pb_02468ACE_13579BDF]
+%else
+ psrlw m5, m1, 8
+ pand m1, [pw_255]
+ packuswb m1, m5
+%endif
+ mov cntd, 4
+
+.loop:
+ movlhps m2, m1
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m3
+ PALIGNR m4, m0, m1, 15, m6
+ PALIGNR m5, m3, m2, 15, m6
+ mova [dstq+strideq*2], m4
+ mova [dstq+stride3q ], m5
+ lea dstq, [dstq+strideq*4]
+ PALIGNR m0, m1, 14, m6
+ PALIGNR m3, m2, 14, m6
+ pslldq m1, 2
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
+ mova m0, [aq]
+ mova m2, [aq+16]
+ movu m1, [aq-1]
+ PALIGNR m3, m2, m0, 15, m6
+ PALIGNR m4, m2, m0, 14, m6
+ LOWPASS 4, 3, 2, 5
+ pavgb m3, m2
+ mova m2, [lq+16]
+ PALIGNR m5, m1, m2, 15, m6
+ LOWPASS 5, 1, 0, 6
+ pavgb m0, m1
+ mova m6, [lq]
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova [dstq], m0
+%endif
+ PALIGNR m1, m2, 1, m0
+ PALIGNR m7, m2, m6, 15, m0
+ LOWPASS 1, 2, 7, 0
+ PALIGNR m2, m6, 1, m0
+ pslldq m7, m6, 1
+ LOWPASS 2, 6, 7, 0
+%if cpuflag(ssse3)
+ pshufb m1, [pb_02468ACE_13579BDF]
+ pshufb m2, [pb_02468ACE_13579BDF]
+%else
+ psrlw m0, m1, 8
+ psrlw m6, m2, 8
+ pand m1, [pw_255]
+ pand m2, [pw_255]
+ packuswb m1, m0
+ packuswb m2, m6
+%endif
+ DEFINE_ARGS dst, stride, dst16, cnt
+ lea dst16q, [dstq +strideq*8]
+ lea dst16q, [dst16q+strideq*8]
+ SBUTTERFLY qdq, 2, 1, 6
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova m0, [dstq]
+%endif
+ mov cntd, 8
+
+.loop:
+ ; even lines (0, 2, 4, ...): m1 | m0, m3
+ ; odd lines (1, 3, 5, ...): m2 | m5, m4
+%macro %%write 4
+ mova [dstq+stride%1+ 0], %3
+ mova [dstq+stride%1+16], %4
+ movhps [dst16q+stride%1 ], %2
+ movu [dst16q+stride%1+ 8], %3
+ movq [dst16q+stride%1+24], %4
+ PALIGNR %4, %3, 15, m6
+ PALIGNR %3, %2, 15, m6
+ pslldq %2, 1
+%endmacro
+
+ %%write q*0, m1, m0, m3
+ %%write q*1, m2, m5, m4
+ lea dstq, [dstq +strideq*2]
+ lea dst16q, [dst16q+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+VR_XMM_FUNCS 7
+INIT_XMM ssse3
+VR_XMM_FUNCS 6
+INIT_XMM avx
+VR_XMM_FUNCS 6
+
+; hd
+
+INIT_MMX mmxext
+cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq-1]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ ; DHIJ <- for the following predictor:
+ ; CGDH
+ ; BFCG | m1 contains ABCDxxxx
+ ; AEBF | m2 contains EFGHIJxx
+
+ punpcklbw m1, m2
+ punpckhdq m0, m1, m2
+
+ ; m1 contains AEBFCGDH
+ ; m0 contains CGDHIJxx
+
+ movd [dstq+stride3q ], m1
+ movd [dstq+strideq*1], m0
+ psrlq m1, 16
+ psrlq m0, 16
+ movd [dstq+strideq*2], m1
+ movd [dstq+strideq*0], m0
+ RET
+
+%macro HD_XMM_FUNCS 0
+cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
+ movq m0, [lq]
+ movhps m0, [aq-1]
+ DEFINE_ARGS dst, stride, stride3, dst4
+ lea stride3q, [strideq*3]
+ lea dst4q, [dstq+strideq*4]
+ psrldq m1, m0, 1
+ psrldq m2, m1, 1
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ ; HPQRSTUV <- for the following predictor
+ ; GOHPQRST
+ ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx
+ ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx
+ ; DLEMFNGO
+ ; CKDLEMFN
+ ; BJCKDLEM
+ ; AIBJCKDL
+
+ punpcklbw m1, m2
+ movhlps m2, m2
+
+ ; m1 contains AIBJCKDLEMFNGOHP
+ ; m2 contains QRSTUVxxxxxxxxxx
+
+ movhps [dstq +stride3q ], m1
+ movq [dst4q+stride3q ], m1
+ PALIGNR m3, m2, m1, 2, m4
+ movhps [dstq +strideq*2], m3
+ movq [dst4q+strideq*2], m3
+ PALIGNR m3, m2, m1, 4, m4
+ movhps [dstq +strideq*1], m3
+ movq [dst4q+strideq*1], m3
+ PALIGNR m2, m1, 6, m4
+ movhps [dstq +strideq*0], m2
+ movq [dst4q+strideq*0], m2
+ RET
+
+cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
+ mova m0, [lq]
+ movu m3, [aq-1]
+ DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
+ lea stride4q, [strideq*4]
+ lea dst4q, [dstq +stride4q]
+ lea dst8q, [dst4q+stride4q]
+ lea dst12q, [dst8q+stride4q]
+ psrldq m4, m3, 1
+ psrldq m5, m3, 2
+ LOWPASS 5, 4, 3, 6
+ PALIGNR m1, m3, m0, 1, m6
+ PALIGNR m2, m3, m0, 2, m6
+ LOWPASS 2, 1, 0, 6
+ pavgb m1, m0
+ SBUTTERFLY bw, 1, 2, 6
+
+ ; I PROBABLY INVERTED L0 ad L16 here
+ ; m1, m2, m5
+.loop:
+ sub stride4q, strideq
+ movhps [dstq +stride4q +0], m2
+ movq [dstq +stride4q +8], m5
+ mova [dst4q+stride4q ], m2
+ movhps [dst8q+stride4q +0], m1
+ movq [dst8q+stride4q +8], m2
+ mova [dst12q+stride4q ], m1
+%if cpuflag(avx)
+ palignr m1, m2, m1, 2
+ palignr m2, m5, m2, 2
+%elif cpuflag(ssse3)
+ palignr m3, m2, m1, 2
+ palignr m0, m5, m2, 2
+ mova m1, m3
+ mova m2, m0
+%else
+ ; slightly modified version of PALIGNR
+ mova m6, m2
+ mova m4, m5
+ pslldq m6, 14
+ pslldq m4, 14
+ psrldq m1, 2
+ psrldq m2, 2
+ por m1, m6
+ por m2, m4
+%endif
+ psrldq m5, 2
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [lq+16]
+ movu m2, [aq-1]
+ movu m3, [aq+15]
+ DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
+ lea stride8q, [strideq*8]
+ lea dst8q, [dstq +stride8q]
+ lea dst16q, [dst8q +stride8q]
+ lea dst24q, [dst16q+stride8q]
+ psrldq m4, m3, 1
+ psrldq m5, m3, 2
+ LOWPASS 5, 4, 3, 6
+ PALIGNR m4, m3, m2, 2, m6
+ PALIGNR m3, m2, 1, m6
+ LOWPASS 4, 3, 2, 6
+ PALIGNR m3, m2, m1, 2, m6
+ PALIGNR m2, m1, 1, m6
+ LOWPASS 3, 2, 1, 6
+ pavgb m2, m1
+ PALIGNR m6, m1, m0, 1, m7
+ PALIGNR m1, m0, 2, m7
+ LOWPASS 1, 6, 0, 7
+ pavgb m0, m6
+ SBUTTERFLY bw, 2, 3, 6
+ SBUTTERFLY bw, 0, 1, 6
+
+ ; m0, m1, m2, m3, m4, m5
+.loop:
+ sub stride8q, strideq
+ mova [dstq +stride8q+ 0], m3
+ mova [dstq +stride8q+16], m4
+ mova [dst8q +stride8q+ 0], m2
+ mova [dst8q +stride8q+16], m3
+ mova [dst16q+stride8q+ 0], m1
+ mova [dst16q+stride8q+16], m2
+ mova [dst24q+stride8q+ 0], m0
+ mova [dst24q+stride8q+16], m1
+%if cpuflag(avx)
+ palignr m0, m1, m0, 2
+ palignr m1, m2, m1, 2
+ palignr m2, m3, m2, 2
+ palignr m3, m4, m3, 2
+ palignr m4, m5, m4, 2
+ psrldq m5, 2
+%elif cpuflag(ssse3)
+ psrldq m6, m5, 2
+ palignr m5, m4, 2
+ palignr m4, m3, 2
+ palignr m3, m2, 2
+ palignr m2, m1, 2
+ palignr m1, m0, 2
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ mova m4, m5
+ mova m5, m6
+%else
+ ; sort of a half-integrated version of PALIGNR
+ pslldq m7, m4, 14
+ pslldq m6, m5, 14
+ psrldq m4, 2
+ psrldq m5, 2
+ por m4, m6
+ pslldq m6, m3, 14
+ psrldq m3, 2
+ por m3, m7
+ pslldq m7, m2, 14
+ psrldq m2, 2
+ por m2, m6
+ pslldq m6, m1, 14
+ psrldq m1, 2
+ por m1, m7
+ psrldq m0, 2
+ por m0, m6
+%endif
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+HD_XMM_FUNCS
+INIT_XMM ssse3
+HD_XMM_FUNCS
+INIT_XMM avx
+HD_XMM_FUNCS
+
+%macro HU_MMX_FUNCS 0
+cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
+ movd m0, [lq]
+%if cpuflag(ssse3)
+ pshufb m0, [pb_0to2_5x3]
+%else
+ punpcklbw m1, m0, m0 ; 00112233
+ pshufw m1, m1, q3333 ; 33333333
+ punpckldq m0, m1 ; 01233333
+%endif
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ SBUTTERFLY bw, 1, 2, 0
+ PALIGNR m2, m1, 2, m0
+ movd [dstq+strideq*0], m1
+ movd [dstq+strideq*1], m2
+ punpckhdq m1, m1
+ punpckhdq m2, m2
+ movd [dstq+strideq*2], m1
+ movd [dstq+stride3q ], m2
+ RET
+%endmacro
+
+INIT_MMX mmxext
+HU_MMX_FUNCS
+INIT_MMX ssse3
+HU_MMX_FUNCS
+
+%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
+cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
+ movq m0, [lq]
+%if cpuflag(ssse3)
+ pshufb m0, [pb_0to6_9x7]
+%else
+ punpcklbw m1, m0, m0 ; 0011223344556677
+ punpckhwd m1, m1 ; 4444555566667777
+ shufps m0, m1, q3310 ; 0123456777777777
+%endif
+ psrldq m1, m0, 1
+ psrldq m2, m1, 1
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride3, dst4
+ lea stride3q, [strideq*3]
+ lea dst4q, [dstq+strideq*4]
+ SBUTTERFLY bw, 1, 2, 0
+ movq [dstq +strideq*0], m1
+ movhps [dst4q+strideq*0], m1
+ PALIGNR m0, m2, m1, 2, m3
+ movq [dstq +strideq*1], m0
+ movhps [dst4q+strideq*1], m0
+ PALIGNR m0, m2, m1, 4, m3
+ movq [dstq +strideq*2], m0
+ movhps [dst4q+strideq*2], m0
+ PALIGNR m2, m1, 6, m3
+ movq [dstq +stride3q ], m2
+ movhps [dst4q+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
+ mova m0, [lq]
+%if cpuflag(ssse3)
+ mova m3, [pb_2toE_3xF]
+ pshufb m1, m0, [pb_1toE_2xF]
+ pshufb m2, m0, m3
+%else
+ pand m3, m0, [pb_15x0_1xm1]
+ psrldq m1, m0, 1
+ por m1, m3
+ punpckhbw m3, m3
+ psrldq m2, m0, 2
+ por m2, m3
+%endif
+ LOWPASS 2, 1, 0, 4
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride9, cnt
+ lea stride9q, [strideq*8+strideq]
+ mov cntd, 4
+ SBUTTERFLY bw, 1, 2, 0
+
+.loop:
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*8], m2
+ PALIGNR m0, m2, m1, 2, m4
+%if cpuflag(ssse3)
+ pshufb m2, m3
+%else
+ psrldq m2, 2
+ por m2, m3
+%endif
+ mova [dstq+strideq*1], m0
+ mova [dstq+stride9q ], m2
+ PALIGNR m1, m2, m0, 2, m4
+%if cpuflag(ssse3)
+ pshufb m2, m3
+%else
+ psrldq m2, 2
+ por m2, m3
+%endif
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
+ mova m1, [lq]
+ mova m0, [lq+16]
+ PALIGNR m2, m0, m1, 1, m5
+ PALIGNR m3, m0, m1, 2, m5
+ LOWPASS 3, 2, 1, 5
+ pavgb m2, m1
+%if cpuflag(ssse3)
+ mova m4, [pb_2toE_3xF]
+ pshufb m5, m0, [pb_1toE_2xF]
+ pshufb m1, m0, m4
+%else
+ pand m4, m0, [pb_15x0_1xm1]
+ psrldq m5, m0, 1
+ por m5, m4
+ punpckhbw m4, m4
+ psrldq m1, m0, 2
+ por m1, m4
+%endif
+ LOWPASS 1, 5, 0, 6
+ pavgb m0, m5
+ DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
+ mov cntd, 8
+ xor stride0q, stride0q
+ lea dst8q, [dstq +strideq*8]
+ lea dst16q, [dst8q +strideq*8]
+ lea dst24q, [dst16q+strideq*8]
+ SBUTTERFLY bw, 0, 1, 5
+ SBUTTERFLY bw, 2, 3, 5
+%if cpuflag(ssse3)
+ pshufb m6, m1, [pb_15]
+%else
+ pshufhw m6, m4, q3333
+ punpckhqdq m6, m6
+%endif
+
+.loop:
+ mova [dstq +stride0q+ 0], m2
+ mova [dstq +stride0q+16], m3
+ mova [dst8q +stride0q+ 0], m3
+ mova [dst8q +stride0q+16], m0
+ mova [dst16q+stride0q+ 0], m0
+ mova [dst16q+stride0q+16], m1
+ mova [dst24q+stride0q+ 0], m1
+ mova [dst24q+stride0q+16], m6
+%if cpuflag(avx)
+ palignr m2, m3, m2, 2
+ palignr m3, m0, m3, 2
+ palignr m0, m1, m0, 2
+ pshufb m1, m4
+%elif cpuflag(ssse3)
+ pshufb m5, m1, m4
+ palignr m1, m0, 2
+ palignr m0, m3, 2
+ palignr m3, m2, 2
+ mova m2, m3
+ mova m3, m0
+ mova m0, m1
+ mova m1, m5
+%else
+ ; half-integrated version of PALIGNR
+ pslldq m5, m1, 14
+ pslldq m7, m0, 14
+ psrldq m1, 2
+ psrldq m0, 2
+ por m1, m4
+ por m0, m5
+ pslldq m5, m3, 14
+ psrldq m3, 2
+ por m3, m7
+ psrldq m2, 2
+ por m2, m5
+%endif
+ add stride0q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+HU_XMM_FUNCS 8
+INIT_XMM ssse3
+HU_XMM_FUNCS 7
+INIT_XMM avx
+HU_XMM_FUNCS 7
+
+; FIXME 127, 128, 129 ?
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 0000000000..212e4130e8
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,2174 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+
+pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
+pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
+pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+cextern pd_16
+cextern pd_32
+cextern pd_65535;
+
+; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
+; only 3 registers on x86-32, which would make it one cycle faster, but that
+; would make the code quite a bit uglier...
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+ mova [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+ mova m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ mova m1, [aq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0]
+ mova m1, [aq+mmsize*1]
+ mova m2, [aq+mmsize*2]
+ mova m3, [aq+mmsize*3]
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 16
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*1+32], m2
+ mova [dstq+strideq*1+48], m3
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+ mova m3, [lq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufw m0, m3, q3333
+ pshufw m1, m3, q2222
+ pshufw m2, m3, q1111
+ pshufw m3, m3, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
+ mova m2, [lq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ punpckhwd m3, m2, m2
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufd m0, m3, q1111
+ pshufd m1, m3, q0000
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ punpcklwd m2, m2
+ pshufd m0, m2, q3333
+ pshufd m1, m2, q2222
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q0000
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
+ mov cntd, 3
+ lea stride3q, [strideq*3]
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
+ mov cntd, 7
+ lea stride3q, [strideq*3]
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*1+32], m1
+ mova [dstq+strideq*1+48], m1
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+strideq*2+32], m2
+ mova [dstq+strideq*2+48], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ mova [dstq+stride3q +32], m3
+ mova [dstq+stride3q +48], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufw m1, m0, q3232
+ paddd m0, [pd_4]
+ paddd m0, m1
+ psrad m0, 3
+ pshufw m0, m0, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_8]
+ paddd m0, m1
+ psrad m0, 4
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [lq+mmsize]
+ paddw m0, [aq]
+ paddw m0, [aq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_16]
+ paddd m0, m1
+ psrad m0, 5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq+mmsize*0]
+ paddw m0, [lq+mmsize*1]
+ paddw m0, [lq+mmsize*2]
+ paddw m0, [lq+mmsize*3]
+ paddw m0, [aq+mmsize*0]
+ paddw m0, [aq+mmsize*1]
+ paddw m0, [aq+mmsize*2]
+ paddw m0, [aq+mmsize*3]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 16
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_32]
+ paddd m0, m1
+ psrad m0, 6
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*1+32], m0
+ mova [dstq+strideq*1+48], m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufw m1, m0, q3232
+ paddd m0, [pd_2]
+ paddd m0, m1
+ psrad m0, 2
+ pshufw m0, m0, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_4]
+ paddd m0, m1
+ psrad m0, 3
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ paddw m0, [%2+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_8]
+ paddd m0, m1
+ psrad m0, 4
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2+mmsize*0]
+ paddw m0, [%2+mmsize*1]
+ paddw m0, [%2+mmsize*2]
+ paddw m0, [%2+mmsize*3]
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 16
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_16]
+ paddd m0, m1
+ psrad m0, 5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*1+32], m0
+ mova [dstq+strideq*1+48], m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+DC_1D_FNS top, aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+ mova m5, [pw_1023]
+.body:
+ mova m4, [aq]
+ mova m3, [lq]
+ movd m0, [aq-4]
+ pshufw m0, m0, q1111
+ psubw m4, m0
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufw m0, m3, q3333
+ pshufw m1, m3, q2222
+ pshufw m2, m3, q1111
+ pshufw m3, m3, q0000
+ paddw m0, m4
+ paddw m1, m4
+ paddw m2, m4
+ paddw m3, m4
+ pxor m4, m4
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ pmaxsw m3, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ pminsw m2, m5
+ pminsw m3, m5
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
+ mova m4, [pw_1023]
+.body:
+ pxor m6, m6
+ mova m5, [aq]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m5, m0
+ DEFINE_ARGS dst, stride, l, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 1
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ paddw m0, m5
+ paddw m1, m5
+ paddw m2, m5
+ paddw m3, m5
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ pmaxsw m2, m6
+ pmaxsw m3, m6
+ pminsw m0, m4
+ pminsw m1, m4
+ pminsw m2, m4
+ pminsw m3, m4
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
+ mova m4, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_1023]
+.body:
+ pxor m6, m6
+ mova m4, [aq]
+ mova m5, [aq+mmsize]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m4, m0
+ psubw m5, m0
+ DEFINE_ARGS dst, stride, l, cnt
+ mov cntd, 7
+.loop:
+ movd m3, [lq+cntq*4]
+ punpcklwd m3, m3
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ paddw m0, m2, m4
+ paddw m2, m5
+ paddw m1, m3, m4
+ paddw m3, m5
+ pmaxsw m0, m6
+ pmaxsw m2, m6
+ pmaxsw m1, m6
+ pmaxsw m3, m6
+ pminsw m0, m7
+ pminsw m2, m7
+ pminsw m1, m7
+ pminsw m3, m7
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m2
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m3
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+ mova m0, [pw_1023]
+.body:
+ pxor m1, m1
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+ mova [rsp+ 0], m0
+ mova [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+ mova m4, [aq+mmsize*0]
+ mova m5, [aq+mmsize*1]
+ mova m6, [aq+mmsize*2]
+ mova m7, [aq+mmsize*3]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m4, m0
+ psubw m5, m0
+ psubw m6, m0
+ psubw m7, m0
+ DEFINE_ARGS dst, stride, l, cnt
+ mov cntd, 31
+.loop:
+ pinsrw m3, [lq+cntq*2], 0
+ punpcklwd m3, m3
+ pshufd m3, m3, q0000
+ paddw m0, m3, m4
+ paddw m1, m3, m5
+ paddw m2, m3, m6
+ paddw m3, m7
+ pmaxsw m0, reg_min
+ pmaxsw m1, reg_min
+ pmaxsw m2, reg_min
+ pmaxsw m3, reg_min
+ pminsw m0, reg_max
+ pminsw m1, reg_max
+ pminsw m2, reg_max
+ pminsw m3, reg_max
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+ add dstq, strideq
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+ mova m0, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
+
+; Directional intra predicion functions
+;
+; in the functions below, 'abcdefgh' refers to above data (sometimes simply
+; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
+; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
+; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
+; top-left data.
+
+; left=(left+2*center+right+2)>>2
+%macro LOWPASS 3 ; left [dst], center, right
+ paddw m%1, m%3
+ psraw m%1, 1
+ pavgw m%1, m%2
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst)
+; dst/src can be the same register
+%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+ pshufb %1, %2, %3 ; abcdefgh -> bcdefghh
+%else
+ psrldq %1, %2, 2 ; abcdefgh -> bcdefgh.
+ pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
+%endif
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
+%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+ pshufb %1, %3, %4 ; abcdefgh -> bcdefghh
+ pshufb %2, %1, %4 ; bcdefghh -> cdefghhh
+%else
+ psrldq %1, %3, 2 ; abcdefgh -> bcdefgh.
+ psrldq %2, %3, 4 ; abcdefgh -> cdefgh..
+ pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
+ pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh
+%endif
+%endmacro
+
+%macro DL_FUNCS 0
+cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
+ movifnidn aq, amp
+ movu m1, [aq] ; abcdefgh
+ pshufhw m0, m1, q3310 ; abcdefhh
+ SHIFT_RIGHT m1, m1 ; bcdefghh
+ psrldq m2, m1, 2 ; cdefghh.
+ LOWPASS 0, 1, 2 ; BCDEFGh.
+ pshufd m1, m0, q3321 ; DEFGh...
+ movh [dstq+strideq*0], m0
+ movh [dstq+strideq*2], m1
+ add dstq, strideq
+ psrldq m0, 2 ; CDEFGh..
+ psrldq m1, 2 ; EFGh....
+ movh [dstq+strideq*0], m0
+ movh [dstq+strideq*2], m1
+ RET
+
+cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefgh
+%if cpuflag(ssse3)
+ mova m4, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh
+ LOWPASS 0, 1, 2 ; BCDEFGHh
+ shufps m1, m0, m2, q3332 ; FGHhhhhh
+ shufps m3, m0, m1, q2121 ; DEFGHhhh
+ DEFINE_ARGS dst, stride, stride5
+ lea stride5q, [strideq*5]
+
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*4], m1
+ SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh
+ pshuflw m1, m1, q3321 ; GHhhhhhh
+ pshufd m2, m0, q3321 ; EFGHhhhh
+ mova [dstq+strideq*1], m0
+ mova [dstq+stride5q ], m1
+ lea dstq, [dstq+strideq*2]
+ pshuflw m1, m1, q3321 ; Hhhhhhhh
+ mova [dstq+strideq*0], m3
+ mova [dstq+strideq*4], m1
+ pshuflw m1, m1, q3321 ; hhhhhhhh
+ mova [dstq+strideq*1], m2
+ mova [dstq+stride5q ], m1
+ RET
+
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefgh
+ mova m3, [aq+mmsize] ; ijklmnop
+ PALIGNR m1, m3, m0, 2, m4 ; bcdefghi
+ PALIGNR m2, m3, m0, 4, m4 ; cdefghij
+ LOWPASS 0, 1, 2 ; BCDEFGHI
+%if cpuflag(ssse3)
+ mova m4, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp
+ LOWPASS 1, 2, 3 ; JKLMNOPp
+ pshufd m2, m2, q3333 ; pppppppp
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*8+ 0], m1
+ mova [dstq+strideq*8+16], m2
+ add dstq, strideq
+%if cpuflag(avx)
+ vpalignr m0, m1, m0, 2
+%else
+ PALIGNR m3, m1, m0, 2, m4
+ mova m0, m3
+%endif
+ SHIFT_RIGHT m1, m1, m4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0] ; abcdefgh
+ mova m1, [aq+mmsize*1] ; ijklmnop
+ mova m2, [aq+mmsize*2] ; qrstuvwx
+ mova m3, [aq+mmsize*3] ; yz012345
+ PALIGNR m4, m1, m0, 2, m6
+ PALIGNR m5, m1, m0, 4, m6
+ LOWPASS 0, 4, 5 ; BCDEFGHI
+ PALIGNR m4, m2, m1, 2, m6
+ PALIGNR m5, m2, m1, 4, m6
+ LOWPASS 1, 4, 5 ; JKLMNOPQ
+ PALIGNR m4, m3, m2, 2, m6
+ PALIGNR m5, m3, m2, 4, m6
+ LOWPASS 2, 4, 5 ; RSTUVWXY
+%if cpuflag(ssse3)
+ mova m6, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m4, m5, m3, m6
+ LOWPASS 3, 4, 5 ; Z0123455
+ pshufd m4, m4, q3333 ; 55555555
+ DEFINE_ARGS dst, stride, stride8, stride24, cnt
+ mov cntd, 8
+ lea stride8q, [strideq*8]
+ lea stride24q, [stride8q*3]
+
+.loop:
+ mova [dstq+stride8q*0+ 0], m0
+ mova [dstq+stride8q*0+16], m1
+ mova [dstq+stride8q*0+32], m2
+ mova [dstq+stride8q*0+48], m3
+ mova [dstq+stride8q*1+ 0], m1
+ mova [dstq+stride8q*1+16], m2
+ mova [dstq+stride8q*1+32], m3
+ mova [dstq+stride8q*1+48], m4
+ mova [dstq+stride8q*2+ 0], m2
+ mova [dstq+stride8q*2+16], m3
+ mova [dstq+stride8q*2+32], m4
+ mova [dstq+stride8q*2+48], m4
+ mova [dstq+stride24q + 0], m3
+ mova [dstq+stride24q +16], m4
+ mova [dstq+stride24q +32], m4
+ mova [dstq+stride24q +48], m4
+ add dstq, strideq
+%if cpuflag(avx)
+ vpalignr m0, m1, m0, 2
+ vpalignr m1, m2, m1, 2
+ vpalignr m2, m3, m2, 2
+%else
+ PALIGNR m5, m1, m0, 2, m6
+ mova m0, m5
+ PALIGNR m5, m2, m1, 2, m6
+ mova m1, m5
+ PALIGNR m5, m3, m2, 2, m6
+ mova m2, m5
+%endif
+ SHIFT_RIGHT m3, m3, m6
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DL_FUNCS
+INIT_XMM ssse3
+DL_FUNCS
+INIT_XMM avx
+DL_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefghijklmnop
+ vpbroadcastw xm1, [aq+30] ; pppppppp
+ vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp
+ vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp
+ vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp
+ LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp
+ vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 2
+ lea stride3q, [strideq*3]
+.loop:
+ mova [dstq+strideq*0], m0
+ vpalignr m3, m2, m0, 2
+ vpalignr m4, m2, m0, 4
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 6
+ vpalignr m4, m2, m0, 8
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m4
+ vpalignr m3, m2, m0, 10
+ vpalignr m4, m2, m0, 12
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 14
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova m0, m2
+ vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp
+ dec cntd
+ jg .loop
+ RET
+%endif
+
+%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
+ movh m0, [lq] ; wxyz....
+ movhps m0, [aq-2] ; wxyz*abc
+ movd m1, [aq+6] ; d.......
+ PALIGNR m1, m0, 2, m2 ; xyz*abcd
+ psrldq m2, m1, 2 ; yz*abcd.
+ LOWPASS 0, 1, 2 ; XYZ#ABC.
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movh [dstq+stride3q ], m0
+ psrldq m0, 2 ; YZ#ABC..
+ movh [dstq+strideq*2], m0
+ psrldq m0, 2 ; Z#ABC...
+ movh [dstq+strideq*1], m0
+ psrldq m0, 2 ; #ABC....
+ movh [dstq+strideq*0], m0
+ RET
+
+cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
+ mova m0, [lq] ; stuvwxyz
+ movu m1, [aq-2] ; *abcdefg
+ mova m2, [aq] ; abcdefgh
+ psrldq m3, m2, 2 ; bcdefgh.
+ LOWPASS 3, 2, 1 ; ABCDEFG.
+ PALIGNR m1, m0, 2, m4 ; tuvwxyz*
+ PALIGNR m2, m1, 2, m4 ; uvwxyz*a
+ LOWPASS 2, 1, 0 ; TUVWXYZ#
+ DEFINE_ARGS dst, stride, dst4, stride3
+ lea stride3q, [strideq*3]
+ lea dst4q, [dstq+strideq*4]
+
+ movhps [dstq +stride3q +0], m2
+ movh [dstq+ stride3q +8], m3
+ mova [dst4q+stride3q +0], m2
+ PALIGNR m1, m3, m2, 2, m0
+ psrldq m3, 2
+ movhps [dstq +strideq*2+0], m1
+ movh [dstq+ strideq*2+8], m3
+ mova [dst4q+strideq*2+0], m1
+ PALIGNR m2, m3, m1, 2, m0
+ psrldq m3, 2
+ movhps [dstq +strideq*1+0], m2
+ movh [dstq+ strideq*1+8], m3
+ mova [dst4q+strideq*1+0], m2
+ PALIGNR m1, m3, m2, 2, m0
+ psrldq m3, 2
+ movhps [dstq +strideq*0+0], m1
+ movh [dstq+ strideq*0+8], m3
+ mova [dst4q+strideq*0+0], m1
+ RET
+
+cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
+ mova m0, [lq] ; klmnopqr
+ mova m1, [lq+mmsize] ; stuvwxyz
+ movu m2, [aq-2] ; *abcdefg
+ movu m3, [aq+mmsize-2] ; hijklmno
+ mova m4, [aq] ; abcdefgh
+ mova m5, [aq+mmsize] ; ijklmnop
+ psrldq m6, m5, 2 ; jklmnop.
+ LOWPASS 6, 5, 3 ; IJKLMNO.
+ PALIGNR m5, m4, 2, m3 ; bcdefghi
+ LOWPASS 5, 4, 2 ; ABCDEFGH
+ PALIGNR m2, m1, 2, m3 ; tuvwxyz*
+ PALIGNR m4, m2, 2, m3 ; uvwxyz*a
+ LOWPASS 4, 2, 1 ; TUVWXYZ#
+ PALIGNR m1, m0, 2, m3 ; lmnopqrs
+ PALIGNR m2, m1, 2, m3 ; mnopqrst
+ LOWPASS 2, 1, 0 ; LMNOPQRS
+ DEFINE_ARGS dst, stride, dst8, cnt
+ lea dst8q, [dstq+strideq*8]
+ mov cntd, 8
+
+.loop:
+ sub dst8q, strideq
+ mova [dst8q+strideq*0+ 0], m4
+ mova [dst8q+strideq*0+16], m5
+ mova [dst8q+strideq*8+ 0], m2
+ mova [dst8q+strideq*8+16], m4
+%if cpuflag(avx)
+ vpalignr m2, m4, m2, 2
+ vpalignr m4, m5, m4, 2
+ vpalignr m5, m6, m5, 2
+%else
+ PALIGNR m0, m4, m2, 2, m1
+ mova m2, m0
+ PALIGNR m0, m5, m4, 2, m1
+ mova m4, m0
+ PALIGNR m0, m6, m5, 2, m1
+ mova m5, m0
+%endif
+ psrldq m6, 2
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
+ %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
+ mova m0, [aq+mmsize*3] ; a[24-31]
+ movu m1, [aq+mmsize*3-2] ; a[23-30]
+ psrldq m2, m0, 2 ; a[25-31].
+ LOWPASS 2, 0, 1 ; A[24-30].
+ mova m1, [aq+mmsize*2] ; a[16-23]
+ movu m3, [aq+mmsize*2-2] ; a[15-22]
+ PALIGNR m0, m1, 2, m4 ; a[17-24]
+ LOWPASS 0, 1, 3 ; A[16-23]
+ mova m3, [aq+mmsize*1] ; a[8-15]
+ movu m4, [aq+mmsize*1-2] ; a[7-14]
+ PALIGNR m1, m3, 2, m5 ; a[9-16]
+ LOWPASS 1, 3, 4 ; A[8-15]
+ mova m4, [aq+mmsize*0] ; a[0-7]
+ movu m5, [aq+mmsize*0-2] ; *a[0-6]
+ PALIGNR m3, m4, 2, m6 ; a[1-8]
+ LOWPASS 3, 4, 5 ; A[0-7]
+ SCRATCH 1, 8, rsp+0*mmsize
+ SCRATCH 3, 9, rsp+1*mmsize
+%if notcpuflag(ssse3)
+ SCRATCH 0, 10, rsp+2*mmsize
+%endif
+ mova m6, [lq+mmsize*3] ; l[24-31]
+ PALIGNR m5, m6, 2, m0 ; l[25-31]*
+ PALIGNR m4, m5, 2, m0 ; l[26-31]*a
+ LOWPASS 4, 5, 6 ; L[25-31]#
+ mova m7, [lq+mmsize*2] ; l[16-23]
+ PALIGNR m6, m7, 2, m0 ; l[17-24]
+ PALIGNR m5, m6, 2, m0 ; l[18-25]
+ LOWPASS 5, 6, 7 ; L[17-24]
+ mova m1, [lq+mmsize*1] ; l[8-15]
+ PALIGNR m7, m1, 2, m0 ; l[9-16]
+ PALIGNR m6, m7, 2, m0 ; l[10-17]
+ LOWPASS 6, 7, 1 ; L[9-16]
+ mova m3, [lq+mmsize*0] ; l[0-7]
+ PALIGNR m1, m3, 2, m0 ; l[1-8]
+ PALIGNR m7, m1, 2, m0 ; l[2-9]
+ LOWPASS 7, 1, 3 ; L[1-8]
+%if cpuflag(ssse3)
+%if cpuflag(avx)
+ UNSCRATCH 1, 8, rsp+0*mmsize
+%endif
+ UNSCRATCH 3, 9, rsp+1*mmsize
+%else
+ UNSCRATCH 0, 10, rsp+2*mmsize
+%endif
+ DEFINE_ARGS dst8, stride, stride8, stride24, cnt
+ lea stride8q, [strideq*8]
+ lea stride24q, [stride8q*3]
+ lea dst8q, [dst8q+strideq*8]
+ mov cntd, 8
+
+.loop:
+ sub dst8q, strideq
+%if notcpuflag(avx)
+ UNSCRATCH 1, 8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+ UNSCRATCH 3, 9, rsp+1*mmsize
+%endif
+%endif
+ mova [dst8q+stride8q*0+ 0], m4
+ mova [dst8q+stride8q*0+16], m3
+ mova [dst8q+stride8q*0+32], m1
+ mova [dst8q+stride8q*0+48], m0
+ mova [dst8q+stride8q*1+ 0], m5
+ mova [dst8q+stride8q*1+16], m4
+ mova [dst8q+stride8q*1+32], m3
+ mova [dst8q+stride8q*1+48], m1
+ mova [dst8q+stride8q*2+ 0], m6
+ mova [dst8q+stride8q*2+16], m5
+ mova [dst8q+stride8q*2+32], m4
+ mova [dst8q+stride8q*2+48], m3
+ mova [dst8q+stride24q + 0], m7
+ mova [dst8q+stride24q +16], m6
+ mova [dst8q+stride24q +32], m5
+ mova [dst8q+stride24q +48], m4
+%if cpuflag(avx)
+ vpalignr m7, m6, m7, 2
+ vpalignr m6, m5, m6, 2
+ vpalignr m5, m4, m5, 2
+ vpalignr m4, m3, m4, 2
+ vpalignr m3, m1, m3, 2
+ vpalignr m1, m0, m1, 2
+ vpalignr m0, m2, m0, 2
+%else
+ SCRATCH 2, 8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+ SCRATCH 0, 9, rsp+1*mmsize
+%endif
+ PALIGNR m2, m6, m7, 2, m0
+ mova m7, m2
+ PALIGNR m2, m5, m6, 2, m0
+ mova m6, m2
+ PALIGNR m2, m4, m5, 2, m0
+ mova m5, m2
+ PALIGNR m2, m3, m4, 2, m0
+ mova m4, m2
+ PALIGNR m2, m1, m3, 2, m0
+ mova m3, m2
+%if notcpuflag(ssse3)
+ UNSCRATCH 0, 9, rsp+1*mmsize
+ SCRATCH 3, 9, rsp+1*mmsize
+%endif
+ PALIGNR m2, m0, m1, 2, m3
+ mova m1, m2
+ UNSCRATCH 2, 8, rsp+0*mmsize
+ SCRATCH 1, 8, rsp+0*mmsize
+ PALIGNR m1, m2, m0, 2, m3
+ mova m0, m1
+%endif
+ psrldq m2, 2
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DR_FUNCS 3
+INIT_XMM ssse3
+DR_FUNCS 2
+INIT_XMM avx
+DR_FUNCS 2
+
+%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
+ movifnidn aq, amp
+ movu m0, [aq] ; abcdefgh
+ psrldq m1, m0, 2 ; bcdefgh.
+ psrldq m2, m0, 4 ; cdefgh..
+ LOWPASS 2, 1, 0 ; BCDEFGH.
+ pavgw m1, m0 ; ABCDEFG.
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movh [dstq+strideq*0], m1
+ movh [dstq+strideq*1], m2
+ psrldq m1, 2
+ psrldq m2, 2
+ movh [dstq+strideq*2], m1
+ movh [dstq+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefgh
+%if cpuflag(ssse3)
+ mova m3, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh
+ LOWPASS 2, 1, 0 ; BCDEFGHh
+ pavgw m1, m0 ; ABCDEFGh
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ SHIFT_RIGHT m1, m1, m3
+ SHIFT_RIGHT m2, m2, m3
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ SHIFT_RIGHT m1, m1, m3
+ SHIFT_RIGHT m2, m2, m3
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ SHIFT_RIGHT m1, m1, m3
+ SHIFT_RIGHT m2, m2, m3
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ mova m1, [aq+mmsize]
+ PALIGNR m2, m1, m0, 2, m3
+ PALIGNR m3, m1, m0, 4, m4
+ LOWPASS 3, 2, 0
+ pavgw m2, m0
+%if cpuflag(ssse3)
+ mova m4, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m5, m0, m1, m4
+ LOWPASS 0, 5, 1
+ pavgw m1, m5
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+
+.loop:
+ mova [dstq+strideq*0+ 0], m2
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m3
+ mova [dstq+strideq*1+16], m0
+ lea dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+ vpalignr m2, m1, m2, 2
+ vpalignr m3, m0, m3, 2
+%else
+ PALIGNR m5, m1, m2, 2, m4
+ mova m2, m5
+ PALIGNR m5, m0, m3, 2, m4
+ mova m3, m5
+%endif
+ SHIFT_RIGHT m1, m1, m4
+ SHIFT_RIGHT m0, m0, m4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0]
+ mova m1, [aq+mmsize*1]
+ mova m2, [aq+mmsize*2]
+ PALIGNR m6, m1, m0, 2, m5
+ PALIGNR m7, m1, m0, 4, m5
+ LOWPASS 7, 6, 0
+ pavgw m6, m0
+ SCRATCH 6, 8, rsp+0*mmsize
+ PALIGNR m4, m2, m1, 2, m0
+ PALIGNR m5, m2, m1, 4, m0
+ LOWPASS 5, 4, 1
+ pavgw m4, m1
+ mova m0, [aq+mmsize*3]
+ PALIGNR m1, m0, m2, 2, m6
+ PALIGNR m3, m0, m2, 4, m6
+ LOWPASS 3, 1, 2
+ pavgw m2, m1
+%if cpuflag(ssse3)
+ PRELOAD 10, pb_2to15_14_15, shuf
+%endif
+ SHIFT_RIGHTx2 m6, m1, m0, reg_shuf
+ LOWPASS 1, 6, 0
+ pavgw m0, m6
+%if ARCH_X86_64
+ pshufd m9, m6, q3333
+%endif
+%if cpuflag(avx)
+ UNSCRATCH 6, 8, rsp+0*mmsize
+%endif
+ DEFINE_ARGS dst, stride, cnt, stride16, stride17
+ mov stride16q, strideq
+ mov cntd, 8
+ shl stride16q, 4
+ lea stride17q, [stride16q+strideq]
+
+ ; FIXME m8 is unused for avx, so we could save one register here for win64
+.loop:
+%if notcpuflag(avx)
+ UNSCRATCH 6, 8, rsp+0*mmsize
+%endif
+ mova [dstq+strideq*0+ 0], m6
+ mova [dstq+strideq*0+16], m4
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m7
+ mova [dstq+strideq*1+16], m5
+ mova [dstq+strideq*1+32], m3
+ mova [dstq+strideq*1+48], m1
+ mova [dstq+stride16q+ 0], m4
+ mova [dstq+stride16q+16], m2
+ mova [dstq+stride16q+32], m0
+%if ARCH_X86_64
+ mova [dstq+stride16q+48], m9
+%endif
+ mova [dstq+stride17q+ 0], m5
+ mova [dstq+stride17q+16], m3
+ mova [dstq+stride17q+32], m1
+%if ARCH_X86_64
+ mova [dstq+stride17q+48], m9
+%endif
+ lea dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+ vpalignr m6, m4, m6, 2
+ vpalignr m4, m2, m4, 2
+ vpalignr m2, m0, m2, 2
+ vpalignr m7, m5, m7, 2
+ vpalignr m5, m3, m5, 2
+ vpalignr m3, m1, m3, 2
+%else
+ SCRATCH 3, 8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+ SCRATCH 1, 10, rsp+1*mmsize
+%endif
+ PALIGNR m3, m4, m6, 2, m1
+ mova m6, m3
+ PALIGNR m3, m2, m4, 2, m1
+ mova m4, m3
+ PALIGNR m3, m0, m2, 2, m1
+ mova m2, m3
+ PALIGNR m3, m5, m7, 2, m1
+ mova m7, m3
+ UNSCRATCH 3, 8, rsp+0*mmsize
+ SCRATCH 6, 8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+ UNSCRATCH 1, 10, rsp+1*mmsize
+ SCRATCH 7, 10, rsp+1*mmsize
+%endif
+ PALIGNR m6, m3, m5, 2, m7
+ mova m5, m6
+ PALIGNR m6, m1, m3, 2, m7
+ mova m3, m6
+%if notcpuflag(ssse3)
+ UNSCRATCH 7, 10, rsp+1*mmsize
+%endif
+%endif
+ SHIFT_RIGHT m1, m1, reg_shuf
+ SHIFT_RIGHT m0, m0, reg_shuf
+ dec cntd
+ jg .loop
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+%assign %%n 0
+%rep 4
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+48], m0
+ mova [dstq+strideq*2+48], m0
+ mova [dstq+stride3q +48], m0
+%if %%n < 3
+ lea dstq, [dstq+strideq*4]
+%endif
+%assign %%n (%%n+1)
+%endrep
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+VL_FUNCS 2
+INIT_XMM ssse3
+VL_FUNCS 1
+INIT_XMM avx
+VL_FUNCS 1
+
+%macro VR_FUNCS 0
+cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
+ movu m0, [aq-2]
+ movhps m1, [lq]
+ PALIGNR m0, m1, 10, m2 ; xyz*abcd
+ pslldq m1, m0, 2 ; .xyz*abc
+ pslldq m2, m0, 4 ; ..xyz*ab
+ LOWPASS 2, 1, 0 ; ..YZ#ABC
+ pavgw m1, m0 ; ....#ABC
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movhps [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m2
+ shufps m0, m2, m1, q3210
+%if cpuflag(ssse3)
+ pshufb m2, [pb_4_5_8to13_8x0]
+%else
+ pshuflw m2, m2, q2222
+ psrldq m2, 6
+%endif
+ psrldq m0, 6
+ movh [dstq+strideq*2], m0
+ movh [dstq+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
+ movu m1, [aq-2] ; *abcdefg
+ movu m2, [lq] ; stuvwxyz
+ mova m0, [aq] ; abcdefgh
+ PALIGNR m3, m1, m2, 14, m4 ; z*abcdef
+ LOWPASS 3, 1, 0
+ pavgw m0, m1
+ PALIGNR m1, m2, 2, m4 ; tuvwxyz*
+ pslldq m4, m2, 2 ; .stuvwxy
+ LOWPASS 4, 2, 1
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m3
+ PALIGNR m0, m4, 14, m1
+ pslldq m4, 2
+ PALIGNR m3, m4, 14, m1
+ pslldq m4, 2
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ PALIGNR m0, m4, 14, m1
+ pslldq m4, 2
+ PALIGNR m3, m4, 14, m1
+ pslldq m4, 2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m3
+ PALIGNR m0, m4, 14, m1
+ pslldq m4, 2
+ PALIGNR m3, m4, 14, m4
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m3
+ RET
+
+cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
+ movu m1, [aq-2] ; *abcdefg
+ movu m2, [aq+mmsize-2] ; hijklmno
+ mova m3, [aq] ; abcdefgh
+ mova m4, [aq+mmsize] ; ijklmnop
+ mova m5, [lq+mmsize] ; stuvwxyz
+ PALIGNR m0, m1, m5, 14, m6 ; z*abcdef
+ movu m6, [aq+mmsize-4] ; ghijklmn
+ LOWPASS 6, 2, 4
+ pavgw m2, m4
+ LOWPASS 0, 1, 3
+ pavgw m3, m1
+ PALIGNR m1, m5, 2, m7 ; tuvwxyz*
+ movu m7, [lq+mmsize-2] ; rstuvwxy
+ LOWPASS 1, 5, 7
+ movu m5, [lq+2] ; lmnopqrs
+ pslldq m4, m5, 2 ; .lmnopqr
+ pslldq m7, m5, 4 ; ..lmnopq
+ LOWPASS 5, 4, 7
+ psrld m4, m1, 16
+ psrld m7, m5, 16
+ pand m1, [pd_65535]
+ pand m5, [pd_65535]
+ packssdw m7, m4
+ packssdw m5, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+
+.loop:
+ mova [dstq+strideq*0+ 0], m3
+ mova [dstq+strideq*0+16], m2
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m6
+ lea dstq, [dstq+strideq*2]
+ PALIGNR m2, m3, 14, m4
+ PALIGNR m3, m7, 14, m4
+ pslldq m7, 2
+ PALIGNR m6, m0, 14, m4
+ PALIGNR m0, m5, 14, m4
+ pslldq m5, 2
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
+ movu m0, [aq+mmsize*0-2] ; *a[0-6]
+ movu m1, [aq+mmsize*1-2] ; a[7-14]
+ movu m2, [aq+mmsize*2-2] ; a[15-22]
+ movu m3, [aq+mmsize*3-2] ; a[23-30]
+ mova m4, [aq+mmsize*3+0] ; a[24-31]
+ movu m5, [aq+mmsize*3-4] ; a[22-29]
+ LOWPASS 5, 3, 4 ; A[23-30]
+ SCRATCH 5, 8, rsp+0*mmsize
+ pavgw m3, m4
+ mova m4, [aq+mmsize*2+0] ; a[16-23]
+ movu m6, [aq+mmsize*2-4] ; a[14-21]
+ LOWPASS 6, 2, 4 ; A[15-22]
+ SCRATCH 6, 9, rsp+1*mmsize
+ pavgw m2, m4
+ mova m4, [aq+mmsize*1+0] ; a[8-15]
+ movu m7, [aq+mmsize*1-4] ; a[6-13]
+ LOWPASS 7, 1, 4 ; A[7-14]
+ SCRATCH 7, 10, rsp+2*mmsize
+ pavgw m1, m4
+ mova m4, [aq+mmsize*0+0] ; a[0-7]
+ mova m5, [lq+mmsize*3+0] ; l[24-31]
+ PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5]
+ LOWPASS 6, 0, 4 ; #A[0-6]
+ SCRATCH 6, 11, rsp+3*mmsize
+ pavgw m4, m0
+ PALIGNR m0, m5, 2, m7 ; l[25-31]*
+ movu m7, [lq+mmsize*3-2] ; l[23-30]
+ LOWPASS 0, 5, 7 ; L[24-31]
+ movu m5, [lq+mmsize*2-2] ; l[15-22]
+ mova m7, [lq+mmsize*2+0] ; l[16-23]
+ movu m6, [lq+mmsize*2+2] ; l[17-24]
+ LOWPASS 5, 7, 6 ; L[16-23]
+ psrld m7, m0, 16
+ psrld m6, m5, 16
+ pand m0, [pd_65535]
+ pand m5, [pd_65535]
+ packssdw m6, m7
+ packssdw m5, m0
+ SCRATCH 5, 12, rsp+4*mmsize
+ SCRATCH 6, 13, rsp+5*mmsize
+ movu m6, [lq+mmsize*1-2] ; l[7-14]
+ mova m0, [lq+mmsize*1+0] ; l[8-15]
+ movu m5, [lq+mmsize*1+2] ; l[9-16]
+ LOWPASS 6, 0, 5 ; L[8-15]
+ movu m0, [lq+mmsize*0+2] ; l[1-8]
+ pslldq m5, m0, 2 ; .l[1-7]
+ pslldq m7, m0, 4 ; ..l[1-6]
+ LOWPASS 0, 5, 7
+ psrld m5, m6, 16
+ psrld m7, m0, 16
+ pand m6, [pd_65535]
+ pand m0, [pd_65535]
+ packssdw m7, m5
+ packssdw m0, m6
+ UNSCRATCH 6, 13, rsp+5*mmsize
+ DEFINE_ARGS dst, stride, stride16, cnt, stride17
+ mov stride16q, strideq
+ mov cntd, 8
+ shl stride16q, 4
+%if ARCH_X86_64
+ lea stride17q, [stride16q+strideq]
+%endif
+
+.loop:
+ mova [dstq+strideq*0+ 0], m4
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+%if ARCH_X86_64
+ mova [dstq+strideq*1+ 0], m11
+ mova [dstq+strideq*1+16], m10
+ mova [dstq+strideq*1+32], m9
+ mova [dstq+strideq*1+48], m8
+%endif
+ mova [dstq+stride16q+ 0], m6
+ mova [dstq+stride16q+16], m4
+ mova [dstq+stride16q+32], m1
+ mova [dstq+stride16q+48], m2
+%if ARCH_X86_64
+ mova [dstq+stride17q+ 0], m12
+ mova [dstq+stride17q+16], m11
+ mova [dstq+stride17q+32], m10
+ mova [dstq+stride17q+48], m9
+%endif
+ lea dstq, [dstq+strideq*2]
+ PALIGNR m3, m2, 14, m5
+ PALIGNR m2, m1, 14, m5
+ PALIGNR m1, m4, 14, m5
+ PALIGNR m4, m6, 14, m5
+ PALIGNR m6, m7, 14, m5
+ pslldq m7, 2
+%if ARCH_X86_64
+ PALIGNR m8, m9, 14, m5
+ PALIGNR m9, m10, 14, m5
+ PALIGNR m10, m11, 14, m5
+ PALIGNR m11, m12, 14, m5
+ PALIGNR m12, m0, 14, m5
+ pslldq m0, 2
+%endif
+ dec cntd
+ jg .loop
+
+%if ARCH_X86_32
+ UNSCRATCH 5, 12, rsp+4*mmsize
+ UNSCRATCH 4, 11, rsp+3*mmsize
+ UNSCRATCH 3, 10, rsp+2*mmsize
+ UNSCRATCH 2, 9, rsp+1*mmsize
+ UNSCRATCH 1, 8, rsp+0*mmsize
+ mov dstq, dstm
+ mov cntd, 8
+ add dstq, strideq
+.loop2:
+ mova [dstq+strideq*0+ 0], m4
+ mova [dstq+strideq*0+16], m3
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m1
+ mova [dstq+stride16q+ 0], m5
+ mova [dstq+stride16q+16], m4
+ mova [dstq+stride16q+32], m3
+ mova [dstq+stride16q+48], m2
+ lea dstq, [dstq+strideq*2]
+ PALIGNR m1, m2, 14, m6
+ PALIGNR m2, m3, 14, m6
+ PALIGNR m3, m4, 14, m6
+ PALIGNR m4, m5, 14, m6
+ PALIGNR m5, m0, 14, m6
+ pslldq m0, 2
+ dec cntd
+ jg .loop2
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+VR_FUNCS
+INIT_XMM ssse3
+VR_FUNCS
+INIT_XMM avx
+VR_FUNCS
+
+%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
+ movh m0, [lq] ; abcd
+%if cpuflag(ssse3)
+ pshufb m0, [pb_0to7_67x4] ; abcddddd
+%else
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333 ; abcddddd
+%endif
+ psrldq m1, m0, 2 ; bcddddd.
+ psrldq m2, m0, 4 ; cddddd..
+ LOWPASS 2, 1, 0 ; BCDddd..
+ pavgw m1, m0 ; abcddddd
+ SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd
+ PALIGNR m2, m1, 4, m0 ; bCcDdddd
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movh [dstq+strideq*0], m1 ; aBbC
+ movh [dstq+strideq*1], m2 ; bCcD
+ movhps [dstq+strideq*2], m1 ; cDdd
+ movhps [dstq+stride3q ], m2 ; dddd
+ RET
+
+cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
+ mova m0, [lq]
+%if cpuflag(ssse3)
+ mova m3, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m1, m2, m0, m3
+ LOWPASS 2, 1, 0
+ pavgw m1, m0
+ SBUTTERFLY wd, 1, 2, 0
+ shufps m0, m1, m2, q1032
+ pshufd m3, m2, q3332
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ mova [dstq+strideq *0], m1
+ mova [dstq+strideq *2], m0
+ mova [dstq+strideq *4], m2
+ mova [dstq+stride3q*2], m3
+ add dstq, strideq
+%if cpuflag(avx)
+ vpalignr m1, m2, m1, 4
+%else
+ PALIGNR m0, m2, m1, 4, m3
+ mova m1, m0
+%endif
+ pshufd m2, m2, q3321
+ shufps m0, m1, m2, q1032
+ pshufd m3, m2, q3332
+ mova [dstq+strideq *0], m1
+ mova [dstq+strideq *2], m0
+ mova [dstq+strideq *4], m2
+ mova [dstq+stride3q*2], m3
+ RET
+
+cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
+ mova m0, [lq]
+ mova m3, [lq+mmsize]
+ movu m1, [lq+2]
+ movu m2, [lq+4]
+ LOWPASS 2, 1, 0
+ pavgw m1, m0
+ SBUTTERFLY wd, 1, 2, 0
+%if cpuflag(ssse3)
+ mova m5, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m0, m4, m3, m5
+ LOWPASS 4, 0, 3
+ pavgw m3, m0
+ SBUTTERFLY wd, 3, 4, 5
+ pshufd m0, m0, q3333
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+
+.loop:
+ mova [dstq+strideq *0+ 0], m1
+ mova [dstq+strideq *0+16], m2
+ mova [dstq+strideq *4+ 0], m2
+ mova [dstq+strideq *4+16], m3
+ mova [dstq+strideq *8+ 0], m3
+ mova [dstq+strideq *8+16], m4
+ mova [dstq+stride3q*4+ 0], m4
+ mova [dstq+stride3q*4+16], m0
+ add dstq, strideq
+%if cpuflag(avx)
+ vpalignr m1, m2, m1, 4
+ vpalignr m2, m3, m2, 4
+ vpalignr m3, m4, m3, 4
+ vpalignr m4, m0, m4, 4
+%else
+ PALIGNR m5, m2, m1, 4, m6
+ mova m1, m5
+ PALIGNR m5, m3, m2, 4, m6
+ mova m2, m5
+ PALIGNR m5, m4, m3, 4, m6
+ mova m3, m5
+ PALIGNR m5, m0, m4, 4, m6
+ mova m4, m5
+%endif
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
+ %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
+ mova m2, [lq+mmsize*0+0]
+ movu m1, [lq+mmsize*0+2]
+ movu m0, [lq+mmsize*0+4]
+ LOWPASS 0, 1, 2
+ pavgw m1, m2
+ SBUTTERFLY wd, 1, 0, 2
+ SCRATCH 1, 8, rsp+0*mmsize
+ mova m4, [lq+mmsize*1+0]
+ movu m3, [lq+mmsize*1+2]
+ movu m2, [lq+mmsize*1+4]
+ LOWPASS 2, 3, 4
+ pavgw m3, m4
+ SBUTTERFLY wd, 3, 2, 4
+ mova m6, [lq+mmsize*2+0]
+ movu m5, [lq+mmsize*2+2]
+ movu m4, [lq+mmsize*2+4]
+ LOWPASS 4, 5, 6
+ pavgw m5, m6
+ SBUTTERFLY wd, 5, 4, 6
+ mova m7, [lq+mmsize*3+0]
+ SCRATCH 0, 9, rsp+1*mmsize
+%if cpuflag(ssse3)
+ mova m0, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m1, m6, m7, m0
+ LOWPASS 6, 1, 7
+ pavgw m7, m1
+ SBUTTERFLY wd, 7, 6, 0
+ pshufd m1, m1, q3333
+ UNSCRATCH 0, 9, rsp+1*mmsize
+ DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+ lea stride3q, [strideq*3]
+ lea stride4q, [strideq*4]
+ lea stride28q, [stride4q*8]
+ lea stride20q, [stride4q*5]
+ sub stride28q, stride4q
+ mov cntd, 4
+
+.loop:
+%if ARCH_X86_64
+ SWAP 1, 8
+%else
+ mova [rsp+1*mmsize], m1
+ mova m1, [rsp+0*mmsize]
+%endif
+ mova [dstq+strideq *0+ 0], m1
+ mova [dstq+strideq *0+16], m0
+ mova [dstq+strideq *0+32], m3
+ mova [dstq+strideq *0+48], m2
+ mova [dstq+stride4q*1+ 0], m0
+ mova [dstq+stride4q*1+16], m3
+ mova [dstq+stride4q*1+32], m2
+ mova [dstq+stride4q*1+48], m5
+ mova [dstq+stride4q*2+ 0], m3
+ mova [dstq+stride4q*2+16], m2
+ mova [dstq+stride4q*2+32], m5
+ mova [dstq+stride4q*2+48], m4
+%if cpuflag(avx)
+ vpalignr m1, m0, m1, 4
+ vpalignr m0, m3, m0, 4
+ vpalignr m3, m2, m3, 4
+%else
+ SCRATCH 6, 9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+ SCRATCH 7, 10, rsp+3*mmsize
+%endif
+ PALIGNR m6, m0, m1, 4, m7
+ mova m1, m6
+ PALIGNR m6, m3, m0, 4, m7
+ mova m0, m6
+ PALIGNR m6, m2, m3, 4, m7
+ mova m3, m6
+ UNSCRATCH 6, 9, rsp+2*mmsize
+ SCRATCH 0, 9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+ UNSCRATCH 7, 10, rsp+3*mmsize
+ SCRATCH 3, 10, rsp+3*mmsize
+%endif
+%endif
+%if ARCH_X86_64
+ SWAP 1, 8
+%else
+ mova [rsp+0*mmsize], m1
+ mova m1, [rsp+1*mmsize]
+%endif
+ mova [dstq+stride3q*4+ 0], m2
+ mova [dstq+stride3q*4+16], m5
+ mova [dstq+stride3q*4+32], m4
+ mova [dstq+stride3q*4+48], m7
+ mova [dstq+stride4q*4+ 0], m5
+ mova [dstq+stride4q*4+16], m4
+ mova [dstq+stride4q*4+32], m7
+ mova [dstq+stride4q*4+48], m6
+ mova [dstq+stride20q + 0], m4
+ mova [dstq+stride20q +16], m7
+ mova [dstq+stride20q +32], m6
+ mova [dstq+stride20q +48], m1
+ mova [dstq+stride3q*8+ 0], m7
+ mova [dstq+stride3q*8+16], m6
+ mova [dstq+stride3q*8+32], m1
+ mova [dstq+stride3q*8+48], m1
+ mova [dstq+stride28q + 0], m6
+ mova [dstq+stride28q +16], m1
+ mova [dstq+stride28q +32], m1
+ mova [dstq+stride28q +48], m1
+%if cpuflag(avx)
+ vpalignr m2, m5, m2, 4
+ vpalignr m5, m4, m5, 4
+ vpalignr m4, m7, m4, 4
+ vpalignr m7, m6, m7, 4
+ vpalignr m6, m1, m6, 4
+%else
+ PALIGNR m0, m5, m2, 4, m3
+ mova m2, m0
+ PALIGNR m0, m4, m5, 4, m3
+ mova m5, m0
+ PALIGNR m0, m7, m4, 4, m3
+ mova m4, m0
+ PALIGNR m0, m6, m7, 4, m3
+ mova m7, m0
+ PALIGNR m0, m1, m6, 4, m3
+ mova m6, m0
+ UNSCRATCH 0, 9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+ UNSCRATCH 3, 10, rsp+3*mmsize
+%endif
+%endif
+ add dstq, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+HU_FUNCS 4
+INIT_XMM ssse3
+HU_FUNCS 3
+INIT_XMM avx
+HU_FUNCS 2
+
+%macro HD_FUNCS 0
+cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
+ movh m0, [lq]
+ movhps m0, [aq-2]
+ psrldq m1, m0, 2
+ psrldq m2, m0, 4
+ LOWPASS 2, 1, 0
+ pavgw m1, m0
+ punpcklwd m1, m2
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movh [dstq+stride3q ], m1
+ movhps [dstq+strideq*1], m1
+ movhlps m2, m2
+ PALIGNR m2, m1, 4, m0
+ movh [dstq+strideq*2], m2
+ movhps [dstq+strideq*0], m2
+ RET
+
+cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
+ mova m0, [lq]
+ movu m1, [aq-2]
+ PALIGNR m2, m1, m0, 2, m3
+ PALIGNR m3, m1, m0, 4, m4
+ LOWPASS 3, 2, 0
+ pavgw m2, m0
+ SBUTTERFLY wd, 2, 3, 0
+ psrldq m0, m1, 2
+ psrldq m4, m1, 4
+ LOWPASS 1, 0, 4
+ DEFINE_ARGS dst8, mstride, cnt
+ lea dst8q, [dst8q+mstrideq*8]
+ neg mstrideq
+ mov cntd, 4
+
+.loop:
+ add dst8q, mstrideq
+ mova [dst8q+mstrideq*0], m2
+ mova [dst8q+mstrideq*4], m3
+%if cpuflag(avx)
+ vpalignr m2, m3, m2, 4
+ vpalignr m3, m1, m3, 4
+%else
+ PALIGNR m0, m3, m2, 4, m4
+ mova m2, m0
+ PALIGNR m0, m1, m3, 4, m4
+ mova m3, m0
+%endif
+ psrldq m1, 4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
+ mova m2, [lq]
+ movu m1, [lq+2]
+ movu m0, [lq+4]
+ LOWPASS 0, 1, 2
+ pavgw m1, m2
+ mova m4, [lq+mmsize]
+ movu m5, [aq-2]
+ PALIGNR m3, m5, m4, 2, m6
+ PALIGNR m2, m5, m4, 4, m6
+ LOWPASS 2, 3, 4
+ pavgw m3, m4
+ SBUTTERFLY wd, 1, 0, 4
+ SBUTTERFLY wd, 3, 2, 4
+ mova m6, [aq]
+ movu m4, [aq+2]
+ LOWPASS 4, 6, 5
+ movu m5, [aq+mmsize-2]
+ psrldq m6, m5, 2
+ psrldq m7, m5, 4
+ LOWPASS 5, 6, 7
+ DEFINE_ARGS dst, mstride, mstride3, cnt
+ lea dstq, [dstq+mstrideq*8]
+ lea dstq, [dstq+mstrideq*8]
+ neg mstrideq
+ lea mstride3q, [mstrideq*3]
+ mov cntd, 4
+
+.loop:
+ add dstq, mstrideq
+ mova [dstq+mstride3q*4+ 0], m2
+ mova [dstq+mstride3q*4+16], m4
+ mova [dstq+mstrideq *8+ 0], m3
+ mova [dstq+mstrideq *8+16], m2
+ mova [dstq+mstrideq *4+ 0], m0
+ mova [dstq+mstrideq *4+16], m3
+ mova [dstq+mstrideq *0+ 0], m1
+ mova [dstq+mstrideq *0+16], m0
+%if cpuflag(avx)
+ vpalignr m1, m0, m1, 4
+ vpalignr m0, m3, m0, 4
+ vpalignr m3, m2, m3, 4
+ vpalignr m2, m4, m2, 4
+ vpalignr m4, m5, m4, 4
+%else
+ PALIGNR m6, m0, m1, 4, m7
+ mova m1, m6
+ PALIGNR m6, m3, m0, 4, m7
+ mova m0, m6
+ PALIGNR m6, m2, m3, 4, m7
+ mova m3, m6
+ PALIGNR m6, m4, m2, 4, m7
+ mova m2, m6
+ PALIGNR m6, m5, m4, 4, m7
+ mova m4, m6
+%endif
+ psrldq m5, 4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
+ 10 * -mmsize * ARCH_X86_32, dst, stride, l, a
+ mova m2, [lq+mmsize*0+0]
+ movu m1, [lq+mmsize*0+2]
+ movu m0, [lq+mmsize*0+4]
+ LOWPASS 0, 1, 2
+ pavgw m1, m2
+ SBUTTERFLY wd, 1, 0, 2
+ mova m4, [lq+mmsize*1+0]
+ movu m3, [lq+mmsize*1+2]
+ movu m2, [lq+mmsize*1+4]
+ LOWPASS 2, 3, 4
+ pavgw m3, m4
+ SBUTTERFLY wd, 3, 2, 4
+ SCRATCH 0, 8, rsp+0*mmsize
+ SCRATCH 1, 9, rsp+1*mmsize
+ SCRATCH 2, 10, rsp+2*mmsize
+ SCRATCH 3, 11, rsp+3*mmsize
+ mova m6, [lq+mmsize*2+0]
+ movu m5, [lq+mmsize*2+2]
+ movu m4, [lq+mmsize*2+4]
+ LOWPASS 4, 5, 6
+ pavgw m5, m6
+ SBUTTERFLY wd, 5, 4, 6
+ mova m0, [lq+mmsize*3+0]
+ movu m1, [aq+mmsize*0-2]
+ PALIGNR m7, m1, m0, 2, m2
+ PALIGNR m6, m1, m0, 4, m2
+ LOWPASS 6, 7, 0
+ pavgw m7, m0
+ SBUTTERFLY wd, 7, 6, 0
+ mova m2, [aq+mmsize*0+0]
+ movu m0, [aq+mmsize*0+2]
+ LOWPASS 0, 2, 1
+ movu m1, [aq+mmsize*1-2]
+ mova m2, [aq+mmsize*1+0]
+ movu m3, [aq+mmsize*1+2]
+ LOWPASS 1, 2, 3
+ SCRATCH 6, 12, rsp+6*mmsize
+ SCRATCH 7, 13, rsp+7*mmsize
+ movu m2, [aq+mmsize*2-2]
+ mova m3, [aq+mmsize*2+0]
+ movu m6, [aq+mmsize*2+2]
+ LOWPASS 2, 3, 6
+ movu m3, [aq+mmsize*3-2]
+ psrldq m6, m3, 2
+ psrldq m7, m3, 4
+ LOWPASS 3, 6, 7
+ UNSCRATCH 6, 12, rsp+6*mmsize
+ UNSCRATCH 7, 13, rsp+7*mmsize
+%if ARCH_X86_32
+ mova [rsp+4*mmsize], m4
+ mova [rsp+5*mmsize], m5
+ ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
+ ; to do it again here
+%endif
+ DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+ mov cntd, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ lea stride4q, [strideq*4]
+ lea stride28q, [stride4q*8]
+ lea stride20q, [stride4q*5]
+ sub stride28q, stride4q
+%endif
+ add dstq, stride3q
+
+ ; x86-32 doesn't have enough registers, so on that platform, we split
+ ; the loop in 2... Otherwise you spend most of the loop (un)scratching
+.loop:
+%if ARCH_X86_64
+ mova [dstq+stride28q + 0], m9
+ mova [dstq+stride28q +16], m8
+ mova [dstq+stride28q +32], m11
+ mova [dstq+stride28q +48], m10
+ mova [dstq+stride3q*8+ 0], m8
+ mova [dstq+stride3q*8+16], m11
+ mova [dstq+stride3q*8+32], m10
+ mova [dstq+stride3q*8+48], m5
+ mova [dstq+stride20q + 0], m11
+ mova [dstq+stride20q +16], m10
+ mova [dstq+stride20q +32], m5
+ mova [dstq+stride20q +48], m4
+ mova [dstq+stride4q*4+ 0], m10
+ mova [dstq+stride4q*4+16], m5
+ mova [dstq+stride4q*4+32], m4
+ mova [dstq+stride4q*4+48], m7
+%endif
+ mova [dstq+stride3q*4+ 0], m5
+ mova [dstq+stride3q*4+16], m4
+ mova [dstq+stride3q*4+32], m7
+ mova [dstq+stride3q*4+48], m6
+ mova [dstq+strideq* 8+ 0], m4
+ mova [dstq+strideq* 8+16], m7
+ mova [dstq+strideq* 8+32], m6
+ mova [dstq+strideq* 8+48], m0
+ mova [dstq+strideq* 4+ 0], m7
+ mova [dstq+strideq* 4+16], m6
+ mova [dstq+strideq* 4+32], m0
+ mova [dstq+strideq* 4+48], m1
+ mova [dstq+strideq* 0+ 0], m6
+ mova [dstq+strideq* 0+16], m0
+ mova [dstq+strideq* 0+32], m1
+ mova [dstq+strideq* 0+48], m2
+ sub dstq, strideq
+%if cpuflag(avx)
+%if ARCH_X86_64
+ vpalignr m9, m8, m9, 4
+ vpalignr m8, m11, m8, 4
+ vpalignr m11, m10, m11, 4
+ vpalignr m10, m5, m10, 4
+%endif
+ vpalignr m5, m4, m5, 4
+ vpalignr m4, m7, m4, 4
+ vpalignr m7, m6, m7, 4
+ vpalignr m6, m0, m6, 4
+ vpalignr m0, m1, m0, 4
+ vpalignr m1, m2, m1, 4
+ vpalignr m2, m3, m2, 4
+%else
+%if ARCH_X86_64
+ PALIGNR m12, m8, m9, 4, m13
+ mova m9, m12
+ PALIGNR m12, m11, m8, 4, m13
+ mova m8, m12
+ PALIGNR m12, m10, m11, 4, m13
+ mova m11, m12
+ PALIGNR m12, m5, m10, 4, m13
+ mova m10, m12
+%endif
+ SCRATCH 3, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+ SCRATCH 2, 13, rsp+9*mmsize
+%endif
+ PALIGNR m3, m4, m5, 4, m2
+ mova m5, m3
+ PALIGNR m3, m7, m4, 4, m2
+ mova m4, m3
+ PALIGNR m3, m6, m7, 4, m2
+ mova m7, m3
+ PALIGNR m3, m0, m6, 4, m2
+ mova m6, m3
+ PALIGNR m3, m1, m0, 4, m2
+ mova m0, m3
+%if notcpuflag(ssse3)
+ UNSCRATCH 2, 13, rsp+9*mmsize
+ SCRATCH 0, 13, rsp+9*mmsize
+%endif
+ PALIGNR m3, m2, m1, 4, m0
+ mova m1, m3
+ PALIGNR m3, reg_sh, m2, 4, m0
+ mova m2, m3
+%if notcpuflag(ssse3)
+ UNSCRATCH 0, 13, rsp+9*mmsize
+%endif
+ UNSCRATCH 3, 12, rsp+8*mmsize, sh
+%endif
+ psrldq m3, 4
+ dec cntd
+ jg .loop
+
+%if ARCH_X86_32
+ UNSCRATCH 0, 8, rsp+0*mmsize
+ UNSCRATCH 1, 9, rsp+1*mmsize
+ UNSCRATCH 2, 10, rsp+2*mmsize
+ UNSCRATCH 3, 11, rsp+3*mmsize
+ mova m4, [rsp+4*mmsize]
+ mova m5, [rsp+5*mmsize]
+ mova m6, [rsp+6*mmsize]
+ mova m7, [rsp+7*mmsize]
+ DEFINE_ARGS dst, stride, stride5, stride3
+ lea stride5q, [strideq*5]
+ lea dstq, [dstq+stride5q*4]
+ DEFINE_ARGS dst, stride, cnt, stride3
+ mov cntd, 4
+.loop_2:
+ mova [dstq+stride3q*4+ 0], m1
+ mova [dstq+stride3q*4+16], m0
+ mova [dstq+stride3q*4+32], m3
+ mova [dstq+stride3q*4+48], m2
+ mova [dstq+strideq* 8+ 0], m0
+ mova [dstq+strideq* 8+16], m3
+ mova [dstq+strideq* 8+32], m2
+ mova [dstq+strideq* 8+48], m5
+ mova [dstq+strideq* 4+ 0], m3
+ mova [dstq+strideq* 4+16], m2
+ mova [dstq+strideq* 4+32], m5
+ mova [dstq+strideq* 4+48], m4
+ mova [dstq+strideq* 0+ 0], m2
+ mova [dstq+strideq* 0+16], m5
+ mova [dstq+strideq* 0+32], m4
+ mova [dstq+strideq* 0+48], m7
+ sub dstq, strideq
+%if cpuflag(avx)
+ vpalignr m1, m0, m1, 4
+ vpalignr m0, m3, m0, 4
+ vpalignr m3, m2, m3, 4
+ vpalignr m2, m5, m2, 4
+ vpalignr m5, m4, m5, 4
+ vpalignr m4, m7, m4, 4
+ vpalignr m7, m6, m7, 4
+%else
+ SCRATCH 6, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+ SCRATCH 7, 13, rsp+9*mmsize
+%endif
+ PALIGNR m6, m0, m1, 4, m7
+ mova m1, m6
+ PALIGNR m6, m3, m0, 4, m7
+ mova m0, m6
+ PALIGNR m6, m2, m3, 4, m7
+ mova m3, m6
+ PALIGNR m6, m5, m2, 4, m7
+ mova m2, m6
+ PALIGNR m6, m4, m5, 4, m7
+ mova m5, m6
+%if notcpuflag(ssse3)
+ UNSCRATCH 7, 13, rsp+9*mmsize
+ SCRATCH 5, 13, rsp+9*mmsize
+%endif
+ PALIGNR m6, m7, m4, 4, m5
+ mova m4, m6
+ PALIGNR m6, reg_sh, m7, 4, m5
+ mova m7, m6
+%if notcpuflag(ssse3)
+ UNSCRATCH 5, 13, rsp+9*mmsize
+%endif
+ UNSCRATCH 6, 12, rsp+8*mmsize, sh
+%endif
+ psrldq m6, 4
+ dec cntd
+ jg .loop_2
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+HD_FUNCS
+INIT_XMM ssse3
+HD_FUNCS
+INIT_XMM avx
+HD_FUNCS
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
new file mode 100644
index 0000000000..2c63fe514a
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -0,0 +1,3197 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA 32
+
+%macro VP9_IDCT_COEFFS 2-3 0
+const pw_m%1_%2
+times 8 dw -%1, %2
+const pw_%2_%1
+times 8 dw %2, %1
+
+%if %3 == 1
+const pw_m%2_m%1
+times 8 dw -%2, -%1
+%if %1 != %2
+const pw_m%2_%1
+times 8 dw -%2, %1
+const pw_%1_%2
+times 8 dw %1, %2
+%endif
+%endif
+
+%if %1 < 11585
+pw_m%1x2: times 16 dw -%1*2
+%elif %1 > 11585
+pw_%1x2: times 16 dw %1*2
+%else
+const pw_%1x2
+times 16 dw %1*2
+%endif
+
+%if %2 != %1
+pw_%2x2: times 16 dw %2*2
+%endif
+%endmacro
+
+VP9_IDCT_COEFFS 16364, 804
+VP9_IDCT_COEFFS 16305, 1606
+VP9_IDCT_COEFFS 16069, 3196, 1
+VP9_IDCT_COEFFS 15893, 3981
+VP9_IDCT_COEFFS 15137, 6270, 1
+VP9_IDCT_COEFFS 14811, 7005
+VP9_IDCT_COEFFS 14449, 7723
+VP9_IDCT_COEFFS 13160, 9760
+VP9_IDCT_COEFFS 11585, 11585, 1
+VP9_IDCT_COEFFS 11003, 12140
+VP9_IDCT_COEFFS 10394, 12665
+VP9_IDCT_COEFFS 9102, 13623, 1
+VP9_IDCT_COEFFS 8423, 14053
+VP9_IDCT_COEFFS 5520, 15426
+VP9_IDCT_COEFFS 4756, 15679
+VP9_IDCT_COEFFS 2404, 16207
+
+const pw_5283_13377
+times 4 dw 5283, 13377
+const pw_9929_13377
+times 4 dw 9929, 13377
+const pw_15212_m13377
+times 4 dw 15212, -13377
+const pw_15212_9929
+times 4 dw 15212, 9929
+const pw_m5283_m15212
+times 4 dw -5283, -15212
+const pw_13377x2
+times 8 dw 13377*2
+const pw_m13377_13377
+times 4 dw -13377, 13377
+const pw_13377_0
+times 4 dw 13377, 0
+
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_m1
+cextern pd_8192
+
+SECTION .text
+
+%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
+ punpckhwd m%4, m%2, m%1
+ punpcklwd m%2, m%1
+ pmaddwd m%3, m%4, [pw_m%5_%6]
+ pmaddwd m%4, [pw_%6_%5]
+ pmaddwd m%1, m%2, [pw_m%5_%6]
+ pmaddwd m%2, [pw_%6_%5]
+%endmacro
+
+%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
+ SUMSUB_BA d, %1, %2, %5
+ SUMSUB_BA d, %3, %4, %5
+ paddd m%1, %6
+ paddd m%2, %6
+ paddd m%3, %6
+ paddd m%4, %6
+ psrad m%1, 14
+ psrad m%2, 14
+ psrad m%3, 14
+ psrad m%4, 14
+ packssdw m%1, m%3
+ packssdw m%2, m%4
+%endmacro
+
+%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
+%if mmsize == 32
+ pmovzxbw m%3, [%6]
+ pmovzxbw m%4, [%6+strideq]
+%else
+ movh m%3, [%6]
+ movh m%4, [%6+strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+%endif
+ paddw m%3, m%1
+ paddw m%4, m%2
+%if mmsize == 32
+ packuswb m%3, m%4
+ ; Intel...
+ vpermq m%3, m%3, q3120
+ mova [%6], xm%3
+ vextracti128 [%6+strideq], m%3, 1
+%elif mmsize == 16
+ packuswb m%3, m%4
+ movh [%6], m%3
+ movhps [%6+strideq], m%3
+%else
+ packuswb m%3, m%5
+ packuswb m%4, m%5
+ movh [%6], m%3
+ movh [%6+strideq], m%4
+%endif
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*2/mmsize
+ mova [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+ mova m0, [blockq+0*8]
+ mova m1, [blockq+1*8]
+ mova m2, [blockq+2*8]
+ mova m3, [blockq+3*8]
+ psraw m0, 2
+ psraw m1, 2
+ psraw m2, 2
+ psraw m3, 2
+
+ VP9_IWHT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IWHT4_1D
+
+ pxor m4, m4
+ VP9_STORE_2X 0, 1, 5, 6, 4
+ lea dstq, [dstq+strideq*2]
+ VP9_STORE_2X 2, 3, 5, 6, 4
+ ZERO_BLOCK blockq, 8, 4, m4
+ RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+; 2x2 top left corner
+%macro VP9_IDCT4_2x2_1D 0
+ pmulhrsw m0, m5 ; m0=t1
+ mova m2, m0 ; m2=t0
+ mova m3, m1
+ pmulhrsw m1, m6 ; m1=t2
+ pmulhrsw m3, m7 ; m3=t3
+ VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+ mova m5, [pw_2048]
+ pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+ pmulhrsw m1, m5
+%else
+ mova m5, [pw_8]
+ paddw m0, m5
+ paddw m1, m5
+ psraw m0, 4
+ psraw m1, 4
+%endif
+ VP9_STORE_2X 0, 1, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+%if cpuflag(ssse3)
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+%else
+ paddw m2, m5
+ paddw m3, m5
+ psraw m2, 4
+ psraw m3, 4
+%endif
+ VP9_STORE_2X 2, 3, 6, 7, 4
+%endmacro
+
+%macro IDCT_4x4_FN 1
+INIT_MMX %1
+cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+ cmp eobd, 4 ; 2x2 or smaller
+ jg .idctfull
+
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idct2x2
+%else
+ cmp eobd, 1
+ jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ mova m5, [pw_11585x2]
+ pmulhrsw m0, m5
+ pmulhrsw m0, m5
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ movsx coefd, word [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (8 << 14) + 8192
+ sar coefd, 14 + 4
+ movd m0, coefd
+%endif
+ pshufw m0, m0, 0
+ pxor m4, m4
+ movh [blockq], m4
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ RET
+
+%if cpuflag(ssse3)
+; faster path for when only top left 2x2 block is set
+.idct2x2:
+ movd m0, [blockq+0]
+ movd m1, [blockq+8]
+ mova m5, [pw_11585x2]
+ mova m6, [pw_6270x2]
+ mova m7, [pw_15137x2]
+ VP9_IDCT4_2x2_1D
+ ; partial 2x4 transpose
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ SBUTTERFLY dq, 0, 2, 1
+ SWAP 1, 2
+ VP9_IDCT4_2x2_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ movh [blockq+ 0], m4
+ movh [blockq+ 8], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endif
+
+.idctfull: ; generic full 4x4 idct/idct
+ mova m0, [blockq+ 0]
+ mova m1, [blockq+ 8]
+ mova m2, [blockq+16]
+ mova m3, [blockq+24]
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+ mova m7, [pd_8192] ; rounding
+ VP9_IDCT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT4_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ mova [blockq+ 0], m4
+ mova [blockq+ 8], m4
+ mova [blockq+16], m4
+ mova [blockq+24], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+IDCT_4x4_FN mmxext
+IDCT_4x4_FN ssse3
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+%endif
+ movdqa xmm5, [pd_8192]
+ mova m0, [blockq+ 0]
+ mova m1, [blockq+ 8]
+ mova m2, [blockq+16]
+ mova m3, [blockq+24]
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+ movdq2q m7, xmm5
+%endif
+ VP9_%2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_%4_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ mova [blockq+ 0], m4
+ mova [blockq+ 8], m4
+ mova [blockq+16], m4
+ mova [blockq+24], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+IADST4_FN idct, IDCT4, iadst, IADST4, sse2
+IADST4_FN iadst, IADST4, idct, IDCT4, sse2
+IADST4_FN iadst, IADST4, iadst, IADST4, sse2
+
+IADST4_FN idct, IDCT4, iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct, IDCT4, ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+
+%macro SCRATCH 3
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT8_1D_FINALIZE 0
+ SUMSUB_BA w, 3, 6, 5 ; m3=t0+t7, m6=t0-t7
+ SUMSUB_BA w, 1, 2, 5 ; m1=t1+t6, m2=t1-t6
+ SUMSUB_BA w, 7, 0, 5 ; m7=t2+t5, m0=t2-t5
+
+ UNSCRATCH 5, 8, blockq+ 0
+ SCRATCH 2, 8, blockq+ 0
+
+ SUMSUB_BA w, 5, 4, 2 ; m5=t3+t4, m4=t3-t4
+ SWAP 7, 6, 2
+ SWAP 3, 5, 0
+
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+%endmacro
+
+; x86-32
+; - in: m0/m4 is in mem
+; - out: m6 is in mem
+; x86-64:
+; - everything is in registers (m0-7)
+%macro VP9_IDCT8_1D 0
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 4, 9
+%endif
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 3, 9102, 13623, D_8192_REG, 0, 4 ; m5=t5a, m3=t6a
+ VP9_UNPACK_MULSUB_2W_4X 1, 7, 16069, 3196, D_8192_REG, 0, 4 ; m1=t4a, m7=t7a
+ SUMSUB_BA w, 5, 1, 0 ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
+ SUMSUB_BA w, 3, 7, 0 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+%if cpuflag(ssse3)
+ SUMSUB_BA w, 1, 7, 0 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+ pmulhrsw m1, W_11585x2_REG ; m1=t6
+ pmulhrsw m7, W_11585x2_REG ; m7=t5
+%else
+ VP9_UNPACK_MULSUB_2W_4X 7, 1, 11585, 11585, D_8192_REG, 0, 4
+%endif
+ VP9_UNPACK_MULSUB_2W_4X 2, 6, 15137, 6270, D_8192_REG, 0, 4 ; m2=t2a, m6=t3a
+
+ UNSCRATCH 0, 8, blockq+ 0 ; IN(0)
+ UNSCRATCH 4, 9, blockq+64 ; IN(4)
+ SCRATCH 5, 8, blockq+ 0
+
+%if cpuflag(ssse3)
+ SUMSUB_BA w, 4, 0, 5 ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
+ pmulhrsw m4, W_11585x2_REG ; m4=t0a
+ pmulhrsw m0, W_11585x2_REG ; m0=t1a
+%else
+ SCRATCH 7, 9, blockq+64
+ VP9_UNPACK_MULSUB_2W_4X 0, 4, 11585, 11585, D_8192_REG, 5, 7
+ UNSCRATCH 7, 9, blockq+64
+%endif
+ SUMSUB_BA w, 6, 4, 5 ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
+ SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+
+ VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_4x4_1D 0
+ pmulhrsw m0, W_11585x2_REG ; m0=t1a/t0a
+ pmulhrsw m6, m2, [pw_15137x2] ; m6=t3a
+ pmulhrsw m2, [pw_6270x2] ; m2=t2a
+ pmulhrsw m7, m1, [pw_16069x2] ; m7=t7a
+ pmulhrsw m1, [pw_3196x2] ; m1=t4a
+ pmulhrsw m5, m3, [pw_m9102x2] ; m5=t5a
+ pmulhrsw m3, [pw_13623x2] ; m3=t6a
+ SUMSUB_BA w, 5, 1, 4 ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
+ SUMSUB_BA w, 3, 7, 4 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+ SUMSUB_BA w, 1, 7, 4 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+ pmulhrsw m1, W_11585x2_REG ; m1=t6
+ pmulhrsw m7, W_11585x2_REG ; m7=t5
+ psubw m4, m0, m6 ; m4=t0a-t3a (t3)
+ paddw m6, m0 ; m6=t0a+t3a (t0)
+ SCRATCH 5, 8, blockq+ 0
+ SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+ VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_2x2_1D 1
+ pmulhrsw m0, W_11585x2_REG ; m0=t0
+ pmulhrsw m3, m1, W_16069x2_REG ; m3=t7
+ pmulhrsw m1, W_3196x2_REG ; m1=t4
+ psubw m7, m3, m1 ; t5 = t7a - t4a
+ paddw m5, m3, m1 ; t6 = t7a + t4a
+ pmulhrsw m7, W_11585x2_REG ; m7=t5
+ pmulhrsw m5, W_11585x2_REG ; m5=t6
+ SWAP 5, 1
+ ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
+ psubw m6, m0, m3 ; m6=t0-t7
+ paddw m3, m0 ; m3=t0+t7
+ psubw m2, m0, m1 ; m2=t1-t6
+ paddw m1, m0 ; m1=t1+t6
+%if %1 == 1
+ punpcklwd m3, m1
+%define SCRATCH_REG 1
+%elif ARCH_X86_32
+ mova [blockq+ 0], m2
+%define SCRATCH_REG 2
+%else
+%define SCRATCH_REG 8
+%endif
+ psubw m4, m0, m5 ; m4=t3-t4
+ paddw m5, m0 ; m5=t3+t4
+ SUMSUB_BA w, 7, 0, SCRATCH_REG ; m7=t2+t5, m0=t2-t5
+ SWAP 7, 6, 2
+ SWAP 3, 5, 0
+%undef SCRATCH_REG
+%endmacro
+
+%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
+%if cpuflag(ssse3)
+ pmulhrsw m%1, %6 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+ pmulhrsw m%2, %6
+%else
+ paddw m%1, %6
+ paddw m%2, %6
+ psraw m%1, %7
+ psraw m%2, %7
+%endif
+%if %0 <= 7
+ VP9_STORE_2X %1, %2, %3, %4, %5
+%else
+ VP9_STORE_2X %1, %2, %3, %4, %5, %8
+%endif
+%endmacro
+
+; x86-32:
+; - m6 is in mem
+; x86-64:
+; - m8 holds m6 (SWAP)
+; m6 holds zero
+%macro VP9_IDCT8_WRITEOUT 0
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+ mova m9, [pw_1024]
+%else
+ mova m9, [pw_16]
+%endif
+%define ROUND_REG m9
+%else
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_1024]
+%else
+%define ROUND_REG [pw_16]
+%endif
+%endif
+ SCRATCH 5, 10, blockq+16
+ SCRATCH 7, 11, blockq+32
+ VP9_IDCT8_WRITEx2 0, 1, 5, 7, 6, ROUND_REG
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 2, 3, 5, 7, 6, ROUND_REG
+ lea dstq, [dstq+2*strideq]
+ UNSCRATCH 5, 10, blockq+16
+ UNSCRATCH 7, 11, blockq+32
+ VP9_IDCT8_WRITEx2 4, 5, 0, 1, 6, ROUND_REG
+ lea dstq, [dstq+2*strideq]
+ UNSCRATCH 5, 8, blockq+ 0
+ VP9_IDCT8_WRITEx2 5, 7, 0, 1, 6, ROUND_REG
+
+%undef ROUND_REG
+%endmacro
+
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
+INIT_XMM %1
+cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+ mova m12, [pw_11585x2] ; often used
+%define W_11585x2_REG m12
+%else
+%define W_11585x2_REG [pw_11585x2]
+%endif
+
+ cmp eobd, 12 ; top left half or less
+ jg .idctfull
+
+ cmp eobd, 3 ; top left corner or less
+ jg .idcthalf
+
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idcttopleftcorner
+%else
+ cmp eobd, 1
+ jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ pmulhrsw m0, W_11585x2_REG
+ pmulhrsw m0, W_11585x2_REG
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ movsx coefd, word [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (16 << 14) + 8192
+ sar coefd, 14 + 5
+ movd m0, coefd
+%endif
+ SPLATW m0, m0, 0
+ pxor m4, m4
+ movd [blockq], m4
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_1024] ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+%endif
+%rep 3
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+%endrep
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ RET
+
+%if cpuflag(ssse3)
+; faster path for when only left corner is set (3 input: DC, right to DC, below
+; to DC). Note: also working with a 2x2 block
+.idcttopleftcorner:
+ movd m0, [blockq+0]
+ movd m1, [blockq+16]
+%if ARCH_X86_64
+ mova m10, [pw_3196x2]
+ mova m11, [pw_16069x2]
+%define W_3196x2_REG m10
+%define W_16069x2_REG m11
+%else
+%define W_3196x2_REG [pw_3196x2]
+%define W_16069x2_REG [pw_16069x2]
+%endif
+ VP9_IDCT8_2x2_1D 1
+ ; partial 2x8 transpose
+ ; punpcklwd m0, m1 already done inside idct
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ punpckldq m0, m2
+ punpckldq m4, m6
+ SBUTTERFLY qdq, 0, 4, 1
+ SWAP 1, 4
+ VP9_IDCT8_2x2_1D 2
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+ pxor m6, m6 ; used for the block reset, and VP9_STORE_2X
+ VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+ movd [blockq+ 0], m6
+ movd [blockq+16], m6
+%else
+ mova [blockq+ 0], m6
+ mova [blockq+16], m6
+ mova [blockq+32], m6
+%endif
+ RET
+
+.idcthalf:
+ movh m0, [blockq + 0]
+ movh m1, [blockq +16]
+ movh m2, [blockq +32]
+ movh m3, [blockq +48]
+ VP9_IDCT8_4x4_1D
+ ; partial 4x8 transpose
+%if ARCH_X86_32
+ mova m6, [blockq+ 0]
+%endif
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ SBUTTERFLY dq, 0, 2, 1
+ SBUTTERFLY dq, 4, 6, 5
+ SBUTTERFLY qdq, 0, 4, 1
+ SBUTTERFLY qdq, 2, 6, 5
+ SWAP 1, 4
+ SWAP 3, 6
+ VP9_IDCT8_4x4_1D
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+ pxor m6, m6
+ VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+ movh [blockq+ 0], m6
+ movh [blockq+16], m6
+ movh [blockq+32], m6
+%else
+ mova [blockq+ 0], m6
+ mova [blockq+16], m6
+ mova [blockq+32], m6
+%endif
+ movh [blockq+48], m6
+ RET
+%endif
+
+.idctfull: ; generic full 8x8 idct/idct
+%if ARCH_X86_64
+ mova m0, [blockq+ 0] ; IN(0)
+%endif
+ mova m1, [blockq+ 16] ; IN(1)
+ mova m2, [blockq+ 32] ; IN(2)
+ mova m3, [blockq+ 48] ; IN(3)
+%if ARCH_X86_64
+ mova m4, [blockq+ 64] ; IN(4)
+%endif
+ mova m5, [blockq+ 80] ; IN(5)
+ mova m6, [blockq+ 96] ; IN(6)
+ mova m7, [blockq+112] ; IN(7)
+%if ARCH_X86_64
+ mova m11, [pd_8192] ; rounding
+%define D_8192_REG m11
+%else
+%define D_8192_REG [pd_8192]
+%endif
+ VP9_IDCT8_1D
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+ mova [blockq+0], m0
+%endif
+ VP9_IDCT8_1D
+
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+ pxor m6, m6 ; used for the block reset, and VP9_STORE_2X
+ VP9_IDCT8_WRITEOUT
+ ZERO_BLOCK blockq, 16, 8, m6
+ RET
+%undef W_11585x2_REG
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
+VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-32:
+; - in: m0/3/4/7 are in mem [blockq+N*16]
+; - out: m6 is in mem [blockq+0]
+; x86-64:
+; - everything is in registers
+%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 3, 9
+ SWAP 4, 10
+ SWAP 7, 11
+%endif
+
+ VP9_UNPACK_MULSUB_2D_4X 5, 2, 0, 3, 14449, 7723 ; m5/2=t3[d], m2/4=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 1, 6, 4, 7, 4756, 15679 ; m1/4=t7[d], m6/7=t6[d]
+ SCRATCH 4, 12, blockq+1*16
+ VP9_RND_SH_SUMSUB_BA 6, 2, 7, 3, 4, D_8192_REG ; m6=t2[w], m2=t6[w]
+ UNSCRATCH 4, 12, blockq+1*16
+ VP9_RND_SH_SUMSUB_BA 1, 5, 4, 0, 3, D_8192_REG ; m1=t3[w], m5=t7[w]
+
+ UNSCRATCH 0, 8, blockq+16*0
+ UNSCRATCH 3, 9, blockq+16*3
+ UNSCRATCH 4, 10, blockq+16*4
+ UNSCRATCH 7, 11, blockq+16*7
+ SCRATCH 1, 8, blockq+16*1
+ SCRATCH 2, 9, blockq+16*2
+ SCRATCH 5, 10, blockq+16*5
+ SCRATCH 6, 11, blockq+16*6
+
+ VP9_UNPACK_MULSUB_2D_4X 7, 0, 1, 2, 16305, 1606 ; m7/1=t1[d], m0/2=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 4, 5, 6, 10394, 12665 ; m3/5=t5[d], m4/6=t4[d]
+ SCRATCH 1, 12, blockq+ 0*16
+ VP9_RND_SH_SUMSUB_BA 4, 0, 6, 2, 1, D_8192_REG ; m4=t0[w], m0=t4[w]
+ UNSCRATCH 1, 12, blockq+ 0*16
+ VP9_RND_SH_SUMSUB_BA 3, 7, 5, 1, 2, D_8192_REG ; m3=t1[w], m7=t5[w]
+
+ UNSCRATCH 2, 9, blockq+16*2
+ UNSCRATCH 5, 10, blockq+16*5
+ SCRATCH 3, 9, blockq+16*3
+ SCRATCH 4, 10, blockq+16*4
+
+ ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
+
+ VP9_UNPACK_MULSUB_2D_4X 0, 7, 1, 3, 15137, 6270 ; m0/1=t5[d], m7/3=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 2, 4, 6, 6270, 15137 ; m5/4=t6[d], m2/6=t7[d]
+ SCRATCH 1, 12, blockq+ 0*16
+ VP9_RND_SH_SUMSUB_BA 5, 7, 4, 3, 1, D_8192_REG
+ UNSCRATCH 1, 12, blockq+ 0*16
+ PSIGNW m5, W_M1_REG ; m5=out1[w], m7=t6[w]
+ VP9_RND_SH_SUMSUB_BA 2, 0, 6, 1, 3, D_8192_REG ; m2=out6[w], m0=t7[w]
+
+ UNSCRATCH 1, 8, blockq+16*1
+ UNSCRATCH 3, 9, blockq+16*3
+ UNSCRATCH 4, 10, blockq+16*4
+ UNSCRATCH 6, 11, blockq+16*6
+ SCRATCH 2, 8, blockq+16*0
+
+ SUMSUB_BA w, 6, 4, 2 ; m6=out0[w], m4=t2[w]
+ SUMSUB_BA w, 1, 3, 2
+ PSIGNW m1, W_M1_REG ; m1=out7[w], m3=t3[w]
+
+ ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
+
+ ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+ SUMSUB_BA w, 3, 4, 2
+ SUMSUB_BA w, 0, 7, 2
+ pmulhrsw m3, W_11585x2_REG
+ pmulhrsw m7, W_11585x2_REG
+ pmulhrsw m4, W_11585x2_REG ; out4
+ pmulhrsw m0, W_11585x2_REG ; out2
+%else
+ SCRATCH 5, 9, blockq+16*1
+ VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, D_8192_REG, 2, 5
+ VP9_UNPACK_MULSUB_2W_4X 7, 0, 11585, 11585, D_8192_REG, 2, 5
+ UNSCRATCH 5, 9, blockq+16*1
+%endif
+ PSIGNW m3, W_M1_REG ; out3
+ PSIGNW m7, W_M1_REG ; out5
+
+ ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
+
+%if ARCH_X86_64
+ SWAP 2, 8
+%endif
+ SWAP 0, 6, 2
+ SWAP 7, 1, 5
+%endmacro
+
+%macro IADST8_FN 6
+INIT_XMM %5
+cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
+
+%ifidn %1, idct
+%define first_is_idct 1
+%else
+%define first_is_idct 0
+%endif
+
+%ifidn %3, idct
+%define second_is_idct 1
+%else
+%define second_is_idct 0
+%endif
+
+%if ARCH_X86_64
+ mova m0, [blockq+ 0] ; IN(0)
+%endif
+ mova m1, [blockq+ 16] ; IN(1)
+ mova m2, [blockq+ 32] ; IN(2)
+%if ARCH_X86_64 || first_is_idct
+ mova m3, [blockq+ 48] ; IN(3)
+%endif
+%if ARCH_X86_64
+ mova m4, [blockq+ 64] ; IN(4)
+%endif
+ mova m5, [blockq+ 80] ; IN(5)
+ mova m6, [blockq+ 96] ; IN(6)
+%if ARCH_X86_64 || first_is_idct
+ mova m7, [blockq+112] ; IN(7)
+%endif
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+ mova m15, [pw_11585x2] ; often used
+%endif
+ mova m13, [pd_8192] ; rounding
+ mova m14, [pw_m1]
+%define W_11585x2_REG m15
+%define D_8192_REG m13
+%define W_M1_REG m14
+%else
+%define W_11585x2_REG [pw_11585x2]
+%define D_8192_REG [pd_8192]
+%define W_M1_REG [pw_m1]
+%endif
+
+ ; note different calling conventions for idct8 vs. iadst8 on x86-32
+ VP9_%2_1D
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+ mova [blockq+ 0], m0
+%if second_is_idct == 0
+ mova [blockq+ 48], m3
+ mova [blockq+112], m7
+%endif
+%endif
+ VP9_%4_1D
+
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+ pxor m6, m6 ; used for the block reset, and VP9_STORE_2X
+ VP9_IDCT8_WRITEOUT
+ ZERO_BLOCK blockq, 16, 8, m6
+ RET
+
+%undef W_11585x2_REG
+%undef first_is_idct
+%undef second_is_idct
+
+%endmacro
+
+IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15
+IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15
+IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
+IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16
+IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16
+IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16
+IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-64:
+; at the end of this macro, m7 is stored in [%4+15*%5]
+; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
+; the following sumsubs have not been done yet:
+; SUMSUB_BA w, 6, 9, 15 ; t6, t9
+; SUMSUB_BA w, 7, 8, 15 ; t7, t8
+; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
+; and the following simsubs have not been done yet:
+; SUMSUB_BA w, x13, x14, 7 ; t6, t9
+; SUMSUB_BA w, x15, x12, 7 ; t7, t8
+
+%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
+%if %2 <= 4
+ mova m3, [%1+ 1*%3] ; IN(1)
+ mova m0, [%1+ 3*%3] ; IN(3)
+
+ pmulhrsw m4, m3, [pw_16305x2] ; t14-15
+ pmulhrsw m3, [pw_1606x2] ; t8-9
+ pmulhrsw m7, m0, [pw_m4756x2] ; t10-11
+ pmulhrsw m0, [pw_15679x2] ; t12-13
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+ ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 1, 6 ; t9, t14
+ SCRATCH 4, 10, %4+ 1*%5
+ SCRATCH 5, 11, %4+ 7*%5
+ VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+ UNSCRATCH 5, 11, %4+ 7*%5
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
+ mova m5, [%1+ 1*%3] ; IN(1)
+ mova m4, [%1+ 7*%3] ; IN(7)
+%if %2 <= 8
+ pmulhrsw m2, m5, [pw_16305x2] ; t15
+ pmulhrsw m5, [pw_1606x2] ; t8
+ pmulhrsw m3, m4, [pw_m10394x2] ; t9
+ pmulhrsw m4, [pw_12665x2] ; t14
+%else
+ mova m3, [%1+ 9*%3] ; IN(9)
+ mova m2, [%1+15*%3] ; IN(15)
+
+ ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
+ ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 0, 1 ; t8, t15
+ VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 0, 1 ; t9, t14
+%endif
+
+ SUMSUB_BA w, 3, 5, 0 ; t8, t9
+ SUMSUB_BA w, 4, 2, 0 ; t15, t14
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 0, 1 ; t9, t14
+
+ SCRATCH 4, 10, %4+ 1*%5
+ SCRATCH 5, 11, %4+ 7*%5
+
+ mova m6, [%1+ 3*%3] ; IN(3)
+ mova m7, [%1+ 5*%3] ; IN(5)
+%if %2 <= 8
+ pmulhrsw m0, m7, [pw_14449x2] ; t13
+ pmulhrsw m7, [pw_7723x2] ; t10
+ pmulhrsw m1, m6, [pw_m4756x2] ; t11
+ pmulhrsw m6, [pw_15679x2] ; t12
+%else
+ mova m0, [%1+11*%3] ; IN(11)
+ mova m1, [%1+13*%3] ; IN(13)
+
+ VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 4, 5 ; t10, t13
+ VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 4, 5 ; t11, t12
+%endif
+
+ ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
+ ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
+
+ SUMSUB_BA w, 7, 1, 4 ; t11, t10
+ SUMSUB_BA w, 0, 6, 4 ; t12, t13
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+ ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+ VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+
+ UNSCRATCH 5, 11, %4+ 7*%5
+%endif
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
+ ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
+
+ SUMSUB_BA w, 7, 3, 4 ; t8, t11
+
+ ; backup first register
+ mova [%4+15*%5], m7
+
+ SUMSUB_BA w, 6, 2, 7 ; t9, t10
+ UNSCRATCH 4, 10, %4+ 1*%5
+ SUMSUB_BA w, 0, 4, 7 ; t15, t12
+ SUMSUB_BA w, 1, 5, 7 ; t14. t13
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+
+%if cpuflag(ssse3) && %6 == 0
+ SUMSUB_BA w, 2, 5, 7
+ SUMSUB_BA w, 3, 4, 7
+ pmulhrsw m5, [pw_11585x2] ; t10
+ pmulhrsw m4, [pw_11585x2] ; t11
+ pmulhrsw m3, [pw_11585x2] ; t12
+ pmulhrsw m2, [pw_11585x2] ; t13
+%else
+ SCRATCH 6, 10, %4+ 1*%5
+ VP9_UNPACK_MULSUB_2W_4X 5, 2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
+ VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
+ UNSCRATCH 6, 10, %4+ 1*%5
+%endif
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
+
+ SCRATCH 0, 8, %4+ 1*%5
+ SCRATCH 1, 9, %4+ 3*%5
+ SCRATCH 2, 10, %4+ 5*%5
+ SCRATCH 3, 11, %4+ 7*%5
+ SCRATCH 4, 12, %4+ 9*%5
+ SCRATCH 5, 13, %4+11*%5
+ SCRATCH 6, 14, %4+13*%5
+
+ ; even (tx8x8)
+%if %2 <= 4
+ mova m3, [%1+ 0*%3] ; IN(0)
+ mova m4, [%1+ 2*%3] ; IN(2)
+
+ pmulhrsw m3, [pw_11585x2] ; t0-t3
+ pmulhrsw m7, m4, [pw_16069x2] ; t6-7
+ pmulhrsw m4, [pw_3196x2] ; t4-5
+
+%if 0 ; overflows :(
+ paddw m6, m7, m4
+ psubw m5, m7, m4
+ pmulhrsw m5, [pw_11585x2] ; t5
+ pmulhrsw m6, [pw_11585x2] ; t6
+%else
+ VP9_UNPACK_MULSUB_2W_4X 5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5, t6
+%endif
+
+ psubw m0, m3, m7
+ paddw m7, m3
+ psubw m1, m3, m6
+ paddw m6, m3
+ psubw m2, m3, m5
+ paddw m5, m3
+
+%if ARCH_X86_32
+ SWAP 0, 7
+%endif
+ SCRATCH 7, 15, %4+12*%5
+%else
+ mova m6, [%1+ 2*%3] ; IN(2)
+ mova m1, [%1+ 4*%3] ; IN(4)
+ mova m7, [%1+ 6*%3] ; IN(6)
+%if %2 <= 8
+ pmulhrsw m0, m1, [pw_15137x2] ; t3
+ pmulhrsw m1, [pw_6270x2] ; t2
+ pmulhrsw m5, m6, [pw_16069x2] ; t7
+ pmulhrsw m6, [pw_3196x2] ; t4
+ pmulhrsw m4, m7, [pw_m9102x2] ; t5
+ pmulhrsw m7, [pw_13623x2] ; t6
+%else
+ mova m4, [%1+10*%3] ; IN(10)
+ mova m0, [%1+12*%3] ; IN(12)
+ mova m5, [%1+14*%3] ; IN(14)
+
+ VP9_UNPACK_MULSUB_2W_4X 1, 0, 15137, 6270, [pd_8192], 2, 3 ; t2, t3
+ VP9_UNPACK_MULSUB_2W_4X 6, 5, 16069, 3196, [pd_8192], 2, 3 ; t4, t7
+ VP9_UNPACK_MULSUB_2W_4X 4, 7, 9102, 13623, [pd_8192], 2, 3 ; t5, t6
+%endif
+
+ SUMSUB_BA w, 4, 6, 2 ; t4, t5
+ SUMSUB_BA w, 7, 5, 2 ; t7, t6
+
+%if cpuflag(ssse3) && %6 == 0
+ SUMSUB_BA w, 6, 5, 2
+ pmulhrsw m5, [pw_11585x2] ; t5
+ pmulhrsw m6, [pw_11585x2] ; t6
+%else
+ VP9_UNPACK_MULSUB_2W_4X 5, 6, 11585, 11585, [pd_8192], 2, 3 ; t5, t6
+%endif
+
+ SCRATCH 5, 15, %4+10*%5
+ mova m2, [%1+ 0*%3] ; IN(0)
+%if %2 <= 8
+ pmulhrsw m2, [pw_11585x2] ; t0 and t1
+ psubw m3, m2, m0
+ paddw m0, m2
+
+ SUMSUB_BA w, 7, 0, 5 ; t0, t7
+%else
+ mova m3, [%1+ 8*%3] ; IN(8)
+
+ ; from 3 stages back
+%if cpuflag(ssse3) && %6 == 0
+ SUMSUB_BA w, 3, 2, 5
+ pmulhrsw m3, [pw_11585x2] ; t0
+ pmulhrsw m2, [pw_11585x2] ; t1
+%else
+ mova [%1+ 0*%3], m0
+ VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 0 ; t0, t1
+ mova m0, [%1+ 0*%3]
+%endif
+
+ ; from 2 stages back
+ SUMSUB_BA w, 0, 3, 5 ; t0, t3
+
+ SUMSUB_BA w, 7, 0, 5 ; t0, t7
+%endif
+ UNSCRATCH 5, 15, %4+10*%5
+%if ARCH_X86_32
+ SWAP 0, 7
+%endif
+ SCRATCH 7, 15, %4+12*%5
+ SUMSUB_BA w, 1, 2, 7 ; t1, t2
+
+ ; from 1 stage back
+ SUMSUB_BA w, 6, 1, 7 ; t1, t6
+ SUMSUB_BA w, 5, 2, 7 ; t2, t5
+%endif
+ SUMSUB_BA w, 4, 3, 7 ; t3, t4
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+ SWAP 5, 13
+ SWAP 6, 14
+
+ SUMSUB_BA w, 0, 15, 7 ; t0, t15
+ SUMSUB_BA w, 1, 14, 7 ; t1, t14
+ SUMSUB_BA w, 2, 13, 7 ; t2, t13
+ SUMSUB_BA w, 3, 12, 7 ; t3, t12
+ SUMSUB_BA w, 4, 11, 7 ; t4, t11
+ SUMSUB_BA w, 5, 10, 7 ; t5, t10
+%else
+ SWAP 1, 6
+ SWAP 2, 5
+ SWAP 3, 4
+ mova [%4+14*%5], m6
+
+%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
+ mova m6, [%4+%2*%5]
+ SUMSUB_BA w, 6, %1, 7
+ SWAP %1, 6
+ mova [%4+%3*%5], m6
+%endmacro
+
+ %%SUMSUB_BA_STORE 0, 1, 1, %4, %5 ; t0, t15
+ %%SUMSUB_BA_STORE 1, 3, 3, %4, %5 ; t1, t14
+ %%SUMSUB_BA_STORE 2, 5, 5, %4, %5 ; t2, t13
+ %%SUMSUB_BA_STORE 3, 7, 7, %4, %5 ; t3, t12
+ %%SUMSUB_BA_STORE 4, 9, 9, %4, %5 ; t4, t11
+ %%SUMSUB_BA_STORE 5, 11, 11, %4, %5 ; t5, t10
+%endif
+%endmacro
+
+%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
+%if %2 == 1
+ VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
+
+%if ARCH_X86_64
+ ; backup a different register
+ mova m7, [tmpq+15*16]
+ mova [tmpq+ 1*16], m15
+
+ SUMSUB_BA w, 6, 9, 15 ; t6, t9
+ SUMSUB_BA w, 7, 8, 15 ; t7, t8
+
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15
+ mova [tmpq+ 0], m0
+ mova [tmpq+ 32], m1
+ mova [tmpq+ 64], m2
+ mova [tmpq+ 96], m3
+ mova [tmpq+128], m4
+ mova [tmpq+160], m5
+ mova [tmpq+192], m6
+ mova [tmpq+224], m7
+
+ mova m15, [tmpq+ 1*16]
+ TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
+ mova [tmpq+ 16], m8
+ mova [tmpq+ 48], m9
+ mova [tmpq+ 80], m10
+ mova [tmpq+112], m11
+ mova [tmpq+144], m12
+ mova [tmpq+176], m13
+ mova [tmpq+208], m14
+ mova [tmpq+240], m15
+%else
+ mova m6, [tmpq+13*16]
+ mova m7, [tmpq+14*16]
+ SUMSUB_BA w, 6, 7 ; t6, t9
+ mova [tmpq+14*16], m6
+ mova [tmpq+13*16], m7
+ mova m7, [tmpq+15*16]
+ mova m6, [tmpq+12*16]
+ SUMSUB_BA w, 7, 6 ; t7, t8
+ mova [tmpq+15*16], m6
+
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
+ mova [tmpq+ 0*16], m0
+ mova [tmpq+ 2*16], m1
+ mova [tmpq+ 4*16], m2
+ mova [tmpq+ 6*16], m3
+ mova [tmpq+10*16], m5
+ mova [tmpq+12*16], m6
+ mova [tmpq+14*16], m7
+
+ mova m0, [tmpq+15*16]
+ mova m1, [tmpq+13*16]
+ mova m2, [tmpq+11*16]
+ mova m3, [tmpq+ 9*16]
+ mova m4, [tmpq+ 7*16]
+ mova m5, [tmpq+ 5*16]
+ mova m7, [tmpq+ 1*16]
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
+ mova [tmpq+ 1*16], m0
+ mova [tmpq+ 3*16], m1
+ mova [tmpq+ 5*16], m2
+ mova [tmpq+ 7*16], m3
+ mova [tmpq+11*16], m5
+ mova [tmpq+13*16], m6
+ mova [tmpq+15*16], m7
+%endif
+%else ; %2 == 2
+ VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+ pxor m7, m7
+%if ARCH_X86_64
+ ; backup more registers
+ mova [%1+ 2*32], m8
+ mova [%1+ 3*32], m9
+
+ VP9_IDCT8_WRITEx2 0, 1, 8, 9, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 2, 3, 8, 9, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 4, 5, 8, 9, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ ; restore from cache
+ SWAP 0, 7 ; move zero from m7 to m0
+ mova m7, [%1+15*32]
+ mova m8, [%1+ 2*32]
+ mova m9, [%1+ 3*32]
+
+ SUMSUB_BA w, 6, 9, 3 ; t6, t9
+ SUMSUB_BA w, 7, 8, 3 ; t7, t8
+
+ VP9_IDCT8_WRITEx2 6, 7, 3, 4, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 8, 9, 3, 4, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, ROUND_REG, 6
+%else
+ mova [tmpq+ 0*32], m5
+
+ VP9_IDCT8_WRITEx2 0, 1, 5, 6, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 2, 3, 5, 6, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ SWAP 0, 7 ; move zero from m7 to m0
+ mova m5, [tmpq+ 0*32]
+
+ VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m4, [tmpq+13*32]
+ mova m7, [tmpq+14*32]
+ mova m5, [tmpq+15*32]
+ mova m6, [tmpq+12*32]
+ SUMSUB_BADC w, 4, 7, 5, 6, 1
+
+ VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m4, [tmpq+11*32]
+ mova m5, [tmpq+ 9*32]
+ mova m6, [tmpq+ 7*32]
+ mova m7, [tmpq+ 5*32]
+
+ VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m4, [tmpq+ 3*32]
+ mova m5, [tmpq+ 1*32]
+
+ VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+%endif
+
+%undef ROUND_REG
+%endif ; %2 == 1/2
+%endmacro
+
+%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
+ mova m%3, [dstq]
+ mova m%5, [dstq+%7]
+ punpcklbw m%2, m%3, m%6
+ punpckhbw m%3, m%6
+ punpcklbw m%4, m%5, m%6
+ punpckhbw m%5, m%6
+ paddw m%2, m%1
+ paddw m%3, m%1
+ paddw m%4, m%1
+ paddw m%5, m%1
+ packuswb m%2, m%3
+ packuswb m%4, m%5
+ mova [dstq], m%2
+ mova [dstq+%7], m%4
+%endmacro
+
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
+%if cpuflag(ssse3)
+ ; 2x2=eob=3, 4x4=eob=10
+ cmp eobd, 38
+ jg .idctfull
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idct8x8
+%else
+ cmp eobd, 1 ; faster path for when only DC is set
+ jg .idctfull
+%endif
+
+ ; dc-only
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ mova m1, [pw_11585x2]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ movsx coefd, word [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (32 << 14) + 8192
+ sar coefd, 14 + 6
+ movd m0, coefd
+%endif
+ SPLATW m0, m0, q0000
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_512]
+%endif
+ pxor m5, m5
+ movd [blockq], m5
+%rep 7
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
+ lea dstq, [dstq+2*strideq]
+%endrep
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
+ RET
+
+ DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
+%if cpuflag(ssse3)
+.idct8x8:
+ mov tmpq, rsp
+ VP9_IDCT16_1D blockq, 1, 8, 0
+
+ mov cntd, 2
+ mov dst_bakq, dstq
+.loop2_8x8:
+ VP9_IDCT16_1D tmpq, 2, 8, 0
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_8x8
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 8, m0
+ RET
+%endif
+
+.idctfull:
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_full:
+ VP9_IDCT16_1D blockq, 1, 16, 0
+ add blockq, 16
+ add tmpq, 256
+ dec cntd
+ jg .loop1_full
+ sub blockq, 32
+
+ mov cntd, 2
+ mov tmpq, rsp
+ mov dst_bakq, dstq
+.loop2_full:
+ VP9_IDCT16_1D tmpq, 2, 16, 0
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM sse2
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
+
+%macro VP9_IDCT16_YMM_1D 0
+ VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15
+ VP9_UNPACK_MULSUB_2W_4X 9, 7, 10394, 12665, [pd_8192], 0, 4 ; t9, t14
+
+ SUMSUB_BA w, 9, 1, 0 ; t8, t9
+ SUMSUB_BA w, 7, 15, 0 ; t15, t14
+
+ VP9_UNPACK_MULSUB_2W_4X 15, 1, 15137, 6270, [pd_8192], 0, 4 ; t9, t14
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 11, 14449, 7723, [pd_8192], 0, 4 ; t10, t13
+ VP9_UNPACK_MULSUB_2W_4X 13, 3, 4756, 15679, [pd_8192], 0, 4 ; t11, t12
+
+ SUMSUB_BA w, 5, 13, 0 ; t11, t10
+ SUMSUB_BA w, 11, 3, 0 ; t12, t13
+
+ VP9_UNPACK_MULSUB_2W_4X 3, 13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13
+
+ SUMSUB_BA w, 5, 9, 0 ; t8, t11
+ SUMSUB_BA w, 3, 15, 0 ; t9, t10
+ SUMSUB_BA w, 11, 7, 0 ; t15, t12
+ SUMSUB_BA w, 13, 1, 0 ; t14, t13
+
+ SUMSUB_BA w, 15, 1, 0
+ SUMSUB_BA w, 9, 7, 0
+ pmulhrsw m1, [pw_11585x2] ; t10
+ pmulhrsw m7, [pw_11585x2] ; t11
+ pmulhrsw m9, [pw_11585x2] ; t12
+ pmulhrsw m15, [pw_11585x2] ; t13
+
+ ; even (tx8x8)
+ mova m4, [blockq+128]
+ mova [blockq+128], m5
+ VP9_UNPACK_MULSUB_2W_4X 4, 12, 15137, 6270, [pd_8192], 0, 5 ; t2, t3
+ VP9_UNPACK_MULSUB_2W_4X 2, 14, 16069, 3196, [pd_8192], 0, 5 ; t4, t7
+ VP9_UNPACK_MULSUB_2W_4X 10, 6, 9102, 13623, [pd_8192], 0, 5 ; t5, t6
+ mova m0, [blockq+ 0]
+ SUMSUB_BA w, 8, 0, 5
+ pmulhrsw m8, [pw_11585x2] ; t0
+ pmulhrsw m0, [pw_11585x2] ; t1
+
+ SUMSUB_BA w, 10, 2, 5 ; t4, t5
+ SUMSUB_BA w, 6, 14, 5 ; t7, t6
+ SUMSUB_BA w, 12, 8, 5 ; t0, t3
+ SUMSUB_BA w, 4, 0, 5 ; t1, t2
+
+ SUMSUB_BA w, 2, 14, 5
+ pmulhrsw m14, [pw_11585x2] ; t5
+ pmulhrsw m2, [pw_11585x2] ; t6
+
+ SUMSUB_BA w, 6, 12, 5 ; t0, t7
+ SUMSUB_BA w, 2, 4, 5 ; t1, t6
+ SUMSUB_BA w, 14, 0, 5 ; t2, t5
+ SUMSUB_BA w, 10, 8, 5 ; t3, t4
+
+ ; final stage
+ SUMSUB_BA w, 11, 6, 5 ; out0, out15
+ SUMSUB_BA w, 13, 2, 5 ; out1, out14
+ SUMSUB_BA w, 15, 14, 5 ; out2, out13
+ SUMSUB_BA w, 9, 10, 5 ; out3, out12
+ SUMSUB_BA w, 7, 8, 5 ; out4, out11
+ SUMSUB_BA w, 1, 0, 5 ; out5, out10
+ SUMSUB_BA w, 3, 4, 5 ; out6, out9
+ mova m5, [blockq+128]
+ mova [blockq+192], m3
+ SUMSUB_BA w, 5, 12, 3 ; out7, out8
+
+ SWAP 0, 11, 8, 12, 10
+ SWAP 1, 13, 14, 2, 15, 6, 3, 9, 4, 7, 5
+%endmacro
+
+; this is almost identical to VP9_STORE_2X, but it does two rows
+; for slightly improved interleaving, and it omits vpermq since the
+; input is DC so all values are identical
+%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
+ mova xm%2, [dstq]
+ mova xm%4, [dstq+strideq*2]
+ vinserti128 m%2, m%2, [dstq+strideq], 1
+ vinserti128 m%4, m%4, [dstq+stride3q], 1
+ punpckhbw m%3, m%2, m%6
+ punpcklbw m%2, m%6
+ punpckhbw m%5, m%4, m%6
+ punpcklbw m%4, m%6
+ paddw m%3, m%1
+ paddw m%2, m%1
+ paddw m%5, m%1
+ paddw m%4, m%1
+ packuswb m%2, m%3
+ packuswb m%4, m%5
+ mova [dstq], xm%2
+ mova [dstq+strideq*2], xm%4
+ vextracti128 [dstq+strideq], m%2, 1
+ vextracti128 [dstq+stride3q], m%4, 1
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
+ cmp eobd, 1 ; faster path for when only DC is set
+ jg .idctfull
+
+ ; dc-only
+ mova m1, [pw_11585x2]
+ vpbroadcastw m0, [blockq]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ pxor m5, m5
+ pmulhrsw m0, [pw_512]
+ movd [blockq], xm5
+
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 4
+ lea stride3q, [strideq*3]
+.loop_dc:
+ VP9_STORE_YMM_DC_4X 0, 1, 2, 3, 4, 5
+ lea dstq, [dstq+4*strideq]
+ dec cntd
+ jg .loop_dc
+ RET
+
+ DEFINE_ARGS dst, stride, block, eob
+.idctfull:
+ mova m1, [blockq+ 32]
+ mova m2, [blockq+ 64]
+ mova m3, [blockq+ 96]
+ mova m5, [blockq+160]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ mova m8, [blockq+256]
+ mova m9, [blockq+288]
+ mova m10, [blockq+320]
+ mova m11, [blockq+352]
+ mova m12, [blockq+384]
+ mova m13, [blockq+416]
+ mova m14, [blockq+448]
+ mova m15, [blockq+480]
+
+ VP9_IDCT16_YMM_1D
+ TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ [blockq+192], [blockq+128], 1
+ mova [blockq+ 0], m0
+ VP9_IDCT16_YMM_1D
+
+ mova [blockq+224], m7
+
+ ; store
+ VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ pxor m0, m0
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endif
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST16_1D 2 ; src, pass
+%assign %%str 16*%2
+ mova m0, [%1+ 0*32] ; in0
+ mova m1, [%1+15*32] ; in15
+ mova m2, [%1+ 7*32] ; in7
+ mova m3, [%1+ 8*32] ; in8
+
+ VP9_UNPACK_MULSUB_2D_4X 1, 0, 4, 5, 16364, 804 ; m1/4=t1[d], m0/5=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 6, 11003, 12140 ; m2/7=t9[d], m3/6=t8[d]
+ SCRATCH 4, 8, tmpq+ 0*%%str
+ VP9_RND_SH_SUMSUB_BA 3, 0, 6, 5, 4, [pd_8192] ; m3=t0[w], m0=t8[w]
+ UNSCRATCH 4, 8, tmpq+ 0*%%str
+ VP9_RND_SH_SUMSUB_BA 2, 1, 7, 4, 5, [pd_8192] ; m2=t1[w], m1=t9[w]
+
+ SCRATCH 0, 10, tmpq+ 0*%%str
+ SCRATCH 1, 11, tmpq+15*%%str
+ mova [tmpq+ 7*%%str], m2
+ mova [tmpq+ 8*%%str], m3
+
+ mova m1, [%1+ 2*32] ; in2
+ mova m0, [%1+13*32] ; in13
+ mova m3, [%1+ 5*32] ; in5
+ mova m2, [%1+10*32] ; in10
+
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 15893, 3981 ; m0/6=t3[d], m1/7=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d]
+ SCRATCH 4, 12, tmpq+ 2*%%str
+ VP9_RND_SH_SUMSUB_BA 2, 1, 5, 7, 4, [pd_8192] ; m2=t2[w], m1=t10[w]
+ UNSCRATCH 4, 12, tmpq+ 2*%%str
+ VP9_RND_SH_SUMSUB_BA 3, 0, 4, 6, 5, [pd_8192] ; m3=t3[w], m0=t11[w]
+
+ SCRATCH 0, 12, tmpq+ 2*%%str
+ SCRATCH 1, 13, tmpq+13*%%str
+ mova [tmpq+ 5*%%str], m2
+ mova [tmpq+10*%%str], m3
+
+ mova m2, [%1+ 4*32] ; in4
+ mova m3, [%1+11*32] ; in11
+ mova m0, [%1+ 3*32] ; in3
+ mova m1, [%1+12*32] ; in12
+
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 14811, 7005 ; m3/7=t5[d], m2/6=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 5520, 15426 ; m0/4=t13[d], m1/5=t12[d]
+ SCRATCH 4, 9, tmpq+ 4*%%str
+ VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t4[w], m2=t12[w]
+ UNSCRATCH 4, 9, tmpq+ 4*%%str
+ VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t5[w], m3=t13[w]
+
+ SCRATCH 0, 8, tmpq+ 4*%%str
+ mova [tmpq+11*%%str], m1 ; t4:m1->r11
+ UNSCRATCH 0, 10, tmpq+ 0*%%str
+ UNSCRATCH 1, 11, tmpq+15*%%str
+
+ ; round 2 interleaved part 1
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 5, 4, 3196, 16069 ; m3/5=t12[d], m2/4=t13[d]
+ SCRATCH 4, 9, tmpq+ 3*%%str
+ VP9_RND_SH_SUMSUB_BA 3, 1, 5, 7, 4, [pd_8192] ; m3=t8[w], m1=t12[w]
+ UNSCRATCH 4, 9, tmpq+ 3*%%str
+ VP9_RND_SH_SUMSUB_BA 2, 0, 4, 6, 5, [pd_8192] ; m2=t9[w], m0=t13[w]
+
+ SCRATCH 0, 10, tmpq+ 0*%%str
+ SCRATCH 1, 11, tmpq+15*%%str
+ SCRATCH 2, 14, tmpq+ 3*%%str
+ SCRATCH 3, 15, tmpq+12*%%str
+
+ mova m2, [%1+ 6*32] ; in6
+ mova m3, [%1+ 9*32] ; in9
+ mova m0, [%1+ 1*32] ; in1
+ mova m1, [%1+14*32] ; in14
+
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d]
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 2404, 16207 ; m0/4=t15[d], m1/5=t14[d]
+ SCRATCH 4, 9, tmpq+ 6*%%str
+ VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t6[w], m2=t14[w]
+ UNSCRATCH 4, 9, tmpq+ 6*%%str
+ VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t7[w], m3=t15[w]
+
+ ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
+ ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
+
+ UNSCRATCH 4, 12, tmpq+ 2*%%str
+ UNSCRATCH 5, 13, tmpq+13*%%str
+ SCRATCH 0, 12, tmpq+ 1*%%str
+ SCRATCH 1, 13, tmpq+14*%%str
+
+ ; remainder of round 2 (rest of t8-15)
+ VP9_UNPACK_MULSUB_2D_4X 5, 4, 6, 7, 9102, 13623 ; m5/6=t11[d], m4/7=t10[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 1, 0, 13623, 9102 ; m3/1=t14[d], m2/0=t15[d]
+ SCRATCH 0, 9, tmpq+ 6*%%str
+ VP9_RND_SH_SUMSUB_BA 3, 4, 1, 7, 0, [pd_8192] ; m3=t10[w], m4=t14[w]
+ UNSCRATCH 0, 9, tmpq+ 6*%%str
+ VP9_RND_SH_SUMSUB_BA 2, 5, 0, 6, 1, [pd_8192] ; m2=t11[w], m5=t15[w]
+
+ ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
+
+ UNSCRATCH 6, 14, tmpq+ 3*%%str
+ UNSCRATCH 7, 15, tmpq+12*%%str
+
+ SUMSUB_BA w, 3, 7, 1
+ PSIGNW m3, [pw_m1] ; m3=out1[w], m7=t10[w]
+ SUMSUB_BA w, 2, 6, 1 ; m2=out14[w], m6=t11[w]
+
+ ; unfortunately, the code below overflows in some cases, e.g.
+ ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
+%if 0; cpuflag(ssse3)
+ SUMSUB_BA w, 7, 6, 1
+ pmulhrsw m7, [pw_11585x2] ; m7=out6[w]
+ pmulhrsw m6, [pw_11585x2] ; m6=out9[w]
+%else
+ VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+ mova [tmpq+ 3*%%str], m6
+ mova [tmpq+ 6*%%str], m7
+ UNSCRATCH 6, 10, tmpq+ 0*%%str
+ UNSCRATCH 7, 11, tmpq+15*%%str
+ mova [tmpq+13*%%str], m2
+ SCRATCH 3, 11, tmpq+ 9*%%str
+
+ VP9_UNPACK_MULSUB_2D_4X 7, 6, 2, 3, 15137, 6270 ; m6/3=t13[d], m7/2=t12[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 4, 1, 0, 6270, 15137 ; m5/1=t14[d], m4/0=t15[d]
+ SCRATCH 0, 9, tmpq+ 2*%%str
+ VP9_RND_SH_SUMSUB_BA 5, 6, 1, 3, 0, [pd_8192] ; m5=out2[w], m6=t14[w]
+ UNSCRATCH 0, 9, tmpq+ 2*%%str
+ VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192]
+ PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w]
+
+ ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+ SUMSUB_BA w, 7, 6, 1
+ pmulhrsw m7, [pw_m11585x2] ; m7=out5[w]
+ pmulhrsw m6, [pw_11585x2] ; m6=out10[w]
+%else
+ PSIGNW m7, [pw_m1]
+ VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+ ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
+
+ mova m2, [tmpq+ 8*%%str]
+ mova m3, [tmpq+ 7*%%str]
+ mova m1, [tmpq+11*%%str]
+ mova [tmpq+ 7*%%str], m6
+ mova [tmpq+11*%%str], m4
+ mova m4, [tmpq+ 5*%%str]
+ SCRATCH 5, 14, tmpq+ 5*%%str
+ SCRATCH 7, 15, tmpq+ 8*%%str
+ UNSCRATCH 6, 8, tmpq+ 4*%%str
+ UNSCRATCH 5, 12, tmpq+ 1*%%str
+ UNSCRATCH 7, 13, tmpq+14*%%str
+
+ ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
+ ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+ SUMSUB_BA w, 1, 2, 0 ; m1=t0[w], m2=t4[w]
+ mova m0, [tmpq+10*%%str]
+ SCRATCH 1, 12, tmpq+ 1*%%str
+ SUMSUB_BA w, 6, 3, 1 ; m8=t1[w], m3=t5[w]
+ SCRATCH 6, 13, tmpq+ 4*%%str
+ SUMSUB_BA w, 7, 4, 1 ; m13=t2[w], m9=t6[w]
+ SCRATCH 7, 8, tmpq+10*%%str
+ SUMSUB_BA w, 5, 0, 1 ; m12=t3[w], m0=t7[w]
+ SCRATCH 5, 9, tmpq+14*%%str
+
+ VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 5, 15137, 6270 ; m2/6=t5[d], m3/10=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 0, 4, 1, 6, 6270, 15137 ; m0/14=t6[d], m9/15=t7[d]
+ SCRATCH 6, 10, tmpq+ 0*%%str
+ VP9_RND_SH_SUMSUB_BA 0, 3, 1, 5, 6, [pd_8192]
+ UNSCRATCH 6, 10, tmpq+ 0*%%str
+ PSIGNW m0, [pw_m1] ; m0=out3[w], m3=t6[w]
+ VP9_RND_SH_SUMSUB_BA 4, 2, 6, 7, 5, [pd_8192] ; m9=out12[w], m2=t7[w]
+
+ UNSCRATCH 1, 8, tmpq+10*%%str
+ UNSCRATCH 5, 9, tmpq+14*%%str
+ UNSCRATCH 6, 12, tmpq+ 1*%%str
+ UNSCRATCH 7, 13, tmpq+ 4*%%str
+ SCRATCH 4, 9, tmpq+14*%%str
+
+ SUMSUB_BA w, 1, 6, 4 ; m13=out0[w], m1=t2[w]
+ SUMSUB_BA w, 5, 7, 4
+ PSIGNW m5, [pw_m1] ; m12=out15[w], m8=t3[w]
+
+ ; unfortunately, the code below overflows in some cases, e.g.
+ ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+%if 0 ; cpuflag(ssse3)
+ SUMSUB_BA w, 7, 6, 4
+ pmulhrsw m7, [pw_m11585x2] ; m8=out7[w]
+ pmulhrsw m6, [pw_11585x2] ; m1=out8[w]
+ SWAP 6, 7
+ SUMSUB_BA w, 3, 2, 4
+ pmulhrsw m3, [pw_11585x2] ; m3=out4[w]
+ pmulhrsw m2, [pw_11585x2] ; m2=out11[w]
+%else
+ SCRATCH 5, 8, tmpq+10*%%str
+ VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4
+ VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4
+ UNSCRATCH 5, 8, tmpq+10*%%str
+%endif
+
+ ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
+ ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+%if %2 == 1
+%if ARCH_X86_64
+ mova m13, [tmpq+ 6*%%str]
+ TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10
+ mova [tmpq+ 0*16], m1
+ mova [tmpq+ 2*16], m11
+ mova [tmpq+ 4*16], m14
+ mova [tmpq+ 6*16], m0
+ mova m1, [tmpq+ 3*%%str]
+ mova m11, [tmpq+ 7*%%str]
+ mova m14, [tmpq+11*%%str]
+ mova m0, [tmpq+13*%%str]
+ mova [tmpq+ 8*16], m3
+ mova [tmpq+10*16], m15
+ mova [tmpq+12*16], m13
+ mova [tmpq+14*16], m6
+
+ TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10
+ mova [tmpq+ 1*16], m7
+ mova [tmpq+ 3*16], m1
+ mova [tmpq+ 5*16], m11
+ mova [tmpq+ 7*16], m2
+ mova [tmpq+ 9*16], m9
+ mova [tmpq+11*16], m14
+ mova [tmpq+13*16], m0
+ mova [tmpq+15*16], m5
+%else
+ mova [tmpq+12*%%str], m2
+ mova [tmpq+ 1*%%str], m5
+ mova [tmpq+15*%%str], m7
+ mova m2, [tmpq+ 9*%%str]
+ mova m5, [tmpq+ 5*%%str]
+ mova m7, [tmpq+ 8*%%str]
+ TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
+ mova [tmpq+ 0*16], m1
+ mova [tmpq+ 2*16], m2
+ mova [tmpq+ 4*16], m5
+ mova [tmpq+ 6*16], m0
+ mova [tmpq+10*16], m7
+ mova m3, [tmpq+12*%%str]
+ mova [tmpq+12*16], m4
+ mova m4, [tmpq+14*%%str]
+ mova [tmpq+14*16], m6
+
+ mova m0, [tmpq+15*%%str]
+ mova m1, [tmpq+ 3*%%str]
+ mova m2, [tmpq+ 7*%%str]
+ mova m5, [tmpq+11*%%str]
+ mova m7, [tmpq+ 1*%%str]
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
+ mova [tmpq+ 1*16], m0
+ mova [tmpq+ 3*16], m1
+ mova [tmpq+ 5*16], m2
+ mova [tmpq+ 7*16], m3
+ mova [tmpq+11*16], m5
+ mova [tmpq+13*16], m6
+ mova [tmpq+15*16], m7
+%endif
+%else
+ pxor m4, m4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%if ARCH_X86_64
+ mova m12, [tmpq+ 6*%%str]
+ VP9_IDCT8_WRITEx2 1, 11, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 14, 0, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m1, [tmpq+ 3*%%str]
+ mova m11, [tmpq+ 7*%%str]
+ mova m14, [tmpq+11*%%str]
+ mova m0, [tmpq+13*%%str]
+
+ VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 9, 14, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 0, 5, 10, 8, 4, ROUND_REG, 6
+%else
+ mova [tmpq+ 0*%%str], m2
+ mova [tmpq+ 1*%%str], m5
+ mova [tmpq+ 2*%%str], m7
+ mova m2, [tmpq+ 9*%%str]
+ VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m5, [tmpq+ 5*%%str]
+ VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m5, [tmpq+ 8*%%str]
+ VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m5, [tmpq+ 6*%%str]
+ VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m0, [tmpq+ 2*%%str]
+ mova m3, [tmpq+ 3*%%str]
+ VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m0, [tmpq+ 7*%%str]
+ mova m3, [tmpq+ 0*%%str]
+ VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m0, [tmpq+14*%%str]
+ mova m3, [tmpq+11*%%str]
+ VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m0, [tmpq+13*%%str]
+ mova m3, [tmpq+ 1*%%str]
+ VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6
+%endif
+
+ SWAP 0, 4 ; zero
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro IADST16_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_full:
+ VP9_%2_1D blockq, 1
+ add blockq, 16
+ add tmpq, 256
+ dec cntd
+ jg .loop1_full
+ sub blockq, 32
+
+ mov cntd, 2
+ mov tmpq, rsp
+ mov dst_bakq, dstq
+.loop2_full:
+ VP9_%4_1D tmpq, 2
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+IADST16_FN idct, IDCT16, iadst, IADST16, sse2
+IADST16_FN iadst, IADST16, idct, IDCT16, sse2
+IADST16_FN iadst, IADST16, iadst, IADST16, sse2
+IADST16_FN idct, IDCT16, iadst, IADST16, ssse3
+IADST16_FN iadst, IADST16, idct, IDCT16, ssse3
+IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
+IADST16_FN idct, IDCT16, iadst, IADST16, avx
+IADST16_FN iadst, IADST16, idct, IDCT16, avx
+IADST16_FN iadst, IADST16, iadst, IADST16, avx
+
+; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
+; out: m[0-15] except m6, which is in [blockq+192]
+; uses blockq as scratch space
+%macro VP9_IADST16_YMM_1D 0
+ mova [blockq+ 32], m3
+ mova [blockq+ 64], m7
+ mova [blockq+ 96], m8
+
+ ; first half of round 1
+ VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d]
+ VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d]
+ VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w]
+ VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d]
+ VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w]
+ VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w]
+
+ ; half of round 2 t8-15
+ VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d]
+ VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d]
+ VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w]
+ VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w]
+
+ SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6
+ SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7
+
+ mova m0, [blockq+ 0]
+ mova m4, [blockq+128]
+ mova m3, [blockq+ 32]
+ mova m7, [blockq+ 64]
+ mova m8, [blockq+ 96]
+ mova [blockq+ 0], m1
+ mova [blockq+128], m14
+ mova [blockq+ 32], m6
+ mova [blockq+ 64], m9
+ mova [blockq+ 96], m10
+
+ ; second half of round 1
+ VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d]
+ VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w]
+ VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d]
+ VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w]
+ VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w]
+
+ ; second half of round 2 t8-15
+ VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d]
+ VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d]
+ VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w]
+ VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w]
+
+ SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4
+ SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5
+
+ mova m10, [blockq+ 96]
+ mova [blockq+ 96], m12
+
+ ; round 3
+ VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d]
+ VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d]
+ VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w]
+ VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192]
+ PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d]
+ VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192]
+ PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w]
+ VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w]
+
+ mova m1, [blockq+ 0]
+ mova m14, [blockq+128]
+ mova m6, [blockq+ 32]
+ mova m9, [blockq+ 64]
+ mova m12, [blockq+ 96]
+ mova [blockq+ 0], m10
+ mova [blockq+128], m5
+
+ SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a
+ SUMSUB_BA w, 1, 3, 5
+ PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a
+
+ SUMSUB_BA w, 9, 11, 5
+ PSIGNW m9, [pw_m1] ; m9=out1, m11=t10
+ SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11
+
+ VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6
+ mova m5, [blockq+128]
+ mova [blockq+192], m11
+ PSIGNW m15, [pw_m1]
+ VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10
+
+ PSIGNW m3, [pw_m1]
+ VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8
+ VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4
+
+ mova m10, [blockq+ 0]
+
+ SWAP 0, 14, 6, 11, 8, 12, 10
+ SWAP 1, 9, 15, 4, 7, 3, 5
+ SWAP 5, 9, 15
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+%macro IADST16_YMM_FN 4
+INIT_YMM avx2
+cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
+ mova m1, [blockq+ 32]
+ mova m2, [blockq+ 64]
+ mova m3, [blockq+ 96]
+ mova m5, [blockq+160]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ mova m8, [blockq+256]
+ mova m9, [blockq+288]
+ mova m10, [blockq+320]
+ mova m11, [blockq+352]
+ mova m12, [blockq+384]
+ mova m13, [blockq+416]
+ mova m14, [blockq+448]
+ mova m15, [blockq+480]
+
+ VP9_%2_YMM_1D
+ TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ [blockq+192], [blockq+128], 1
+ mova [blockq+ 0], m0
+ VP9_%4_YMM_1D
+
+ mova [blockq+224], m7
+
+ ; store
+ VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ pxor m0, m0
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+IADST16_YMM_FN idct, IDCT16, iadst, IADST16
+IADST16_YMM_FN iadst, IADST16, idct, IDCT16
+IADST16_YMM_FN iadst, IADST16, iadst, IADST16
+%endif
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
+%if %2 == 1
+%assign %%str mmsize
+%else
+%assign %%str 64
+%endif
+
+ ; first do t0-15, this can be done identical to idct16x16
+ VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
+
+ ; store everything on stack to make space available for t16-31
+ ; we store interleaved with the output of the second half (t16-31)
+ ; so we don't need to allocate extra stack space
+ mova [tmpq+ 0*%%str], m0 ; t0
+ mova [tmpq+ 4*%%str], m1 ; t1
+ mova [tmpq+ 8*%%str], m2 ; t2
+ mova [tmpq+12*%%str], m3 ; t3
+ mova [tmpq+16*%%str], m4 ; t4
+ mova [tmpq+20*%%str], m5 ; t5
+%if ARCH_X86_64
+ mova [tmpq+22*%%str], m10 ; t10
+ mova [tmpq+18*%%str], m11 ; t11
+ mova [tmpq+14*%%str], m12 ; t12
+ mova [tmpq+10*%%str], m13 ; t13
+ mova [tmpq+ 6*%%str], m14 ; t14
+ mova [tmpq+ 2*%%str], m15 ; t15
+%endif
+
+ mova m0, [tmpq+ 30*%%str]
+ UNSCRATCH 1, 6, tmpq+26*%%str
+ UNSCRATCH 2, 8, tmpq+24*%%str
+ UNSCRATCH 3, 9, tmpq+28*%%str
+ SUMSUB_BA w, 1, 3, 4 ; t6, t9
+ SUMSUB_BA w, 0, 2, 4 ; t7, t8
+
+ mova [tmpq+24*%%str], m1 ; t6
+ mova [tmpq+28*%%str], m0 ; t7
+ mova [tmpq+30*%%str], m2 ; t8
+ mova [tmpq+26*%%str], m3 ; t9
+
+ ; then, secondly, do t16-31
+%if %3 <= 8
+ mova m4, [%1+ 1*64]
+ mova m7, [%1+ 7*64]
+
+ pmulhrsw m1, m4, [pw_16364x2] ;t31
+ pmulhrsw m4, [pw_804x2] ;t16
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 0, 1, 4, 16069, 3196, [pd_8192], 6, 2 ; t17, t30
+
+ pmulhrsw m3, m7, [pw_m5520x2] ;t19
+ pmulhrsw m7, [pw_15426x2] ;t28
+
+ SCRATCH 4, 13, tmpq+ 1*%%str
+ SCRATCH 5, 12, tmpq+15*%%str
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 6, 7, 3, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
+%else
+ mova m0, [%1+ 1*64]
+ mova m1, [%1+15*64]
+%if %3 <= 16
+ pmulhrsw m5, m0, [pw_16364x2]
+ pmulhrsw m0, [pw_804x2]
+ pmulhrsw m4, m1, [pw_m11003x2]
+ pmulhrsw m1, [pw_12140x2]
+%else
+ mova m4, [%1+17*64]
+ mova m5, [%1+31*64]
+
+ VP9_UNPACK_MULSUB_2W_4X 0, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31
+ VP9_UNPACK_MULSUB_2W_4X 4, 1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
+%endif
+ SUMSUB_BA w, 4, 0, 2
+ SUMSUB_BA w, 1, 5, 2
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 0, 16069, 3196, [pd_8192], 2, 3 ; t17, t30
+
+ SCRATCH 4, 13, tmpq+ 1*%%str
+ SCRATCH 5, 12, tmpq+15*%%str
+
+ mova m2, [%1+ 7*64]
+ mova m3, [%1+ 9*64]
+%if %3 <= 16
+ pmulhrsw m7, m3, [pw_14811x2]
+ pmulhrsw m3, [pw_7005x2]
+ pmulhrsw m6, m2, [pw_m5520x2]
+ pmulhrsw m2, [pw_15426x2]
+%else
+ mova m7, [%1+23*64]
+ mova m6, [%1+25*64]
+
+ VP9_UNPACK_MULSUB_2W_4X 3, 7, 14811, 7005, [pd_8192], 4, 5 ; t18, t29
+ VP9_UNPACK_MULSUB_2W_4X 6, 2, 5520, 15426, [pd_8192], 4, 5 ; t19, t28
+%endif
+ SUMSUB_BA w, 3, 6, 4
+ SUMSUB_BA w, 7, 2, 4
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
+%endif
+
+ UNSCRATCH 5, 12, tmpq+15*%%str
+ SUMSUB_BA w, 6, 0, 4
+ mova [tmpq+25*%%str], m6 ; t19
+ UNSCRATCH 4, 13, tmpq+ 1*%%str
+ SUMSUB_BA w, 7, 1, 6
+ SUMSUB_BA w, 3, 4, 6
+ mova [tmpq+23*%%str], m3 ; t16
+ SUMSUB_BA w, 2, 5, 6
+
+ VP9_UNPACK_MULSUB_2W_4X 0, 5, 15137, 6270, [pd_8192], 6, 3 ; t18, t29
+ VP9_UNPACK_MULSUB_2W_4X 1, 4, 15137, 6270, [pd_8192], 6, 3 ; t19, t28
+
+ SCRATCH 0, 10, tmpq+ 1*%%str
+ SCRATCH 1, 11, tmpq+ 7*%%str
+ SCRATCH 2, 9, tmpq+ 9*%%str
+ SCRATCH 4, 14, tmpq+15*%%str
+ SCRATCH 5, 15, tmpq+17*%%str
+ SCRATCH 7, 13, tmpq+31*%%str
+
+%if %3 <= 8
+ mova m0, [%1+ 5*64]
+ mova m3, [%1+ 3*64]
+
+ pmulhrsw m5, m0, [pw_15893x2] ;t27
+ pmulhrsw m0, [pw_3981x2] ;t20
+
+ VP9_UNPACK_MULSUB_2W_4X 1, 4, 5, 0, 9102, 13623, [pd_8192], 7, 2 ; t21, t26
+
+ pmulhrsw m6, m3, [pw_m2404x2] ;t23
+ pmulhrsw m3, [pw_16207x2] ;t24
+
+ SCRATCH 5, 8, tmpq+ 5*%%str
+ SCRATCH 4, 12, tmpq+11*%%str
+
+ VP9_UNPACK_MULSUB_2W_4X 7, 2, 3, 6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%else
+ mova m4, [%1+ 5*64]
+ mova m5, [%1+11*64]
+%if %3 <= 16
+ pmulhrsw m1, m4, [pw_15893x2]
+ pmulhrsw m4, [pw_3981x2]
+ pmulhrsw m0, m5, [pw_m8423x2]
+ pmulhrsw m5, [pw_14053x2]
+%else
+ mova m0, [%1+21*64]
+ mova m1, [%1+27*64]
+
+ VP9_UNPACK_MULSUB_2W_4X 4, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27
+ VP9_UNPACK_MULSUB_2W_4X 0, 5, 8423, 14053, [pd_8192], 2, 3 ; t21, t26
+%endif
+ SUMSUB_BA w, 0, 4, 2
+ SUMSUB_BA w, 5, 1, 2
+
+ VP9_UNPACK_MULSUB_2W_4X 1, 4, 9102, 13623, [pd_8192], 2, 3 ; t21, t26
+
+ SCRATCH 5, 8, tmpq+ 5*%%str
+ SCRATCH 4, 12, tmpq+11*%%str
+
+ mova m7, [%1+ 3*64]
+ mova m6, [%1+13*64]
+%if %3 <= 16
+ pmulhrsw m3, m6, [pw_13160x2]
+ pmulhrsw m6, [pw_9760x2]
+ pmulhrsw m2, m7, [pw_m2404x2]
+ pmulhrsw m7, [pw_16207x2]
+%else
+ mova m2, [%1+29*64]
+ mova m3, [%1+19*64]
+ VP9_UNPACK_MULSUB_2W_4X 6, 3, 13160, 9760, [pd_8192], 4, 5 ; t22, t25
+ VP9_UNPACK_MULSUB_2W_4X 2, 7, 2404, 16207, [pd_8192], 4, 5 ; t23, t24
+%endif
+ SUMSUB_BA w, 6, 2, 4
+ SUMSUB_BA w, 3, 7, 4
+
+ VP9_UNPACK_MULSUB_2W_4X 7, 2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%endif
+
+ ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+ ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+
+ UNSCRATCH 4, 12, tmpq+11*%%str
+ SUMSUB_BA w, 0, 6, 5
+ SUMSUB_BA w, 4, 2, 5
+ UNSCRATCH 5, 8, tmpq+ 5*%%str
+ SCRATCH 4, 8, tmpq+11*%%str
+ SUMSUB_BA w, 1, 7, 4
+ SUMSUB_BA w, 5, 3, 4
+ SCRATCH 5, 12, tmpq+ 5*%%str
+
+ VP9_UNPACK_MULSUB_2W_4X 3, 6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
+ VP9_UNPACK_MULSUB_2W_4X 2, 7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
+
+ ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
+ ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
+
+ UNSCRATCH 5, 9, tmpq+ 9*%%str
+ mova m4, [tmpq+23*%%str] ; t16
+%if ARCH_X86_64
+ SUMSUB_BA w, 1, 5, 9
+ SUMSUB_BA w, 0, 4, 9
+%else
+ SUMSUB_BADC w, 1, 5, 0, 4
+%endif
+ mova [tmpq+29*%%str], m1 ; t17
+ mova [tmpq+21*%%str], m0 ; t16
+ UNSCRATCH 0, 10, tmpq+ 1*%%str
+ UNSCRATCH 1, 11, tmpq+ 7*%%str
+%if ARCH_X86_64
+ SUMSUB_BA w, 2, 0, 9
+ SUMSUB_BA w, 3, 1, 9
+%else
+ SUMSUB_BADC w, 2, 0, 3, 1
+%endif
+ mova [tmpq+ 9*%%str], m2 ; t18
+ mova [tmpq+13*%%str], m3 ; t19
+ SCRATCH 0, 10, tmpq+23*%%str
+ SCRATCH 1, 11, tmpq+27*%%str
+
+ UNSCRATCH 2, 14, tmpq+15*%%str
+ UNSCRATCH 3, 15, tmpq+17*%%str
+ SUMSUB_BA w, 6, 2, 0
+ SUMSUB_BA w, 7, 3, 0
+ SCRATCH 6, 14, tmpq+ 3*%%str
+ SCRATCH 7, 15, tmpq+ 7*%%str
+
+ UNSCRATCH 0, 8, tmpq+11*%%str
+ mova m1, [tmpq+25*%%str] ; t19
+ UNSCRATCH 6, 12, tmpq+ 5*%%str
+ UNSCRATCH 7, 13, tmpq+31*%%str
+%if ARCH_X86_64
+ SUMSUB_BA w, 0, 1, 9
+ SUMSUB_BA w, 6, 7, 9
+%else
+ SUMSUB_BADC w, 0, 1, 6, 7
+%endif
+
+ ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
+ ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+%if 0; cpuflag(ssse3)
+%if ARCH_X86_64
+ SUMSUB_BA w, 4, 7, 8
+ SUMSUB_BA w, 5, 1, 8
+%else
+ SUMSUB_BADC w, 4, 7, 5, 1
+%endif
+
+ pmulhrsw m7, [pw_11585x2]
+ pmulhrsw m4, [pw_11585x2]
+ pmulhrsw m1, [pw_11585x2]
+ pmulhrsw m5, [pw_11585x2]
+
+ mova [tmpq+ 5*%%str], m7 ; t23
+ SCRATCH 1, 13, tmpq+25*%%str
+ UNSCRATCH 7, 10, tmpq+23*%%str
+ UNSCRATCH 1, 11, tmpq+27*%%str
+
+%if ARCH_X86_64
+ SUMSUB_BA w, 7, 3, 10
+ SUMSUB_BA w, 1, 2, 10
+%else
+ SUMSUB_BADC w, 7, 3, 1, 2
+%endif
+
+ pmulhrsw m3, [pw_11585x2]
+ pmulhrsw m7, [pw_11585x2]
+ pmulhrsw m2, [pw_11585x2]
+ pmulhrsw m1, [pw_11585x2]
+%else
+ SCRATCH 0, 8, tmpq+15*%%str
+ SCRATCH 6, 9, tmpq+17*%%str
+ VP9_UNPACK_MULSUB_2W_4X 7, 4, 11585, 11585, [pd_8192], 0, 6
+ mova [tmpq+ 5*%%str], m7 ; t23
+ UNSCRATCH 7, 10, tmpq+23*%%str
+ VP9_UNPACK_MULSUB_2W_4X 1, 5, 11585, 11585, [pd_8192], 0, 6
+ SCRATCH 1, 13, tmpq+25*%%str
+ UNSCRATCH 1, 11, tmpq+27*%%str
+ VP9_UNPACK_MULSUB_2W_4X 3, 7, 11585, 11585, [pd_8192], 0, 6
+ VP9_UNPACK_MULSUB_2W_4X 2, 1, 11585, 11585, [pd_8192], 0, 6
+ UNSCRATCH 0, 8, tmpq+15*%%str
+ UNSCRATCH 6, 9, tmpq+17*%%str
+%endif
+
+ ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
+ ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+ ; then do final pass to sumsub+store the two halves
+%if %2 == 1
+ mova [tmpq+17*%%str], m2 ; t20
+ mova [tmpq+ 1*%%str], m3 ; t21
+%if ARCH_X86_64
+ mova [tmpq+25*%%str], m13 ; t22
+
+ mova m8, [tmpq+ 0*%%str] ; t0
+ mova m9, [tmpq+ 4*%%str] ; t1
+ mova m12, [tmpq+ 8*%%str] ; t2
+ mova m11, [tmpq+12*%%str] ; t3
+ mova m2, [tmpq+16*%%str] ; t4
+ mova m3, [tmpq+20*%%str] ; t5
+ mova m13, [tmpq+24*%%str] ; t6
+
+ SUMSUB_BA w, 6, 8, 10
+ mova [tmpq+ 3*%%str], m8 ; t15
+ SUMSUB_BA w, 0, 9, 8
+ SUMSUB_BA w, 15, 12, 8
+ SUMSUB_BA w, 14, 11, 8
+ SUMSUB_BA w, 1, 2, 8
+ SUMSUB_BA w, 7, 3, 8
+ SUMSUB_BA w, 5, 13, 8
+ mova m10, [tmpq+28*%%str] ; t7
+ SUMSUB_BA w, 4, 10, 8
+%if cpuflag(avx2)
+ ; the "shitty" about this idct is that the final pass does the outermost
+ ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need
+ ; to be sequential, which means I need to load/store half of the sumsub
+ ; intermediates back to/from memory to get a 16x16 transpose going...
+ ; This would be easier if we had more (e.g. 32) YMM regs here.
+ mova [tmpq+ 7*%%str], m9
+ mova [tmpq+11*%%str], m12
+ mova [tmpq+15*%%str], m11
+ mova [tmpq+19*%%str], m2
+ mova [tmpq+23*%%str], m3
+ mova [tmpq+27*%%str], m13
+ mova [tmpq+31*%%str], m10
+ mova [tmpq+12*%%str], m5
+
+ mova m13, [tmpq+30*%%str] ; t8
+ mova m12, [tmpq+26*%%str] ; t9
+ mova m11, [tmpq+22*%%str] ; t10
+ mova m10, [tmpq+18*%%str] ; t11
+ mova m9, [tmpq+17*%%str] ; t20
+ mova m8, [tmpq+ 1*%%str] ; t21
+ mova m3, [tmpq+25*%%str] ; t22
+ mova m2, [tmpq+ 5*%%str] ; t23
+
+ SUMSUB_BA w, 9, 10, 5
+ SUMSUB_BA w, 8, 11, 5
+ SUMSUB_BA w, 3, 12, 5
+ SUMSUB_BA w, 2, 13, 5
+ mova [tmpq+ 1*%%str], m10
+ mova [tmpq+ 5*%%str], m11
+ mova [tmpq+17*%%str], m12
+ mova [tmpq+25*%%str], m13
+
+ mova m13, [tmpq+14*%%str] ; t12
+ mova m12, [tmpq+10*%%str] ; t13
+ mova m11, [tmpq+ 9*%%str] ; t18
+ mova m10, [tmpq+13*%%str] ; t19
+
+ SUMSUB_BA w, 11, 12, 5
+ SUMSUB_BA w, 10, 13, 5
+ mova [tmpq+ 9*%%str], m13
+ mova [tmpq+13*%%str], m12
+ mova [tmpq+10*%%str], m10
+ mova [tmpq+14*%%str], m11
+
+ mova m13, [tmpq+ 6*%%str] ; t14
+ mova m12, [tmpq+ 2*%%str] ; t15
+ mova m11, [tmpq+21*%%str] ; t16
+ mova m10, [tmpq+29*%%str] ; t17
+ SUMSUB_BA w, 11, 12, 5
+ SUMSUB_BA w, 10, 13, 5
+ mova [tmpq+21*%%str], m12
+ mova [tmpq+29*%%str], m13
+ mova m12, [tmpq+10*%%str]
+ mova m13, [tmpq+14*%%str]
+
+ TRANSPOSE16x16W 6, 0, 15, 14, 1, 7, 5, 4, \
+ 2, 3, 8, 9, 12, 13, 10, 11, \
+ [tmpq+12*%%str], [tmpq+ 8*%%str], 1
+ mova [tmpq+ 0*%%str], m6
+ mova [tmpq+ 2*%%str], m0
+ mova [tmpq+ 4*%%str], m15
+ mova [tmpq+ 6*%%str], m14
+ mova [tmpq+10*%%str], m7
+ mova [tmpq+12*%%str], m5
+ mova [tmpq+14*%%str], m4
+ mova [tmpq+16*%%str], m2
+ mova [tmpq+18*%%str], m3
+ mova [tmpq+20*%%str], m8
+ mova [tmpq+22*%%str], m9
+ mova [tmpq+24*%%str], m12
+ mova [tmpq+26*%%str], m13
+ mova [tmpq+28*%%str], m10
+ mova [tmpq+30*%%str], m11
+
+ mova m0, [tmpq+21*%%str]
+ mova m1, [tmpq+29*%%str]
+ mova m2, [tmpq+13*%%str]
+ mova m3, [tmpq+ 9*%%str]
+ mova m4, [tmpq+ 1*%%str]
+ mova m5, [tmpq+ 5*%%str]
+ mova m7, [tmpq+25*%%str]
+ mova m8, [tmpq+31*%%str]
+ mova m9, [tmpq+27*%%str]
+ mova m10, [tmpq+23*%%str]
+ mova m11, [tmpq+19*%%str]
+ mova m12, [tmpq+15*%%str]
+ mova m13, [tmpq+11*%%str]
+ mova m14, [tmpq+ 7*%%str]
+ mova m15, [tmpq+ 3*%%str]
+ TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ [tmpq+17*%%str], [tmpq+ 9*%%str], 1
+ mova [tmpq+ 1*%%str], m0
+ mova [tmpq+ 3*%%str], m1
+ mova [tmpq+ 5*%%str], m2
+ mova [tmpq+ 7*%%str], m3
+ mova [tmpq+11*%%str], m5
+ mova [tmpq+13*%%str], m6
+ mova [tmpq+15*%%str], m7
+ mova [tmpq+17*%%str], m8
+ mova [tmpq+19*%%str], m9
+ mova [tmpq+21*%%str], m10
+ mova [tmpq+23*%%str], m11
+ mova [tmpq+25*%%str], m12
+ mova [tmpq+27*%%str], m13
+ mova [tmpq+29*%%str], m14
+ mova [tmpq+31*%%str], m15
+%else ; !avx2
+ TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8
+ mova [tmpq+ 0*%%str], m6
+ mova [tmpq+ 4*%%str], m0
+ mova [tmpq+ 8*%%str], m15
+ mova [tmpq+12*%%str], m14
+ mova [tmpq+16*%%str], m1
+ mova [tmpq+20*%%str], m7
+ mova [tmpq+24*%%str], m5
+ mova [tmpq+28*%%str], m4
+
+ mova m8, [tmpq+ 3*%%str] ; t15
+ TRANSPOSE8x8W 10, 13, 3, 2, 11, 12, 9, 8, 0
+ mova [tmpq+ 3*%%str], m10
+ mova [tmpq+ 7*%%str], m13
+ mova [tmpq+11*%%str], m3
+ mova [tmpq+15*%%str], m2
+ mova [tmpq+19*%%str], m11
+ mova [tmpq+23*%%str], m12
+ mova [tmpq+27*%%str], m9
+ mova [tmpq+31*%%str], m8
+
+ mova m15, [tmpq+30*%%str] ; t8
+ mova m14, [tmpq+26*%%str] ; t9
+ mova m13, [tmpq+22*%%str] ; t10
+ mova m12, [tmpq+18*%%str] ; t11
+ mova m11, [tmpq+14*%%str] ; t12
+ mova m10, [tmpq+10*%%str] ; t13
+ mova m9, [tmpq+ 6*%%str] ; t14
+ mova m8, [tmpq+ 2*%%str] ; t15
+ mova m7, [tmpq+21*%%str] ; t16
+ mova m6, [tmpq+29*%%str] ; t17
+ mova m5, [tmpq+ 9*%%str] ; t18
+ mova m4, [tmpq+13*%%str] ; t19
+ mova m3, [tmpq+17*%%str] ; t20
+ mova m2, [tmpq+ 1*%%str] ; t21
+ mova m1, [tmpq+25*%%str] ; t22
+
+ SUMSUB_BA w, 7, 8, 0
+ mova [tmpq+ 2*%%str], m8
+ mova m0, [tmpq+ 5*%%str] ; t23
+ SUMSUB_BA w, 6, 9, 8
+ SUMSUB_BA w, 5, 10, 8
+ SUMSUB_BA w, 4, 11, 8
+ SUMSUB_BA w, 3, 12, 8
+ SUMSUB_BA w, 2, 13, 8
+ SUMSUB_BA w, 1, 14, 8
+ SUMSUB_BA w, 0, 15, 8
+
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ mova [tmpq+ 1*%%str], m0
+ mova [tmpq+ 5*%%str], m1
+ mova [tmpq+ 9*%%str], m2
+ mova [tmpq+13*%%str], m3
+ mova [tmpq+17*%%str], m4
+ mova [tmpq+21*%%str], m5
+ mova [tmpq+25*%%str], m6
+ mova [tmpq+29*%%str], m7
+
+ mova m8, [tmpq+ 2*%%str]
+ TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
+ mova [tmpq+ 2*%%str], m8
+ mova [tmpq+ 6*%%str], m9
+ mova [tmpq+10*%%str], m10
+ mova [tmpq+14*%%str], m11
+ mova [tmpq+18*%%str], m12
+ mova [tmpq+22*%%str], m13
+ mova [tmpq+26*%%str], m14
+ mova [tmpq+30*%%str], m15
+%endif ; avx2
+%else
+ mova m2, [tmpq+24*%%str] ; t6
+ mova m3, [tmpq+28*%%str] ; t7
+ SUMSUB_BADC w, 5, 2, 4, 3
+ mova [tmpq+24*%%str], m5
+ mova [tmpq+23*%%str], m2
+ mova [tmpq+28*%%str], m4
+ mova [tmpq+19*%%str], m3
+
+ mova m2, [tmpq+16*%%str] ; t4
+ mova m3, [tmpq+20*%%str] ; t5
+ SUMSUB_BA w, 1, 2, 5
+ SUMSUB_BA w, 7, 3, 5
+ mova [tmpq+15*%%str], m2
+ mova [tmpq+11*%%str], m3
+
+ mova m2, [tmpq+ 0*%%str] ; t0
+ mova m3, [tmpq+ 4*%%str] ; t1
+ SUMSUB_BA w, 6, 2, 5
+ SUMSUB_BA w, 0, 3, 5
+ mova [tmpq+31*%%str], m2
+ mova [tmpq+27*%%str], m3
+
+ mova m2, [tmpq+ 8*%%str] ; t2
+ mova m3, [tmpq+12*%%str] ; t3
+ mova m5, [tmpq+ 7*%%str]
+ mova m4, [tmpq+ 3*%%str]
+ SUMSUB_BADC w, 5, 2, 4, 3
+ mova [tmpq+ 7*%%str], m2
+ mova [tmpq+ 3*%%str], m3
+
+ mova m3, [tmpq+28*%%str]
+ TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
+ mova [tmpq+ 0*%%str], m6
+ mova [tmpq+ 4*%%str], m0
+ mova [tmpq+ 8*%%str], m5
+ mova [tmpq+12*%%str], m4
+ mova [tmpq+20*%%str], m7
+ mova [tmpq+24*%%str], m2
+ mova [tmpq+28*%%str], m3
+
+ mova m6, [tmpq+19*%%str]
+ mova m0, [tmpq+23*%%str]
+ mova m5, [tmpq+11*%%str]
+ mova m4, [tmpq+15*%%str]
+ mova m1, [tmpq+ 3*%%str]
+ mova m7, [tmpq+ 7*%%str]
+ mova m3, [tmpq+31*%%str]
+ TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
+ mova [tmpq+ 3*%%str], m6
+ mova [tmpq+ 7*%%str], m0
+ mova [tmpq+11*%%str], m5
+ mova [tmpq+15*%%str], m4
+ mova [tmpq+23*%%str], m7
+ mova [tmpq+27*%%str], m2
+ mova [tmpq+31*%%str], m3
+
+ mova m1, [tmpq+ 6*%%str] ; t14
+ mova m0, [tmpq+ 2*%%str] ; t15
+ mova m7, [tmpq+21*%%str] ; t16
+ mova m6, [tmpq+29*%%str] ; t17
+ SUMSUB_BA w, 7, 0, 2
+ SUMSUB_BA w, 6, 1, 2
+ mova [tmpq+29*%%str], m7
+ mova [tmpq+ 2*%%str], m0
+ mova [tmpq+21*%%str], m6
+ mova [tmpq+ 6*%%str], m1
+
+ mova m1, [tmpq+14*%%str] ; t12
+ mova m0, [tmpq+10*%%str] ; t13
+ mova m5, [tmpq+ 9*%%str] ; t18
+ mova m4, [tmpq+13*%%str] ; t19
+ SUMSUB_BA w, 5, 0, 2
+ SUMSUB_BA w, 4, 1, 2
+ mova [tmpq+10*%%str], m0
+ mova [tmpq+14*%%str], m1
+
+ mova m1, [tmpq+22*%%str] ; t10
+ mova m0, [tmpq+18*%%str] ; t11
+ mova m3, [tmpq+17*%%str] ; t20
+ mova m2, [tmpq+ 1*%%str] ; t21
+ SUMSUB_BA w, 3, 0, 6
+ SUMSUB_BA w, 2, 1, 6
+ mova [tmpq+18*%%str], m0
+ mova [tmpq+22*%%str], m1
+
+ mova m7, [tmpq+30*%%str] ; t8
+ mova m6, [tmpq+26*%%str] ; t9
+ mova m1, [tmpq+25*%%str] ; t22
+ mova m0, [tmpq+ 5*%%str] ; t23
+ SUMSUB_BADC w, 1, 6, 0, 7
+ mova [tmpq+26*%%str], m6
+ mova [tmpq+30*%%str], m7
+
+ mova m7, [tmpq+29*%%str]
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
+ mova [tmpq+ 1*%%str], m0
+ mova [tmpq+ 5*%%str], m1
+ mova [tmpq+ 9*%%str], m2
+ mova [tmpq+13*%%str], m3
+ mova [tmpq+21*%%str], m5
+ mova [tmpq+25*%%str], m6
+ mova [tmpq+29*%%str], m7
+
+ mova m0, [tmpq+ 2*%%str]
+ mova m1, [tmpq+ 6*%%str]
+ mova m2, [tmpq+10*%%str]
+ mova m3, [tmpq+14*%%str]
+ mova m4, [tmpq+18*%%str]
+ mova m5, [tmpq+22*%%str]
+ mova m7, [tmpq+30*%%str]
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
+ mova [tmpq+ 2*%%str], m0
+ mova [tmpq+ 6*%%str], m1
+ mova [tmpq+10*%%str], m2
+ mova [tmpq+14*%%str], m3
+ mova [tmpq+22*%%str], m5
+ mova [tmpq+26*%%str], m6
+ mova [tmpq+30*%%str], m7
+%endif
+%else
+ ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+ ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+ ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
+ ; t20-22 is in m4-6
+ ; t24-31 is in m8-15
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
+ SUMSUB_BA w, %4, %1, %5
+ SUMSUB_BA w, %3, %2, %5
+ VP9_IDCT8_WRITEx2 %4, %3, %5, %6, %7, ROUND_REG, 6
+%if %8 == 1
+ add dstq, stride2q
+%endif
+ VP9_IDCT8_WRITEx2 %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
+%if %8 == 1
+ sub dst_endq, stride2q
+%endif
+%endmacro
+
+%if ARCH_X86_64
+ pxor m10, m10
+
+ ; store t0-1 and t30-31
+ mova m8, [tmpq+ 0*%%str]
+ mova m9, [tmpq+ 4*%%str]
+ %%STORE_2X2 8, 9, 0, 6, 12, 11, 10
+
+ ; store t2-3 and t28-29
+ mova m8, [tmpq+ 8*%%str]
+ mova m9, [tmpq+12*%%str]
+ %%STORE_2X2 8, 9, 14, 15, 12, 11, 10
+
+ ; store t4-5 and t26-27
+ mova m8, [tmpq+16*%%str]
+ mova m9, [tmpq+20*%%str]
+ %%STORE_2X2 8, 9, 7, 1, 12, 11, 10
+
+ ; store t6-7 and t24-25
+ mova m8, [tmpq+24*%%str]
+ mova m9, [tmpq+28*%%str]
+ %%STORE_2X2 8, 9, 4, 5, 12, 11, 10
+
+ ; store t8-9 and t22-23
+ mova m8, [tmpq+30*%%str]
+ mova m9, [tmpq+26*%%str]
+ mova m0, [tmpq+ 5*%%str]
+ %%STORE_2X2 8, 9, 13, 0, 12, 11, 10
+
+ ; store t10-11 and t20-21
+ mova m8, [tmpq+22*%%str]
+ mova m9, [tmpq+18*%%str]
+ %%STORE_2X2 8, 9, 2, 3, 12, 11, 10
+
+ ; store t12-13 and t18-19
+ mova m8, [tmpq+14*%%str]
+ mova m9, [tmpq+10*%%str]
+ mova m5, [tmpq+13*%%str]
+ mova m4, [tmpq+ 9*%%str]
+ %%STORE_2X2 8, 9, 4, 5, 12, 11, 10
+
+ ; store t14-17
+ mova m8, [tmpq+ 6*%%str]
+ mova m9, [tmpq+ 2*%%str]
+ mova m5, [tmpq+29*%%str]
+ mova m4, [tmpq+21*%%str]
+ %%STORE_2X2 8, 9, 4, 5, 12, 11, 10, 0
+
+ SWAP 1, 10 ; zero
+%else
+ mova [tmpq+ 1*%%str], m1
+ mova [tmpq+11*%%str], m2
+ mova [tmpq+15*%%str], m3
+ mova [tmpq+17*%%str], m4
+ mova [tmpq+19*%%str], m5
+ pxor m1, m1
+
+ ; store t0-1 and t30-31
+ mova m2, [tmpq+ 0*%%str]
+ mova m3, [tmpq+ 4*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t2-3 and t28-29
+ mova m2, [tmpq+ 8*%%str]
+ mova m3, [tmpq+12*%%str]
+ mova m0, [tmpq+ 3*%%str]
+ mova m6, [tmpq+ 7*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t4-5 and t26-27
+ mova m2, [tmpq+16*%%str]
+ mova m3, [tmpq+20*%%str]
+ mova m0, [tmpq+ 1*%%str]
+ %%STORE_2X2 2, 3, 7, 0, 4, 5, 1
+
+ ; store t6-7 and t24-25
+ mova m2, [tmpq+24*%%str]
+ mova m3, [tmpq+28*%%str]
+ mova m0, [tmpq+17*%%str]
+ mova m6, [tmpq+19*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t8-9 and t22-23
+ mova m2, [tmpq+30*%%str]
+ mova m3, [tmpq+26*%%str]
+ mova m0, [tmpq+25*%%str]
+ mova m6, [tmpq+ 5*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t10-11 and t20-21
+ mova m2, [tmpq+22*%%str]
+ mova m3, [tmpq+18*%%str]
+ mova m0, [tmpq+11*%%str]
+ mova m6, [tmpq+15*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t12-13 and t18-19
+ mova m2, [tmpq+14*%%str]
+ mova m3, [tmpq+10*%%str]
+ mova m6, [tmpq+13*%%str]
+ mova m0, [tmpq+ 9*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t14-17
+ mova m2, [tmpq+ 6*%%str]
+ mova m3, [tmpq+ 2*%%str]
+ mova m6, [tmpq+29*%%str]
+ mova m0, [tmpq+21*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1, 0
+%endif
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+ movifnidn eobd, dword eobm
+%if cpuflag(ssse3)
+ cmp eobd, 135
+ jg .idctfull
+ cmp eobd, 34
+ jg .idct16x16
+ cmp eobd, 1
+ jg .idct8x8
+%else
+ cmp eobd, 1
+ jg .idctfull
+%endif
+
+ ; dc-only case
+ movifnidn blockq, blockmp
+ movifnidn dstq, dstmp
+ movifnidn strideq, stridemp
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ mova m1, [pw_11585x2]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ movsx coefd, word [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (32 << 14) + 8192
+ sar coefd, 14 + 6
+ movd m0, coefd
+%endif
+ SPLATW m0, m0, q0000
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_512]
+%endif
+ pxor m5, m5
+ movd [blockq], m5
+%rep 31
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
+ add dstq, strideq
+%endrep
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
+ RET
+
+%if ARCH_X86_64
+ DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+%else
+%define dst_bakq r0mp
+%endif
+%if cpuflag(ssse3)
+.idct8x8:
+%if ARCH_X86_32
+ DEFINE_ARGS block, u1, u2, u3, u4, tmp
+ mov blockq, r2mp
+%endif
+ mov tmpq, rsp
+ VP9_IDCT32_1D blockq, 1, 8
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ sub stride30q, stride2q ; stride*30
+.loop2_8x8:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2, 8
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_8x8
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
+ ZERO_BLOCK blockq, 64, 8, m1
+ RET
+
+.idct16x16:
+%if ARCH_X86_32
+ DEFINE_ARGS block, tmp, cnt
+ mov blockq, r2mp
+%endif
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_16x16:
+ VP9_IDCT32_1D blockq, 1, 16
+ add blockq, 16
+ add tmpq, 512
+ dec cntd
+ jg .loop1_16x16
+
+%if ARCH_X86_64
+ sub blockq, 32
+%else
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ mov tmpq, rsp
+ sub stride30q, stride2q ; stride*30
+.loop2_16x16:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2, 16
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_16x16
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
+ ZERO_BLOCK blockq, 64, 16, m1
+ RET
+%endif
+
+.idctfull:
+%if ARCH_X86_32
+ DEFINE_ARGS block, tmp, cnt
+ mov blockq, r2mp
+%endif
+ mov cntd, 4
+ mov tmpq, rsp
+.loop1_full:
+ VP9_IDCT32_1D blockq, 1
+ add blockq, 16
+ add tmpq, 512
+ dec cntd
+ jg .loop1_full
+
+%if ARCH_X86_64
+ sub blockq, 64
+%else
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ mov tmpq, rsp
+ sub stride30q, stride2q ; stride*30
+.loop2_full:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
+ ZERO_BLOCK blockq, 64, 32, m1
+ RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM sse2
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
+
+; this is almost identical to VP9_STORE_2X, but it does two rows
+; for slightly improved interleaving, and it omits vpermq since the
+; input is DC so all values are identical
+%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
+ mova m%2, [dstq]
+ mova m%4, [dstq+strideq]
+ punpckhbw m%3, m%2, m%6
+ punpcklbw m%2, m%6
+ punpckhbw m%5, m%4, m%6
+ punpcklbw m%4, m%6
+ paddw m%3, m%1
+ paddw m%2, m%1
+ paddw m%5, m%1
+ paddw m%4, m%1
+ packuswb m%2, m%3
+ packuswb m%4, m%5
+ mova [dstq+strideq*0], m%2
+ mova [dstq+strideq*1], m%4
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
+ cmp eobd, 135
+ jg .idctfull
+ cmp eobd, 1
+ jg .idct16x16
+
+ ; dc-only case
+ mova m1, [pw_11585x2]
+ vpbroadcastw m0, [blockq]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ pxor m5, m5
+ pmulhrsw m0, [pw_512]
+ movd [blockq], xm5
+
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 16
+.loop_dc:
+ VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5
+ lea dstq, [dstq+2*strideq]
+ dec cntd
+ jg .loop_dc
+ RET
+
+ DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+.idct16x16:
+ mov tmpq, rsp
+ VP9_IDCT32_1D blockq, 1, 16
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 2
+ sub stride30q, stride2q ; stride*30
+.loop2_16x16:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2, 16
+ add dst_bakq, 16
+ add tmpq, 32
+ dec cntd
+ jg .loop2_16x16
+
+ ; at the end of the loop, m1 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 64, 16, m1
+ RET
+
+.idctfull:
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_full:
+ VP9_IDCT32_1D blockq, 1
+ add blockq, 32
+ add tmpq, 1024
+ dec cntd
+ jg .loop1_full
+
+ sub blockq, 64
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 2
+ mov tmpq, rsp
+ sub stride30q, stride2q ; stride*30
+.loop2_full:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2
+ add dst_bakq, 16
+ add tmpq, 32
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m1 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 64, 32, m1
+ RET
+%endif
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm
new file mode 100644
index 0000000000..902685edf6
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 inverse transform x86 SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA
+
+cextern pw_8
+cextern pw_1023
+cextern pw_2048
+cextern pw_4095
+cextern pw_m1
+cextern pd_1
+cextern pd_16
+cextern pd_32
+cextern pd_8192
+
+pd_8: times 4 dd 8
+pd_3fff: times 4 dd 0x3fff
+
+cextern pw_11585x2
+
+cextern pw_5283_13377
+cextern pw_9929_13377
+cextern pw_15212_m13377
+cextern pw_15212_9929
+cextern pw_m5283_m15212
+cextern pw_13377x2
+cextern pw_m13377_13377
+cextern pw_13377_0
+
+pw_9929_m5283: times 4 dw 9929, -5283
+
+%macro COEF_PAIR 2-3
+cextern pw_m%1_%2
+cextern pw_%2_%1
+%if %0 == 3
+cextern pw_m%1_m%2
+%if %1 != %2
+cextern pw_m%2_%1
+cextern pw_%1_%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR 2404, 16207
+COEF_PAIR 3196, 16069, 1
+COEF_PAIR 4756, 15679
+COEF_PAIR 5520, 15426
+COEF_PAIR 6270, 15137, 1
+COEF_PAIR 8423, 14053
+COEF_PAIR 10394, 12665
+COEF_PAIR 11003, 12140
+COEF_PAIR 11585, 11585, 1
+COEF_PAIR 13160, 9760
+COEF_PAIR 13623, 9102, 1
+COEF_PAIR 14449, 7723
+COEF_PAIR 14811, 7005
+COEF_PAIR 15893, 3981
+COEF_PAIR 16305, 1606
+COEF_PAIR 16364, 804
+
+default_8x8:
+times 12 db 1
+times 52 db 2
+row_8x8:
+times 18 db 1
+times 46 db 2
+col_8x8:
+times 6 db 1
+times 58 db 2
+default_16x16:
+times 10 db 1
+times 28 db 2
+times 51 db 3
+times 167 db 4
+row_16x16:
+times 21 db 1
+times 45 db 2
+times 60 db 3
+times 130 db 4
+col_16x16:
+times 5 db 1
+times 12 db 2
+times 25 db 3
+times 214 db 4
+default_32x32:
+times 9 db 1
+times 25 db 2
+times 36 db 3
+times 65 db 4
+times 105 db 5
+times 96 db 6
+times 112 db 7
+times 576 db 8
+
+SECTION .text
+
+%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
+ mova m%3, [%7]
+ mova m%4, [%7+strideq]
+ paddw m%3, m%1
+ paddw m%4, m%2
+ pmaxsw m%3, m%5
+ pmaxsw m%4, m%5
+ pminsw m%3, m%6
+ pminsw m%4, m%6
+ mova [%7], m%3
+ mova [%7+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*4/mmsize
+ mova [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+; the input coefficients are scaled up by 2 bit (which we downscale immediately
+; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
+; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
+; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
+; add 2 bits, we need to scale before converting to word in 12bpp, since the
+; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
+; we can scale after converting to words (which is half the instructions),
+; since the input is only 14+sign bit, which fits in 15+sign words directly.
+
+%macro IWHT4_FN 2 ; bpp, max
+cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
+ mova m7, [pw_%2]
+ mova m0, [blockq+0*16+0]
+ mova m1, [blockq+1*16+0]
+%if %1 >= 12
+ mova m4, [blockq+0*16+8]
+ mova m5, [blockq+1*16+8]
+ psrad m0, 2
+ psrad m1, 2
+ psrad m4, 2
+ psrad m5, 2
+ packssdw m0, m4
+ packssdw m1, m5
+%else
+ packssdw m0, [blockq+0*16+8]
+ packssdw m1, [blockq+1*16+8]
+ psraw m0, 2
+ psraw m1, 2
+%endif
+ mova m2, [blockq+2*16+0]
+ mova m3, [blockq+3*16+0]
+%if %1 >= 12
+ mova m4, [blockq+2*16+8]
+ mova m5, [blockq+3*16+8]
+ psrad m2, 2
+ psrad m3, 2
+ psrad m4, 2
+ psrad m5, 2
+ packssdw m2, m4
+ packssdw m3, m5
+%else
+ packssdw m2, [blockq+2*16+8]
+ packssdw m3, [blockq+3*16+8]
+ psraw m2, 2
+ psraw m3, 2
+%endif
+
+ VP9_IWHT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IWHT4_1D
+
+ pxor m6, m6
+ VP9_STORE_2X 0, 1, 4, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ VP9_STORE_2X 2, 3, 4, 5, 6, 7
+ ZERO_BLOCK blockq, 16, 4, m6
+ RET
+%endmacro
+
+INIT_MMX mmxext
+IWHT4_FN 10, 1023
+INIT_MMX mmxext
+IWHT4_FN 12, 4095
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+ mova m5, [pw_2048]
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+%else
+ mova m5, [pw_8]
+ paddw m0, m5
+ paddw m1, m5
+ paddw m2, m5
+ paddw m3, m5
+ psraw m0, 4
+ psraw m1, 4
+ psraw m2, 4
+ psraw m3, 4
+%endif
+ mova m5, [pw_1023]
+ VP9_STORE_2X 0, 1, 6, 7, 4, 5
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 2, 3, 6, 7, 4, 5
+%endmacro
+
+%macro DC_ONLY 2 ; shift, zero
+ mov coefd, dword [blockq]
+ movd [blockq], %2
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, ((1 << (%1 - 1)) << 14) + 8192
+ sar coefd, 14 + %1
+%endmacro
+
+; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
+; in 15+1 words without additional effort, since the coefficients are 15bpp.
+
+%macro IDCT4_10_FN 0
+cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only
+ pxor m4, m4
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ movd [blockq], m4
+ mova m5, [pw_11585x2]
+ pmulhrsw m0, m5
+ pmulhrsw m0, m5
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ DC_ONLY 4, m4
+ movd m0, coefd
+%endif
+ pshufw m0, m0, 0
+ mova m5, [pw_1023]
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+ VP9_STORE_2X 0, 0, 6, 7, 4, 5
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4, 5
+ RET
+
+.idctfull:
+ mova m0, [blockq+0*16+0]
+ mova m1, [blockq+1*16+0]
+ packssdw m0, [blockq+0*16+8]
+ packssdw m1, [blockq+1*16+8]
+ mova m2, [blockq+2*16+0]
+ mova m3, [blockq+3*16+0]
+ packssdw m2, [blockq+2*16+8]
+ packssdw m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+ mova m7, [pd_8192] ; rounding
+ VP9_IDCT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT4_1D
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+INIT_MMX mmxext
+IDCT4_10_FN
+INIT_MMX ssse3
+IDCT4_10_FN
+
+%macro IADST4_FN 4
+cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+%endif
+ movdqa xmm5, [pd_8192]
+ mova m0, [blockq+0*16+0]
+ mova m1, [blockq+1*16+0]
+ packssdw m0, [blockq+0*16+8]
+ packssdw m1, [blockq+1*16+8]
+ mova m2, [blockq+2*16+0]
+ mova m3, [blockq+3*16+0]
+ packssdw m2, [blockq+2*16+8]
+ packssdw m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+ movdq2q m7, xmm5
+%endif
+ VP9_%2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_%4_1D
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+INIT_MMX sse2
+IADST4_FN idct, IDCT4, iadst, IADST4
+IADST4_FN iadst, IADST4, idct, IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+INIT_MMX ssse3
+IADST4_FN idct, IDCT4, iadst, IADST4
+IADST4_FN iadst, IADST4, idct, IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
+; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
+%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
+ pand m%3, m%1, %8
+ pand m%4, m%2, %8
+ psrad m%1, 14
+ psrad m%2, 14
+ packssdw m%4, m%2
+ packssdw m%3, m%1
+ punpckhwd m%2, m%4, m%3
+ punpcklwd m%4, m%3
+ pmaddwd m%3, m%4, [pw_%6_%5]
+ pmaddwd m%1, m%2, [pw_%6_%5]
+ pmaddwd m%4, [pw_m%5_%6]
+ pmaddwd m%2, [pw_m%5_%6]
+ paddd m%3, %7
+ paddd m%4, %7
+ psrad m%3, 14
+ psrad m%4, 14
+ paddd m%1, m%3
+ paddd m%2, m%4
+%endmacro
+
+%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
+ SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2
+ SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2
+ SUMSUB_BA d, %4, %3, %7
+ SUMSUB_BA d, %6, %5, %7
+ SWAP %4, %6, %3
+%endmacro
+
+%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
+ movh m%1, [dstq+strideq*0]
+ movh m%2, [dstq+strideq*2]
+ movhps m%1, [dstq+strideq*1]
+ movhps m%2, [dstq+stride3q ]
+ paddw m%1, m%3
+ paddw m%2, m%4
+ pmaxsw m%1, %5
+ pmaxsw m%2, %5
+ pminsw m%1, %6
+ pminsw m%2, %6
+ movh [dstq+strideq*0], m%1
+ movhps [dstq+strideq*1], m%1
+ movh [dstq+strideq*2], m%2
+ movhps [dstq+stride3q ], m%2
+%endmacro
+
+%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
+ paddd m%1, %7
+ paddd m%2, %7
+ paddd m%3, %7
+ paddd m%4, %7
+ psrad m%1, %8
+ psrad m%2, %8
+ psrad m%3, %8
+ psrad m%4, %8
+ packssdw m%1, m%2
+ packssdw m%3, m%4
+ STORE_4x4 %2, %4, %1, %3, %5, %6
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
+ ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
+ ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
+ ; dword. After the final shift (4), the result is 13+sign bits, so we
+ ; don't need any additional processing to fit it in a word
+ DEFINE_ARGS dst, stride, block, coef
+ pxor m4, m4
+ DC_ONLY 4, m4
+ movd m0, coefd
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova m5, [pw_4095]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ STORE_4x4 1, 3, 0, 0, m4, m5
+ RET
+
+.idctfull:
+ DEFINE_ARGS dst, stride, block, eob
+ mova m0, [blockq+0*16]
+ mova m1, [blockq+1*16]
+ mova m2, [blockq+2*16]
+ mova m3, [blockq+3*16]
+ mova m6, [pd_8192]
+ mova m7, [pd_3fff]
+
+ IDCT4_12BPP_1D m6, m7
+ TRANSPOSE4x4D 0, 1, 2, 3, 4
+ IDCT4_12BPP_1D m6, m7
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+
+ ; writeout
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova m5, [pw_4095]
+ mova m6, [pd_8]
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
+ RET
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+ mova [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+ mova m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
+; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
+; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
+; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
+%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
+ pand m4, m0, %2
+ pand m5, m1, %2
+ psrad m0, 14
+ psrad m1, 14
+ packssdw m5, m1
+ packssdw m4, m0
+ punpckhwd m1, m4, m5
+ punpcklwd m4, m5
+ pand m5, m2, %2
+ pand m6, m3, %2
+ psrad m2, 14
+ psrad m3, 14
+ packssdw m6, m3
+ packssdw m5, m2
+ punpckhwd m3, m5, m6
+ punpcklwd m5, m6
+ SCRATCH 1, 8, rsp+0*mmsize, a
+ SCRATCH 5, 9, rsp+1*mmsize, b
+
+ ; m1/3 have the high bits of 0,1,2,3
+ ; m4/5 have the low bits of 0,1,2,3
+ ; m0/2/6/7 are free
+
+ mova m2, [pw_15212_9929]
+ mova m0, [pw_5283_13377]
+ pmaddwd m7, m2, reg_b
+ pmaddwd m6, m4, m0
+ pmaddwd m2, m3
+ pmaddwd m0, reg_a
+ paddd m6, m7
+ paddd m0, m2
+ mova m1, [pw_m13377_13377]
+ mova m5, [pw_13377_0]
+ pmaddwd m7, m1, reg_b
+ pmaddwd m2, m4, m5
+ pmaddwd m1, m3
+ pmaddwd m5, reg_a
+ paddd m2, m7
+ paddd m1, m5
+ paddd m6, %1
+ paddd m2, %1
+ psrad m6, 14
+ psrad m2, 14
+ paddd m0, m6 ; t0
+ paddd m2, m1 ; t2
+
+ mova m7, [pw_m5283_m15212]
+ mova m5, [pw_9929_13377]
+ pmaddwd m1, m7, reg_b
+ pmaddwd m6, m4, m5
+ pmaddwd m7, m3
+ pmaddwd m5, reg_a
+ paddd m6, m1
+ paddd m7, m5
+ UNSCRATCH 5, 9, rsp+1*mmsize, b
+ pmaddwd m5, [pw_9929_m5283]
+ pmaddwd m4, [pw_15212_m13377]
+ pmaddwd m3, [pw_9929_m5283]
+ UNSCRATCH 1, 8, rsp+0*mmsize, a
+ pmaddwd m1, [pw_15212_m13377]
+ paddd m4, m5
+ paddd m3, m1
+ paddd m6, %1
+ paddd m4, %1
+ psrad m6, 14
+ psrad m4, 14
+ paddd m7, m6 ; t1
+ paddd m3, m4 ; t3
+
+ SWAP 1, 7
+%endmacro
+
+%macro IADST4_12BPP_FN 4
+cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
+ mova m0, [blockq+0*16]
+ mova m1, [blockq+1*16]
+ mova m2, [blockq+2*16]
+ mova m3, [blockq+3*16]
+
+ PRELOAD 10, pd_8192, rnd
+ PRELOAD 11, pd_3fff, mask
+ %2_12BPP_1D reg_rnd, reg_mask
+ TRANSPOSE4x4D 0, 1, 2, 3, 4
+ %4_12BPP_1D reg_rnd, reg_mask
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+
+ ; writeout
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova m5, [pw_4095]
+ mova m6, [pd_8]
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
+ RET
+%endmacro
+
+INIT_XMM sse2
+IADST4_12BPP_FN idct, IDCT4, iadst, IADST4
+IADST4_12BPP_FN iadst, IADST4, idct, IDCT4
+IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH 6, 8, rsp+%3*mmsize
+%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
+ mova m0, [%1+0*%4]
+ mova m2, [%1+2*%4]
+ mova m4, [%1+4*%4]
+ mova m6, [%1+6*%4]
+ IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3
+ SCRATCH 4, 8, rsp+(%5+0)*mmsize
+ SCRATCH 6, 9, rsp+(%5+1)*mmsize
+ mova m1, [%1+1*%4]
+ mova m3, [%1+3*%4]
+ mova m5, [%1+5*%4]
+ mova m7, [%1+7*%4]
+ SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a
+ SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a
+ SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a
+ SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a
+ SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5
+ SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7
+ SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6
+ UNSCRATCH 4, 8, rsp+(%5+0)*mmsize
+ UNSCRATCH 6, 9, rsp+(%5+1)*mmsize
+ SCRATCH 2, 8, rsp+(%5+0)*mmsize
+ SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5
+ SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4
+ SWAP 0, 5, 4, 6, 2, 7
+%endmacro
+
+%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
+ mova m%1, [%6+%7*0]
+ mova m%2, [%6+%7*1]
+ paddw m%1, m%3
+ paddw m%2, m%3
+ pmaxsw m%1, %4
+ pmaxsw m%2, %4
+ pminsw m%1, %5
+ pminsw m%2, %5
+ mova [%6+%7*0], m%1
+ mova [%6+%7*1], m%2
+%endmacro
+
+; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
+; storage also instead of allocating two more stack spaces. This doesn't
+; matter much but it's something...
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
+ 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+ ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
+ ; fits in 32bit
+ DEFINE_ARGS dst, stride, block, coef
+ pxor m2, m2
+ DC_ONLY 5, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 4
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop_dc
+ RET
+
+.idctfull:
+ SCRATCH 0, 12, rsp+16*mmsize, max
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [default_8x8]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_8x8+cntq-1]
+%endif
+ mov skipd, 2
+ sub skipd, cntd
+ mov ptrq, rsp
+ PRELOAD 10, pd_8192, rnd
+ PRELOAD 11, pd_3fff, mask
+ PRELOAD 13, pd_16, srnd
+.loop_1:
+ IDCT8_1D blockq, reg_rnd, reg_mask
+
+ TRANSPOSE4x4D 0, 1, 2, 3, 6
+ mova [ptrq+ 0*mmsize], m0
+ mova [ptrq+ 2*mmsize], m1
+ mova [ptrq+ 4*mmsize], m2
+ mova [ptrq+ 6*mmsize], m3
+ UNSCRATCH 6, 8, rsp+17*mmsize
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 1*mmsize], m4
+ mova [ptrq+ 3*mmsize], m5
+ mova [ptrq+ 5*mmsize], m6
+ mova [ptrq+ 7*mmsize], m7
+ add ptrq, 8 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ add ptrq, 4 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 2
+ mov ptrq, rsp
+.loop_2:
+ IDCT8_1D ptrq, reg_rnd, reg_mask
+
+ pxor m6, m6
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+ lea dstq, [dstq+strideq*4]
+ UNSCRATCH 0, 8, rsp+17*mmsize
+ UNSCRATCH 1, 12, rsp+16*mmsize, max
+ UNSCRATCH 2, 13, pd_16, srnd
+ ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
+ add ptrq, 16
+%if ARCH_X86_64
+ lea dstq, [dstbakq+8]
+%else
+ mov dstq, dstm
+ add dstq, 8
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m6 is still zero
+ ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+ RET
+
+%macro DC_ONLY_64BIT 2 ; shift, zero
+%if ARCH_X86_64
+ movsxd coefq, dword [blockq]
+ movd [blockq], %2
+ imul coefq, 11585
+ add coefq, 8192
+ sar coefq, 14
+ imul coefq, 11585
+ add coefq, ((1 << (%1 - 1)) << 14) + 8192
+ sar coefq, 14 + %1
+%else
+ mov coefd, dword [blockq]
+ movd [blockq], %2
+ DEFINE_ARGS dst, stride, cnt, coef, coefl
+ mov cntd, 2
+.loop_dc_calc:
+ mov coefld, coefd
+ sar coefd, 14
+ and coefld, 0x3fff
+ imul coefd, 11585
+ imul coefld, 11585
+ add coefld, 8192
+ sar coefld, 14
+ add coefd, coefld
+ dec cntd
+ jg .loop_dc_calc
+ add coefd, 1 << (%1 - 1)
+ sar coefd, %1
+%endif
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
+ 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ cmp eobd, 1
+ jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
+
+ ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
+ ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
+ DEFINE_ARGS dst, stride, block, coef, coefl
+ pxor m2, m2
+ DC_ONLY_64BIT 5, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 4
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop_dc
+ RET
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
+; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
+%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
+ pand m%3, m%1, %7
+ pand m%4, m%2, %7
+ psrad m%1, 14
+ psrad m%2, 14
+ packssdw m%4, m%2
+ packssdw m%3, m%1
+ punpckhwd m%2, m%4, m%3
+ punpcklwd m%4, m%3
+ pmaddwd m%3, m%4, [pw_%6_%5]
+ pmaddwd m%1, m%2, [pw_%6_%5]
+ pmaddwd m%4, [pw_m%5_%6]
+ pmaddwd m%2, [pw_m%5_%6]
+%endmacro
+
+; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
+; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
+%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
+ SUMSUB_BA d, %1, %2, %5
+ SUMSUB_BA d, %3, %4, %5
+ paddd m%3, %6
+ paddd m%4, %6
+ psrad m%3, 14
+ psrad m%4, 14
+ paddd m%1, m%3
+ paddd m%2, m%4
+%endmacro
+
+%macro NEGD 1
+%if cpuflag(ssse3)
+ psignd %1, [pw_m1]
+%else
+ pxor %1, [pw_m1]
+ paddd %1, [pd_1]
+%endif
+%endmacro
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH 6, 8, rsp+17*mmsize
+%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
+ mova m0, [%1+ 0*mmsize]
+ mova m3, [%1+ 6*mmsize]
+ mova m4, [%1+ 8*mmsize]
+ mova m7, [%1+14*mmsize]
+ SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a
+ SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a
+ SCRATCH 0, 8, rsp+17*mmsize
+ SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4
+ UNSCRATCH 0, 8, rsp+17*mmsize
+ SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5
+
+ SCRATCH 3, 8, rsp+17*mmsize
+ SCRATCH 4, 9, rsp+18*mmsize
+ SCRATCH 7, 10, rsp+19*mmsize
+ SCRATCH 0, 11, rsp+20*mmsize
+
+ mova m1, [%1+ 2*mmsize]
+ mova m2, [%1+ 4*mmsize]
+ mova m5, [%1+10*mmsize]
+ mova m6, [%1+12*mmsize]
+ SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a
+ SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a
+ SCRATCH 2, 12, rsp+21*mmsize
+ SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6
+ UNSCRATCH 2, 12, rsp+21*mmsize
+ SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7
+
+ UNSCRATCH 7, 10, rsp+19*mmsize
+ UNSCRATCH 0, 11, rsp+20*mmsize
+ SCRATCH 1, 10, rsp+19*mmsize
+ SCRATCH 6, 11, rsp+20*mmsize
+
+ SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a
+ SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a
+ SCRATCH 2, 12, rsp+21*mmsize
+ SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6
+ UNSCRATCH 2, 12, rsp+21*mmsize
+ NEGD m5 ; m5=out1
+ SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7
+ SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5
+ NEGD m0 ; m0=out5
+
+ UNSCRATCH 3, 8, rsp+17*mmsize
+ UNSCRATCH 4, 9, rsp+18*mmsize
+ UNSCRATCH 1, 10, rsp+19*mmsize
+ UNSCRATCH 6, 11, rsp+20*mmsize
+ SCRATCH 2, 8, rsp+17*mmsize
+ SCRATCH 0, 9, rsp+18*mmsize
+
+ SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2
+ SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3
+ NEGD m6 ; m6=out7
+ SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
+ NEGD m3 ; m3=out3
+
+ UNSCRATCH 0, 9, rsp+18*mmsize
+
+ SWAP 0, 1, 5
+ SWAP 2, 7, 6
+%endmacro
+
+%macro IADST8_FN 5
+cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+
+.body:
+ SCRATCH 0, 13, rsp+16*mmsize, max
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [%5_8x8]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [%5_8x8+cntq-1]
+%endif
+ mov skipd, 2
+ sub skipd, cntd
+ mov ptrq, rsp
+ PRELOAD 14, pd_8192, rnd
+ PRELOAD 15, pd_3fff, mask
+.loop_1:
+ %2_1D blockq, reg_rnd, reg_mask
+
+ TRANSPOSE4x4D 0, 1, 2, 3, 6
+ mova [ptrq+ 0*mmsize], m0
+ mova [ptrq+ 2*mmsize], m1
+ mova [ptrq+ 4*mmsize], m2
+ mova [ptrq+ 6*mmsize], m3
+ UNSCRATCH 6, 8, rsp+17*mmsize
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 1*mmsize], m4
+ mova [ptrq+ 3*mmsize], m5
+ mova [ptrq+ 5*mmsize], m6
+ mova [ptrq+ 7*mmsize], m7
+ add ptrq, 8 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ add ptrq, 4 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 2
+ mov ptrq, rsp
+.loop_2:
+ %4_1D ptrq, reg_rnd, reg_mask
+
+ pxor m6, m6
+ PRELOAD 9, pd_16, srnd
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+ lea dstq, [dstq+strideq*4]
+ UNSCRATCH 0, 8, rsp+17*mmsize
+ UNSCRATCH 1, 13, rsp+16*mmsize, max
+ UNSCRATCH 2, 9, pd_16, srnd
+ ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
+ add ptrq, 16
+%if ARCH_X86_64
+ lea dstq, [dstbakq+8]
+%else
+ mov dstq, dstm
+ add dstq, 8
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m6 is still zero
+ ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+ RET
+
+cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST8_FN idct, IDCT8, iadst, IADST8, row
+IADST8_FN iadst, IADST8, idct, IDCT8, col
+IADST8_FN iadst, IADST8, iadst, IADST8, default
+
+%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
+ IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
+ ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
+ SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a
+ SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a
+ SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a
+ SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a
+ SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4
+ mova [rsp+(%3+0)*mmsize], m5 ; t5
+ mova [rsp+(%3+1)*mmsize], m7 ; t7
+
+ mova m0, [%1+ 1*%2] ; in1
+ mova m3, [%1+ 7*%2] ; in7
+ mova m4, [%1+ 9*%2] ; in9
+ mova m7, [%1+15*%2] ; in15
+
+ SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a
+ SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a
+ SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9
+ SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14
+ SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a
+
+ mova m1, [%1+ 3*%2] ; in3
+ mova m2, [%1+ 5*%2] ; in5
+ mova m5, [%1+11*%2] ; in11
+ mova m6, [%1+13*%2] ; in13
+
+ SCRATCH 0, 9, rsp+(%4+1)*mmsize
+ SCRATCH 7, 10, rsp+(%4+2)*mmsize
+
+ SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a
+ SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a
+ SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10
+ SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13
+ NEGD m1 ; m1=-t10
+ SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a
+
+ UNSCRATCH 7, 10, rsp+(%4+2)*mmsize
+ SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a
+ SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10
+ SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a
+ SCRATCH 5, 10, rsp+(%4+2)*mmsize
+ SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11
+ UNSCRATCH 0, 9, rsp+(%4+1)*mmsize
+ SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13
+ SCRATCH 6, 9, rsp+(%4+1)*mmsize
+ SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a
+
+ ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
+ ; free: 6,5
+
+ UNSCRATCH 5, 15, rsp+(%4+7)*mmsize
+ SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15
+ SCRATCH 5, 15, rsp+(%4+7)*mmsize
+ UNSCRATCH 5, 14, rsp+(%4+6)*mmsize
+ SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14
+ SCRATCH 5, 14, rsp+(%4+6)*mmsize
+ UNSCRATCH 5, 13, rsp+(%4+5)*mmsize
+ SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13
+ SCRATCH 5, 13, rsp+(%4+5)*mmsize
+ UNSCRATCH 5, 12, rsp+(%4+4)*mmsize
+ SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12
+ SCRATCH 5, 12, rsp+(%4+4)*mmsize
+ UNSCRATCH 5, 11, rsp+(%4+3)*mmsize
+ SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11
+ SCRATCH 4, 11, rsp+(%4+3)*mmsize
+ mova m4, [rsp+(%3+0)*mmsize]
+ SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10
+ mova [rsp+(%3+0)*mmsize], m5
+ UNSCRATCH 5, 8, rsp+(%4+0)*mmsize
+ UNSCRATCH 6, 9, rsp+(%4+1)*mmsize
+ SCRATCH 2, 8, rsp+(%4+0)*mmsize
+ SCRATCH 1, 9, rsp+(%4+1)*mmsize
+ UNSCRATCH 1, 10, rsp+(%4+2)*mmsize
+ SCRATCH 0, 10, rsp+(%4+2)*mmsize
+ mova m0, [rsp+(%3+1)*mmsize]
+ SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9
+ SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8
+
+ SWAP 0, 3, 1, 7, 2, 6, 4
+
+ ; output order: 8-11|r67-70=out0-3
+ ; 0-6,r65=out4-11
+ ; 12-15|r71-74=out12-15
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+ ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+ ; fits in 32bit
+ DEFINE_ARGS dst, stride, block, coef
+ pxor m2, m2
+ DC_ONLY 6, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
+ STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop_dc
+ RET
+
+.idctfull:
+ mova [rsp+64*mmsize], m0
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [default_16x16]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_16x16+cntq-1]
+%endif
+ mov skipd, 4
+ sub skipd, cntd
+ mov ptrq, rsp
+.loop_1:
+ IDCT16_1D blockq
+
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 1*mmsize], m0
+ mova [ptrq+ 5*mmsize], m1
+ mova [ptrq+ 9*mmsize], m2
+ mova [ptrq+13*mmsize], m3
+ mova m7, [rsp+65*mmsize]
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 2*mmsize], m4
+ mova [ptrq+ 6*mmsize], m5
+ mova [ptrq+10*mmsize], m6
+ mova [ptrq+14*mmsize], m7
+ UNSCRATCH 0, 8, rsp+67*mmsize
+ UNSCRATCH 1, 9, rsp+68*mmsize
+ UNSCRATCH 2, 10, rsp+69*mmsize
+ UNSCRATCH 3, 11, rsp+70*mmsize
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 0*mmsize], m0
+ mova [ptrq+ 4*mmsize], m1
+ mova [ptrq+ 8*mmsize], m2
+ mova [ptrq+12*mmsize], m3
+ UNSCRATCH 4, 12, rsp+71*mmsize
+ UNSCRATCH 5, 13, rsp+72*mmsize
+ UNSCRATCH 6, 14, rsp+73*mmsize
+ UNSCRATCH 7, 15, rsp+74*mmsize
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 3*mmsize], m4
+ mova [ptrq+ 7*mmsize], m5
+ mova [ptrq+11*mmsize], m6
+ mova [ptrq+15*mmsize], m7
+ add ptrq, 16 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ mov ptrq, rsp
+.loop_2:
+ IDCT16_1D ptrq
+
+ pxor m7, m7
+ lea dstq, [dstq+strideq*4]
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+ lea dstq, [dstq+strideq*4]
+ mova m0, [rsp+65*mmsize]
+ mova m1, [rsp+64*mmsize]
+ mova m2, [pd_32]
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+ DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+ mov dstq, dstm
+%endif
+ UNSCRATCH 0, 8, rsp+67*mmsize
+ UNSCRATCH 4, 9, rsp+68*mmsize
+ UNSCRATCH 5, 10, rsp+69*mmsize
+ UNSCRATCH 3, 11, rsp+70*mmsize
+ ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea dstq, [dstbakq+stride3q*4]
+%else
+ lea dstq, [dstq+stride3q*4]
+%endif
+ UNSCRATCH 4, 12, rsp+71*mmsize
+ UNSCRATCH 5, 13, rsp+72*mmsize
+ UNSCRATCH 6, 14, rsp+73*mmsize
+ UNSCRATCH 0, 15, rsp+74*mmsize
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
+
+ add ptrq, mmsize
+%if ARCH_X86_64
+ add dstbakq, 8
+ mov dstq, dstbakq
+%else
+ add dword dstm, 8
+ mov dstq, dstm
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m7 is still zero
+ ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+ RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ cmp eobd, 1
+ jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
+
+ ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+ ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+ DEFINE_ARGS dst, stride, block, coef, coefl
+ pxor m2, m2
+ DC_ONLY_64BIT 6, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
+ STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop_dc
+ RET
+
+; r65-69 are available for spills
+; r70-77 are available on x86-32 only (x86-64 should use m8-15)
+; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
+%macro IADST16_1D 1 ; src
+ mova m0, [%1+ 0*4*mmsize] ; in0
+ mova m1, [%1+ 7*4*mmsize] ; in7
+ mova m2, [%1+ 8*4*mmsize] ; in8
+ mova m3, [%1+15*4*mmsize] ; in15
+ SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1
+ SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9
+ SCRATCH 0, 8, rsp+70*mmsize
+ SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a
+ UNSCRATCH 0, 8, rsp+70*mmsize
+ SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a
+ mova [rsp+67*mmsize], m1
+ SCRATCH 2, 9, rsp+71*mmsize
+ SCRATCH 3, 12, rsp+74*mmsize
+ SCRATCH 0, 13, rsp+75*mmsize
+
+ mova m0, [%1+ 3*4*mmsize] ; in3
+ mova m1, [%1+ 4*4*mmsize] ; in4
+ mova m2, [%1+11*4*mmsize] ; in11
+ mova m3, [%1+12*4*mmsize] ; in12
+ SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5
+ SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13
+ SCRATCH 1, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a
+ UNSCRATCH 1, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a
+ SCRATCH 0, 15, rsp+77*mmsize
+ SCRATCH 3, 11, rsp+73*mmsize
+
+ UNSCRATCH 0, 12, rsp+74*mmsize ; t8a
+ UNSCRATCH 3, 13, rsp+75*mmsize ; t9a
+ SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9
+ SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12
+ SCRATCH 1, 12, rsp+74*mmsize
+ SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a
+ UNSCRATCH 1, 12, rsp+74*mmsize
+ SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a
+ mova [rsp+65*mmsize], m2
+ mova [rsp+66*mmsize], m1
+ SCRATCH 0, 8, rsp+70*mmsize
+ SCRATCH 3, 12, rsp+74*mmsize
+
+ mova m0, [%1+ 2*4*mmsize] ; in2
+ mova m1, [%1+ 5*4*mmsize] ; in5
+ mova m2, [%1+10*4*mmsize] ; in10
+ mova m3, [%1+13*4*mmsize] ; in13
+ SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3
+ SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11
+ SCRATCH 0, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a
+ UNSCRATCH 0, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a
+ mova [rsp+68*mmsize], m1
+ mova [rsp+69*mmsize], m2
+ SCRATCH 3, 13, rsp+75*mmsize
+ SCRATCH 0, 14, rsp+76*mmsize
+
+ mova m0, [%1+ 1*4*mmsize] ; in1
+ mova m1, [%1+ 6*4*mmsize] ; in6
+ mova m2, [%1+ 9*4*mmsize] ; in9
+ mova m3, [%1+14*4*mmsize] ; in14
+ SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7
+ SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15
+ SCRATCH 1, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a
+ UNSCRATCH 1, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a
+
+ UNSCRATCH 4, 13, rsp+75*mmsize ; t10a
+ UNSCRATCH 5, 14, rsp+76*mmsize ; t11a
+ SCRATCH 0, 13, rsp+75*mmsize
+ SCRATCH 3, 14, rsp+76*mmsize
+ SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11
+ SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14
+ SCRATCH 0, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a
+ UNSCRATCH 0, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a
+
+ UNSCRATCH 0, 8, rsp+70*mmsize ; t12a
+ UNSCRATCH 3, 12, rsp+74*mmsize ; t13a
+ SCRATCH 2, 8, rsp+70*mmsize
+ SCRATCH 1, 12, rsp+74*mmsize
+ SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13
+ SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14
+ SCRATCH 2, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a
+ UNSCRATCH 2, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a
+ NEGD m5 ; m5=out13
+
+ UNSCRATCH 1, 9, rsp+71*mmsize ; t1a
+ mova m2, [rsp+68*mmsize] ; t2a
+ UNSCRATCH 6, 13, rsp+75*mmsize ; t6a
+ UNSCRATCH 7, 14, rsp+76*mmsize ; t7a
+ SCRATCH 4, 10, rsp+72*mmsize
+ SCRATCH 5, 13, rsp+75*mmsize
+ UNSCRATCH 4, 15, rsp+77*mmsize ; t4a
+ UNSCRATCH 5, 11, rsp+73*mmsize ; t5a
+ SCRATCH 0, 14, rsp+76*mmsize
+ SCRATCH 3, 15, rsp+77*mmsize
+ mova m0, [rsp+67*mmsize] ; t0a
+ SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4
+ SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5
+ SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6
+ SCRATCH 4, 9, rsp+71*mmsize
+ mova m3, [rsp+69*mmsize] ; t3a
+ SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7
+
+ mova [rsp+67*mmsize], m5
+ mova [rsp+68*mmsize], m6
+ mova [rsp+69*mmsize], m7
+ SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a
+ SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a
+ SCRATCH 1, 11, rsp+73*mmsize
+ SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6
+ NEGD m2 ; m2=out3
+ UNSCRATCH 1, 11, rsp+73*mmsize
+ SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7
+ SCRATCH 2, 11, rsp+73*mmsize
+ UNSCRATCH 2, 12, rsp+74*mmsize ; t11a
+ SCRATCH 3, 12, rsp+74*mmsize
+
+ UNSCRATCH 3, 8, rsp+70*mmsize ; t10a
+ mova m4, [rsp+65*mmsize] ; t8a
+ mova m5, [rsp+66*mmsize] ; t9a
+ SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10
+ NEGD m3 ; m3=out1
+ SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11
+ UNSCRATCH 6, 9, rsp+71*mmsize ; t0
+ UNSCRATCH 7, 14, rsp+76*mmsize ; t14a
+ SCRATCH 3, 9, rsp+71*mmsize
+ SCRATCH 2, 14, rsp+76*mmsize
+
+ SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11
+ mova [rsp+65*mmsize], m0
+ SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9
+ UNSCRATCH 0, 15, rsp+77*mmsize ; t15a
+ SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5
+
+ mova m2, [rsp+68*mmsize] ; t2
+ SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a
+ SCRATCH 2, 8, rsp+70*mmsize
+ mova m2, [rsp+67*mmsize] ; t1
+ mova m3, [rsp+69*mmsize] ; t3
+ mova [rsp+67*mmsize], m7
+ SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a
+ NEGD m3 ; m3=out15
+ SCRATCH 3, 15, rsp+77*mmsize
+ SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7
+ mova m7, [rsp+67*mmsize]
+
+ SWAP 0, 1
+ SWAP 2, 5, 4, 6, 7, 3
+%endmacro
+
+%macro IADST16_FN 7
+cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+
+.body:
+ mova [rsp+64*mmsize], m0
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [%7_16x16]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [%7_16x16+cntq-1]
+%endif
+ mov skipd, 4
+ sub skipd, cntd
+ mov ptrq, rsp
+.loop_1:
+ %2_1D blockq
+
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 1*mmsize], m0
+ mova [ptrq+ 5*mmsize], m1
+ mova [ptrq+ 9*mmsize], m2
+ mova [ptrq+13*mmsize], m3
+ mova m7, [rsp+65*mmsize]
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 2*mmsize], m4
+ mova [ptrq+ 6*mmsize], m5
+ mova [ptrq+10*mmsize], m6
+ mova [ptrq+14*mmsize], m7
+ UNSCRATCH 0, 8, rsp+(%3+0)*mmsize
+ UNSCRATCH 1, 9, rsp+(%3+1)*mmsize
+ UNSCRATCH 2, 10, rsp+(%3+2)*mmsize
+ UNSCRATCH 3, 11, rsp+(%3+3)*mmsize
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 0*mmsize], m0
+ mova [ptrq+ 4*mmsize], m1
+ mova [ptrq+ 8*mmsize], m2
+ mova [ptrq+12*mmsize], m3
+ UNSCRATCH 4, 12, rsp+(%3+4)*mmsize
+ UNSCRATCH 5, 13, rsp+(%3+5)*mmsize
+ UNSCRATCH 6, 14, rsp+(%3+6)*mmsize
+ UNSCRATCH 7, 15, rsp+(%3+7)*mmsize
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 3*mmsize], m4
+ mova [ptrq+ 7*mmsize], m5
+ mova [ptrq+11*mmsize], m6
+ mova [ptrq+15*mmsize], m7
+ add ptrq, 16 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ mov ptrq, rsp
+.loop_2:
+ %5_1D ptrq
+
+ pxor m7, m7
+ lea dstq, [dstq+strideq*4]
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+ lea dstq, [dstq+strideq*4]
+ mova m0, [rsp+65*mmsize]
+ mova m1, [rsp+64*mmsize]
+ mova m2, [pd_32]
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+ DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+ mov dstq, dstm
+%endif
+ UNSCRATCH 0, 8, rsp+(%6+0)*mmsize
+ UNSCRATCH 4, 9, rsp+(%6+1)*mmsize
+ UNSCRATCH 5, 10, rsp+(%6+2)*mmsize
+ UNSCRATCH 3, 11, rsp+(%6+3)*mmsize
+ ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea dstq, [dstbakq+stride3q*4]
+%else
+ lea dstq, [dstq+stride3q*4]
+%endif
+ UNSCRATCH 4, 12, rsp+(%6+4)*mmsize
+ UNSCRATCH 5, 13, rsp+(%6+5)*mmsize
+ UNSCRATCH 6, 14, rsp+(%6+6)*mmsize
+ UNSCRATCH 0, 15, rsp+(%6+7)*mmsize
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
+
+ add ptrq, mmsize
+%if ARCH_X86_64
+ add dstbakq, 8
+ mov dstq, dstbakq
+%else
+ add dword dstm, 8
+ mov dstq, dstm
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m7 is still zero
+ ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+ RET
+
+cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row
+IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col
+IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
+
+%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
+ IDCT16_1D %2, 2 * %3, 272, 257
+%if ARCH_X86_64
+ mova [rsp+257*mmsize], m8
+ mova [rsp+258*mmsize], m9
+ mova [rsp+259*mmsize], m10
+ mova [rsp+260*mmsize], m11
+ mova [rsp+261*mmsize], m12
+ mova [rsp+262*mmsize], m13
+ mova [rsp+263*mmsize], m14
+ mova [rsp+264*mmsize], m15
+%endif
+ mova [rsp+265*mmsize], m0
+ mova [rsp+266*mmsize], m1
+ mova [rsp+267*mmsize], m2
+ mova [rsp+268*mmsize], m3
+ mova [rsp+269*mmsize], m4
+ mova [rsp+270*mmsize], m5
+ mova [rsp+271*mmsize], m6
+
+ ; r257-260: t0-3
+ ; r265-272: t4/5a/6a/7/8/9a/10/11a
+ ; r261-264: t12a/13/14a/15
+ ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
+
+ mova m0, [%2+ 1*%3] ; in1
+ mova m1, [%2+15*%3] ; in15
+ mova m2, [%2+17*%3] ; in17
+ mova m3, [%2+31*%3] ; in31
+ SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a
+ SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a
+ SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17
+ SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30
+ SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a
+ SCRATCH 0, 8, rsp+275*mmsize
+ SCRATCH 2, 9, rsp+276*mmsize
+
+ ; end of stage 1-3 first quart
+
+ mova m0, [%2+ 7*%3] ; in7
+ mova m2, [%2+ 9*%3] ; in9
+ mova m4, [%2+23*%3] ; in23
+ mova m5, [%2+25*%3] ; in25
+ SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a
+ SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a
+ SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18
+ SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29
+ SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a
+
+ ; end of stage 1-3 second quart
+
+ SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a
+ SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18
+ UNSCRATCH 6, 8, rsp+275*mmsize ; t30a
+ UNSCRATCH 7, 9, rsp+276*mmsize ; t31
+ mova [rsp+273*mmsize], m4
+ mova [rsp+274*mmsize], m0
+ SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a
+ SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29
+ SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a
+ SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19
+ SCRATCH 3, 10, rsp+277*mmsize
+ SCRATCH 1, 11, rsp+278*mmsize
+ SCRATCH 7, 12, rsp+279*mmsize
+ SCRATCH 6, 13, rsp+280*mmsize
+ SCRATCH 5, 14, rsp+281*mmsize
+ SCRATCH 2, 15, rsp+282*mmsize
+
+ ; end of stage 4-5 first half
+
+ mova m0, [%2+ 5*%3] ; in5
+ mova m1, [%2+11*%3] ; in11
+ mova m2, [%2+21*%3] ; in21
+ mova m3, [%2+27*%3] ; in27
+ SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a
+ SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a
+ SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21
+ SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26
+ SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a
+ SCRATCH 0, 8, rsp+275*mmsize
+ SCRATCH 2, 9, rsp+276*mmsize
+
+ ; end of stage 1-3 third quart
+
+ mova m0, [%2+ 3*%3] ; in3
+ mova m2, [%2+13*%3] ; in13
+ mova m4, [%2+19*%3] ; in19
+ mova m5, [%2+29*%3] ; in29
+ SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a
+ SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a
+ SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22
+ SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25
+ SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a
+
+ ; end of stage 1-3 fourth quart
+
+ SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a
+ SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21
+ UNSCRATCH 6, 8, rsp+275*mmsize ; t26a
+ UNSCRATCH 7, 9, rsp+276*mmsize ; t27
+ SCRATCH 3, 8, rsp+275*mmsize
+ SCRATCH 1, 9, rsp+276*mmsize
+ SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a
+ SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26
+ SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20
+ SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a
+
+ ; end of stage 4-5 second half
+
+ UNSCRATCH 1, 12, rsp+279*mmsize ; t28
+ UNSCRATCH 3, 13, rsp+280*mmsize ; t29a
+ SCRATCH 4, 12, rsp+279*mmsize
+ SCRATCH 0, 13, rsp+280*mmsize
+ SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26
+ SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a
+ UNSCRATCH 0, 14, rsp+281*mmsize ; t30
+ UNSCRATCH 4, 15, rsp+282*mmsize ; t31a
+ SCRATCH 2, 14, rsp+281*mmsize
+ SCRATCH 5, 15, rsp+282*mmsize
+ SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a
+ SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24
+
+ mova m2, [rsp+273*mmsize] ; t16a
+ mova m5, [rsp+274*mmsize] ; t17
+ mova [rsp+273*mmsize], m6
+ mova [rsp+274*mmsize], m7
+ UNSCRATCH 6, 10, rsp+277*mmsize ; t18a
+ UNSCRATCH 7, 11, rsp+278*mmsize ; t19
+ SCRATCH 4, 10, rsp+277*mmsize
+ SCRATCH 0, 11, rsp+278*mmsize
+ UNSCRATCH 4, 12, rsp+279*mmsize ; t20
+ UNSCRATCH 0, 13, rsp+280*mmsize ; t21a
+ SCRATCH 3, 12, rsp+279*mmsize
+ SCRATCH 1, 13, rsp+280*mmsize
+ SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21
+ SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a
+ UNSCRATCH 3, 8, rsp+275*mmsize ; t22
+ UNSCRATCH 1, 9, rsp+276*mmsize ; t23a
+ SCRATCH 0, 8, rsp+275*mmsize
+ SCRATCH 4, 9, rsp+276*mmsize
+ SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a
+ SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23
+
+ ; end of stage 6
+
+ UNSCRATCH 0, 10, rsp+277*mmsize ; t24
+ UNSCRATCH 4, 11, rsp+278*mmsize ; t25a
+ SCRATCH 1, 10, rsp+277*mmsize
+ SCRATCH 3, 11, rsp+278*mmsize
+ SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a
+ SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22
+ UNSCRATCH 1, 12, rsp+279*mmsize ; t26
+ UNSCRATCH 3, 13, rsp+280*mmsize ; t27a
+ SCRATCH 0, 12, rsp+279*mmsize
+ SCRATCH 4, 13, rsp+280*mmsize
+ SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20
+ SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a
+
+ ; end of stage 7
+
+ mova m0, [rsp+269*mmsize] ; t8
+ mova m4, [rsp+270*mmsize] ; t9a
+ mova [rsp+269*mmsize], m1 ; t26a
+ mova [rsp+270*mmsize], m3 ; t27
+ mova m3, [rsp+271*mmsize] ; t10
+ SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23
+ SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22
+ SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21
+ mova m1, [rsp+272*mmsize] ; t11a
+ mova [rsp+271*mmsize], m0
+ SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20
+
+%if %1 == 1
+ TRANSPOSE4x4D 2, 5, 6, 7, 0
+ mova [ptrq+ 2*mmsize], m2
+ mova [ptrq+10*mmsize], m5
+ mova [ptrq+18*mmsize], m6
+ mova [ptrq+26*mmsize], m7
+%else ; %1 == 2
+ pxor m0, m0
+ lea dstq, [dstq+strideq*8]
+ ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+ mova m2, [rsp+271*mmsize]
+%if %1 == 1
+ TRANSPOSE4x4D 1, 3, 4, 2, 0
+ mova [ptrq+ 5*mmsize], m1
+ mova [ptrq+13*mmsize], m3
+ mova [ptrq+21*mmsize], m4
+ mova [ptrq+29*mmsize], m2
+%else ; %1 == 2
+ lea dstq, [dstq+stride3q*4]
+ ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+ ; end of last stage + store for out8-11 and out20-23
+
+ UNSCRATCH 0, 9, rsp+276*mmsize ; t19a
+ UNSCRATCH 1, 8, rsp+275*mmsize ; t18
+ UNSCRATCH 2, 11, rsp+278*mmsize ; t17a
+ UNSCRATCH 3, 10, rsp+277*mmsize ; t16
+ mova m7, [rsp+261*mmsize] ; t12a
+ mova m6, [rsp+262*mmsize] ; t13
+ mova m5, [rsp+263*mmsize] ; t14a
+ SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19
+ SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18
+ SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17
+ mova m4, [rsp+264*mmsize] ; t15
+ SCRATCH 7, 8, rsp+275*mmsize
+ SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16
+
+%if %1 == 1
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 3*mmsize], m0
+ mova [ptrq+11*mmsize], m1
+ mova [ptrq+19*mmsize], m2
+ mova [ptrq+27*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+ SWAP 7, 9
+ lea dstq, [dstbakq+stride3q*4]
+%else ; x86-32
+ pxor m7, m7
+ mov dstq, dstm
+ lea dstq, [dstq+stride3q*4]
+%endif
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+ UNSCRATCH 0, 8, rsp+275*mmsize ; out19
+%if %1 == 1
+ TRANSPOSE4x4D 4, 5, 6, 0, 7
+ mova [ptrq+ 4*mmsize], m4
+ mova [ptrq+12*mmsize], m5
+ mova [ptrq+20*mmsize], m6
+ mova [ptrq+28*mmsize], m0
+%else ; %1 == 2
+ lea dstq, [dstq+strideq*4]
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+ ; end of last stage + store for out12-19
+
+%if ARCH_X86_64
+ SWAP 7, 8
+%endif
+ mova m7, [rsp+257*mmsize] ; t0
+ mova m6, [rsp+258*mmsize] ; t1
+ mova m5, [rsp+259*mmsize] ; t2
+ mova m4, [rsp+260*mmsize] ; t3
+ mova m0, [rsp+274*mmsize] ; t31
+ mova m1, [rsp+273*mmsize] ; t30a
+ UNSCRATCH 2, 15, rsp+282*mmsize ; t29
+ SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31
+ SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30
+ SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29
+ SCRATCH 0, 9, rsp+276*mmsize
+ UNSCRATCH 3, 14, rsp+281*mmsize ; t28a
+ SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28
+
+%if %1 == 1
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 7*mmsize], m4
+ mova [ptrq+15*mmsize], m5
+ mova [ptrq+23*mmsize], m6
+ mova [ptrq+31*mmsize], m7
+%else ; %1 == 2
+%if ARCH_X86_64
+ SWAP 0, 8
+%else ; x86-32
+ pxor m0, m0
+%endif
+ lea dstq, [dstq+stride3q*4]
+ ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+ UNSCRATCH 7, 9, rsp+276*mmsize ; out0
+%if %1 == 1
+ TRANSPOSE4x4D 7, 1, 2, 3, 0
+ mova [ptrq+ 0*mmsize], m7
+ mova [ptrq+ 8*mmsize], m1
+ mova [ptrq+16*mmsize], m2
+ mova [ptrq+24*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+ DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else ; x86-32
+ mov dstq, dstm
+%endif
+ ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+%endif
+%endif
+
+ ; end of last stage + store for out0-3 and out28-31
+
+%if ARCH_X86_64
+ SWAP 0, 8
+%endif
+ mova m7, [rsp+265*mmsize] ; t4
+ mova m6, [rsp+266*mmsize] ; t5a
+ mova m5, [rsp+267*mmsize] ; t6a
+ mova m4, [rsp+268*mmsize] ; t7
+ mova m0, [rsp+270*mmsize] ; t27
+ mova m1, [rsp+269*mmsize] ; t26a
+ UNSCRATCH 2, 13, rsp+280*mmsize ; t25
+ SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27
+ SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26
+ SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25
+ UNSCRATCH 3, 12, rsp+279*mmsize ; t24a
+ SCRATCH 7, 9, rsp+276*mmsize
+ SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24
+
+%if %1 == 1
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 1*mmsize], m0
+ mova [ptrq+ 9*mmsize], m1
+ mova [ptrq+17*mmsize], m2
+ mova [ptrq+25*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+ SWAP 7, 8
+ lea dstq, [dstbakq+strideq*4]
+%else ; x86-32
+ pxor m7, m7
+ lea dstq, [dstq+strideq*4]
+%endif
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+ UNSCRATCH 0, 9, rsp+276*mmsize ; out27
+%if %1 == 1
+ TRANSPOSE4x4D 4, 5, 6, 0, 7
+ mova [ptrq+ 6*mmsize], m4
+ mova [ptrq+14*mmsize], m5
+ mova [ptrq+22*mmsize], m6
+ mova [ptrq+30*mmsize], m0
+%else ; %1 == 2
+%if ARCH_X86_64
+ lea dstq, [dstbakq+stride3q*8]
+%else
+ mov dstq, dstm
+ lea dstq, [dstq+stride3q*8]
+%endif
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+ ; end of last stage + store for out4-7 and out24-27
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+ ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+ ; fits in 32bit
+ DEFINE_ARGS dst, stride, block, coef
+ pxor m2, m2
+ DC_ONLY 6, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 32
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
+ STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+ add dstq, strideq
+ dec cntd
+ jg .loop_dc
+ RET
+
+.idctfull:
+ mova [rsp+256*mmsize], m0
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [default_32x32]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_32x32+cntq-1]
+%endif
+ mov skipd, 8
+ sub skipd, cntd
+ mov ptrq, rsp
+.loop_1:
+ IDCT32_1D 1, blockq
+
+ add ptrq, 32 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ shl skipd, 2
+ lea blockq, [blockq+skipq*(mmsize/4)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 8
+ mov ptrq, rsp
+.loop_2:
+ IDCT32_1D 2, ptrq
+
+ add ptrq, mmsize
+%if ARCH_X86_64
+ add dstbakq, 8
+ mov dstq, dstbakq
+%else
+ add dword dstm, 8
+ mov dstq, dstm
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m7 is still zero
+ ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
+ RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ cmp eobd, 1
+ jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
+
+ ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+ ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+ DEFINE_ARGS dst, stride, block, coef, coefl
+ pxor m2, m2
+ DC_ONLY_64BIT 6, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 32
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
+ STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+ add dstq, strideq
+ dec cntd
+ jg .loop_dc
+ RET
diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm
new file mode 100644
index 0000000000..d2f2257d84
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_template.asm
@@ -0,0 +1,142 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%macro VP9_IWHT4_1D 0
+ SWAP 1, 2, 3
+ paddw m0, m2
+ psubw m3, m1
+ psubw m4, m0, m3
+ psraw m4, 1
+ psubw m5, m4, m1
+ SWAP 5, 1
+ psubw m4, m2
+ SWAP 4, 2
+ psubw m0, m1
+ paddw m3, m2
+ SWAP 3, 2, 1
+%endmacro
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+ pmaddwd m%1, m%2, %4
+ pmaddwd m%2, %5
+ paddd m%1, %3
+ paddd m%2, %3
+ psrad m%1, 14
+ psrad m%2, 14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+ VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3]
+ VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3]
+ packssdw m%1, m%7
+ packssdw m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+ punpckhwd m%6, m%2, m%1
+ punpcklwd m%2, m%1
+ VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7
+%else
+ punpckhwd m%8, m%4, m%3
+ punpcklwd m%2, m%4, m%3
+ VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+ SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
+ SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
+ SWAP 0, 3, 2 ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+%if cpuflag(ssse3)
+ SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+ pmulhrsw m2, m6 ; m2=t0
+ pmulhrsw m0, m6 ; m0=t1
+%else ; <= sse2
+ VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0
+%endif
+ VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
+ VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IADST4_1D 0
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ movq2dq xmm2, m2
+ movq2dq xmm3, m3
+%if cpuflag(ssse3)
+ paddw m3, m0
+%endif
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+ pmaddwd xmm1, xmm0, [pw_5283_13377]
+ pmaddwd xmm4, xmm0, [pw_9929_13377]
+%if notcpuflag(ssse3)
+ pmaddwd xmm6, xmm0, [pw_13377_0]
+%endif
+ pmaddwd xmm0, [pw_15212_m13377]
+ pmaddwd xmm3, xmm2, [pw_15212_9929]
+%if notcpuflag(ssse3)
+ pmaddwd xmm7, xmm2, [pw_m13377_13377]
+%endif
+ pmaddwd xmm2, [pw_m5283_m15212]
+%if cpuflag(ssse3)
+ psubw m3, m2
+%else
+ paddd xmm6, xmm7
+%endif
+ paddd xmm0, xmm2
+ paddd xmm3, xmm5
+ paddd xmm2, xmm5
+%if notcpuflag(ssse3)
+ paddd xmm6, xmm5
+%endif
+ paddd xmm1, xmm3
+ paddd xmm0, xmm3
+ paddd xmm4, xmm2
+ psrad xmm1, 14
+ psrad xmm0, 14
+ psrad xmm4, 14
+%if cpuflag(ssse3)
+ pmulhrsw m3, [pw_13377x2] ; out2
+%else
+ psrad xmm6, 14
+%endif
+ packssdw xmm0, xmm0
+ packssdw xmm1, xmm1
+ packssdw xmm4, xmm4
+%if notcpuflag(ssse3)
+ packssdw xmm6, xmm6
+%endif
+ movdq2q m0, xmm0 ; out3
+ movdq2q m1, xmm1 ; out0
+ movdq2q m2, xmm4 ; out1
+%if notcpuflag(ssse3)
+ movdq2q m3, xmm6 ; out2
+%endif
+ SWAP 0, 1, 2, 3
+%endmacro
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 54f20fe090..4e7ede2235 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -4,24 +4,23 @@
;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
-
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
@@ -37,8 +36,8 @@ pb_f8: times 16 db 0xf8
pb_fe: times 16 db 0xfe
pb_ff: times 16 db 0xff
-pw_4: times 8 dw 4
-pw_8: times 8 dw 8
+cextern pw_4
+cextern pw_8
; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
; the following mask is used to splat both in the same register
@@ -53,7 +52,7 @@ mask_mix48: times 8 db 0x00
SECTION .text
%macro SCRATCH 3
-%if ARCH_X86_64
+%ifdef m8
SWAP %1, %2
%else
mova [%3], m%1
@@ -61,7 +60,7 @@ SECTION .text
%endmacro
%macro UNSCRATCH 3
-%if ARCH_X86_64
+%ifdef m8
SWAP %1, %2
%else
mova m%1, [%3]
@@ -70,7 +69,7 @@ SECTION .text
; %1 = abs(%2-%3)
%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
-%if ARCH_X86_64
+%ifdef m8
psubusb %1, %3, %2
psubusb %4, %2, %3
%else
@@ -103,7 +102,7 @@ SECTION .text
%endmacro
%macro UNPACK 4
-%if ARCH_X86_64
+%ifdef m8
punpck%1bw %2, %3, %4
%else
mova %2, %3
@@ -113,27 +112,27 @@ SECTION .text
%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1
; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32]
- psubw %3, [rsp+%4+%5*32]
- psubw %3, [rsp+%4+%6*32]
- paddw %3, [rsp+%4+%7*32]
+ psubw %3, [rsp+%4+%5*mmsize*2]
+ psubw %3, [rsp+%4+%6*mmsize*2]
+ paddw %3, [rsp+%4+%7*mmsize*2]
%ifnidn %10, ""
%if %11 == 0
punpck%2bw %1, %10, m0
%else
UNPACK %2, %1, %10, m0
%endif
- mova [rsp+%4+%8*32], %1
+ mova [rsp+%4+%8*mmsize*2], %1
paddw %3, %1
%else
- paddw %3, [rsp+%4+%8*32]
+ paddw %3, [rsp+%4+%8*mmsize*2]
%endif
psraw %1, %3, %9
%endmacro
; FIXME interleave l/h better (for instruction pairing)
%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source
- FILTER%7_INIT %1, l, %3, %6 + 0
- FILTER%7_INIT %2, h, %4, %6 + 16
+ FILTER%7_INIT %1, l, %3, %6 + 0
+ FILTER%7_INIT %2, h, %4, %6 + mmsize
packuswb %1, %2
MASK_APPLY %1, %9, %8, %2
mova %5, %1
@@ -148,8 +147,8 @@ SECTION .text
mova %14, %15
%endif
%endif
- FILTER_SUBx2_ADDx2 %1, l, %3, %6 + 0, %7, %8, %9, %10, %11, %14, %16
- FILTER_SUBx2_ADDx2 %2, h, %4, %6 + 16, %7, %8, %9, %10, %11, %14, %16
+ FILTER_SUBx2_ADDx2 %1, l, %3, %6 + 0, %7, %8, %9, %10, %11, %14, %16
+ FILTER_SUBx2_ADDx2 %2, h, %4, %6 + mmsize, %7, %8, %9, %10, %11, %14, %16
packuswb %1, %2
%ifnidn %13, ""
MASK_APPLY %1, %13, %12, %2
@@ -196,21 +195,21 @@ SECTION .text
%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
UNPACK %2, %1, rp3, m0 ; p3: B->W
- mova [rsp+%4+0*32], %1
+ mova [rsp+%4+0*mmsize*2], %1
paddw %3, %1, %1 ; p3*2
paddw %3, %1 ; p3*3
punpck%2bw %1, m1, m0 ; p2: B->W
- mova [rsp+%4+1*32], %1
+ mova [rsp+%4+1*mmsize*2], %1
paddw %3, %1 ; p3*3 + p2
paddw %3, %1 ; p3*3 + p2*2
UNPACK %2, %1, rp1, m0 ; p1: B->W
- mova [rsp+%4+2*32], %1
+ mova [rsp+%4+2*mmsize*2], %1
paddw %3, %1 ; p3*3 + p2*2 + p1
UNPACK %2, %1, rp0, m0 ; p0: B->W
- mova [rsp+%4+3*32], %1
+ mova [rsp+%4+3*mmsize*2], %1
paddw %3, %1 ; p3*3 + p2*2 + p1 + p0
UNPACK %2, %1, rq0, m0 ; q0: B->W
- mova [rsp+%4+4*32], %1
+ mova [rsp+%4+4*mmsize*2], %1
paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + q0
paddw %3, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4
psraw %1, %3, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
@@ -218,24 +217,24 @@ SECTION .text
%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
punpck%2bw %1, m2, m0 ; p7: B->W
- mova [rsp+%4+ 8*32], %1
+ mova [rsp+%4+ 8*mmsize*2], %1
psllw %3, %1, 3 ; p7*8
psubw %3, %1 ; p7*7
punpck%2bw %1, m3, m0 ; p6: B->W
- mova [rsp+%4+ 9*32], %1
+ mova [rsp+%4+ 9*mmsize*2], %1
paddw %3, %1 ; p7*7 + p6
paddw %3, %1 ; p7*7 + p6*2
UNPACK %2, %1, rp5, m0 ; p5: B->W
- mova [rsp+%4+10*32], %1
+ mova [rsp+%4+10*mmsize*2], %1
paddw %3, %1 ; p7*7 + p6*2 + p5
UNPACK %2, %1, rp4, m0 ; p4: B->W
- mova [rsp+%4+11*32], %1
+ mova [rsp+%4+11*mmsize*2], %1
paddw %3, %1 ; p7*7 + p6*2 + p5 + p4
- paddw %3, [rsp+%4+ 0*32] ; p7*7 + p6*2 + p5 + p4 + p3
- paddw %3, [rsp+%4+ 1*32] ; p7*7 + p6*2 + p5 + .. + p2
- paddw %3, [rsp+%4+ 2*32] ; p7*7 + p6*2 + p5 + .. + p1
- paddw %3, [rsp+%4+ 3*32] ; p7*7 + p6*2 + p5 + .. + p0
- paddw %3, [rsp+%4+ 4*32] ; p7*7 + p6*2 + p5 + .. + p0 + q0
+ paddw %3, [rsp+%4+ 0*mmsize*2] ; p7*7 + p6*2 + p5 + p4 + p3
+ paddw %3, [rsp+%4+ 1*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p2
+ paddw %3, [rsp+%4+ 2*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p1
+ paddw %3, [rsp+%4+ 3*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0
+ paddw %3, [rsp+%4+ 4*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0 + q0
paddw %3, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
psraw %1, %3, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
%endmacro
@@ -335,22 +334,24 @@ SECTION .text
%endmacro
%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0
-%define P3 rsp + 0 + %1
-%define P2 rsp + 16 + %1
-%define P1 rsp + 32 + %1
-%define P0 rsp + 48 + %1
-%define Q0 rsp + 64 + %1
-%define Q1 rsp + 80 + %1
-%define Q2 rsp + 96 + %1
-%define Q3 rsp + 112 + %1
-%define P7 rsp + 128 + %1
-%define P6 rsp + 144 + %1
-%define P5 rsp + 160 + %1
-%define P4 rsp + 176 + %1
-%define Q4 rsp + 192 + %1
-%define Q5 rsp + 208 + %1
-%define Q6 rsp + 224 + %1
-%define Q7 rsp + 240 + %1
+%define P3 rsp + 0*mmsize + %1
+%define P2 rsp + 1*mmsize + %1
+%define P1 rsp + 2*mmsize + %1
+%define P0 rsp + 3*mmsize + %1
+%define Q0 rsp + 4*mmsize + %1
+%define Q1 rsp + 5*mmsize + %1
+%define Q2 rsp + 6*mmsize + %1
+%define Q3 rsp + 7*mmsize + %1
+%if mmsize == 16
+%define P7 rsp + 8*mmsize + %1
+%define P6 rsp + 9*mmsize + %1
+%define P5 rsp + 10*mmsize + %1
+%define P4 rsp + 11*mmsize + %1
+%define Q4 rsp + 12*mmsize + %1
+%define Q5 rsp + 13*mmsize + %1
+%define Q6 rsp + 14*mmsize + %1
+%define Q7 rsp + 15*mmsize + %1
+%endif
%endmacro
; ..............AB -> AAAAAAAABBBBBBBB
@@ -364,14 +365,19 @@ SECTION .text
%endif
%endmacro
-%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=32bit stack only
+%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=mmx/32bit stack only
+%assign %%ext 0
+%if ARCH_X86_32 || mmsize == 8
+%assign %%ext %5
+%endif
+
%if UNIX64
-cglobal vp9_loop_filter_%1_%2_16, 5, 9, 16, %3 + %4, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 5, 9, 16, %3 + %4 + %%ext, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
%else
%if WIN64
-cglobal vp9_loop_filter_%1_%2_16, 4, 8, 16, %3 + %4, dst, stride, E, I, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 4, 8, 16, %3 + %4 + %%ext, dst, stride, E, I, mstride, dst2, stride3, mstride3
%else
-cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride, mstride, dst2, stride3, mstride3
%define Ed dword r2m
%define Id dword r3m
%endif
@@ -385,18 +391,22 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
lea mstride3q, [mstrideq*3]
%ifidn %1, h
-%if %2 > 16
+%if %2 != 16
+%if mmsize == 16
%define movx movh
+%else
+%define movx mova
+%endif
lea dstq, [dstq + 4*strideq - 4]
%else
%define movx movu
lea dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos)
%endif
- lea dst2q, [dstq + 8*strideq]
%else
lea dstq, [dstq + 4*mstrideq]
- lea dst2q, [dstq + 8*strideq]
%endif
+ ; FIXME we shouldn't need two dts registers if mmsize == 8
+ lea dst2q, [dstq + 8*strideq]
DEFINE_REAL_P7_TO_Q7
@@ -407,11 +417,11 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
movx m3, [P4]
movx m4, [P3]
movx m5, [P2]
-%if ARCH_X86_64 || %2 != 16
+%if (ARCH_X86_64 && mmsize == 16) || %2 > 16
movx m6, [P1]
%endif
movx m7, [P0]
-%if ARCH_X86_64
+%ifdef m8
movx m8, [Q0]
movx m9, [Q1]
movx m10, [Q2]
@@ -503,7 +513,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
movhps [Q5], m6
movhps [Q7], m7
DEFINE_TRANSPOSED_P7_TO_Q7
-%else ; %2 == 44/48/84/88
+%elif %2 > 16 ; %2 == 44/48/84/88
punpcklbw m0, m1
punpcklbw m2, m3
punpcklbw m4, m5
@@ -530,12 +540,31 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
mova [Q1], m5
mova [Q2], m7
mova [Q3], m3
+%else ; %2 == 4 || %2 == 8
+ SBUTTERFLY bw, 0, 1, 6
+ SBUTTERFLY bw, 2, 3, 6
+ SBUTTERFLY bw, 4, 5, 6
+ mova [rsp+4*mmsize], m5
+ mova m6, [P1]
+ SBUTTERFLY bw, 6, 7, 5
+ DEFINE_TRANSPOSED_P7_TO_Q7
+ TRANSPOSE4x4W 0, 2, 4, 6, 5
+ mova [P3], m0
+ mova [P2], m2
+ mova [P1], m4
+ mova [P0], m6
+ mova m5, [rsp+4*mmsize]
+ TRANSPOSE4x4W 1, 3, 5, 7, 0
+ mova [Q0], m1
+ mova [Q1], m3
+ mova [Q2], m5
+ mova [Q3], m7
%endif ; %2
%endif ; x86-32/64
%endif ; %1 == h
; calc fm mask
-%if %2 == 16
+%if %2 == 16 || mmsize == 8
%if cpuflag(ssse3)
pxor m0, m0
%endif
@@ -553,7 +582,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
mova m0, [pb_80]
pxor m2, m0
pxor m3, m0
-%if ARCH_X86_64
+%ifdef m8
%ifidn %1, v
mova m8, [P3]
mova m9, [P2]
@@ -614,10 +643,10 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
; calc flat8in (if not 44_16) and hev masks
-%if %2 != 44
+%if %2 != 44 && %2 != 4
mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80
ABSSUB_GT m2, rp3, rp0, m6, m5 ; abs(p3 - p0) <= 1
-%if ARCH_X86_64
+%ifdef m8
mova m8, [pb_80]
%define rb80 m8
%else
@@ -626,7 +655,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
ABSSUB_GT m1, rp2, rp0, m6, m5, rb80 ; abs(p2 - p0) <= 1
por m2, m1
ABSSUB m4, rp1, rp0, m5 ; abs(p1 - p0)
-%if %2 == 16
+%if %2 <= 16
%if cpuflag(ssse3)
pxor m0, m0
%endif
@@ -656,8 +685,15 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
%endif
%else
mova m6, [pb_80]
+%if %2 == 44
movd m7, Hd
SPLATB_MIX m7
+%else
+%if cpuflag(ssse3)
+ pxor m0, m0
+%endif
+ SPLATB_REG m7, H, m0 ; H H H H ...
+%endif
pxor m7, m6
ABSSUB m4, rp1, rp0, m1 ; abs(p1 - p0)
pxor m4, m6
@@ -671,7 +707,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
%if %2 == 16
; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
; calc flat8out mask
-%if ARCH_X86_64
+%ifdef m8
mova m8, [P7]
mova m9, [P6]
%define rp7 m8
@@ -683,7 +719,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
ABSSUB_GT m1, rp7, rp0, m6, m5 ; abs(p7 - p0) <= 1
ABSSUB_GT m7, rp6, rp0, m6, m5 ; abs(p6 - p0) <= 1
por m1, m7
-%if ARCH_X86_64
+%ifdef m8
mova m8, [P5]
mova m9, [P4]
%define rp5 m8
@@ -696,7 +732,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
por m1, m7
ABSSUB_GT m7, rp4, rp0, m6, m5 ; abs(p4 - p0) <= 1
por m1, m7
-%if ARCH_X86_64
+%ifdef m8
mova m14, [Q4]
mova m15, [Q5]
%define rq4 m14
@@ -709,7 +745,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
por m1, m7
ABSSUB_GT m7, rq5, rq0, m6, m5 ; abs(q5 - q0) <= 1
por m1, m7
-%if ARCH_X86_64
+%ifdef m8
mova m14, [Q6]
mova m15, [Q7]
%define rq6 m14
@@ -739,7 +775,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
; filter2()
-%if %2 != 44
+%if %2 != 44 && %2 != 4
mova m6, [pb_80] ; already in m6 if 44_16
SCRATCH 2, 15, rsp+%3+%4
%if %2 == 16
@@ -757,7 +793,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
paddsb m4, m2 ; 3*(q0 - p0) + (p1 - q1)
paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127)
paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127)
-%if ARCH_X86_64
+%ifdef m8
mova m14, [pb_10] ; will be reused in filter4()
%define rb10 m14
%else
@@ -766,8 +802,8 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
SRSHIFT3B_2X m6, m4, rb10, m7 ; f1 and f2 sign byte shift by 3
SIGN_SUB m7, rq0, m6, m5 ; m7 = q0 - f1
SIGN_ADD m1, rp0, m4, m5 ; m1 = p0 + f2
-%if %2 != 44
-%if ARCH_X86_64
+%if %2 != 44 && %2 != 4
+%ifdef m8
pandn m6, m15, m3 ; ~mask(in) & mask(fm)
%else
mova m6, [rsp+%3+%4]
@@ -788,8 +824,8 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127)
paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127)
SRSHIFT3B_2X m6, m2, rb10, m4 ; f1 and f2 sign byte shift by 3
-%if %2 != 44
-%if ARCH_X86_64
+%if %2 != 44 && %2 != 4
+%ifdef m8
pandn m5, m15, m3 ; ~mask(in) & mask(fm)
%else
mova m5, [rsp+%3+%4]
@@ -816,26 +852,26 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
mova [P1], m1
mova [Q1], m4
-%if %2 != 44
+%if %2 != 44 && %2 != 4
UNSCRATCH 2, 15, rsp+%3+%4
%endif
; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
; filter6()
-%if %2 != 44
+%if %2 != 44 && %2 != 4
pxor m0, m0
-%if %2 > 16
+%if %2 != 16
pand m3, m2
%else
pand m2, m3 ; mask(fm) & mask(in)
-%if ARCH_X86_64
+%ifdef m8
pandn m3, m8, m2 ; ~mask(out) & (mask(fm) & mask(in))
%else
mova m3, [rsp+%3+%4+16]
pandn m3, m2
%endif
%endif
-%if ARCH_X86_64
+%ifdef m8
mova m14, [P3]
mova m9, [Q3]
%define rp3 m14
@@ -883,7 +919,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
pand m1, m2 ; mask(out) & (mask(fm) & mask(in))
mova m2, [P7]
mova m3, [P6]
-%if ARCH_X86_64
+%ifdef m8
mova m8, [P5]
mova m9, [P4]
%define rp5 m8
@@ -1009,7 +1045,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
movhps [Q5], m6
movhps [Q7], m7
%endif
-%elif %2 == 44
+%elif %2 == 44 || %2 == 4
SWAP 0, 1 ; m0 = p1
SWAP 1, 7 ; m1 = p0
SWAP 2, 5 ; m2 = q0
@@ -1019,6 +1055,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
SBUTTERFLY bw, 2, 3, 4
SBUTTERFLY wd, 0, 2, 4
SBUTTERFLY wd, 1, 3, 4
+%if mmsize == 16
movd [P7], m0
movd [P3], m2
movd [Q0], m1
@@ -1048,6 +1085,20 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
movd [Q3], m1
movd [Q7], m3
%else
+ movd [P7], m0
+ movd [P5], m2
+ movd [P3], m1
+ movd [P1], m3
+ psrlq m0, 32
+ psrlq m2, 32
+ psrlq m1, 32
+ psrlq m3, 32
+ movd [P6], m0
+ movd [P4], m2
+ movd [P2], m1
+ movd [P0], m3
+%endif
+%else
; the following code do a transpose of 8 full lines to 16 half
; lines (high part). It is inlined to avoid the need of a staging area
mova m0, [P3]
@@ -1056,12 +1107,12 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
mova m3, [P0]
mova m4, [Q0]
mova m5, [Q1]
-%if ARCH_X86_64
+%ifdef m8
mova m6, [Q2]
%endif
mova m7, [Q3]
DEFINE_REAL_P7_TO_Q7
-%if ARCH_X86_64
+%ifdef m8
SBUTTERFLY bw, 0, 1, 8
SBUTTERFLY bw, 2, 3, 8
SBUTTERFLY bw, 4, 5, 8
@@ -1076,27 +1127,32 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
SBUTTERFLY dq, 3, 7, 8
%else
SBUTTERFLY bw, 0, 1, 6
- mova [rsp+64], m1
- mova m6, [rsp+96]
+ mova [rsp+mmsize*4], m1
+ mova m6, [rsp+mmsize*6]
SBUTTERFLY bw, 2, 3, 1
SBUTTERFLY bw, 4, 5, 1
SBUTTERFLY bw, 6, 7, 1
SBUTTERFLY wd, 0, 2, 1
- mova [rsp+96], m2
- mova m1, [rsp+64]
+ mova [rsp+mmsize*6], m2
+ mova m1, [rsp+mmsize*4]
SBUTTERFLY wd, 1, 3, 2
SBUTTERFLY wd, 4, 6, 2
SBUTTERFLY wd, 5, 7, 2
SBUTTERFLY dq, 0, 4, 2
SBUTTERFLY dq, 1, 5, 2
+%if mmsize == 16
movh [Q0], m1
movhps [Q1], m1
- mova m2, [rsp+96]
+%else
+ mova [P3], m1
+%endif
+ mova m2, [rsp+mmsize*6]
SBUTTERFLY dq, 2, 6, 1
SBUTTERFLY dq, 3, 7, 1
%endif
SWAP 3, 6
SWAP 1, 4
+%if mmsize == 16
movh [P7], m0
movhps [P6], m0
movh [P5], m1
@@ -1105,7 +1161,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
movhps [P2], m2
movh [P1], m3
movhps [P0], m3
-%if ARCH_X86_64
+%ifdef m8
movh [Q0], m4
movhps [Q1], m4
%endif
@@ -1115,6 +1171,15 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
movhps [Q5], m6
movh [Q6], m7
movhps [Q7], m7
+%else
+ mova [P7], m0
+ mova [P6], m1
+ mova [P5], m2
+ mova [P4], m3
+ mova [P2], m5
+ mova [P1], m6
+ mova [P0], m7
+%endif
%endif
%endif
@@ -1138,3 +1203,9 @@ LPF_16_VH_ALL_OPTS 44, 0, 128, 0
LPF_16_VH_ALL_OPTS 48, 256, 128, 16
LPF_16_VH_ALL_OPTS 84, 256, 128, 16
LPF_16_VH_ALL_OPTS 88, 256, 128, 16
+
+INIT_MMX mmxext
+LOOPFILTER v, 4, 0, 0, 0
+LOOPFILTER h, 4, 0, 64, 0
+LOOPFILTER v, 8, 128, 0, 8
+LOOPFILTER h, 8, 128, 64, 8
diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm
new file mode 100644
index 0000000000..c0888170c9
--- /dev/null
+++ b/libavcodec/x86/vp9lpf_16bpp.asm
@@ -0,0 +1,823 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_511: times 16 dw 511
+pw_2047: times 16 dw 2047
+pw_16384: times 16 dw 16384
+pw_m512: times 16 dw -512
+pw_m2048: times 16 dw -2048
+
+cextern pw_1
+cextern pw_3
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_256
+cextern pw_1023
+cextern pw_4095
+cextern pw_m1
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+ mova [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+ mova m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; calculate p or q portion of flat8out
+%macro FLAT8OUT_HALF 0
+ psubw m4, m0 ; q4-q0
+ psubw m5, m0 ; q5-q0
+ psubw m6, m0 ; q6-q0
+ psubw m7, m0 ; q7-q0
+ ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0)
+ ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0)
+ pcmpgtw m4, reg_F ; abs(q4-q0) > F
+ pcmpgtw m5, reg_F ; abs(q5-q0) > F
+ pcmpgtw m6, reg_F ; abs(q6-q0) > F
+ pcmpgtw m7, reg_F ; abs(q7-q0) > F
+ por m5, m4
+ por m7, m6
+ por m7, m5 ; !flat8out, q portion
+%endmacro
+
+; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
+%macro FLAT8IN_HALF 1
+%if %1 > 4
+ psubw m4, m3, m0 ; q3-q0
+ psubw m5, m2, m0 ; q2-q0
+ ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0)
+ pcmpgtw m4, reg_F ; abs(q3-q0) > F
+ pcmpgtw m5, reg_F ; abs(q2-q0) > F
+%endif
+ psubw m3, m2 ; q3-q2
+ psubw m2, m1 ; q2-q1
+ ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1)
+ pcmpgtw m3, reg_I ; abs(q3-q2) > I
+ pcmpgtw m2, reg_I ; abs(q2-q1) > I
+%if %1 > 4
+ por m4, m5
+%endif
+ por m2, m3
+ psubw m3, m1, m0 ; q1-q0
+ ABS1 m3, m5 ; abs(q1-q0)
+%if %1 > 4
+ pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F
+%endif
+ pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H
+ pcmpgtw m3, reg_I ; abs(q1-q0) > I
+%if %1 > 4
+ por m4, m6
+%endif
+ por m2, m3
+%endmacro
+
+; one step in filter_14/filter_6
+;
+; take sum $reg, downshift, apply mask and write into dst
+;
+; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
+; step's sum $reg. This is omitted for the last row in each filter.
+;
+; if dont_store is set, don't write the result into memory, instead keep the
+; values in register so we can write it out later
+%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
+ ; src/sub1, sub2, add1, add2, dont_store
+ psrlw %1, %2, %4
+ psubw %1, %6 ; abs->delta
+%ifnidn %7, ""
+ psubw %2, %6
+ psubw %2, %7
+ paddw %2, %8
+ paddw %2, %9
+%endif
+ pand %1, reg_%3 ; apply mask
+%if %10 == 1
+ paddw %6, %1 ; delta->abs
+%else
+ paddw %1, %6 ; delta->abs
+ mova [%5], %1
+%endif
+%endmacro
+
+; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
+
+%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
+
+%if ARCH_X86_64
+%if %2 == 16
+%assign %%num_xmm_regs 16
+%elif %2 == 8
+%assign %%num_xmm_regs 15
+%else ; %2 == 4
+%assign %%num_xmm_regs 14
+%endif ; %2
+%assign %%bak_mem 0
+%else ; ARCH_X86_32
+%assign %%num_xmm_regs 8
+%if %2 == 16
+%assign %%bak_mem 7
+%elif %2 == 8
+%assign %%bak_mem 6
+%else ; %2 == 4
+%assign %%bak_mem 5
+%endif ; %2
+%endif ; ARCH_X86_64/32
+
+%if %2 == 16
+%ifidn %1, v
+%assign %%num_gpr_regs 6
+%else ; %1 == h
+%assign %%num_gpr_regs 5
+%endif ; %1
+%assign %%wd_mem 6
+%else ; %2 == 8/4
+%assign %%num_gpr_regs 5
+%if ARCH_X86_32 && %2 == 8
+%assign %%wd_mem 2
+%else ; ARCH_X86_64 || %2 == 4
+%assign %%wd_mem 0
+%endif ; ARCH_X86_64/32 etc.
+%endif ; %2
+
+%ifidn %1, v
+%assign %%tsp_mem 0
+%elif %2 == 16 ; && %1 == h
+%assign %%tsp_mem 16
+%else ; %1 == h && %1 == 8/4
+%assign %%tsp_mem 8
+%endif ; %1/%2
+
+%assign %%off %%wd_mem
+%assign %%tspoff %%bak_mem+%%wd_mem
+%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
+
+%if %3 == 10
+%define %%maxsgn 511
+%define %%minsgn m512
+%define %%maxusgn 1023
+%define %%maxf 4
+%else ; %3 == 12
+%define %%maxsgn 2047
+%define %%minsgn m2048
+%define %%maxusgn 4095
+%define %%maxf 16
+%endif ; %3
+
+cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
+ ; prepare E, I and H masks
+ shl Ed, %3-8
+ shl Id, %3-8
+ shl Hd, %3-8
+%if cpuflag(ssse3)
+ mova m0, [pw_256]
+%endif
+ movd m1, Ed
+ movd m2, Id
+ movd m3, Hd
+%if cpuflag(ssse3)
+ pshufb m1, m0 ; E << (bit_depth - 8)
+ pshufb m2, m0 ; I << (bit_depth - 8)
+ pshufb m3, m0 ; H << (bit_depth - 8)
+%else
+ punpcklwd m1, m1
+ punpcklwd m2, m2
+ punpcklwd m3, m3
+ pshufd m1, m1, q0000
+ pshufd m2, m2, q0000
+ pshufd m3, m3, q0000
+%endif
+ SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E
+ SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I
+ SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H
+%if %2 > 4
+ PRELOAD 11, pw_ %+ %%maxf, F
+%endif
+
+ ; set up variables to load data
+%ifidn %1, v
+ DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
+ lea stride3q, [strideq*3]
+ neg strideq
+%if %2 == 16
+ lea dst0q, [dst8q+strideq*8]
+%else
+ lea dst4q, [dst8q+strideq*4]
+%endif
+ neg strideq
+%if %2 == 16
+ lea dst12q, [dst8q+strideq*4]
+ lea dst4q, [dst0q+strideq*4]
+%endif
+
+%if %2 == 16
+%define %%p7 dst0q
+%define %%p6 dst0q+strideq
+%define %%p5 dst0q+strideq*2
+%define %%p4 dst0q+stride3q
+%endif
+%define %%p3 dst4q
+%define %%p2 dst4q+strideq
+%define %%p1 dst4q+strideq*2
+%define %%p0 dst4q+stride3q
+%define %%q0 dst8q
+%define %%q1 dst8q+strideq
+%define %%q2 dst8q+strideq*2
+%define %%q3 dst8q+stride3q
+%if %2 == 16
+%define %%q4 dst12q
+%define %%q5 dst12q+strideq
+%define %%q6 dst12q+strideq*2
+%define %%q7 dst12q+stride3q
+%endif
+%else ; %1 == h
+ DEFINE_ARGS dst0, stride, stride3, dst4
+ lea stride3q, [strideq*3]
+ lea dst4q, [dst0q+strideq*4]
+
+%define %%p3 rsp+(%%tspoff+0)*mmsize
+%define %%p2 rsp+(%%tspoff+1)*mmsize
+%define %%p1 rsp+(%%tspoff+2)*mmsize
+%define %%p0 rsp+(%%tspoff+3)*mmsize
+%define %%q0 rsp+(%%tspoff+4)*mmsize
+%define %%q1 rsp+(%%tspoff+5)*mmsize
+%define %%q2 rsp+(%%tspoff+6)*mmsize
+%define %%q3 rsp+(%%tspoff+7)*mmsize
+
+%if %2 < 16
+ movu m0, [dst0q+strideq*0-8]
+ movu m1, [dst0q+strideq*1-8]
+ movu m2, [dst0q+strideq*2-8]
+ movu m3, [dst0q+stride3q -8]
+ movu m4, [dst4q+strideq*0-8]
+ movu m5, [dst4q+strideq*1-8]
+ movu m6, [dst4q+strideq*2-8]
+ movu m7, [dst4q+stride3q -8]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
+%endif
+
+ mova [%%p3], m0
+ mova [%%p2], m1
+ mova [%%p1], m2
+ mova [%%p0], m3
+%if ARCH_X86_64
+ mova [%%q0], m4
+%endif
+ mova [%%q1], m5
+ mova [%%q2], m6
+ mova [%%q3], m7
+
+ ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
+ ; order here accordingly
+%else ; %2 == 16
+
+%define %%p7 rsp+(%%tspoff+ 8)*mmsize
+%define %%p6 rsp+(%%tspoff+ 9)*mmsize
+%define %%p5 rsp+(%%tspoff+10)*mmsize
+%define %%p4 rsp+(%%tspoff+11)*mmsize
+%define %%q4 rsp+(%%tspoff+12)*mmsize
+%define %%q5 rsp+(%%tspoff+13)*mmsize
+%define %%q6 rsp+(%%tspoff+14)*mmsize
+%define %%q7 rsp+(%%tspoff+15)*mmsize
+
+ mova m0, [dst0q+strideq*0-16]
+ mova m1, [dst0q+strideq*1-16]
+ mova m2, [dst0q+strideq*2-16]
+ mova m3, [dst0q+stride3q -16]
+ mova m4, [dst4q+strideq*0-16]
+ mova m5, [dst4q+strideq*1-16]
+%if ARCH_X86_64
+ mova m6, [dst4q+strideq*2-16]
+%endif
+ mova m7, [dst4q+stride3q -16]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
+%endif
+
+ mova [%%p7], m0
+ mova [%%p6], m1
+ mova [%%p5], m2
+ mova [%%p4], m3
+%if ARCH_X86_64
+ mova [%%p3], m4
+%endif
+ mova [%%p2], m5
+ mova [%%p1], m6
+ mova [%%p0], m7
+
+ mova m0, [dst0q+strideq*0]
+ mova m1, [dst0q+strideq*1]
+ mova m2, [dst0q+strideq*2]
+ mova m3, [dst0q+stride3q ]
+ mova m4, [dst4q+strideq*0]
+ mova m5, [dst4q+strideq*1]
+%if ARCH_X86_64
+ mova m6, [dst4q+strideq*2]
+%endif
+ mova m7, [dst4q+stride3q ]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
+%endif
+
+ mova [%%q0], m0
+ mova [%%q1], m1
+ mova [%%q2], m2
+ mova [%%q3], m3
+%if ARCH_X86_64
+ mova [%%q4], m4
+%endif
+ mova [%%q5], m5
+ mova [%%q6], m6
+ mova [%%q7], m7
+
+ ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
+ ; order here accordingly
+%endif ; %2
+%endif ; %1
+
+ ; load q0|q4-7 data
+ mova m0, [%%q0]
+%if %2 == 16
+ mova m4, [%%q4]
+ mova m5, [%%q5]
+ mova m6, [%%q6]
+ mova m7, [%%q7]
+
+ ; flat8out q portion
+ FLAT8OUT_HALF
+ SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+ ; load q1-3 data
+ mova m1, [%%q1]
+ mova m2, [%%q2]
+ mova m3, [%%q3]
+
+ ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+ ; r9[m15]=!flatout[q]
+ ; m12-14=free
+ ; m0-3=q0-q3
+ ; m4-7=free
+
+ ; flat8in|fm|hev q portion
+ FLAT8IN_HALF %2
+ SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
+%if %2 > 4
+ SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I
+%endif
+
+ ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+ ; r9[m15]=!flat8out[q]
+ ; r10[m13]=hev[q]
+ ; r11[m14]=!flat8in[q]
+ ; m2=!fm[q]
+ ; m0,1=q0-q1
+ ; m2-7=free
+ ; m12=free
+
+ ; load p0-1
+ mova m3, [%%p0]
+ mova m4, [%%p1]
+
+ ; fm mb_edge portion
+ psubw m5, m3, m0 ; q0-p0
+ psubw m6, m4, m1 ; q1-p1
+%if ARCH_X86_64
+ ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1)
+%else
+ ABS1 m5, m7 ; abs(q0-p0)
+ ABS1 m6, m7 ; abs(q1-p1)
+%endif
+ paddw m5, m5
+ psraw m6, 1
+ paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1)
+ pcmpgtw m6, reg_E
+ por m2, m6
+ SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM
+
+ ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+ ; r9[m15]=!flat8out[q]
+ ; r10[m13]=hev[q]
+ ; r11[m14]=!flat8in[q]
+ ; r12[m12]=!fm[q]
+ ; m3-4=q0-1
+ ; m0-2/5-7=free
+
+ ; load p4-7 data
+ SWAP 3, 0 ; p0
+ SWAP 4, 1 ; p1
+%if %2 == 16
+ mova m7, [%%p7]
+ mova m6, [%%p6]
+ mova m5, [%%p5]
+ mova m4, [%%p4]
+
+ ; flat8out p portion
+ FLAT8OUT_HALF
+ por m7, reg_F8O
+ SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+ ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+ ; r9[m15]=!flat8out
+ ; r10[m13]=hev[q]
+ ; r11[m14]=!flat8in[q]
+ ; r12[m12]=!fm[q]
+ ; m0=p0
+ ; m1-7=free
+
+ ; load p2-3 data
+ mova m2, [%%p2]
+ mova m3, [%%p3]
+
+ ; flat8in|fm|hev p portion
+ FLAT8IN_HALF %2
+ por m7, reg_HEV
+%if %2 > 4
+ por m4, reg_F8I
+%endif
+ por m2, reg_FM
+%if %2 > 4
+ por m4, m2 ; !flat8|!fm
+%if %2 == 16
+ por m5, m4, reg_F8O ; !flat16|!fm
+ pandn m2, m4 ; filter4_mask
+ pandn m4, m5 ; filter8_mask
+ pxor m5, [pw_m1] ; filter16_mask
+ SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M
+%else
+ pandn m2, m4 ; filter4_mask
+ pxor m4, [pw_m1] ; filter8_mask
+%endif
+ SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M
+%else
+ pxor m2, [pw_m1] ; filter4_mask
+%endif
+ SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
+ SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M
+
+ ; r9[m15]=filter16_mask
+ ; r10[m13]=hev
+ ; r11[m14]=filter8_mask
+ ; r12[m12]=filter4_mask
+ ; m0,1=p0-p1
+ ; m2-7=free
+ ; m8-11=free
+
+%if %2 > 4
+%if %2 == 16
+ ; filter_14
+ mova m2, [%%p7]
+ mova m3, [%%p6]
+ mova m6, [%%p5]
+ mova m7, [%%p4]
+ PRELOAD 8, %%p3, P3
+ PRELOAD 9, %%p2, P2
+%endif
+ PRELOAD 10, %%q0, Q0
+ PRELOAD 11, %%q1, Q1
+%if %2 == 16
+ psllw m4, m2, 3
+ paddw m5, m3, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddw m4, reg_P3
+ paddw m5, reg_P2
+ paddw m4, m1
+ paddw m5, m0
+ paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8
+ psubw m5, m2 ; p0+p2+p4+p6*2-p7
+ paddw m4, [pw_8]
+ paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
+
+ ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
+ ; at the end of the filter
+
+ mova [rsp+0*mmsize], m3
+ FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1
+%endif
+ mova m3, [%%q2]
+%if %2 == 16
+ mova [rsp+1*mmsize], m6
+ FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3
+%endif
+ mova m6, [%%q3]
+%if %2 == 16
+ mova [rsp+2*mmsize], m7
+ FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6
+ mova m7, [%%q4]
+%if ARCH_X86_64
+ mova [rsp+3*mmsize], reg_P3
+%else
+ mova m4, reg_P3
+ mova [rsp+3*mmsize], m4
+%endif
+ FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7
+ PRELOAD 8, %%q5, Q5
+%if ARCH_X86_64
+ mova [rsp+4*mmsize], reg_P2
+%else
+ mova m4, reg_P2
+ mova [rsp+4*mmsize], m4
+%endif
+ FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5
+ PRELOAD 9, %%q6, Q6
+ mova [rsp+5*mmsize], m1
+ FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6
+ mova m1, [%%q7]
+ FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1
+ FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64
+ FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64
+ FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1
+ FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1
+ FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1
+ FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
+ FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6
+
+ mova m7, [%%p1]
+%else
+ SWAP 1, 7
+%endif
+
+ mova m2, [%%p3]
+ mova m1, [%%p2]
+
+ ; reg_Q0-1 (m10-m11)
+ ; m0=p0
+ ; m1=p2
+ ; m2=p3
+ ; m3=q2
+ ; m4-5=free
+ ; m6=q3
+ ; m7=p1
+ ; m8-9 unused
+
+ ; filter_6
+ psllw m4, m2, 2
+ paddw m5, m1, m1
+ paddw m4, m7
+ psubw m5, m2
+ paddw m4, m0
+ paddw m5, reg_Q0
+ paddw m4, [pw_4]
+ paddw m5, m4
+
+%if ARCH_X86_64
+ mova m8, m1
+ mova m9, m7
+%else
+ mova [rsp+0*mmsize], m1
+ mova [rsp+1*mmsize], m7
+%endif
+%ifidn %1, v
+ FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1
+%else
+ FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1
+%endif
+ FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1
+ FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1
+%if ARCH_X86_64
+ FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64
+ FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64
+%else
+ FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
+ FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64
+%endif
+ FILTER_STEP m4, m5, F8M, 3, %%q2, m3
+
+ UNSCRATCH 2, 10, %%q0
+ UNSCRATCH 6, 11, %%q1
+%else
+ SWAP 1, 7
+ mova m2, [%%q0]
+ mova m6, [%%q1]
+%endif
+ UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV
+
+ ; m0=p0
+ ; m1=p2
+ ; m2=q0
+ ; m3=hev_mask
+ ; m4-5=free
+ ; m6=q1
+ ; m7=p1
+
+ ; filter_4
+ psubw m4, m7, m6 ; p1-q1
+ psubw m5, m2, m0 ; q0-p0
+ pand m4, m3
+ pminsw m4, [pw_ %+ %%maxsgn]
+ pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f
+ paddw m4, m5
+ paddw m5, m5
+ paddw m4, m5 ; 3*(q0-p0)+f
+ pminsw m4, [pw_ %+ %%maxsgn]
+ pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f
+ pand m4, reg_F4M
+ paddw m5, m4, [pw_4]
+ paddw m4, [pw_3]
+ pminsw m5, [pw_ %+ %%maxsgn]
+ pminsw m4, [pw_ %+ %%maxsgn]
+ psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1
+ psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2
+ psubw m2, m5 ; q0-f1
+ paddw m0, m4 ; p0+f2
+ pandn m3, m5 ; f1 & !hev (for p1/q1 adj)
+ pxor m4, m4
+ mova m5, [pw_ %+ %%maxusgn]
+ pmaxsw m2, m4
+ pmaxsw m0, m4
+ pminsw m2, m5
+ pminsw m0, m5
+%if cpuflag(ssse3)
+ pmulhrsw m3, [pw_16384] ; (f1+1)>>1
+%else
+ paddw m3, [pw_1]
+ psraw m3, 1
+%endif
+ paddw m7, m3 ; p1+f
+ psubw m6, m3 ; q1-f
+ pmaxsw m7, m4
+ pmaxsw m6, m4
+ pminsw m7, m5
+ pminsw m6, m5
+
+ ; store
+%ifidn %1, v
+ mova [%%p1], m7
+ mova [%%p0], m0
+ mova [%%q0], m2
+ mova [%%q1], m6
+%else ; %1 == h
+%if %2 == 4
+ TRANSPOSE4x4W 7, 0, 2, 6, 1
+ movh [dst0q+strideq*0-4], m7
+ movhps [dst0q+strideq*1-4], m7
+ movh [dst0q+strideq*2-4], m0
+ movhps [dst0q+stride3q -4], m0
+ movh [dst4q+strideq*0-4], m2
+ movhps [dst4q+strideq*1-4], m2
+ movh [dst4q+strideq*2-4], m6
+ movhps [dst4q+stride3q -4], m6
+%elif %2 == 8
+ mova m3, [%%p3]
+ mova m4, [%%q2]
+ mova m5, [%%q3]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8
+%else
+ TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
+ mova m2, [%%q0]
+%endif
+
+ movu [dst0q+strideq*0-8], m3
+ movu [dst0q+strideq*1-8], m1
+ movu [dst0q+strideq*2-8], m7
+ movu [dst0q+stride3q -8], m0
+ movu [dst4q+strideq*0-8], m2
+ movu [dst4q+strideq*1-8], m6
+ movu [dst4q+strideq*2-8], m4
+ movu [dst4q+stride3q -8], m5
+%else ; %2 == 16
+ SCRATCH 2, 8, %%q0
+ SCRATCH 6, 9, %%q1
+ mova m2, [%%p7]
+ mova m3, [%%p6]
+ mova m4, [%%p5]
+ mova m5, [%%p4]
+ mova m6, [%%p3]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10
+%else
+ mova [%%p1], m7
+ TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
+%endif
+
+ mova [dst0q+strideq*0-16], m2
+ mova [dst0q+strideq*1-16], m3
+ mova [dst0q+strideq*2-16], m4
+ mova [dst0q+stride3q -16], m5
+%if ARCH_X86_64
+ mova [dst4q+strideq*0-16], m6
+%endif
+ mova [dst4q+strideq*1-16], m1
+ mova [dst4q+strideq*2-16], m7
+ mova [dst4q+stride3q -16], m0
+
+ UNSCRATCH 2, 8, %%q0
+ UNSCRATCH 6, 9, %%q1
+ mova m0, [%%q2]
+ mova m1, [%%q3]
+ mova m3, [%%q4]
+ mova m4, [%%q5]
+%if ARCH_X86_64
+ mova m5, [%%q6]
+%endif
+ mova m7, [%%q7]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8
+%else
+ TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
+%endif
+
+ mova [dst0q+strideq*0], m2
+ mova [dst0q+strideq*1], m6
+ mova [dst0q+strideq*2], m0
+ mova [dst0q+stride3q ], m1
+%if ARCH_X86_64
+ mova [dst4q+strideq*0], m3
+%endif
+ mova [dst4q+strideq*1], m4
+ mova [dst4q+strideq*2], m5
+ mova [dst4q+stride3q ], m7
+%endif ; %2
+%endif ; %1
+ RET
+%endmacro
+
+%macro LOOP_FILTER_CPUSETS 3
+INIT_XMM sse2
+LOOP_FILTER %1, %2, %3
+INIT_XMM ssse3
+LOOP_FILTER %1, %2, %3
+INIT_XMM avx
+LOOP_FILTER %1, %2, %3
+%endmacro
+
+%macro LOOP_FILTER_WDSETS 2
+LOOP_FILTER_CPUSETS %1, 4, %2
+LOOP_FILTER_CPUSETS %1, 8, %2
+LOOP_FILTER_CPUSETS %1, 16, %2
+%endmacro
+
+LOOP_FILTER_WDSETS h, 10
+LOOP_FILTER_WDSETS v, 10
+LOOP_FILTER_WDSETS h, 12
+LOOP_FILTER_WDSETS v, 12
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index c9701aea18..f64161b2c2 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -3,20 +3,20 @@
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -45,6 +45,13 @@ times 8 dw %7
times 8 dw %8
%endmacro
+%macro F8_16BPP_TAPS 8
+times 8 dw %1, %2
+times 8 dw %3, %4
+times 8 dw %5, %6
+times 8 dw %7, %8
+%endmacro
+
%macro FILTER 1
const filters_%1 ; smooth
F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0
@@ -102,12 +109,15 @@ FILTER ssse3
%define F8_TAPS F8_SSE2_TAPS
; int16_t ff_filters_sse2[3][15][8][8]
FILTER sse2
+%define F8_TAPS F8_16BPP_TAPS
+; int16_t ff_filters_16bpp[3][15][4][16]
+FILTER 16bpp
SECTION .text
%macro filter_sse2_h_fn 1
%assign %%px mmsize/2
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 15, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
pxor m5, m5
mova m6, [pw_64]
mova m7, [filteryq+ 0]
@@ -192,7 +202,7 @@ filter_sse2_h_fn avg
%macro filter_h_fn 1
%assign %%px mmsize/2
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
mova m6, [pw_256]
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
@@ -253,7 +263,7 @@ filter_h_fn avg
%if ARCH_X86_64
%macro filter_hx2_fn 1
%assign %%px mmsize
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
mova m13, [pw_256]
mova m8, [filteryq+ 0]
mova m9, [filteryq+32]
@@ -315,9 +325,9 @@ filter_hx2_fn avg
%macro filter_sse2_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
@@ -413,9 +423,9 @@ filter_sse2_v_fn avg
%macro filter_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
@@ -486,7 +496,7 @@ filter_v_fn avg
%macro filter_vx2_fn 1
%assign %%px mmsize
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
mova m13, [pw_256]
lea sstride3q, [sstrideq*3]
lea src4q, [srcq+sstrideq]
@@ -552,7 +562,7 @@ filter_vx2_fn avg
%endif ; ARCH_X86_64
-%macro fpel_fn 6
+%macro fpel_fn 6-8 0, 4
%if %2 == 4
%define %%srcfn movh
%define %%dstfn movh
@@ -561,29 +571,57 @@ filter_vx2_fn avg
%define %%dstfn mova
%endif
+%if %7 == 8
+%define %%pavg pavgb
+%define %%szsuf _8
+%elif %7 == 16
+%define %%pavg pavgw
+%define %%szsuf _16
+%else
+%define %%szsuf
+%endif
+
%if %2 <= mmsize
-cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
+cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
lea sstride3q, [sstrideq*3]
lea dstride3q, [dstrideq*3]
%else
-cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
+cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
%endif
.loop:
%%srcfn m0, [srcq]
%%srcfn m1, [srcq+s%3]
%%srcfn m2, [srcq+s%4]
%%srcfn m3, [srcq+s%5]
+%if %2/mmsize == 8
+ %%srcfn m4, [srcq+mmsize*4]
+ %%srcfn m5, [srcq+mmsize*5]
+ %%srcfn m6, [srcq+mmsize*6]
+ %%srcfn m7, [srcq+mmsize*7]
+%endif
lea srcq, [srcq+sstrideq*%6]
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+d%3]
- pavgb m2, [dstq+d%4]
- pavgb m3, [dstq+d%5]
+ %%pavg m0, [dstq]
+ %%pavg m1, [dstq+d%3]
+ %%pavg m2, [dstq+d%4]
+ %%pavg m3, [dstq+d%5]
+%if %2/mmsize == 8
+ %%pavg m4, [dstq+mmsize*4]
+ %%pavg m5, [dstq+mmsize*5]
+ %%pavg m6, [dstq+mmsize*6]
+ %%pavg m7, [dstq+mmsize*7]
+%endif
%endif
%%dstfn [dstq], m0
%%dstfn [dstq+d%3], m1
%%dstfn [dstq+d%4], m2
%%dstfn [dstq+d%5], m3
+%if %2/mmsize == 8
+ %%dstfn [dstq+mmsize*4], m4
+ %%dstfn [dstq+mmsize*5], m5
+ %%dstfn [dstq+mmsize*6], m6
+ %%dstfn [dstq+mmsize*7], m7
+%endif
lea dstq, [dstq+dstrideq*%6]
sub hd, %6
jnz .loop
@@ -598,23 +636,38 @@ INIT_MMX mmx
fpel_fn put, 4, strideq, strideq*2, stride3q, 4
fpel_fn put, 8, strideq, strideq*2, stride3q, 4
INIT_MMX mmxext
-fpel_fn avg, 4, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 8, strideq, strideq*2, stride3q, 4
+fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8
INIT_XMM sse
fpel_fn put, 16, strideq, strideq*2, stride3q, 4
fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
+fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8
INIT_XMM sse2
-fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2
-fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8
+fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8
INIT_YMM avx
fpel_fn put, 32, strideq, strideq*2, stride3q, 4
fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2
+fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8
+%endif
+INIT_MMX mmxext
+fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16
+fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16
+fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16
+fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16
%endif
%undef s16
%undef d16
diff --git a/libavcodec/x86/vp9mc_16bpp.asm b/libavcodec/x86/vp9mc_16bpp.asm
new file mode 100644
index 0000000000..9a462eaf80
--- /dev/null
+++ b/libavcodec/x86/vp9mc_16bpp.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* VP9 MC SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_64: times 8 dd 64
+
+cextern pw_1023
+cextern pw_4095
+
+SECTION .text
+
+%macro filter_h4_fn 1-2 12
+cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+ mova m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+ pxor m11, m11
+%endif
+ mova m6, [pd_64]
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+32]
+ mova m9, [filteryq+64]
+ mova m10, [filteryq+96]
+%endif
+.loop:
+ movh m0, [srcq-6]
+ movh m1, [srcq-4]
+ movh m2, [srcq-2]
+ movh m3, [srcq+0]
+ movh m4, [srcq+2]
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ pmaddwd m0, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m8
+%else
+ pmaddwd m2, [filteryq+32]
+%endif
+ movu m1, [srcq+4]
+ movu m3, [srcq+6]
+ paddd m0, m2
+ movu m2, [srcq+8]
+ add srcq, sstrideq
+ punpcklwd m4, m1
+ punpcklwd m3, m2
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m4, m9
+ pmaddwd m3, m10
+%else
+ pmaddwd m4, [filteryq+64]
+ pmaddwd m3, [filteryq+96]
+%endif
+ paddd m0, m4
+ paddd m0, m3
+ paddd m0, m6
+ psrad m0, 7
+%if cpuflag(sse4)
+ packusdw m0, m0
+%else
+ packssdw m0, m0
+%endif
+%ifidn %1, avg
+ movh m1, [dstq]
+%endif
+ pminsw m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+ pmaxsw m0, m11
+%else
+ pxor m2, m2
+ pmaxsw m0, m2
+%endif
+%endif
+%ifidn %1, avg
+ pavgw m0, m1
+%endif
+ movh [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+
+cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h4_fn put
+filter_h4_fn avg
+
+%macro filter_h_fn 1-2 12
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+ mova m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+ pxor m11, m11
+%endif
+ mova m6, [pd_64]
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+32]
+ mova m9, [filteryq+64]
+ mova m10, [filteryq+96]
+%endif
+.loop:
+ movu m0, [srcq-6]
+ movu m1, [srcq-4]
+ movu m2, [srcq-2]
+ movu m3, [srcq+0]
+ movu m4, [srcq+2]
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m8
+ pmaddwd m3, m8
+ pmaddwd m4, m9
+%else
+ pmaddwd m2, [filteryq+32]
+ pmaddwd m3, [filteryq+32]
+ pmaddwd m4, [filteryq+64]
+%endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m4
+ movu m2, [srcq+4]
+ movu m3, [srcq+6]
+ movu m4, [srcq+8]
+ add srcq, sstrideq
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m9
+ pmaddwd m3, m10
+ pmaddwd m4, m10
+%else
+ pmaddwd m2, [filteryq+64]
+ pmaddwd m3, [filteryq+96]
+ pmaddwd m4, [filteryq+96]
+%endif
+ paddd m1, m2
+ paddd m0, m3
+ paddd m1, m4
+ paddd m0, m6
+ paddd m1, m6
+ psrad m0, 7
+ psrad m1, 7
+%if cpuflag(sse4)
+ packusdw m0, m0
+ packusdw m1, m1
+%else
+ packssdw m0, m0
+ packssdw m1, m1
+%endif
+ punpcklwd m0, m1
+ pminsw m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+ pmaxsw m0, m11
+%else
+ pxor m2, m2
+ pmaxsw m0, m2
+%endif
+%endif
+%ifidn %1, avg
+ pavgw m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h_fn put
+filter_h_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_h_fn put
+filter_h_fn avg
+%endif
+
+%macro filter_v4_fn 1-2 12
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%define hd r4mp
+%endif
+ mova m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+ pxor m11, m11
+%endif
+ mova m6, [pd_64]
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+ 32]
+ mova m9, [filteryq+ 64]
+ mova m10, [filteryq+ 96]
+%endif
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movh m0, [srcq]
+ movh m1, [srcq+sstrideq]
+ movh m2, [srcq+sstrideq*2]
+ movh m3, [srcq+sstride3q]
+ add srcq, sstrideq
+ movh m4, [src4q]
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ pmaddwd m0, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m8
+%else
+ pmaddwd m2, [filteryq+ 32]
+%endif
+ movh m1, [src4q+sstrideq]
+ movh m3, [src4q+sstrideq*2]
+ paddd m0, m2
+ movh m2, [src4q+sstride3q]
+ add src4q, sstrideq
+ punpcklwd m4, m1
+ punpcklwd m3, m2
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m4, m9
+ pmaddwd m3, m10
+%else
+ pmaddwd m4, [filteryq+ 64]
+ pmaddwd m3, [filteryq+ 96]
+%endif
+ paddd m0, m4
+ paddd m0, m3
+ paddd m0, m6
+ psrad m0, 7
+%if cpuflag(sse4)
+ packusdw m0, m0
+%else
+ packssdw m0, m0
+%endif
+%ifidn %1, avg
+ movh m1, [dstq]
+%endif
+ pminsw m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+ pmaxsw m0, m11
+%else
+ pxor m2, m2
+ pmaxsw m0, m2
+%endif
+%endif
+%ifidn %1, avg
+ pavgw m0, m1
+%endif
+ movh [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%endif
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v4_fn put
+filter_v4_fn avg
+
+%macro filter_v_fn 1-2 13
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%define hd r4mp
+%endif
+ mova m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+ pxor m12, m12
+%endif
+%if ARCH_X86_64
+ mova m11, [pd_64]
+%endif
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+ 32]
+ mova m9, [filteryq+ 64]
+ mova m10, [filteryq+ 96]
+%endif
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movu m0, [srcq]
+ movu m1, [srcq+sstrideq]
+ movu m2, [srcq+sstrideq*2]
+ movu m3, [srcq+sstride3q]
+ add srcq, sstrideq
+ movu m4, [src4q]
+ SBUTTERFLY wd, 0, 1, 6
+ SBUTTERFLY wd, 2, 3, 6
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m8
+ pmaddwd m3, m8
+%else
+ pmaddwd m2, [filteryq+ 32]
+ pmaddwd m3, [filteryq+ 32]
+%endif
+ paddd m0, m2
+ paddd m1, m3
+ movu m2, [src4q+sstrideq]
+ movu m3, [src4q+sstrideq*2]
+ SBUTTERFLY wd, 4, 2, 6
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m4, m9
+ pmaddwd m2, m9
+%else
+ pmaddwd m4, [filteryq+ 64]
+ pmaddwd m2, [filteryq+ 64]
+%endif
+ paddd m0, m4
+ paddd m1, m2
+ movu m4, [src4q+sstride3q]
+ add src4q, sstrideq
+ SBUTTERFLY wd, 3, 4, 6
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m3, m10
+ pmaddwd m4, m10
+%else
+ pmaddwd m3, [filteryq+ 96]
+ pmaddwd m4, [filteryq+ 96]
+%endif
+ paddd m0, m3
+ paddd m1, m4
+%if ARCH_X86_64
+ paddd m0, m11
+ paddd m1, m11
+%else
+ paddd m0, [pd_64]
+ paddd m1, [pd_64]
+%endif
+ psrad m0, 7
+ psrad m1, 7
+%if cpuflag(sse4)
+ packusdw m0, m1
+%else
+ packssdw m0, m1
+%endif
+ pminsw m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+ pmaxsw m0, m12
+%else
+ pxor m2, m2
+ pmaxsw m0, m2
+%endif
+%endif
+%ifidn %1, avg
+ pavgw m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%endif
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v_fn put
+filter_v_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_v_fn put
+filter_v_fn avg
+%endif
diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c
index 8925573ade..8f2b8a6299 100644
--- a/libavcodec/x86/w64xmmtest.c
+++ b/libavcodec/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
new file mode 100644
index 0000000000..0220885da6
--- /dev/null
+++ b/libavcodec/x86/xvididct.asm
@@ -0,0 +1,983 @@
+; XVID MPEG-4 VIDEO CODEC
+;
+; Conversion from gcc syntax to x264asm syntax with modifications
+; by Christophe Gisquet <christophe.gisquet@gmail.com>
+;
+; =========== SSE2 inverse discrete cosine transform ===========
+;
+; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+;
+; Conversion to gcc syntax with modifications
+; by Alexander Strange <astrange@ithinksw.com>
+;
+; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+;
+; Vertical pass is an implementation of the scheme:
+; Loeffler C., Ligtenberg A., and Moschytz C.S.:
+; Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+; Proc. ICASSP 1989, 988-991.
+;
+; Horizontal pass is a double 4x4 vector/matrix multiplication,
+; (see also Intel's Application Note 922:
+; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+; Copyright (C) 1999 Intel Corporation)
+;
+; More details at http://skal.planet-d.net/coding/dct.html
+;
+; ======= MMX and XMM forward discrete cosine transform =======
+;
+; Copyright(C) 2001 Peter Ross <pross@xvid.org>
+;
+; Originally provided by Intel at AP-922
+; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
+; but in a limited edition.
+; New macro implements a column part for precise iDCT
+; The routine precision now satisfies IEEE standard 1180-1990.
+;
+; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
+; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
+;
+; http://www.elecard.com/peter/idct.html
+; http://www.linuxvideo.org/mpeg2dec/
+;
+; These examples contain code fragments for first stage iDCT 8x8
+; (for rows) and first stage DCT 8x8 (for columns)
+;
+; conversion to gcc syntax by Michael Niedermayer
+;
+; ======================================================================
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+; Similar to tg_1_16 in MMX code
+tan1: times 8 dw 13036
+tan2: times 8 dw 27146
+tan3: times 8 dw 43790
+sqrt2: times 8 dw 23170
+
+; SSE2 tables
+iTab1: dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
+ dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
+ dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
+ dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+iTab2: dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
+ dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
+ dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
+ dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+iTab3: dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
+ dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
+ dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
+ dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+iTab4: dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
+ dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
+ dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
+ dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+
+%if ARCH_X86_32
+; -----------------------------------------------------------------------------
+;
+; The first stage iDCT 8x8 - inverse DCTs of rows
+;
+; -----------------------------------------------------------------------------
+; The 8-point inverse DCT direct algorithm
+; -----------------------------------------------------------------------------
+;
+; static const short w[32] = {
+; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
+; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
+; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
+; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
+; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
+; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
+;
+; #define DCT_8_INV_ROW(x, y)
+; {
+; int a0, a1, a2, a3, b0, b1, b2, b3;
+;
+; a0 = x[0] * w[0] + x[2] * w[1] + x[4] * w[2] + x[6] * w[3];
+; a1 = x[0] * w[4] + x[2] * w[5] + x[4] * w[6] + x[6] * w[7];
+; a2 = x[0] * w[8] + x[2] * w[9] + x[4] * w[10] + x[6] * w[11];
+; a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+; b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+; b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+; b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+; b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+;
+; y[0] = SHIFT_ROUND(a0 + b0);
+; y[1] = SHIFT_ROUND(a1 + b1);
+; y[2] = SHIFT_ROUND(a2 + b2);
+; y[3] = SHIFT_ROUND(a3 + b3);
+; y[4] = SHIFT_ROUND(a3 - b3);
+; y[5] = SHIFT_ROUND(a2 - b2);
+; y[6] = SHIFT_ROUND(a1 - b1);
+; y[7] = SHIFT_ROUND(a0 - b0);
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; In this implementation the outputs of the iDCT-1D are multiplied
+; for rows 0,4 - by cos_4_16,
+; for rows 1,7 - by cos_1_16,
+; for rows 2,6 - by cos_2_16,
+; for rows 3,5 - by cos_3_16
+; and are shifted to the left for better accuracy.
+;
+; For the constants used,
+; FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; Tables for mmx processors
+; -----------------------------------------------------------------------------
+
+; Table for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_mmx: dw 16384, 16384, 16384, -16384
+ dw 21407, 8867, 8867, -21407 ; w07 w05 w03 w01
+ dw 16384, -16384, 16384, 16384 ; w14 w12 w10 w08
+ dw -8867, 21407, -21407, -8867 ; w15 w13 w11 w09
+ dw 22725, 12873, 19266, -22725 ; w22 w20 w18 w16
+ dw 19266, 4520, -4520, -12873 ; w23 w21 w19 w17
+ dw 12873, 4520, 4520, 19266 ; w30 w28 w26 w24
+ dw -22725, 19266, -12873, -22725 ; w31 w29 w27 w25
+; Table for rows 1,7 - constants are multiplied by cos_1_16
+ dw 22725, 22725, 22725, -22725 ; movq-> w06 w04 w02 w00
+ dw 29692, 12299, 12299, -29692 ; w07 w05 w03 w01
+ dw 22725, -22725, 22725, 22725 ; w14 w12 w10 w08
+ dw -12299, 29692, -29692, -12299 ; w15 w13 w11 w09
+ dw 31521, 17855, 26722, -31521 ; w22 w20 w18 w16
+ dw 26722, 6270, -6270, -17855 ; w23 w21 w19 w17
+ dw 17855, 6270, 6270, 26722 ; w30 w28 w26 w24
+ dw -31521, 26722, -17855, -31521 ; w31 w29 w27 w25
+; Table for rows 2,6 - constants are multiplied by cos_2_16
+ dw 21407, 21407, 21407, -21407 ; movq-> w06 w04 w02 w00
+ dw 27969, 11585, 11585, -27969 ; w07 w05 w03 w01
+ dw 21407, -21407, 21407, 21407 ; w14 w12 w10 w08
+ dw -11585, 27969, -27969, -11585 ; w15 w13 w11 w09
+ dw 29692, 16819, 25172, -29692 ; w22 w20 w18 w16
+ dw 25172, 5906, -5906, -16819 ; w23 w21 w19 w17
+ dw 16819, 5906, 5906, 25172 ; w30 w28 w26 w24
+ dw -29692, 25172, -16819, -29692 ; w31 w29 w27 w25
+; Table for rows 3,5 - constants are multiplied by cos_3_16
+ dw 19266, 19266, 19266, -19266 ; movq-> w06 w04 w02 w00
+ dw 25172, 10426, 10426, -25172 ; w07 w05 w03 w01
+ dw 19266, -19266, 19266, 19266 ; w14 w12 w10 w08
+ dw -10426, 25172, -25172, -10426 ; w15 w13 w11 w09
+ dw 26722, 15137, 22654, -26722 ; w22 w20 w18 w16
+ dw 22654, 5315, -5315, -15137 ; w23 w21 w19 w17
+ dw 15137, 5315, 5315, 22654 ; w30 w28 w26 w24
+ dw -26722, 22654, -15137, -26722 ; w31 w29 w27 w25
+
+; -----------------------------------------------------------------------------
+; Tables for xmm processors
+; -----------------------------------------------------------------------------
+
+; %3 for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_xmm: dw 16384, 21407, 16384, 8867 ; movq-> w05 w04 w01 w00
+ dw 16384, 8867, -16384, -21407 ; w07 w06 w03 w02
+ dw 16384, -8867, 16384, -21407 ; w13 w12 w09 w08
+ dw -16384, 21407, 16384, -8867 ; w15 w14 w11 w10
+ dw 22725, 19266, 19266, -4520 ; w21 w20 w17 w16
+ dw 12873, 4520, -22725, -12873 ; w23 w22 w19 w18
+ dw 12873, -22725, 4520, -12873 ; w29 w28 w25 w24
+ dw 4520, 19266, 19266, -22725 ; w31 w30 w27 w26
+; %3 for rows 1,7 - constants are multiplied by cos_1_16
+ dw 22725, 29692, 22725, 12299 ; movq-> w05 w04 w01 w00
+ dw 22725, 12299, -22725, -29692 ; w07 w06 w03 w02
+ dw 22725, -12299, 22725, -29692 ; w13 w12 w09 w08
+ dw -22725, 29692, 22725, -12299 ; w15 w14 w11 w10
+ dw 31521, 26722, 26722, -6270 ; w21 w20 w17 w16
+ dw 17855, 6270, -31521, -17855 ; w23 w22 w19 w18
+ dw 17855, -31521, 6270, -17855 ; w29 w28 w25 w24
+ dw 6270, 26722, 26722, -31521 ; w31 w30 w27 w26
+; %3 for rows 2,6 - constants are multiplied by cos_2_16
+ dw 21407, 27969, 21407, 11585 ; movq-> w05 w04 w01 w00
+ dw 21407, 11585, -21407, -27969 ; w07 w06 w03 w02
+ dw 21407, -11585, 21407, -27969 ; w13 w12 w09 w08
+ dw -21407, 27969, 21407, -11585 ; w15 w14 w11 w10
+ dw 29692, 25172, 25172, -5906 ; w21 w20 w17 w16
+ dw 16819, 5906, -29692, -16819 ; w23 w22 w19 w18
+ dw 16819, -29692, 5906, -16819 ; w29 w28 w25 w24
+ dw 5906, 25172, 25172, -29692 ; w31 w30 w27 w26
+; %3 for rows 3,5 - constants are multiplied by cos_3_16
+ dw 19266, 25172, 19266, 10426 ; movq-> w05 w04 w01 w00
+ dw 19266, 10426, -19266, -25172 ; w07 w06 w03 w02
+ dw 19266, -10426, 19266, -25172 ; w13 w12 w09 w08
+ dw -19266, 25172, 19266, -10426 ; w15 w14 w11 w10
+ dw 26722, 22654, 22654, -5315 ; w21 w20 w17 w16
+ dw 15137, 5315, -26722, -15137 ; w23 w22 w19 w18
+ dw 15137, -26722, 5315, -15137 ; w29 w28 w25 w24
+ dw 5315, 22654, 22654, -26722 ; w31 w30 w27 w26
+%endif ; ~ARCH_X86_32
+
+; Similar to rounder_0 in MMX code
+; 4 first similar, then: 4*8->6*16 5*8->4*16 6/7*8->5*16
+walkenIdctRounders: times 4 dd 65536
+ times 4 dd 3597
+ times 4 dd 2260
+ times 4 dd 1203
+ times 4 dd 120
+ times 4 dd 512
+ times 2 dd 0
+
+pb_127: times 8 db 127
+
+SECTION .text
+
+; Temporary storage before the column pass
+%define ROW1 xmm6
+%define ROW3 xmm4
+%define ROW5 xmm5
+%define ROW7 xmm7
+
+%macro CLEAR_ODD 1
+ pxor %1, %1
+%endmacro
+%macro PUT_ODD 1
+ pshufhw %1, xmm2, 0x1B
+%endmacro
+
+%macro MOV32 2
+%if ARCH_X86_32
+ movdqa %2, %1
+%endif
+%endmacro
+
+%macro CLEAR_EVEN 1
+%if ARCH_X86_64
+ CLEAR_ODD %1
+%endif
+%endmacro
+
+%macro PUT_EVEN 1
+%if ARCH_X86_64
+ PUT_ODD %1
+%else
+ pshufhw xmm2, xmm2, 0x1B
+ movdqa %1, xmm2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+%define ROW0 xmm8
+%define REG0 ROW0
+%define ROW2 xmm9
+%define REG2 ROW2
+%define ROW4 xmm10
+%define REG4 ROW4
+%define ROW6 xmm11
+%define REG6 ROW6
+%define XMMS xmm12
+%define SREG2 REG2
+%define TAN3 xmm13
+%define TAN1 xmm14
+%else
+%define ROW0 [BLOCK + 0*16]
+%define REG0 xmm4
+%define ROW2 [BLOCK + 2*16]
+%define REG2 xmm4
+%define ROW4 [BLOCK + 4*16]
+%define REG4 xmm6
+%define ROW6 [BLOCK + 6*16]
+%define REG6 xmm6
+%define XMMS xmm2
+%define SREG2 xmm7
+%define TAN3 xmm0
+%define TAN1 xmm2
+%endif
+
+%macro JZ 2
+ test %1, %1
+ jz .%2
+%endmacro
+
+%macro JNZ 2
+ test %1, %1
+ jnz .%2
+%endmacro
+
+%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
+ %3 %4
+ movq mm1, [%1]
+ por mm1, [%1 + 8]
+ paddusb mm1, mm0
+ pmovmskb %2, mm1
+%endmacro
+
+;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
+%macro TEST_TWO_ROWS 8
+ %5 %6
+ %7 %8
+ movq mm1, [%1 + 0]
+ por mm1, [%1 + 8]
+ movq mm2, [%2 + 0]
+ por mm2, [%2 + 8]
+ paddusb mm1, mm0
+ paddusb mm2, mm0
+ pmovmskb %3, mm1
+ pmovmskb %4, mm2
+%endmacro
+
+; IDCT pass on rows.
+%macro iMTX_MULT 4-5 ; src, table, put, arg, rounder
+ movdqa xmm3, [%1]
+ movdqa xmm0, xmm3
+ pshufd xmm1, xmm3, 0x11 ; 4602
+ punpcklqdq xmm0, xmm0 ; 0246
+ pmaddwd xmm0, [%2]
+ pmaddwd xmm1, [%2+16]
+ pshufd xmm2, xmm3, 0xBB ; 5713
+ punpckhqdq xmm3, xmm3 ; 1357
+ pmaddwd xmm2, [%2+32]
+ pmaddwd xmm3, [%2+48]
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+%if %0 == 5
+ paddd xmm0, [walkenIdctRounders+%5]
+%endif
+ movdqa xmm3, xmm2
+ paddd xmm2, xmm0
+ psubd xmm0, xmm3
+ psrad xmm2, 11
+ psrad xmm0, 11
+ packssdw xmm2, xmm0
+ %3 %4
+%endmacro
+
+%macro iLLM_HEAD 0
+ movdqa TAN3, [tan3]
+ movdqa TAN1, [tan1]
+%endmacro
+
+%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put)
+ psraw xmm5, 6
+ psraw REG0, 6
+ psraw TAN3, 6
+ psraw xmm3, 6
+ ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+ movdqa [%1+1*16], TAN3
+ movdqa [%1+2*16], xmm3
+ movdqa [%1+5*16], REG0
+ movdqa [%1+6*16], xmm5
+%else
+ ; Must now load args as gprs are no longer used for masks
+ ; DEST is set to where address of dest was loaded
+ %if ARCH_X86_32
+ %if %2 == 2 ; Not enough xmms, store
+ movdqa [%1+1*16], TAN3
+ movdqa [%1+2*16], xmm3
+ movdqa [%1+5*16], REG0
+ movdqa [%1+6*16], xmm5
+ %endif
+ %xdefine DEST r2q ; BLOCK is r0, stride r1
+ movifnidn DEST, destm
+ movifnidn strideq, stridem
+ %else
+ %xdefine DEST r0q
+ %endif
+ lea r3q, [3*strideq]
+ %if %2 == 1
+ packuswb TAN3, xmm3
+ packuswb xmm5, REG0
+ movq [DEST + strideq], TAN3
+ movhps [DEST + 2*strideq], TAN3
+ ; REG0 and TAN3 are now available (and likely used in second half)
+ %endif
+%endif
+%endmacro
+
+%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms
+ psraw %3, 6
+ psraw %4, 6
+ psraw %5, 6
+ psraw %6, 6
+ ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+ movdqa [%1+0*16], %3
+ movdqa [%1+3*16], %5
+ movdqa [%1+4*16], %6
+ movdqa [%1+7*16], %4
+%elif %2 == 1
+ packuswb %3, %5
+ packuswb %6, %4
+ ; address of dest may have been loaded
+ movq [DEST], %3
+ movhps [DEST + r3q], %3
+ lea DEST, [DEST + 4*strideq]
+ movq [DEST], %6
+ movhps [DEST + r3q], %6
+ ; and now write remainder of first half
+ movq [DEST + 2*strideq], xmm5
+ movhps [DEST + strideq], xmm5
+%elif %2 == 2
+ pxor xmm0, xmm0
+ %if ARCH_X86_32
+ ; free: m3 REG0=m4 m5
+ ; input: m1, m7, m2, m6
+ movq xmm3, [DEST+0*strideq]
+ movq xmm4, [DEST+1*strideq]
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ paddsw xmm3, %3
+ paddsw xmm4, [%1 + 1*16]
+ movq %3, [DEST+2*strideq]
+ movq xmm5, [DEST+ r3q]
+ punpcklbw %3, xmm0
+ punpcklbw xmm5, xmm0
+ paddsw %3, [%1 + 2*16]
+ paddsw xmm5, %5
+ packuswb xmm3, xmm4
+ packuswb %3, xmm5
+ movq [DEST+0*strideq], xmm3
+ movhps [DEST+1*strideq], xmm3
+ movq [DEST+2*strideq], %3
+ movhps [DEST+ r3q], %3
+ lea DEST, [DEST+4*strideq]
+ movq xmm3, [DEST+0*strideq]
+ movq xmm4, [DEST+1*strideq]
+ movq %3, [DEST+2*strideq]
+ movq xmm5, [DEST+ r3q]
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw %3, xmm0
+ punpcklbw xmm5, xmm0
+ paddsw xmm3, %6
+ paddsw xmm4, [%1 + 5*16]
+ paddsw %3, [%1 + 6*16]
+ paddsw xmm5, %4
+ packuswb xmm3, xmm4
+ packuswb %3, xmm5
+ movq [DEST+0*strideq], xmm3
+ movhps [DEST+1*strideq], xmm3
+ movq [DEST+2*strideq], %3
+ movhps [DEST+ r3q], %3
+ %else
+ ; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5
+ ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
+ movq xmm2, [DEST+0*strideq]
+ movq xmm4, [DEST+1*strideq]
+ movq xmm12, [DEST+2*strideq]
+ movq xmm11, [DEST+ r3q]
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm12, xmm0
+ punpcklbw xmm11, xmm0
+ paddsw xmm2, %3
+ paddsw xmm4, TAN3
+ paddsw xmm12, xmm3
+ paddsw xmm11, %5
+ packuswb xmm2, xmm4
+ packuswb xmm12, xmm11
+ movq [DEST+0*strideq], xmm2
+ movhps [DEST+1*strideq], xmm2
+ movq [DEST+2*strideq], xmm12
+ movhps [DEST+ r3q], xmm12
+ lea DEST, [DEST+4*strideq]
+ movq xmm2, [DEST+0*strideq]
+ movq xmm4, [DEST+1*strideq]
+ movq xmm12, [DEST+2*strideq]
+ movq xmm11, [DEST+ r3q]
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm12, xmm0
+ punpcklbw xmm11, xmm0
+ paddsw xmm2, %6
+ paddsw xmm4, REG0
+ paddsw xmm12, xmm5
+ paddsw xmm11, %4
+ packuswb xmm2, xmm4
+ packuswb xmm12, xmm11
+ movq [DEST+0*strideq], xmm2
+ movhps [DEST+1*strideq], xmm2
+ movq [DEST+2*strideq], xmm12
+ movhps [DEST+ r3q], xmm12
+ %endif
+%endif
+%endmacro
+
+
+; IDCT pass on columns.
+%macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put)
+ movdqa xmm1, TAN3
+ movdqa xmm3, TAN1
+ pmulhw TAN3, xmm4
+ pmulhw xmm1, xmm5
+ paddsw TAN3, xmm4
+ paddsw xmm1, xmm5
+ psubsw TAN3, xmm5
+ paddsw xmm1, xmm4
+ pmulhw xmm3, xmm7
+ pmulhw TAN1, xmm6
+ paddsw xmm3, xmm6
+ psubsw TAN1, xmm7
+ movdqa xmm7, xmm3
+ movdqa xmm6, TAN1
+ psubsw xmm3, xmm1
+ psubsw TAN1, TAN3
+ paddsw xmm1, xmm7
+ paddsw TAN3, xmm6
+ movdqa xmm6, xmm3
+ psubsw xmm3, TAN3
+ paddsw TAN3, xmm6
+ movdqa xmm4, [sqrt2]
+ pmulhw xmm3, xmm4
+ pmulhw TAN3, xmm4
+ paddsw TAN3, TAN3
+ paddsw xmm3, xmm3
+ movdqa xmm7, [tan2]
+ MOV32 ROW2, REG2
+ MOV32 ROW6, REG6
+ movdqa xmm5, xmm7
+ pmulhw xmm7, REG6
+ pmulhw xmm5, REG2
+ paddsw xmm7, REG2
+ psubsw xmm5, REG6
+ MOV32 ROW0, REG0
+ MOV32 ROW4, REG4
+ MOV32 TAN1, [BLOCK]
+ movdqa XMMS, REG0
+ psubsw REG0, REG4
+ paddsw REG4, XMMS
+ movdqa XMMS, REG4
+ psubsw REG4, xmm7
+ paddsw xmm7, XMMS
+ movdqa XMMS, REG0
+ psubsw REG0, xmm5
+ paddsw xmm5, XMMS
+ movdqa XMMS, xmm5
+ psubsw xmm5, TAN3
+ paddsw TAN3, XMMS
+ movdqa XMMS, REG0
+ psubsw REG0, xmm3
+ paddsw xmm3, XMMS
+ MOV32 [BLOCK], TAN1
+
+ FIRST_HALF %1, %2
+
+ movdqa xmm0, xmm7
+ movdqa xmm4, REG4
+ psubsw xmm7, xmm1
+ psubsw REG4, TAN1
+ paddsw xmm1, xmm0
+ paddsw TAN1, xmm4
+
+ SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
+%endmacro
+
+; IDCT pass on columns, assuming rows 4-7 are zero
+%macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add)
+ pmulhw TAN3, xmm4
+ paddsw TAN3, xmm4
+ movdqa xmm3, xmm6
+ pmulhw TAN1, xmm6
+ movdqa xmm1, xmm4
+ psubsw xmm3, xmm1
+ paddsw xmm1, xmm6
+ movdqa xmm6, TAN1
+ psubsw TAN1, TAN3
+ paddsw TAN3, xmm6
+ movdqa xmm6, xmm3
+ psubsw xmm3, TAN3
+ paddsw TAN3, xmm6
+ movdqa xmm4, [sqrt2]
+ pmulhw xmm3, xmm4
+ pmulhw TAN3, xmm4
+ paddsw TAN3, TAN3
+ paddsw xmm3, xmm3
+ movdqa xmm5, [tan2]
+ MOV32 ROW2, SREG2
+ pmulhw xmm5, SREG2
+ MOV32 ROW0, REG0
+ movdqa xmm6, REG0
+ psubsw xmm6, SREG2
+ paddsw SREG2, REG0
+ MOV32 TAN1, [BLOCK]
+ movdqa XMMS, REG0
+ psubsw REG0, xmm5
+ paddsw xmm5, XMMS
+ movdqa XMMS, xmm5
+ psubsw xmm5, TAN3
+ paddsw TAN3, XMMS
+ movdqa XMMS, REG0
+ psubsw REG0, xmm3
+ paddsw xmm3, XMMS
+ MOV32 [BLOCK], TAN1
+
+ FIRST_HALF %1, %2
+
+ movdqa xmm0, SREG2
+ movdqa xmm4, xmm6
+ psubsw SREG2, xmm1
+ psubsw xmm6, TAN1
+ paddsw xmm1, xmm0
+ paddsw TAN1, xmm4
+
+ SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
+%endmacro
+
+%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add
+%if %1 == 0 || ARCH_X86_32
+ %define GPR0 r1d
+ %define GPR1 r2d
+ %define GPR2 r3d
+ %define GPR3 r4d
+ %define NUM_GPRS 5
+%else
+ %define GPR0 r3d
+ %define GPR1 r4d
+ %define GPR2 r5d
+ %define GPR3 r6d
+ %define NUM_GPRS 7
+%endif
+%if %1 == 0
+cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
+%xdefine BLOCK blockq
+%else
+ %if %1 == 1
+cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+ %else
+cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+ %endif
+ %if ARCH_X86_64
+ %xdefine BLOCK blockq
+ %else
+ mov r0q, blockm
+ %xdefine BLOCK r0q
+ %endif
+%endif
+ movq mm0, [pb_127]
+ iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
+ iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16
+ iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
+
+ TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
+ JZ GPR0, col1
+ iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16
+.col1:
+ TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
+ TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
+
+ iLLM_HEAD
+ JNZ GPR1, 2
+ JNZ GPR0, 3
+ JNZ GPR2, 4
+ JNZ GPR3, 5
+ iLLM_PASS_SPARSE BLOCK, %1
+ jmp .6
+.2:
+ iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
+.3:
+ iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16
+ JZ GPR2, col2
+.4:
+ iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
+.col2:
+ JZ GPR3, col3
+.5:
+ iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16
+.col3:
+%if ARCH_X86_32
+ iLLM_HEAD
+%endif
+ iLLM_PASS BLOCK, %1
+.6:
+ RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_SSE2 0
+IDCT_SSE2 1
+IDCT_SSE2 2
+
+%if ARCH_X86_32
+
+; %1=offset %2=tab_offset
+; %3=rnd_offset where 4*8->6*16 5*8->4*16 6/7*8->5*16
+%macro DCT_8_INV_ROW 3
+ movq mm0, [r0+16*%1+0] ; 0 ; x3 x2 x1 x0
+ movq mm1, [r0+16*%1+8] ; 1 ; x7 x6 x5 x4
+ movq mm2, mm0 ; 2 ; x3 x2 x1 x0
+ movq mm3, [%2+ 0] ; 3 ; w06 w04 w02 w00
+%if cpuflag(mmxext)
+ pshufw mm0, mm0, 0x88 ; x2 x0 x2 x0
+ movq mm4, [%2+ 8] ; 4 ; w07 w06 w03 w02
+ movq mm5, mm1 ; 5 ; x7 x6 x5 x4
+ pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00
+ movq mm6, [%2+32] ; 6 ; w21 w20 w17 w16
+ pshufw mm1, mm1, 0x88 ; x6 x4 x6 x4
+ pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
+ movq mm7, [%2+40] ; 7; w23 w22 w19 w18
+ pshufw mm2, mm2, 0xdd ; x3 x1 x3 x1
+ pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
+ pshufw mm5, mm5, 0xdd ; x7 x5 x7 x5
+ pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18
+ paddd mm3, [walkenIdctRounders + %3] ; +%3
+ pmaddwd mm0, [%2+16] ; x2*w13+x0*w12 x2*w09+x0*w08
+ paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0)
+ pmaddwd mm1, [%2+24] ; x6*w15+x4*w14 x6*w11+x4*w10
+ movq mm4, mm3 ; 4 ; a1 a0
+ pmaddwd mm2, [%2+48] ; x3*w29+x1*w28 x3*w25+x1*w24
+ paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0)
+ pmaddwd mm5, [%2+56] ; x7*w31+x5*w30 x7*w27+x5*w26
+ paddd mm3, mm6 ; a1+b1 a0+b0
+ paddd mm0, [walkenIdctRounders + %3] ; +%3
+ psrad mm3, 11 ; y1=a1+b1 y0=a0+b0
+ paddd mm0, mm1 ; 1 ; a3=sum(even3) a2=sum(even2)
+ psubd mm4, mm6 ; 6 ; a1-b1 a0-b0
+ movq mm7, mm0 ; 7 ; a3 a2
+ paddd mm2, mm5 ; 5 ; b3=sum(odd3) b2=sum(odd2)
+ paddd mm0, mm2 ; a3+b3 a2+b2
+ psrad mm4, 11 ; y6=a1-b1 y7=a0-b0
+ psubd mm7, mm2 ; 2 ; a3-b3 a2-b2
+ psrad mm0, 11 ; y3=a3+b3 y2=a2+b2
+ psrad mm7, 11 ; y4=a3-b3 y5=a2-b2
+ packssdw mm3, mm0 ; 0 ; y3 y2 y1 y0
+ packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5
+ movq [r0+16*%1+0], mm3 ; 3 ; save y3 y2 y1 y0
+ pshufw mm7, mm7, 0xb1 ; y7 y6 y5 y4
+%else
+ punpcklwd mm0, mm1 ; x5 x1 x4 x0
+ movq mm5, mm0 ; 5 ; x5 x1 x4 x0
+ punpckldq mm0, mm0 ; x4 x0 x4 x0
+ movq mm4, [%2+ 8] ; 4 ; w07 w05 w03 w01
+ punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2
+ pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00
+ movq mm6, mm2 ; 6 ; x7 x3 x6 x2
+ movq mm1, [%2+32] ; 1 ; w22 w20 w18 w16
+ punpckldq mm2, mm2 ; x6 x2 x6 x2
+ pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01
+ punpckhdq mm5, mm5 ; x5 x1 x5 x1
+ pmaddwd mm0, [%2+16] ; x4*w14+x0*w12 x4*w10+x0*w08
+ punpckhdq mm6, mm6 ; x7 x3 x7 x3
+ movq mm7, [%2+40] ; 7 ; w23 w21 w19 w17
+ pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16
+ paddd mm3, [walkenIdctRounders + %3] ; +%3
+ pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17
+ pmaddwd mm2, [%2+24] ; x6*w15+x2*w13 x6*w11+x2*w09
+ paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0)
+ pmaddwd mm5, [%2+48] ; x5*w30+x1*w28 x5*w26+x1*w24
+ movq mm4, mm3 ; 4 ; a1 a0
+ pmaddwd mm6, [%2+56] ; x7*w31+x3*w29 x7*w27+x3*w25
+ paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0)
+ paddd mm0, [walkenIdctRounders + %3] ; +%3
+ psubd mm3, mm1 ; a1-b1 a0-b0
+ psrad mm3, 11 ; y6=a1-b1 y7=a0-b0
+ paddd mm1, mm4 ; 4 ; a1+b1 a0+b0
+ paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2)
+ psrad mm1, 11 ; y1=a1+b1 y0=a0+b0
+ paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2)
+ movq mm4, mm0 ; 4 ; a3 a2
+ paddd mm0, mm5 ; a3+b3 a2+b2
+ psubd mm4, mm5 ; 5 ; a3-b3 a2-b2
+ psrad mm0, 11 ; y3=a3+b3 y2=a2+b2
+ psrad mm4, 11 ; y4=a3-b3 y5=a2-b2
+ packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0
+ packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5
+ movq mm7, mm4 ; 7 ; y6 y7 y4 y5
+ psrld mm4, 16 ; 0 y6 0 y4
+ pslld mm7, 16 ; y7 0 y5 0
+ movq [r0+16*%1+0], mm1 ; 1 ; save y3 y2 y1 y0
+ por mm7, mm4 ; 4 ; y7 y6 y5 y4
+%endif
+ movq [r0+16*%1+8], mm7 ; 7 ; save y7 y6 y5 y4
+%endmacro
+
+; -----------------------------------------------------------------------------
+;
+; The first stage DCT 8x8 - forward DCTs of columns
+;
+; The %2puts are multiplied
+; for rows 0,4 - on cos_4_16,
+; for rows 1,7 - on cos_1_16,
+; for rows 2,6 - on cos_2_16,
+; for rows 3,5 - on cos_3_16
+; and are shifted to the left for rise of accuracy
+;
+; -----------------------------------------------------------------------------
+;
+; The 8-point scaled forward DCT algorithm (26a8m)
+;
+; -----------------------------------------------------------------------------
+;
+;#define DCT_8_FRW_COL(x, y)
+; {
+; short t0, t1, t2, t3, t4, t5, t6, t7;
+; short tp03, tm03, tp12, tm12, tp65, tm65;
+; short tp465, tm465, tp765, tm765;
+;
+; t0 = LEFT_SHIFT(x[0] + x[7]);
+; t1 = LEFT_SHIFT(x[1] + x[6]);
+; t2 = LEFT_SHIFT(x[2] + x[5]);
+; t3 = LEFT_SHIFT(x[3] + x[4]);
+; t4 = LEFT_SHIFT(x[3] - x[4]);
+; t5 = LEFT_SHIFT(x[2] - x[5]);
+; t6 = LEFT_SHIFT(x[1] - x[6]);
+; t7 = LEFT_SHIFT(x[0] - x[7]);
+;
+; tp03 = t0 + t3;
+; tm03 = t0 - t3;
+; tp12 = t1 + t2;
+; tm12 = t1 - t2;
+;
+; y[0] = tp03 + tp12;
+; y[4] = tp03 - tp12;
+;
+; y[2] = tm03 + tm12 * tg_2_16;
+; y[6] = tm03 * tg_2_16 - tm12;
+;
+; tp65 = (t6 + t5) * cos_4_16;
+; tm65 = (t6 - t5) * cos_4_16;
+;
+; tp765 = t7 + tp65;
+; tm765 = t7 - tp65;
+; tp465 = t4 + tm65;
+; tm465 = t4 - tm65;
+;
+; y[1] = tp765 + tp465 * tg_1_16;
+; y[7] = tp765 * tg_1_16 - tp465;
+; y[5] = tm765 * tg_3_16 + tm465;
+; y[3] = tm765 - tm465 * tg_3_16;
+; }
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; DCT_8_INV_COL_4 INP,OUT
+; -----------------------------------------------------------------------------
+%macro DCT_8_INV_COL 1
+ movq mm0, [tan3]
+ movq mm3, [%1+16*3]
+ movq mm1, mm0 ; tg_3_16
+ movq mm5, [%1+16*5]
+ pmulhw mm0, mm3 ; x3*(tg_3_16-1)
+ movq mm4, [tan1]
+ pmulhw mm1, mm5 ; x5*(tg_3_16-1)
+ movq mm7, [%1+16*7]
+ movq mm2, mm4 ; tg_1_16
+ movq mm6, [%1+16*1]
+ pmulhw mm4, mm7 ; x7*tg_1_16
+ paddsw mm0, mm3 ; x3*tg_3_16
+ pmulhw mm2, mm6 ; x1*tg_1_16
+ paddsw mm1, mm3 ; x3+x5*(tg_3_16-1)
+ psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35
+ movq mm3, [sqrt2]
+ paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35
+ paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17
+ psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17
+ movq mm5, mm4 ; tp17
+ movq mm6, mm2 ; tm17
+ paddsw mm5, mm1 ; tp17+tp35 = b0
+ psubsw mm6, mm0 ; tm17-tm35 = b3
+ psubsw mm4, mm1 ; tp17-tp35 = t1
+ paddsw mm2, mm0 ; tm17+tm35 = t2
+ movq mm7, [tan2]
+ movq mm1, mm4 ; t1
+ movq [%1+3*16], mm5 ; save b0
+ paddsw mm1, mm2 ; t1+t2
+ movq [%1+5*16], mm6 ; save b3
+ psubsw mm4, mm2 ; t1-t2
+ movq mm5, [%1+2*16]
+ movq mm0, mm7 ; tg_2_16
+ movq mm6, [%1+6*16]
+ pmulhw mm0, mm5 ; x2*tg_2_16
+ pmulhw mm7, mm6 ; x6*tg_2_16
+ pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2
+ movq mm2, [%1+0*16]
+ pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2
+ psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26
+ movq mm3, mm2 ; x0
+ movq mm6, [%1+4*16]
+ paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26
+ paddsw mm2, mm6 ; x0+x4 = tp04
+ psubsw mm3, mm6 ; x0-x4 = tm04
+ movq mm5, mm2 ; tp04
+ movq mm6, mm3 ; tm04
+ psubsw mm2, mm7 ; tp04-tp26 = a3
+ paddsw mm3, mm0 ; tm04+tm26 = a1
+ paddsw mm1, mm1 ; b1
+ paddsw mm4, mm4 ; b2
+ paddsw mm5, mm7 ; tp04+tp26 = a0
+ psubsw mm6, mm0 ; tm04-tm26 = a2
+ movq mm7, mm3 ; a1
+ movq mm0, mm6 ; a2
+ paddsw mm3, mm1 ; a1+b1
+ paddsw mm6, mm4 ; a2+b2
+ psraw mm3, 6 ; dst1
+ psubsw mm7, mm1 ; a1-b1
+ psraw mm6, 6 ; dst2
+ psubsw mm0, mm4 ; a2-b2
+ movq mm1, [%1+3*16] ; load b0
+ psraw mm7, 6 ; dst6
+ movq mm4, mm5 ; a0
+ psraw mm0, 6 ; dst5
+ movq [%1+1*16], mm3
+ paddsw mm5, mm1 ; a0+b0
+ movq [%1+2*16], mm6
+ psubsw mm4, mm1 ; a0-b0
+ movq mm3, [%1+5*16] ; load b3
+ psraw mm5, 6 ; dst0
+ movq mm6, mm2 ; a3
+ psraw mm4, 6 ; dst7
+ movq [%1+5*16], mm0
+ paddsw mm2, mm3 ; a3+b3
+ movq [%1+6*16], mm7
+ psubsw mm6, mm3 ; a3-b3
+ movq [%1+0*16], mm5
+ psraw mm2, 6 ; dst3
+ movq [%1+7*16], mm4
+ psraw mm6, 6 ; dst4
+ movq [%1+3*16], mm2
+ movq [%1+4*16], mm6
+%endmacro
+
+%macro XVID_IDCT_MMX 0
+cglobal xvid_idct, 1, 1, 0, block
+%if cpuflag(mmxext)
+%define TAB tab_i_04_xmm
+%else
+%define TAB tab_i_04_mmx
+%endif
+ ; Process each row - beware of rounder offset
+ DCT_8_INV_ROW 0, TAB + 64 * 0, 0*16
+ DCT_8_INV_ROW 1, TAB + 64 * 1, 1*16
+ DCT_8_INV_ROW 2, TAB + 64 * 2, 2*16
+ DCT_8_INV_ROW 3, TAB + 64 * 3, 3*16
+ DCT_8_INV_ROW 4, TAB + 64 * 0, 6*16
+ DCT_8_INV_ROW 5, TAB + 64 * 3, 4*16
+ DCT_8_INV_ROW 6, TAB + 64 * 2, 5*16
+ DCT_8_INV_ROW 7, TAB + 64 * 1, 5*16
+
+ ; Process the columns (4 at a time)
+ DCT_8_INV_COL r0+0
+ DCT_8_INV_COL r0+8
+
+ RET
+%endmacro
+
+INIT_MMX mmx
+XVID_IDCT_MMX
+INIT_MMX mmxext
+XVID_IDCT_MMX
+
+%endif ; ~ARCH_X86_32
diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h
index 6640b6b78c..edb5ebfd31 100644
--- a/libavcodec/x86/xvididct.h
+++ b/libavcodec/x86/xvididct.h
@@ -1,20 +1,20 @@
/*
* XVID MPEG-4 VIDEO CODEC
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -38,7 +38,7 @@ void ff_xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
void ff_xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_xvid_idct_sse2(short *block);
-void ff_xvid_idct_sse2_put(uint8_t *dest, ptrdiff_t line_size, short *block);
-void ff_xvid_idct_sse2_add(uint8_t *dest, ptrdiff_t line_size, short *block);
+void ff_xvid_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
+void ff_xvid_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
#endif /* AVCODEC_X86_XVIDIDCT_H */
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index e4f7345795..fd10953829 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -26,9 +26,36 @@
#include "idctdsp.h"
#include "xvididct.h"
+#if ARCH_X86_32 && HAVE_YASM
+static void xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+ ff_xvid_idct_mmx(block);
+ ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+ ff_xvid_idct_mmx(block);
+ ff_add_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+ ff_xvid_idct_mmxext(block);
+ ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+ ff_xvid_idct_mmxext(block);
+ ff_add_pixels_clamped(block, dest, line_size);
+}
+#endif
+
av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
+#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (high_bit_depth ||
@@ -36,24 +63,27 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
avctx->idct_algo == FF_IDCT_XVID))
return;
- if (INLINE_MMX(cpu_flags)) {
- c->idct_put = ff_xvid_idct_mmx_put;
- c->idct_add = ff_xvid_idct_mmx_add;
+#if ARCH_X86_32
+ if (EXTERNAL_MMX(cpu_flags)) {
+ c->idct_put = xvid_idct_mmx_put;
+ c->idct_add = xvid_idct_mmx_add;
c->idct = ff_xvid_idct_mmx;
c->perm_type = FF_IDCT_PERM_NONE;
}
- if (INLINE_MMXEXT(cpu_flags)) {
- c->idct_put = ff_xvid_idct_mmxext_put;
- c->idct_add = ff_xvid_idct_mmxext_add;
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->idct_put = xvid_idct_mmxext_put;
+ c->idct_add = xvid_idct_mmxext_add;
c->idct = ff_xvid_idct_mmxext;
c->perm_type = FF_IDCT_PERM_NONE;
}
+#endif
- if (INLINE_SSE2(cpu_flags)) {
- c->idct_put = ff_xvid_idct_sse2_put;
- c->idct_add = ff_xvid_idct_sse2_add;
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->idct_put = ff_xvid_idct_put_sse2;
+ c->idct_add = ff_xvid_idct_add_sse2;
c->idct = ff_xvid_idct_sse2;
c->perm_type = FF_IDCT_PERM_SSE2;
}
+#endif /* HAVE_YASM */
}
diff --git a/libavcodec/x86/xvididct_mmx.c b/libavcodec/x86/xvididct_mmx.c
deleted file mode 100644
index 9bb407cd70..0000000000
--- a/libavcodec/x86/xvididct_mmx.c
+++ /dev/null
@@ -1,548 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - MMX and XMM forward discrete cosine transform -
- *
- * Copyright(C) 2001 Peter Ross <pross@xvid.org>
- *
- * Originally provided by Intel at AP-922
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
- * but in a limited edition.
- * New macro implements a column part for precise iDCT
- * The routine precision now satisfies IEEE standard 1180-1990.
- *
- * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
- * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
- *
- * http://www.elecard.com/peter/idct.html
- * http://www.linuxvideo.org/mpeg2dec/
- *
- * These examples contain code fragments for first stage iDCT 8x8
- * (for rows) and first stage DCT 8x8 (for columns)
- *
- * conversion to gcc syntax by Michael Niedermayer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-
-#include "config.h"
-
-#include "libavutil/mem.h"
-
-#include "libavcodec/avcodec.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_MMX_INLINE
-
-// -----------------------------------------------------------------------------
-// Various memory constants (trigonometric values or rounding values)
-// -----------------------------------------------------------------------------
-
-DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4 * 4] = {
- 13036, 13036, 13036, 13036, // tg * (2 << 16) + 0.5
- 27146, 27146, 27146, 27146, // tg * (2 << 16) + 0.5
- -21746, -21746, -21746, -21746, // tg * (2 << 16) + 0.5
- 23170, 23170, 23170, 23170
-}; // cos * (2 << 15) + 0.5
-
-DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2 * 8] = {
- 65536, 65536,
- 3597, 3597,
- 2260, 2260,
- 1203, 1203,
- 0, 0,
- 120, 120,
- 512, 512,
- 512, 512
-};
-
-// -----------------------------------------------------------------------------
-//
-// The first stage iDCT 8x8 - inverse DCTs of rows
-//
-// -----------------------------------------------------------------------------
-// The 8-point inverse DCT direct algorithm
-// -----------------------------------------------------------------------------
-//
-// static const short w[32] = {
-// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
-// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
-// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
-// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
-// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
-// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
-// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
-// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
-//
-// #define DCT_8_INV_ROW(x, y)
-// {
-// int a0, a1, a2, a3, b0, b1, b2, b3;
-//
-// a0 = x[0] * w[0] + x[2] * w[1] + x[4] * w[2] + x[6] * w[3];
-// a1 = x[0] * w[4] + x[2] * w[5] + x[4] * w[6] + x[6] * w[7];
-// a2 = x[0] * w[8] + x[2] * w[9] + x[4] * w[10] + x[6] * w[11];
-// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
-// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
-// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
-// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
-// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
-//
-// y[0] = SHIFT_ROUND(a0 + b0);
-// y[1] = SHIFT_ROUND(a1 + b1);
-// y[2] = SHIFT_ROUND(a2 + b2);
-// y[3] = SHIFT_ROUND(a3 + b3);
-// y[4] = SHIFT_ROUND(a3 - b3);
-// y[5] = SHIFT_ROUND(a2 - b2);
-// y[6] = SHIFT_ROUND(a1 - b1);
-// y[7] = SHIFT_ROUND(a0 - b0);
-// }
-//
-// -----------------------------------------------------------------------------
-//
-// In this implementation the outputs of the iDCT-1D are multiplied
-// for rows 0,4 - by cos_4_16,
-// for rows 1,7 - by cos_1_16,
-// for rows 2,6 - by cos_2_16,
-// for rows 3,5 - by cos_3_16
-// and are shifted to the left for better accuracy.
-//
-// For the constants used,
-// FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// Tables for mmx processors
-// -----------------------------------------------------------------------------
-
-// Table for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32 * 4] = {
- 16384, 16384, 16384, -16384, // movq-> w06 w04 w02 w00
- 21407, 8867, 8867, -21407, // w07 w05 w03 w01
- 16384, -16384, 16384, 16384, // w14 w12 w10 w08
- -8867, 21407, -21407, -8867, // w15 w13 w11 w09
- 22725, 12873, 19266, -22725, // w22 w20 w18 w16
- 19266, 4520, -4520, -12873, // w23 w21 w19 w17
- 12873, 4520, 4520, 19266, // w30 w28 w26 w24
- -22725, 19266, -12873, -22725, // w31 w29 w27 w25
-// Table for rows 1,7 - constants are multiplied by cos_1_16
- 22725, 22725, 22725, -22725, // movq-> w06 w04 w02 w00
- 29692, 12299, 12299, -29692, // w07 w05 w03 w01
- 22725, -22725, 22725, 22725, // w14 w12 w10 w08
- -12299, 29692, -29692, -12299, // w15 w13 w11 w09
- 31521, 17855, 26722, -31521, // w22 w20 w18 w16
- 26722, 6270, -6270, -17855, // w23 w21 w19 w17
- 17855, 6270, 6270, 26722, // w30 w28 w26 w24
- -31521, 26722, -17855, -31521, // w31 w29 w27 w25
-// Table for rows 2,6 - constants are multiplied by cos_2_16
- 21407, 21407, 21407, -21407, // movq-> w06 w04 w02 w00
- 27969, 11585, 11585, -27969, // w07 w05 w03 w01
- 21407, -21407, 21407, 21407, // w14 w12 w10 w08
- -11585, 27969, -27969, -11585, // w15 w13 w11 w09
- 29692, 16819, 25172, -29692, // w22 w20 w18 w16
- 25172, 5906, -5906, -16819, // w23 w21 w19 w17
- 16819, 5906, 5906, 25172, // w30 w28 w26 w24
- -29692, 25172, -16819, -29692, // w31 w29 w27 w25
-// Table for rows 3,5 - constants are multiplied by cos_3_16
- 19266, 19266, 19266, -19266, // movq-> w06 w04 w02 w00
- 25172, 10426, 10426, -25172, // w07 w05 w03 w01
- 19266, -19266, 19266, 19266, // w14 w12 w10 w08
- -10426, 25172, -25172, -10426, // w15 w13 w11 w09
- 26722, 15137, 22654, -26722, // w22 w20 w18 w16
- 22654, 5315, -5315, -15137, // w23 w21 w19 w17
- 15137, 5315, 5315, 22654, // w30 w28 w26 w24
- -26722, 22654, -15137, -26722, // w31 w29 w27 w25
-};
-// -----------------------------------------------------------------------------
-// Tables for xmm processors
-// -----------------------------------------------------------------------------
-
-// %3 for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32 * 4] = {
- 16384, 21407, 16384, 8867, // movq-> w05 w04 w01 w00
- 16384, 8867, -16384, -21407, // w07 w06 w03 w02
- 16384, -8867, 16384, -21407, // w13 w12 w09 w08
- -16384, 21407, 16384, -8867, // w15 w14 w11 w10
- 22725, 19266, 19266, -4520, // w21 w20 w17 w16
- 12873, 4520, -22725, -12873, // w23 w22 w19 w18
- 12873, -22725, 4520, -12873, // w29 w28 w25 w24
- 4520, 19266, 19266, -22725, // w31 w30 w27 w26
-// %3 for rows 1,7 - constants are multiplied by cos_1_16
- 22725, 29692, 22725, 12299, // movq-> w05 w04 w01 w00
- 22725, 12299, -22725, -29692, // w07 w06 w03 w02
- 22725, -12299, 22725, -29692, // w13 w12 w09 w08
- -22725, 29692, 22725, -12299, // w15 w14 w11 w10
- 31521, 26722, 26722, -6270, // w21 w20 w17 w16
- 17855, 6270, -31521, -17855, // w23 w22 w19 w18
- 17855, -31521, 6270, -17855, // w29 w28 w25 w24
- 6270, 26722, 26722, -31521, // w31 w30 w27 w26
-// %3 for rows 2,6 - constants are multiplied by cos_2_16
- 21407, 27969, 21407, 11585, // movq-> w05 w04 w01 w00
- 21407, 11585, -21407, -27969, // w07 w06 w03 w02
- 21407, -11585, 21407, -27969, // w13 w12 w09 w08
- -21407, 27969, 21407, -11585, // w15 w14 w11 w10
- 29692, 25172, 25172, -5906, // w21 w20 w17 w16
- 16819, 5906, -29692, -16819, // w23 w22 w19 w18
- 16819, -29692, 5906, -16819, // w29 w28 w25 w24
- 5906, 25172, 25172, -29692, // w31 w30 w27 w26
-// %3 for rows 3,5 - constants are multiplied by cos_3_16
- 19266, 25172, 19266, 10426, // movq-> w05 w04 w01 w00
- 19266, 10426, -19266, -25172, // w07 w06 w03 w02
- 19266, -10426, 19266, -25172, // w13 w12 w09 w08
- -19266, 25172, 19266, -10426, // w15 w14 w11 w10
- 26722, 22654, 22654, -5315, // w21 w20 w17 w16
- 15137, 5315, -26722, -15137, // w23 w22 w19 w18
- 15137, -26722, 5315, -15137, // w29 w28 w25 w24
- 5315, 22654, 22654, -26722, // w31 w30 w27 w26
-};
-// =============================================================================
-// Helper macros for the code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_MMX(A1, A2, A3, A4) \
- "movq "#A1", %%mm0 \n\t" /* 0 ; x3 x2 x1 x0 */ \
- "movq 8+"#A1", %%mm1 \n\t" /* 1 ; x7 x6 x5 x4 */ \
- "movq %%mm0, %%mm2 \n\t" /* 2 ; x3 x2 x1 x0 */ \
- "movq "#A3", %%mm3 \n\t" /* 3 ; w06 w04 w02 w00 */ \
- "punpcklwd %%mm1, %%mm0 \n\t" /* x5 x1 x4 x0 */ \
- "movq %%mm0, %%mm5 \n\t" /* 5 ; x5 x1 x4 x0 */ \
- "punpckldq %%mm0, %%mm0 \n\t" /* x4 x0 x4 x0 */ \
- "movq 8+"#A3", %%mm4 \n\t" /* 4 ; w07 w05 w03 w01 */ \
- "punpckhwd %%mm1, %%mm2 \n\t" /* 1 ; x7 x3 x6 x2 */ \
- "pmaddwd %%mm0, %%mm3 \n\t" /* x4*w06+x0*w04 x4*w02+x0*w00 */ \
- "movq %%mm2, %%mm6 \n\t" /* 6 ; x7 x3 x6 x2 */ \
- "movq 32+"#A3", %%mm1 \n\t" /* 1 ; w22 w20 w18 w16 */ \
- "punpckldq %%mm2, %%mm2 \n\t" /* x6 x2 x6 x2 */ \
- "pmaddwd %%mm2, %%mm4 \n\t" /* x6*w07+x2*w05 x6*w03+x2*w01 */ \
- "punpckhdq %%mm5, %%mm5 \n\t" /* x5 x1 x5 x1 */ \
- "pmaddwd 16+"#A3", %%mm0 \n\t" /* x4*w14+x0*w12 x4*w10+x0*w08 */ \
- "punpckhdq %%mm6, %%mm6 \n\t" /* x7 x3 x7 x3 */ \
- "movq 40+ "#A3", %%mm7 \n\t" /* 7 ; w23 w21 w19 w17 */ \
- "pmaddwd %%mm5, %%mm1 \n\t" /* x5*w22+x1*w20 x5*w18+x1*w16 */ \
- "paddd "#A4", %%mm3 \n\t" /* +%4 */ \
- "pmaddwd %%mm6, %%mm7 \n\t" /* x7*w23+x3*w21 x7*w19+x3*w17 */ \
- "pmaddwd 24+"#A3", %%mm2 \n\t" /* x6*w15+x2*w13 x6*w11+x2*w09 */ \
- "paddd %%mm4, %%mm3 \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */ \
- "pmaddwd 48+"#A3", %%mm5 \n\t" /* x5*w30+x1*w28 x5*w26+x1*w24 */ \
- "movq %%mm3, %%mm4 \n\t" /* 4 ; a1 a0 */ \
- "pmaddwd 56+"#A3", %%mm6 \n\t" /* x7*w31+x3*w29 x7*w27+x3*w25 */ \
- "paddd %%mm7, %%mm1 \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */ \
- "paddd "#A4", %%mm0 \n\t" /* +%4 */ \
- "psubd %%mm1, %%mm3 \n\t" /* a1-b1 a0-b0 */ \
- "psrad $11, %%mm3 \n\t" /* y6=a1-b1 y7=a0-b0 */ \
- "paddd %%mm4, %%mm1 \n\t" /* 4 ; a1+b1 a0+b0 */ \
- "paddd %%mm2, %%mm0 \n\t" /* 2 ; a3=sum(even3) a2=sum(even2) */ \
- "psrad $11, %%mm1 \n\t" /* y1=a1+b1 y0=a0+b0 */ \
- "paddd %%mm6, %%mm5 \n\t" /* 6 ; b3=sum(odd3) b2=sum(odd2) */ \
- "movq %%mm0, %%mm4 \n\t" /* 4 ; a3 a2 */ \
- "paddd %%mm5, %%mm0 \n\t" /* a3+b3 a2+b2 */ \
- "psubd %%mm5, %%mm4 \n\t" /* 5 ; a3-b3 a2-b2 */ \
- "psrad $11, %%mm0 \n\t" /* y3=a3+b3 y2=a2+b2 */ \
- "psrad $11, %%mm4 \n\t" /* y4=a3-b3 y5=a2-b2 */ \
- "packssdw %%mm0, %%mm1 \n\t" /* 0 ; y3 y2 y1 y0 */ \
- "packssdw %%mm3, %%mm4 \n\t" /* 3 ; y6 y7 y4 y5 */ \
- "movq %%mm4, %%mm7 \n\t" /* 7 ; y6 y7 y4 y5 */ \
- "psrld $16, %%mm4 \n\t" /* 0 y6 0 y4 */ \
- "pslld $16, %%mm7 \n\t" /* y7 0 y5 0 */ \
- "movq %%mm1, "#A2" \n\t" /* 1 ; save y3 y2 y1 y0 */ \
- "por %%mm4, %%mm7 \n\t" /* 4 ; y7 y6 y5 y4 */ \
- "movq %%mm7, 8+"#A2" \n\t" /* 7 ; save y7 y6 y5 y4 */ \
-
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_XMM(A1, A2, A3, A4) \
- "movq "#A1", %%mm0 \n\t" /* 0 ; x3 x2 x1 x0 */ \
- "movq 8+"#A1", %%mm1 \n\t" /* 1 ; x7 x6 x5 x4 */ \
- "movq %%mm0, %%mm2 \n\t" /* 2 ; x3 x2 x1 x0 */ \
- "movq "#A3", %%mm3 \n\t" /* 3 ; w05 w04 w01 w00 */ \
- "pshufw $0x88, %%mm0, %%mm0 \n\t" /* x2 x0 x2 x0 */ \
- "movq 8+"#A3", %%mm4 \n\t" /* 4 ; w07 w06 w03 w02 */ \
- "movq %%mm1, %%mm5 \n\t" /* 5 ; x7 x6 x5 x4 */ \
- "pmaddwd %%mm0, %%mm3 \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00 */ \
- "movq 32+"#A3", %%mm6 \n\t" /* 6 ; w21 w20 w17 w16 */ \
- "pshufw $0x88, %%mm1, %%mm1 \n\t" /* x6 x4 x6 x4 */ \
- "pmaddwd %%mm1, %%mm4 \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02 */ \
- "movq 40+"#A3", %%mm7 \n\t" /* 7; w23 w22 w19 w18 */ \
- "pshufw $0xdd, %%mm2, %%mm2 \n\t" /* x3 x1 x3 x1 */ \
- "pmaddwd %%mm2, %%mm6 \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16 */ \
- "pshufw $0xdd, %%mm5, %%mm5 \n\t" /* x7 x5 x7 x5 */ \
- "pmaddwd %%mm5, %%mm7 \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18 */ \
- "paddd "#A4", %%mm3 \n\t" /* +%4 */ \
- "pmaddwd 16+"#A3", %%mm0 \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08 */ \
- "paddd %%mm4, %%mm3 \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */ \
- "pmaddwd 24+"#A3", %%mm1 \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10 */ \
- "movq %%mm3, %%mm4 \n\t" /* 4 ; a1 a0 */ \
- "pmaddwd 48+"#A3", %%mm2 \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24 */ \
- "paddd %%mm7, %%mm6 \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */ \
- "pmaddwd 56+"#A3", %%mm5 \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26 */ \
- "paddd %%mm6, %%mm3 \n\t" /* a1+b1 a0+b0 */ \
- "paddd "#A4", %%mm0 \n\t" /* +%4 */ \
- "psrad $11, %%mm3 \n\t" /* y1=a1+b1 y0=a0+b0 */ \
- "paddd %%mm1, %%mm0 \n\t" /* 1 ; a3=sum(even3) a2=sum(even2) */ \
- "psubd %%mm6, %%mm4 \n\t" /* 6 ; a1-b1 a0-b0 */ \
- "movq %%mm0, %%mm7 \n\t" /* 7 ; a3 a2 */ \
- "paddd %%mm5, %%mm2 \n\t" /* 5 ; b3=sum(odd3) b2=sum(odd2) */ \
- "paddd %%mm2, %%mm0 \n\t" /* a3+b3 a2+b2 */ \
- "psrad $11, %%mm4 \n\t" /* y6=a1-b1 y7=a0-b0 */ \
- "psubd %%mm2, %%mm7 \n\t" /* 2 ; a3-b3 a2-b2 */ \
- "psrad $11, %%mm0 \n\t" /* y3=a3+b3 y2=a2+b2 */ \
- "psrad $11, %%mm7 \n\t" /* y4=a3-b3 y5=a2-b2 */ \
- "packssdw %%mm0, %%mm3 \n\t" /* 0 ; y3 y2 y1 y0 */ \
- "packssdw %%mm4, %%mm7 \n\t" /* 4 ; y6 y7 y4 y5 */ \
- "movq %%mm3, "#A2" \n\t" /* 3 ; save y3 y2 y1 y0 */ \
- "pshufw $0xb1, %%mm7, %%mm7 \n\t" /* y7 y6 y5 y4 */ \
- "movq %%mm7, 8+"#A2" \n\t" /* 7 ; save y7 y6 y5 y4 */ \
-
-
-// -----------------------------------------------------------------------------
-//
-// The first stage DCT 8x8 - forward DCTs of columns
-//
-// The %2puts are multiplied
-// for rows 0,4 - on cos_4_16,
-// for rows 1,7 - on cos_1_16,
-// for rows 2,6 - on cos_2_16,
-// for rows 3,5 - on cos_3_16
-// and are shifted to the left for rise of accuracy
-//
-// -----------------------------------------------------------------------------
-//
-// The 8-point scaled forward DCT algorithm (26a8m)
-//
-// -----------------------------------------------------------------------------
-//
-//#define DCT_8_FRW_COL(x, y)
-// {
-// short t0, t1, t2, t3, t4, t5, t6, t7;
-// short tp03, tm03, tp12, tm12, tp65, tm65;
-// short tp465, tm465, tp765, tm765;
-//
-// t0 = LEFT_SHIFT(x[0] + x[7]);
-// t1 = LEFT_SHIFT(x[1] + x[6]);
-// t2 = LEFT_SHIFT(x[2] + x[5]);
-// t3 = LEFT_SHIFT(x[3] + x[4]);
-// t4 = LEFT_SHIFT(x[3] - x[4]);
-// t5 = LEFT_SHIFT(x[2] - x[5]);
-// t6 = LEFT_SHIFT(x[1] - x[6]);
-// t7 = LEFT_SHIFT(x[0] - x[7]);
-//
-// tp03 = t0 + t3;
-// tm03 = t0 - t3;
-// tp12 = t1 + t2;
-// tm12 = t1 - t2;
-//
-// y[0] = tp03 + tp12;
-// y[4] = tp03 - tp12;
-//
-// y[2] = tm03 + tm12 * tg_2_16;
-// y[6] = tm03 * tg_2_16 - tm12;
-//
-// tp65 = (t6 + t5) * cos_4_16;
-// tm65 = (t6 - t5) * cos_4_16;
-//
-// tp765 = t7 + tp65;
-// tm765 = t7 - tp65;
-// tp465 = t4 + tm65;
-// tm465 = t4 - tm65;
-//
-// y[1] = tp765 + tp465 * tg_1_16;
-// y[7] = tp765 * tg_1_16 - tp465;
-// y[5] = tm765 * tg_3_16 + tm465;
-// y[3] = tm765 - tm465 * tg_3_16;
-// }
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_COL_4 INP,OUT
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_COL(A1, A2) \
- "movq 2*8(%3), %%mm0 \n\t" \
- "movq 16*3+"#A1", %%mm3 \n\t" \
- "movq %%mm0, %%mm1 \n\t" /* tg_3_16 */ \
- "movq 16*5+"#A1", %%mm5 \n\t" \
- "pmulhw %%mm3, %%mm0 \n\t" /* x3*(tg_3_16-1) */ \
- "movq (%3), %%mm4 \n\t" \
- "pmulhw %%mm5, %%mm1 \n\t" /* x5*(tg_3_16-1) */ \
- "movq 16*7+"#A1", %%mm7 \n\t" \
- "movq %%mm4, %%mm2 \n\t" /* tg_1_16 */ \
- "movq 16*1+"#A1", %%mm6 \n\t" \
- "pmulhw %%mm7, %%mm4 \n\t" /* x7*tg_1_16 */ \
- "paddsw %%mm3, %%mm0 \n\t" /* x3*tg_3_16 */ \
- "pmulhw %%mm6, %%mm2 \n\t" /* x1*tg_1_16 */ \
- "paddsw %%mm3, %%mm1 \n\t" /* x3+x5*(tg_3_16-1) */ \
- "psubsw %%mm5, %%mm0 \n\t" /* x3*tg_3_16-x5 = tm35 */ \
- "movq 3*8(%3), %%mm3 \n\t" \
- "paddsw %%mm5, %%mm1 \n\t" /* x3+x5*tg_3_16 = tp35 */ \
- "paddsw %%mm6, %%mm4 \n\t" /* x1+tg_1_16*x7 = tp17 */ \
- "psubsw %%mm7, %%mm2 \n\t" /* x1*tg_1_16-x7 = tm17 */ \
- "movq %%mm4, %%mm5 \n\t" /* tp17 */ \
- "movq %%mm2, %%mm6 \n\t" /* tm17 */ \
- "paddsw %%mm1, %%mm5 \n\t" /* tp17+tp35 = b0 */ \
- "psubsw %%mm0, %%mm6 \n\t" /* tm17-tm35 = b3 */ \
- "psubsw %%mm1, %%mm4 \n\t" /* tp17-tp35 = t1 */ \
- "paddsw %%mm0, %%mm2 \n\t" /* tm17+tm35 = t2 */ \
- "movq 1*8(%3), %%mm7 \n\t" \
- "movq %%mm4, %%mm1 \n\t" /* t1 */ \
- "movq %%mm5, 3*16+"#A2" \n\t" /* save b0 */ \
- "paddsw %%mm2, %%mm1 \n\t" /* t1+t2 */ \
- "movq %%mm6, 5*16+"#A2" \n\t" /* save b3 */ \
- "psubsw %%mm2, %%mm4 \n\t" /* t1-t2 */ \
- "movq 2*16+"#A1", %%mm5 \n\t" \
- "movq %%mm7, %%mm0 \n\t" /* tg_2_16 */ \
- "movq 6*16+"#A1", %%mm6 \n\t" \
- "pmulhw %%mm5, %%mm0 \n\t" /* x2*tg_2_16 */ \
- "pmulhw %%mm6, %%mm7 \n\t" /* x6*tg_2_16 */ \
- "pmulhw %%mm3, %%mm1 \n\t" /* ocos_4_16*(t1+t2) = b1/2 */ \
- "movq 0*16+"#A1", %%mm2 \n\t" \
- "pmulhw %%mm3, %%mm4 \n\t" /* ocos_4_16*(t1-t2) = b2/2 */ \
- "psubsw %%mm6, %%mm0 \n\t" /* t2*tg_2_16-x6 = tm26 */ \
- "movq %%mm2, %%mm3 \n\t" /* x0 */ \
- "movq 4*16+"#A1", %%mm6 \n\t" \
- "paddsw %%mm5, %%mm7 \n\t" /* x2+x6*tg_2_16 = tp26 */ \
- "paddsw %%mm6, %%mm2 \n\t" /* x0+x4 = tp04 */ \
- "psubsw %%mm6, %%mm3 \n\t" /* x0-x4 = tm04 */ \
- "movq %%mm2, %%mm5 \n\t" /* tp04 */ \
- "movq %%mm3, %%mm6 \n\t" /* tm04 */ \
- "psubsw %%mm7, %%mm2 \n\t" /* tp04-tp26 = a3 */ \
- "paddsw %%mm0, %%mm3 \n\t" /* tm04+tm26 = a1 */ \
- "paddsw %%mm1, %%mm1 \n\t" /* b1 */ \
- "paddsw %%mm4, %%mm4 \n\t" /* b2 */ \
- "paddsw %%mm7, %%mm5 \n\t" /* tp04+tp26 = a0 */ \
- "psubsw %%mm0, %%mm6 \n\t" /* tm04-tm26 = a2 */ \
- "movq %%mm3, %%mm7 \n\t" /* a1 */ \
- "movq %%mm6, %%mm0 \n\t" /* a2 */ \
- "paddsw %%mm1, %%mm3 \n\t" /* a1+b1 */ \
- "paddsw %%mm4, %%mm6 \n\t" /* a2+b2 */ \
- "psraw $6, %%mm3 \n\t" /* dst1 */ \
- "psubsw %%mm1, %%mm7 \n\t" /* a1-b1 */ \
- "psraw $6, %%mm6 \n\t" /* dst2 */ \
- "psubsw %%mm4, %%mm0 \n\t" /* a2-b2 */ \
- "movq 3*16+"#A2", %%mm1 \n\t" /* load b0 */ \
- "psraw $6, %%mm7 \n\t" /* dst6 */ \
- "movq %%mm5, %%mm4 \n\t" /* a0 */ \
- "psraw $6, %%mm0 \n\t" /* dst5 */ \
- "movq %%mm3, 1*16+"#A2" \n\t" \
- "paddsw %%mm1, %%mm5 \n\t" /* a0+b0 */ \
- "movq %%mm6, 2*16+"#A2" \n\t" \
- "psubsw %%mm1, %%mm4 \n\t" /* a0-b0 */ \
- "movq 5*16+"#A2", %%mm3 \n\t" /* load b3 */ \
- "psraw $6, %%mm5 \n\t" /* dst0 */ \
- "movq %%mm2, %%mm6 \n\t" /* a3 */ \
- "psraw $6, %%mm4 \n\t" /* dst7 */ \
- "movq %%mm0, 5*16+"#A2" \n\t" \
- "paddsw %%mm3, %%mm2 \n\t" /* a3+b3 */ \
- "movq %%mm7, 6*16+"#A2" \n\t" \
- "psubsw %%mm3, %%mm6 \n\t" /* a3-b3 */ \
- "movq %%mm5, 0*16+"#A2" \n\t" \
- "psraw $6, %%mm2 \n\t" /* dst3 */ \
- "movq %%mm4, 7*16+"#A2" \n\t" \
- "psraw $6, %%mm6 \n\t" /* dst4 */ \
- "movq %%mm2, 3*16+"#A2" \n\t" \
- "movq %%mm6, 4*16+"#A2" \n\t" \
-
-// =============================================================================
-// Code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// void idct_mmx(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmx(short *block)
-{
- __asm__ volatile (
- // # Process each row
- DCT_8_INV_ROW_MMX(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
- DCT_8_INV_ROW_MMX(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
- DCT_8_INV_ROW_MMX(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
- DCT_8_INV_ROW_MMX(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
- DCT_8_INV_ROW_MMX(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
- DCT_8_INV_ROW_MMX(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
- DCT_8_INV_ROW_MMX(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
- DCT_8_INV_ROW_MMX(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
- // # Process the columns (4 at a time)
- DCT_8_INV_COL(0(%0), 0(%0))
- DCT_8_INV_COL(8(%0), 8(%0))
- :: "r" (block), "r" (rounder_0), "r" (tab_i_04_mmx), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
- ff_xvid_idct_mmx(block);
- ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
- ff_xvid_idct_mmx(block);
- ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-
-// -----------------------------------------------------------------------------
-// void idct_xmm(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmxext(short *block)
-{
- __asm__ volatile (
- // # Process each row
- DCT_8_INV_ROW_XMM(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
- DCT_8_INV_ROW_XMM(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
- DCT_8_INV_ROW_XMM(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
- DCT_8_INV_ROW_XMM(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
- DCT_8_INV_ROW_XMM(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
- DCT_8_INV_ROW_XMM(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
- DCT_8_INV_ROW_XMM(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
- DCT_8_INV_ROW_XMM(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
- // # Process the columns (4 at a time)
- DCT_8_INV_COL(0(%0), 0(%0))
- DCT_8_INV_COL(8(%0), 8(%0))
- :: "r" (block), "r" (rounder_0), "r" (tab_i_04_xmm), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
- ff_xvid_idct_mmxext(block);
- ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
- ff_xvid_idct_mmxext(block);
- ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMXEXT_INLINE */
diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c
deleted file mode 100644
index f318e95999..0000000000
--- a/libavcodec/x86/xvididct_sse2.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - SSE2 inverse discrete cosine transform -
- *
- * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
- *
- * Conversion to gcc syntax with modifications
- * by Alexander Strange <astrange@ithinksw.com>
- *
- * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
- *
- * This file is part of Libav.
- *
- * Vertical pass is an implementation of the scheme:
- * Loeffler C., Ligtenberg A., and Moschytz C.S.:
- * Practical Fast 1D DCT Algorithm with Eleven Multiplications,
- * Proc. ICASSP 1989, 988-991.
- *
- * Horizontal pass is a double 4x4 vector/matrix multiplication,
- * (see also Intel's Application Note 922:
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * Copyright (C) 1999 Intel Corporation)
- *
- * More details at http://skal.planet-d.net/coding/dct.html
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_SSE2_INLINE
-
-/**
- * @file
- * @brief SSE2 IDCT compatible with the Xvid IDCT
- */
-
-#define X8(x) x, x, x, x, x, x, x, x
-
-DECLARE_ASM_CONST(16, int16_t, tan1)[] = { X8(13036) }; // tan( pi/16)
-DECLARE_ASM_CONST(16, int16_t, tan2)[] = { X8(27146) }; // tan(2pi/16) = sqrt(2)-1
-DECLARE_ASM_CONST(16, int16_t, tan3)[] = { X8(43790) }; // tan(3pi/16)-1
-DECLARE_ASM_CONST(16, int16_t, sqrt2)[] = { X8(23170) }; // 0.5/sqrt(2)
-DECLARE_ASM_CONST(8, uint8_t, m127)[] = { X8(127) };
-
-DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
- 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
- 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
- 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
- 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
- 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
- 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
- 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
- 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
- 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
- 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
- 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
- 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
- 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
- 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
- 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
- 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-};
-
-DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
- 65536, 65536, 65536, 65536,
- 3597, 3597, 3597, 3597,
- 2260, 2260, 2260, 2260,
- 1203, 1203, 1203, 1203,
- 120, 120, 120, 120,
- 512, 512, 512, 512
-};
-
-// Temporary storage before the column pass
-#define ROW1 "%%xmm6"
-#define ROW3 "%%xmm4"
-#define ROW5 "%%xmm5"
-#define ROW7 "%%xmm7"
-
-#define CLEAR_ODD(r) "pxor "r","r" \n\t"
-#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
-
-#if ARCH_X86_64
-
-# define ROW0 "%%xmm8"
-# define REG0 ROW0
-# define ROW2 "%%xmm9"
-# define REG2 ROW2
-# define ROW4 "%%xmm10"
-# define REG4 ROW4
-# define ROW6 "%%xmm11"
-# define REG6 ROW6
-# define CLEAR_EVEN(r) CLEAR_ODD(r)
-# define PUT_EVEN(dst) PUT_ODD(dst)
-# define XMMS "%%xmm12"
-# define MOV_32_ONLY "#"
-# define SREG2 REG2
-# define TAN3 "%%xmm13"
-# define TAN1 "%%xmm14"
-
-#else
-
-# define ROW0 "(%0)"
-# define REG0 "%%xmm4"
-# define ROW2 "2*16(%0)"
-# define REG2 "%%xmm4"
-# define ROW4 "4*16(%0)"
-# define REG4 "%%xmm6"
-# define ROW6 "6*16(%0)"
-# define REG6 "%%xmm6"
-# define CLEAR_EVEN(r)
-# define PUT_EVEN(dst) \
- "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
- "movdqa %%xmm2, "dst" \n\t"
-# define XMMS "%%xmm2"
-# define MOV_32_ONLY "movdqa "
-# define SREG2 "%%xmm7"
-# define TAN3 "%%xmm0"
-# define TAN1 "%%xmm2"
-
-#endif
-
-#define ROUND(x) "paddd "MANGLE(x)
-
-#define JZ(reg, to) \
- "testl "reg","reg" \n\t" \
- "jz "to" \n\t"
-
-#define JNZ(reg, to) \
- "testl "reg","reg" \n\t" \
- "jnz "to" \n\t"
-
-#define TEST_ONE_ROW(src, reg, clear) \
- clear \
- "movq "src", %%mm1 \n\t" \
- "por 8+"src", %%mm1 \n\t" \
- "paddusb %%mm0, %%mm1 \n\t" \
- "pmovmskb %%mm1, "reg" \n\t"
-
-#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
- clear1 \
- clear2 \
- "movq "row1", %%mm1 \n\t" \
- "por 8+"row1", %%mm1 \n\t" \
- "movq "row2", %%mm2 \n\t" \
- "por 8+"row2", %%mm2 \n\t" \
- "paddusb %%mm0, %%mm1 \n\t" \
- "paddusb %%mm0, %%mm2 \n\t" \
- "pmovmskb %%mm1, "reg1" \n\t" \
- "pmovmskb %%mm2, "reg2" \n\t"
-
-/// IDCT pass on rows.
-#define iMTX_MULT(src, table, rounder, put) \
- "movdqa "src", %%xmm3 \n\t" \
- "movdqa %%xmm3, %%xmm0 \n\t" \
- "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
- "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
- "pmaddwd "table", %%xmm0 \n\t" \
- "pmaddwd 16+"table", %%xmm1 \n\t" \
- "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
- "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
- "pmaddwd 32+"table", %%xmm2 \n\t" \
- "pmaddwd 48+"table", %%xmm3 \n\t" \
- "paddd %%xmm1, %%xmm0 \n\t" \
- "paddd %%xmm3, %%xmm2 \n\t" \
- rounder", %%xmm0 \n\t" \
- "movdqa %%xmm2, %%xmm3 \n\t" \
- "paddd %%xmm0, %%xmm2 \n\t" \
- "psubd %%xmm3, %%xmm0 \n\t" \
- "psrad $11, %%xmm2 \n\t" \
- "psrad $11, %%xmm0 \n\t" \
- "packssdw %%xmm0, %%xmm2 \n\t" \
- put \
- "1: \n\t"
-
-#define iLLM_HEAD \
- "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
- "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
-
-/// IDCT pass on columns.
-#define iLLM_PASS(dct) \
- "movdqa "TAN3", %%xmm1 \n\t" \
- "movdqa "TAN1", %%xmm3 \n\t" \
- "pmulhw %%xmm4, "TAN3" \n\t" \
- "pmulhw %%xmm5, %%xmm1 \n\t" \
- "paddsw %%xmm4, "TAN3" \n\t" \
- "paddsw %%xmm5, %%xmm1 \n\t" \
- "psubsw %%xmm5, "TAN3" \n\t" \
- "paddsw %%xmm4, %%xmm1 \n\t" \
- "pmulhw %%xmm7, %%xmm3 \n\t" \
- "pmulhw %%xmm6, "TAN1" \n\t" \
- "paddsw %%xmm6, %%xmm3 \n\t" \
- "psubsw %%xmm7, "TAN1" \n\t" \
- "movdqa %%xmm3, %%xmm7 \n\t" \
- "movdqa "TAN1", %%xmm6 \n\t" \
- "psubsw %%xmm1, %%xmm3 \n\t" \
- "psubsw "TAN3", "TAN1" \n\t" \
- "paddsw %%xmm7, %%xmm1 \n\t" \
- "paddsw %%xmm6, "TAN3" \n\t" \
- "movdqa %%xmm3, %%xmm6 \n\t" \
- "psubsw "TAN3", %%xmm3 \n\t" \
- "paddsw %%xmm6, "TAN3" \n\t" \
- "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
- "pmulhw %%xmm4, %%xmm3 \n\t" \
- "pmulhw %%xmm4, "TAN3" \n\t" \
- "paddsw "TAN3", "TAN3" \n\t" \
- "paddsw %%xmm3, %%xmm3 \n\t" \
- "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
- MOV_32_ONLY ROW2", "REG2" \n\t" \
- MOV_32_ONLY ROW6", "REG6" \n\t" \
- "movdqa %%xmm7, %%xmm5 \n\t" \
- "pmulhw "REG6", %%xmm7 \n\t" \
- "pmulhw "REG2", %%xmm5 \n\t" \
- "paddsw "REG2", %%xmm7 \n\t" \
- "psubsw "REG6", %%xmm5 \n\t" \
- MOV_32_ONLY ROW0", "REG0" \n\t" \
- MOV_32_ONLY ROW4", "REG4" \n\t" \
- MOV_32_ONLY" "TAN1", (%0) \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw "REG4", "REG0" \n\t" \
- "paddsw "XMMS", "REG4" \n\t" \
- "movdqa "REG4", "XMMS" \n\t" \
- "psubsw %%xmm7, "REG4" \n\t" \
- "paddsw "XMMS", %%xmm7 \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw %%xmm5, "REG0" \n\t" \
- "paddsw "XMMS", %%xmm5 \n\t" \
- "movdqa %%xmm5, "XMMS" \n\t" \
- "psubsw "TAN3", %%xmm5 \n\t" \
- "paddsw "XMMS", "TAN3" \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw %%xmm3, "REG0" \n\t" \
- "paddsw "XMMS", %%xmm3 \n\t" \
- MOV_32_ONLY" (%0), "TAN1" \n\t" \
- "psraw $6, %%xmm5 \n\t" \
- "psraw $6, "REG0" \n\t" \
- "psraw $6, "TAN3" \n\t" \
- "psraw $6, %%xmm3 \n\t" \
- "movdqa "TAN3", 1*16("dct") \n\t" \
- "movdqa %%xmm3, 2*16("dct") \n\t" \
- "movdqa "REG0", 5*16("dct") \n\t" \
- "movdqa %%xmm5, 6*16("dct") \n\t" \
- "movdqa %%xmm7, %%xmm0 \n\t" \
- "movdqa "REG4", %%xmm4 \n\t" \
- "psubsw %%xmm1, %%xmm7 \n\t" \
- "psubsw "TAN1", "REG4" \n\t" \
- "paddsw %%xmm0, %%xmm1 \n\t" \
- "paddsw %%xmm4, "TAN1" \n\t" \
- "psraw $6, %%xmm1 \n\t" \
- "psraw $6, %%xmm7 \n\t" \
- "psraw $6, "TAN1" \n\t" \
- "psraw $6, "REG4" \n\t" \
- "movdqa %%xmm1, ("dct") \n\t" \
- "movdqa "TAN1", 3*16("dct") \n\t" \
- "movdqa "REG4", 4*16("dct") \n\t" \
- "movdqa %%xmm7, 7*16("dct") \n\t"
-
-/// IDCT pass on columns, assuming rows 4-7 are zero.
-#define iLLM_PASS_SPARSE(dct) \
- "pmulhw %%xmm4, "TAN3" \n\t" \
- "paddsw %%xmm4, "TAN3" \n\t" \
- "movdqa %%xmm6, %%xmm3 \n\t" \
- "pmulhw %%xmm6, "TAN1" \n\t" \
- "movdqa %%xmm4, %%xmm1 \n\t" \
- "psubsw %%xmm1, %%xmm3 \n\t" \
- "paddsw %%xmm6, %%xmm1 \n\t" \
- "movdqa "TAN1", %%xmm6 \n\t" \
- "psubsw "TAN3", "TAN1" \n\t" \
- "paddsw %%xmm6, "TAN3" \n\t" \
- "movdqa %%xmm3, %%xmm6 \n\t" \
- "psubsw "TAN3", %%xmm3 \n\t" \
- "paddsw %%xmm6, "TAN3" \n\t" \
- "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
- "pmulhw %%xmm4, %%xmm3 \n\t" \
- "pmulhw %%xmm4, "TAN3" \n\t" \
- "paddsw "TAN3", "TAN3" \n\t" \
- "paddsw %%xmm3, %%xmm3 \n\t" \
- "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
- MOV_32_ONLY ROW2", "SREG2" \n\t" \
- "pmulhw "SREG2", %%xmm5 \n\t" \
- MOV_32_ONLY ROW0", "REG0" \n\t" \
- "movdqa "REG0", %%xmm6 \n\t" \
- "psubsw "SREG2", %%xmm6 \n\t" \
- "paddsw "REG0", "SREG2" \n\t" \
- MOV_32_ONLY" "TAN1", (%0) \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw %%xmm5, "REG0" \n\t" \
- "paddsw "XMMS", %%xmm5 \n\t" \
- "movdqa %%xmm5, "XMMS" \n\t" \
- "psubsw "TAN3", %%xmm5 \n\t" \
- "paddsw "XMMS", "TAN3" \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw %%xmm3, "REG0" \n\t" \
- "paddsw "XMMS", %%xmm3 \n\t" \
- MOV_32_ONLY" (%0), "TAN1" \n\t" \
- "psraw $6, %%xmm5 \n\t" \
- "psraw $6, "REG0" \n\t" \
- "psraw $6, "TAN3" \n\t" \
- "psraw $6, %%xmm3 \n\t" \
- "movdqa "TAN3", 1*16("dct") \n\t" \
- "movdqa %%xmm3, 2*16("dct") \n\t" \
- "movdqa "REG0", 5*16("dct") \n\t" \
- "movdqa %%xmm5, 6*16("dct") \n\t" \
- "movdqa "SREG2", %%xmm0 \n\t" \
- "movdqa %%xmm6, %%xmm4 \n\t" \
- "psubsw %%xmm1, "SREG2" \n\t" \
- "psubsw "TAN1", %%xmm6 \n\t" \
- "paddsw %%xmm0, %%xmm1 \n\t" \
- "paddsw %%xmm4, "TAN1" \n\t" \
- "psraw $6, %%xmm1 \n\t" \
- "psraw $6, "SREG2" \n\t" \
- "psraw $6, "TAN1" \n\t" \
- "psraw $6, %%xmm6 \n\t" \
- "movdqa %%xmm1, ("dct") \n\t" \
- "movdqa "TAN1", 3*16("dct") \n\t" \
- "movdqa %%xmm6, 4*16("dct") \n\t" \
- "movdqa "SREG2", 7*16("dct") \n\t"
-
-inline void ff_xvid_idct_sse2(short *block)
-{
- __asm__ volatile (
- "movq "MANGLE (m127) ", %%mm0 \n\t"
- iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
- iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1))
- iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2))
-
- TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
- JZ("%%eax", "1f")
- iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3))
-
- TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
- TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
- iLLM_HEAD
- ".p2align 4 \n\t"
- JNZ("%%ecx", "2f")
- JNZ("%%eax", "3f")
- JNZ("%%edx", "4f")
- JNZ("%%esi", "5f")
- iLLM_PASS_SPARSE("%0")
- "jmp 6f \n\t"
- "2: \n\t"
- iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
- "3: \n\t"
- iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5))
- JZ("%%edx", "1f")
- "4: \n\t"
- iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6))
- JZ("%%esi", "1f")
- "5: \n\t"
- iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7))
-#if ARCH_X86_32
- iLLM_HEAD
-#endif
- iLLM_PASS("%0")
- "6: \n\t"
- : "+r" (block)
- :
- : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7", )
-#if ARCH_X86_64
- XMM_CLOBBERS("%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", )
-#endif
- "%eax", "%ecx", "%edx", "%esi", "memory");
-}
-
-void ff_xvid_idct_sse2_put(uint8_t *dest, ptrdiff_t line_size, short *block)
-{
- ff_xvid_idct_sse2(block);
- ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_sse2_add(uint8_t *dest, ptrdiff_t line_size, short *block)
-{
- ff_xvid_idct_sse2(block);
- ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_SSE2_INLINE */