diff options
-rw-r--r-- | cmdutils.c | 1 | ||||
-rw-r--r-- | libavcodec/Makefile | 7 | ||||
-rw-r--r-- | libavcodec/dirac.c | 122 | ||||
-rw-r--r-- | libavcodec/dnxhdenc.c | 5 | ||||
-rw-r--r-- | libavcodec/dnxhdenc.h | 2 | ||||
-rw-r--r-- | libavcodec/mpegvideo.c | 4 | ||||
-rw-r--r-- | libavcodec/mpegvideo.h | 3 | ||||
-rw-r--r-- | libavcodec/mpegvideo_enc.c | 3 | ||||
-rw-r--r-- | libavcodec/svq13.c | 68 | ||||
-rw-r--r-- | libavcodec/svq1dec.c | 45 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 5 | ||||
-rw-r--r-- | libavcodec/x86/dnxhdenc.c (renamed from libavcodec/x86/dnxhd_mmx.c) | 2 | ||||
-rw-r--r-- | libavcodec/x86/mpegvideo.c (renamed from libavcodec/x86/mpegvideo_mmx.c) | 65 | ||||
-rw-r--r-- | libavcodec/x86/mpegvideoenc.c | 96 | ||||
-rw-r--r-- | libavcodec/x86/mpegvideoenc_template.c (renamed from libavcodec/x86/mpegvideo_mmx_template.c) | 0 | ||||
-rw-r--r-- | libavresample/x86/audio_convert.asm | 957 | ||||
-rw-r--r-- | libavresample/x86/audio_convert_init.c | 174 | ||||
-rw-r--r-- | libavresample/x86/util.asm | 6 | ||||
-rw-r--r-- | libavutil/x86/x86util.asm | 12 |
19 files changed, 1397 insertions, 180 deletions
diff --git a/cmdutils.c b/cmdutils.c index ed856301d7..f6f258a9d9 100644 --- a/cmdutils.c +++ b/cmdutils.c @@ -301,6 +301,7 @@ int parse_option(void *optctx, const char *opt, const char *arg, if (po->flags & OPT_STRING) { char *str; str = av_strdup(arg); +// av_freep(dst); *(char **)dst = str; } else if (po->flags & OPT_BOOL) { *(int *)dst = bool_val; diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 450086d29a..c64175d66f 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -405,14 +405,13 @@ OBJS-$(CONFIG_SUBRIP_ENCODER) += srtenc.o ass_split.o OBJS-$(CONFIG_SUBVIEWER_DECODER) += subviewerdec.o ass.o OBJS-$(CONFIG_SUNRAST_DECODER) += sunrast.o OBJS-$(CONFIG_SUNRAST_ENCODER) += sunrastenc.o -OBJS-$(CONFIG_SVQ1_DECODER) += svq1dec.o svq1.o h263.o +OBJS-$(CONFIG_SVQ1_DECODER) += svq1dec.o svq1.o svq13.o h263.o OBJS-$(CONFIG_SVQ1_ENCODER) += svq1enc.o svq1.o \ h263.o ituh263enc.o -OBJS-$(CONFIG_SVQ3_DECODER) += h264.o svq3.o \ +OBJS-$(CONFIG_SVQ3_DECODER) += svq3.o svq13.o h263.o h264.o \ h264_loopfilter.o h264_direct.o \ h264_sei.o h264_ps.o h264_refs.o \ - h264_cavlc.o h264_cabac.o cabac.o \ - svq1dec.o svq1.o h263.o + h264_cavlc.o h264_cabac.o cabac.o OBJS-$(CONFIG_TARGA_DECODER) += targa.o OBJS-$(CONFIG_TARGA_ENCODER) += targaenc.o rle.o OBJS-$(CONFIG_THEORA_DECODER) += xiph.o diff --git a/libavcodec/dirac.c b/libavcodec/dirac.c index 3aa65e3c4d..bab3980382 100644 --- a/libavcodec/dirac.c +++ b/libavcodec/dirac.c @@ -57,10 +57,7 @@ static const dirac_source_params dirac_source_parameters_defaults[] = { { 7680, 4320, 1, 0, 1, 6, 1, 3840, 2160, 0, 0, 3, 3 }, }; -/** - * Dirac Specification -> - * Table 10.4 - Available preset pixel aspect ratio values - */ +/* [DIRAC_STD] Table 10.4 - Available preset pixel aspect ratio values */ static const AVRational dirac_preset_aspect_ratios[] = { {1, 1}, {10, 11}, @@ -70,19 +67,16 @@ static const AVRational dirac_preset_aspect_ratios[] = { {4, 3}, }; -/** - * Dirac Specification -> - * Values 9,10 of 10.3.5 Frame Rate. Table 10.3 Available preset frame rate values +/* [DIRAC_STD] Values 9,10 of 10.3.5 Frame Rate. + * Table 10.3 Available preset frame rate values */ static const AVRational dirac_frame_rate[] = { {15000, 1001}, {25, 2}, }; -/** - * Dirac Specification -> - * This should be equivalent to Table 10.5 Available signal range presets - */ +/* [DIRAC_STD] This should be equivalent to Table 10.5 Available signal + * range presets */ static const struct { uint8_t bitdepth; enum AVColorRange color_range; @@ -111,19 +105,14 @@ static const struct { { AVCOL_PRI_BT709, AVCOL_SPC_BT709, AVCOL_TRC_UNSPECIFIED /* DCinema */ }, }; -/** - * Dirac Specification -> - * Table 10.2 Supported chroma sampling formats + Luma Offset - */ +/* [DIRAC_STD] Table 10.2 Supported chroma sampling formats + luma Offset */ static const enum PixelFormat dirac_pix_fmt[2][3] = { { PIX_FMT_YUV444P, PIX_FMT_YUV422P, PIX_FMT_YUV420P }, { PIX_FMT_YUVJ444P, PIX_FMT_YUVJ422P, PIX_FMT_YUVJ420P }, }; -/** - * Dirac Specification -> - * 10.3 Parse Source Parameters. source_parameters(base_video_format) - */ +/* [DIRAC_STD] 10.3 Parse Source Parameters. + * source_parameters(base_video_format) */ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, dirac_source_params *source) { @@ -132,15 +121,18 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, int idx; /* [DIRAC_STD] 10.3.2 Frame size. frame_size(video_params) */ - if (get_bits1(gb)) { /* [DIRAC_STD] custom_dimensions_flag */ - source->width = svq3_get_ue_golomb(gb); /* [DIRAC_STD] FRAME_WIDTH */ - source->height = svq3_get_ue_golomb(gb); /* [DIRAC_STD] FRAME_HEIGHT */ + /* [DIRAC_STD] custom_dimensions_flag */ + if (get_bits1(gb)) { + source->width = svq3_get_ue_golomb(gb); /* [DIRAC_STD] FRAME_WIDTH */ + source->height = svq3_get_ue_golomb(gb); /* [DIRAC_STD] FRAME_HEIGHT */ } /* [DIRAC_STD] 10.3.3 Chroma Sampling Format. - chroma_sampling_format(video_params) */ - if (get_bits1(gb)) /* [DIRAC_STD] custom_chroma_format_flag */ - source->chroma_format = svq3_get_ue_golomb(gb); /*[DIRAC_STD] CHROMA_FORMAT_INDEX */ + * chroma_sampling_format(video_params) */ + /* [DIRAC_STD] custom_chroma_format_flag */ + if (get_bits1(gb)) + /* [DIRAC_STD] CHROMA_FORMAT_INDEX */ + source->chroma_format = svq3_get_ue_golomb(gb); if (source->chroma_format > 2U) { av_log(avctx, AV_LOG_ERROR, "Unknown chroma format %d\n", source->chroma_format); @@ -148,8 +140,10 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, } /* [DIRAC_STD] 10.3.4 Scan Format. scan_format(video_params) */ - if (get_bits1(gb)) /* [DIRAC_STD] custom_scan_format_flag */ - source->interlaced = svq3_get_ue_golomb(gb); /* [DIRAC_STD] SOURCE_SAMPLING */ + /* [DIRAC_STD] custom_scan_format_flag */ + if (get_bits1(gb)) + /* [DIRAC_STD] SOURCE_SAMPLING */ + source->interlaced = svq3_get_ue_golomb(gb); if (source->interlaced > 1U) return AVERROR_INVALIDDATA; @@ -160,23 +154,29 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, if (source->frame_rate_index > 10U) return AVERROR_INVALIDDATA; - if (!source->frame_rate_index){ - frame_rate.num = svq3_get_ue_golomb(gb); /* [DIRAC_STD] FRAME_RATE_NUMER */ - frame_rate.den = svq3_get_ue_golomb(gb); /* [DIRAC_STD] FRAME_RATE_DENOM */ + if (!source->frame_rate_index) { + /* [DIRAC_STD] FRAME_RATE_NUMER */ + frame_rate.num = svq3_get_ue_golomb(gb); + /* [DIRAC_STD] FRAME_RATE_DENOM */ + frame_rate.den = svq3_get_ue_golomb(gb); } } - if (source->frame_rate_index > 0) { /* [DIRAC_STD] preset_frame_rate(video_params,index) */ + /* [DIRAC_STD] preset_frame_rate(video_params, index) */ + if (source->frame_rate_index > 0) { if (source->frame_rate_index <= 8) frame_rate = avpriv_frame_rate_tab[source->frame_rate_index]; /* [DIRAC_STD] Table 10.3 values 1-8 */ else - frame_rate = dirac_frame_rate[source->frame_rate_index-9]; /* [DIRAC_STD] Table 10.3 values 9-10 */ + /* [DIRAC_STD] Table 10.3 values 9-10 */ + frame_rate = dirac_frame_rate[source->frame_rate_index-9]; } av_reduce(&avctx->time_base.num, &avctx->time_base.den, frame_rate.den, frame_rate.num, 1<<30); - /* [DIRAC_STD] 10.3.6 Pixel Aspect Ratio. pixel_aspect_ratio(video_params) */ + /* [DIRAC_STD] 10.3.6 Pixel Aspect Ratio. + * pixel_aspect_ratio(video_params) */ if (get_bits1(gb)) { /* [DIRAC_STD] custom_pixel_aspect_ratio_flag */ - source->aspect_ratio_index = svq3_get_ue_golomb(gb); /* [DIRAC_STD] index */ + /* [DIRAC_STD] index */ + source->aspect_ratio_index = svq3_get_ue_golomb(gb); if (source->aspect_ratio_index > 6U) return AVERROR_INVALIDDATA; @@ -186,22 +186,30 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, avctx->sample_aspect_ratio.den = svq3_get_ue_golomb(gb); } } - if (source->aspect_ratio_index > 0) /* [DIRAC_STD] Take value from Table 10.4 Available preset pixel aspect ratio values */ + /* [DIRAC_STD] Take value from Table 10.4 Available preset pixel + * aspect ratio values */ + if (source->aspect_ratio_index > 0) avctx->sample_aspect_ratio = dirac_preset_aspect_ratios[source->aspect_ratio_index-1]; /* [DIRAC_STD] 10.3.7 Clean area. clean_area(video_params) */ if (get_bits1(gb)) { /* [DIRAC_STD] custom_clean_area_flag */ - source->clean_width = svq3_get_ue_golomb(gb); /* [DIRAC_STD] CLEAN_WIDTH */ - source->clean_height = svq3_get_ue_golomb(gb); /* [DIRAC_STD] CLEAN_HEIGHT */ - source->clean_left_offset = svq3_get_ue_golomb(gb); /* [DIRAC_STD] CLEAN_LEFT_OFFSET */ - source->clean_right_offset = svq3_get_ue_golomb(gb); /* [DIRAC_STD] CLEAN_RIGHT_OFFSET */ + /* [DIRAC_STD] CLEAN_WIDTH */ + source->clean_width = svq3_get_ue_golomb(gb); + /* [DIRAC_STD] CLEAN_HEIGHT */ + source->clean_height = svq3_get_ue_golomb(gb); + /* [DIRAC_STD] CLEAN_LEFT_OFFSET */ + source->clean_left_offset = svq3_get_ue_golomb(gb); + /* [DIRAC_STD] CLEAN_RIGHT_OFFSET */ + source->clean_right_offset = svq3_get_ue_golomb(gb); } - /*[DIRAC_STD] 10.3.8 Signal range. signal_range(video_params) - WARNING: Some adaptation seemed to be done using the AVCOL_RANGE_MPEG/JPEG values */ - if (get_bits1(gb)) { /*[DIRAC_STD] custom_signal_range_flag */ - source->pixel_range_index = svq3_get_ue_golomb(gb); /*[DIRAC_STD] index */ + /* [DIRAC_STD] 10.3.8 Signal range. signal_range(video_params) + * WARNING: Some adaptation seems to be done using the + * AVCOL_RANGE_MPEG/JPEG values */ + if (get_bits1(gb)) { /* [DIRAC_STD] custom_signal_range_flag */ + /* [DIRAC_STD] index */ + source->pixel_range_index = svq3_get_ue_golomb(gb); if (source->pixel_range_index > 4U) return AVERROR_INVALIDDATA; @@ -210,13 +218,14 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, if (!source->pixel_range_index) { luma_offset = svq3_get_ue_golomb(gb); luma_depth = av_log2(svq3_get_ue_golomb(gb))+1; - svq3_get_ue_golomb(gb); /* chroma offset @Jordi: Why are these two ignored? */ + svq3_get_ue_golomb(gb); /* chroma offset */ svq3_get_ue_golomb(gb); /* chroma excursion */ - avctx->color_range = luma_offset ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG; } } - if (source->pixel_range_index > 0) { /*[DIRAC_STD] Take values from Table 10.5 Available signal range presets */ + /* [DIRAC_STD] Table 10.5 + * Available signal range presets <--> pixel_range_presets */ + if (source->pixel_range_index > 0) { idx = source->pixel_range_index-1; luma_depth = pixel_range_presets[idx].bitdepth; avctx->color_range = pixel_range_presets[idx].color_range; @@ -229,7 +238,8 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, /* [DIRAC_STD] 10.3.9 Colour specification. colour_spec(video_params) */ if (get_bits1(gb)) { /* [DIRAC_STD] custom_colour_spec_flag */ - idx = source->color_spec_index = svq3_get_ue_golomb(gb); /* [DIRAC_STD] index */ + /* [DIRAC_STD] index */ + idx = source->color_spec_index = svq3_get_ue_golomb(gb); if (source->color_spec_index > 4U) return AVERROR_INVALIDDATA; @@ -239,13 +249,13 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, avctx->color_trc = dirac_color_presets[idx].color_trc; if (!source->color_spec_index) { - /* [DIRAC_STD] 10.3.9.1 Color primaries */ + /* [DIRAC_STD] 10.3.9.1 Colour primaries */ if (get_bits1(gb)) { idx = svq3_get_ue_golomb(gb); if (idx < 3U) avctx->color_primaries = dirac_primaries[idx]; } - /* [DIRAC_STD] 10.3.9.2 Color matrix */ + /* [DIRAC_STD] 10.3.9.2 Colour matrix */ if (get_bits1(gb)) { idx = svq3_get_ue_golomb(gb); if (!idx) @@ -267,10 +277,7 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb, return 0; } -/** - * Dirac Specification -> - * 10. Sequence Header. sequence_header() - */ +/* [DIRAC_STD] 10. Sequence Header. sequence_header() */ int avpriv_dirac_parse_sequence_header(AVCodecContext *avctx, GetBitContext *gb, dirac_source_params *source) { @@ -284,7 +291,7 @@ int avpriv_dirac_parse_sequence_header(AVCodecContext *avctx, GetBitContext *gb, avctx->profile = svq3_get_ue_golomb(gb); avctx->level = svq3_get_ue_golomb(gb); /* [DIRAC_SPEC] sequence_header() -> base_video_format as defined in - 10.2 Base Video Format, table 10.1 Dirac predefined video formats */ + * 10.2 Base Video Format, table 10.1 Dirac predefined video formats */ video_format = svq3_get_ue_golomb(gb); if (version_major < 2) @@ -298,7 +305,8 @@ int avpriv_dirac_parse_sequence_header(AVCodecContext *avctx, GetBitContext *gb, /* Fill in defaults for the source parameters. */ *source = dirac_source_parameters_defaults[video_format]; - // Override the defaults. + /* [DIRAC_STD] 10.3 Source Parameters + * Override the defaults. */ if (ret = parse_source_parameters(avctx, gb, source)) return ret; @@ -307,8 +315,8 @@ int avpriv_dirac_parse_sequence_header(AVCodecContext *avctx, GetBitContext *gb, avcodec_set_dimensions(avctx, source->width, source->height); - /*[DIRAC_STD] picture_coding_mode shall be 0 for fields and 1 for frames - currently only used to signal field coding */ + /* [DIRAC_STD] picture_coding_mode shall be 0 for fields and 1 for frames + * currently only used to signal field coding */ picture_coding_mode = svq3_get_ue_golomb(gb); if (picture_coding_mode != 0) { av_log(avctx, AV_LOG_ERROR, "Unsupported picture coding mode %d", diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c index 97013f6a94..24767fd1a0 100644 --- a/libavcodec/dnxhdenc.c +++ b/libavcodec/dnxhdenc.c @@ -294,9 +294,8 @@ static int dnxhd_encode_init(AVCodecContext *avctx) ctx->block_width_l2 = 3; } -#if HAVE_MMX - ff_dnxhd_init_mmx(ctx); -#endif + if (ARCH_X86) + ff_dnxhdenc_init_x86(ctx); ctx->m.mb_height = (avctx->height + 15) / 16; ctx->m.mb_width = (avctx->width + 15) / 16; diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h index 640bbd3995..e57047d5dc 100644 --- a/libavcodec/dnxhdenc.h +++ b/libavcodec/dnxhdenc.h @@ -93,6 +93,6 @@ typedef struct DNXHDEncContext { void (*get_pixels_8x4_sym)(DCTELEM * /*align 16*/, const uint8_t *, int); } DNXHDEncContext; -void ff_dnxhd_init_mmx(DNXHDEncContext *ctx); +void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx); #endif /* AVCODEC_DNXHDENC_H */ diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c index fc05b5f1d1..77108cf834 100644 --- a/libavcodec/mpegvideo.c +++ b/libavcodec/mpegvideo.c @@ -185,8 +185,8 @@ av_cold int ff_dct_common_init(MpegEncContext *s) s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_bitexact; s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_c; -#if HAVE_MMX - ff_MPV_common_init_mmx(s); +#if ARCH_X86 + ff_MPV_common_init_x86(s); #elif ARCH_ALPHA ff_MPV_common_init_axp(s); #elif HAVE_MMI diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h index 3c6f13a2cd..dd1079b0e7 100644 --- a/libavcodec/mpegvideo.h +++ b/libavcodec/mpegvideo.h @@ -765,7 +765,8 @@ int ff_MPV_encode_init(AVCodecContext *avctx); int ff_MPV_encode_end(AVCodecContext *avctx); int ff_MPV_encode_picture(AVCodecContext *avctx, AVPacket *pkt, AVFrame *frame, int *got_packet); -void ff_MPV_common_init_mmx(MpegEncContext *s); +void ff_MPV_encode_init_x86(MpegEncContext *s); +void ff_MPV_common_init_x86(MpegEncContext *s); void ff_MPV_common_init_axp(MpegEncContext *s); void ff_MPV_common_init_mmi(MpegEncContext *s); void ff_MPV_common_init_arm(MpegEncContext *s); diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index 05ae1b6b2d..6ec860984a 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -832,6 +832,9 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx) if (ff_MPV_common_init(s) < 0) return -1; + if (ARCH_X86) + ff_MPV_encode_init_x86(s); + if (!s->dct_quantize) s->dct_quantize = ff_dct_quantize_c; if (!s->denoise_dct) diff --git a/libavcodec/svq13.c b/libavcodec/svq13.c new file mode 100644 index 0000000000..65a79025fe --- /dev/null +++ b/libavcodec/svq13.c @@ -0,0 +1,68 @@ +/* + * SVQ1/SVQ3 decoder common code + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "svq1.h" + +static const uint16_t checksum_table[256] = { + 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7, + 0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF, + 0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6, + 0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE, + 0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485, + 0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D, + 0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4, + 0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC, + 0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823, + 0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B, + 0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12, + 0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A, + 0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41, + 0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49, + 0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70, + 0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78, + 0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F, + 0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067, + 0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E, + 0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256, + 0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D, + 0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, + 0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C, + 0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634, + 0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB, + 0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3, + 0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A, + 0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92, + 0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9, + 0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1, + 0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8, + 0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0 +}; + +uint16_t ff_svq1_packet_checksum (const uint8_t *data, const int length, int value) { + int i; + + for (i=0; i < length; i++) { + value = checksum_table[data[i] ^ (value >> 8)] ^ ((value & 0xFF) << 8); + } + + return value; +} diff --git a/libavcodec/svq1dec.c b/libavcodec/svq1dec.c index 3b8dc1b852..ba240ac74d 100644 --- a/libavcodec/svq1dec.c +++ b/libavcodec/svq1dec.c @@ -58,41 +58,6 @@ typedef struct svq1_pmv_s { int y; } svq1_pmv; -static const uint16_t checksum_table[256] = { - 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7, - 0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF, - 0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6, - 0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE, - 0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485, - 0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D, - 0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4, - 0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC, - 0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823, - 0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B, - 0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12, - 0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A, - 0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41, - 0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49, - 0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70, - 0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78, - 0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F, - 0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067, - 0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E, - 0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256, - 0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D, - 0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, - 0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C, - 0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634, - 0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB, - 0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3, - 0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A, - 0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92, - 0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9, - 0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1, - 0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8, - 0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0 -}; - static const uint8_t string_table[256] = { 0x00, 0xD5, 0x7F, 0xAA, 0xFE, 0x2B, 0x81, 0x54, 0x29, 0xFC, 0x56, 0x83, 0xD7, 0x02, 0xA8, 0x7D, @@ -524,16 +489,6 @@ static int svq1_decode_delta_block (MpegEncContext *s, GetBitContext *bitbuf, return result; } -uint16_t ff_svq1_packet_checksum (const uint8_t *data, const int length, int value) { - int i; - - for (i=0; i < length; i++) { - value = checksum_table[data[i] ^ (value >> 8)] ^ ((value & 0xFF) << 8); - } - - return value; -} - static void svq1_parse_string (GetBitContext *bitbuf, uint8_t *out) { uint8_t seed; int i; diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 73a6a6032c..f90f12e877 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -14,7 +14,7 @@ MMX-OBJS += x86/dsputil_mmx.o \ MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o MMX-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o -MMX-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhd_mmx.o +MMX-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o \ x86/dwt.o MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o @@ -24,7 +24,8 @@ MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o MMX-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec_mmx.o -MMX-OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo_mmx.o +MMX-OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o +MMX-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o MMX-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o diff --git a/libavcodec/x86/dnxhd_mmx.c b/libavcodec/x86/dnxhdenc.c index ccd1575545..8b0c2ad225 100644 --- a/libavcodec/x86/dnxhd_mmx.c +++ b/libavcodec/x86/dnxhdenc.c @@ -54,7 +54,7 @@ static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int l #endif /* HAVE_INLINE_ASM */ -void ff_dnxhd_init_mmx(DNXHDEncContext *ctx) +void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx) { #if HAVE_INLINE_ASM if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2) { diff --git a/libavcodec/x86/mpegvideo_mmx.c b/libavcodec/x86/mpegvideo.c index f3927f0eae..2d475abe01 100644 --- a/libavcodec/x86/mpegvideo_mmx.c +++ b/libavcodec/x86/mpegvideo.c @@ -1,7 +1,4 @@ /* - * The simplest mpeg encoder (well, it was the simplest!) - * Copyright (c) 2000,2001 Fabrice Bellard - * * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> * @@ -31,9 +28,6 @@ #if HAVE_INLINE_ASM -extern uint16_t ff_inv_zigzag_direct16[64]; - - static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { @@ -588,56 +582,14 @@ static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ ); } -#if HAVE_SSSE3 -#define HAVE_SSSE3_BAK -#endif -#undef HAVE_SSSE3 -#define HAVE_SSSE3 0 - -#undef HAVE_SSE2 -#undef HAVE_MMXEXT -#define HAVE_SSE2 0 -#define HAVE_MMXEXT 0 -#define RENAME(a) a ## _MMX -#define RENAMEl(a) a ## _mmx -#include "mpegvideo_mmx_template.c" - -#undef HAVE_MMXEXT -#define HAVE_MMXEXT 1 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _MMX2 -#define RENAMEl(a) a ## _mmx2 -#include "mpegvideo_mmx_template.c" - -#undef HAVE_SSE2 -#define HAVE_SSE2 1 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _SSE2 -#define RENAMEl(a) a ## _sse2 -#include "mpegvideo_mmx_template.c" - -#ifdef HAVE_SSSE3_BAK -#undef HAVE_SSSE3 -#define HAVE_SSSE3 1 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _SSSE3 -#define RENAMEl(a) a ## _sse2 -#include "mpegvideo_mmx_template.c" -#endif - #endif /* HAVE_INLINE_ASM */ -void ff_MPV_common_init_mmx(MpegEncContext *s) +void ff_MPV_common_init_x86(MpegEncContext *s) { #if HAVE_INLINE_ASM int mm_flags = av_get_cpu_flags(); if (mm_flags & AV_CPU_FLAG_MMX) { - const int dct_algo = s->avctx->dct_algo; - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; @@ -651,21 +603,6 @@ void ff_MPV_common_init_mmx(MpegEncContext *s) } else { s->denoise_dct= denoise_dct_mmx; } - - if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ -#if HAVE_SSSE3 - if(mm_flags & AV_CPU_FLAG_SSSE3){ - s->dct_quantize= dct_quantize_SSSE3; - } else -#endif - if(mm_flags & AV_CPU_FLAG_SSE2){ - s->dct_quantize= dct_quantize_SSE2; - } else if (mm_flags & AV_CPU_FLAG_MMXEXT) { - s->dct_quantize= dct_quantize_MMX2; - } else { - s->dct_quantize= dct_quantize_MMX; - } - } } #endif /* HAVE_INLINE_ASM */ } diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c new file mode 100644 index 0000000000..2d190f979f --- /dev/null +++ b/libavcodec/x86/mpegvideoenc.c @@ -0,0 +1,96 @@ +/* + * The simplest mpeg encoder (well, it was the simplest!) + * Copyright (c) 2000,2001 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" +#include "dsputil_mmx.h" + +#if HAVE_INLINE_ASM + +extern uint16_t ff_inv_zigzag_direct16[64]; + +#if HAVE_SSSE3 +#define HAVE_SSSE3_BAK +#endif +#undef HAVE_SSSE3 +#define HAVE_SSSE3 0 + +#undef HAVE_SSE2 +#undef HAVE_MMXEXT +#define HAVE_SSE2 0 +#define HAVE_MMXEXT 0 +#define RENAME(a) a ## _MMX +#define RENAMEl(a) a ## _mmx +#include "mpegvideoenc_template.c" + +#undef HAVE_MMXEXT +#define HAVE_MMXEXT 1 +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _MMX2 +#define RENAMEl(a) a ## _mmx2 +#include "mpegvideoenc_template.c" + +#undef HAVE_SSE2 +#define HAVE_SSE2 1 +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _SSE2 +#define RENAMEl(a) a ## _sse2 +#include "mpegvideoenc_template.c" + +#ifdef HAVE_SSSE3_BAK +#undef HAVE_SSSE3 +#define HAVE_SSSE3 1 +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _SSSE3 +#define RENAMEl(a) a ## _sse2 +#include "mpegvideoenc_template.c" +#endif + +#endif /* HAVE_INLINE_ASM */ + +void ff_MPV_encode_init_x86(MpegEncContext *s) +{ +#if HAVE_INLINE_ASM + int mm_flags = av_get_cpu_flags(); + const int dct_algo = s->avctx->dct_algo; + + if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) { +#if HAVE_SSSE3 + if (mm_flags & AV_CPU_FLAG_SSSE3) { + s->dct_quantize = dct_quantize_SSSE3; + } else +#endif + if (mm_flags & AV_CPU_FLAG_SSE2) { + s->dct_quantize = dct_quantize_SSE2; + } else if (mm_flags & AV_CPU_FLAG_MMXEXT) { + s->dct_quantize = dct_quantize_MMX2; + } else { + s->dct_quantize = dct_quantize_MMX; + } + } +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/x86/mpegvideo_mmx_template.c b/libavcodec/x86/mpegvideoenc_template.c index d538a39919..d538a39919 100644 --- a/libavcodec/x86/mpegvideo_mmx_template.c +++ b/libavcodec/x86/mpegvideoenc_template.c diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 244c4d1b08..3db64d2f9b 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -30,6 +30,11 @@ pf_s32_inv_scale: times 8 dd 0x30000000 pf_s32_scale: times 8 dd 0x4f000000 pf_s16_inv_scale: times 4 dd 0x38000000 pf_s16_scale: times 4 dd 0x47000000 +pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11 +pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15 +pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7 +pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7 +pw_zero_even: times 4 dw 0x0000, 0xffff SECTION_TEXT @@ -223,6 +228,512 @@ INIT_YMM avx CONV_FLT_TO_S32 %endif +;------------------------------------------------------------------------------ +; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_S16P_TO_S16_2CH 0 +cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1 + mov src1q, [src0q+gprsize] + mov src0q, [src0q ] + lea lenq, [2*lend] + add src0q, lenq + add src1q, lenq + lea dstq, [dstq+2*lenq] + neg lenq +.loop + mova m0, [src0q+lenq ] + mova m1, [src1q+lenq ] + mova m2, [src0q+lenq+mmsize] + mova m3, [src1q+lenq+mmsize] + SBUTTERFLY2 wd, 0, 1, 4 + SBUTTERFLY2 wd, 2, 3, 4 + mova [dstq+2*lenq+0*mmsize], m0 + mova [dstq+2*lenq+1*mmsize], m1 + mova [dstq+2*lenq+2*mmsize], m2 + mova [dstq+2*lenq+3*mmsize], m3 + add lenq, 2*mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_S16P_TO_S16_2CH +%if HAVE_AVX +INIT_XMM avx +CONV_S16P_TO_S16_2CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +;------------------------------------------------------------------------------ +; NOTE: In the 6-channel functions, len could be used as an index on x86-64 +; instead of just a counter, which would avoid incrementing the +; pointers, but the extra complexity and amount of code is not worth +; the small gain. On x86-32 there are not enough registers to use len +; as an index without keeping two of the pointers on the stack and +; loading them in each iteration. +;------------------------------------------------------------------------------ + +%macro CONV_S16P_TO_S16_6CH 0 +%if ARCH_X86_64 +cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5 +%else +cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5 +%define lend dword r2m +%endif + mov src1q, [src0q+1*gprsize] + mov src2q, [src0q+2*gprsize] + mov src3q, [src0q+3*gprsize] + mov src4q, [src0q+4*gprsize] + mov src5q, [src0q+5*gprsize] + mov src0q, [src0q] + sub src1q, src0q + sub src2q, src0q + sub src3q, src0q + sub src4q, src0q + sub src5q, src0q +.loop: +%if cpuflag(sse2slow) + movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x + movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x + movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x + movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x + movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x + movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x + ; unpack words: + punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 + punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23 + punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21 + ; blend dwords + shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15 + shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 + shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 + ; shuffle dwords + pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19 + pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15 + pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 + movq [dstq+0*mmsize/2], m1 + movq [dstq+1*mmsize/2], m0 + movq [dstq+2*mmsize/2], m2 + movhps [dstq+3*mmsize/2], m1 + movhps [dstq+4*mmsize/2], m0 + movhps [dstq+5*mmsize/2], m2 + add src0q, mmsize/2 + add dstq, mmsize*3 + sub lend, mmsize/4 +%else + mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42 + mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43 + mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44 + mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45 + mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46 + mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47 + ; unpack words: + SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 + ; m1 = 24, 25, 30, 31, 36, 37, 42, 43 + SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 + ; m3 = 26, 27, 32, 33, 38, 39, 44, 45 + SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23 + ; m5 = 28, 29, 34, 35, 40, 41, 46, 47 + ; blend dwords + shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15 + shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 + shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 + SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15 + shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39 + shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41 + shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47 + SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39 + ; shuffle dwords + pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19 + pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 + pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15 + pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43 + pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47 + pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39 + ; shuffle qwords + punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7 + punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23 + shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15 + SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7 + punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31 + punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47 + shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39 + SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31 + mova [dstq+0*mmsize], m4 + mova [dstq+1*mmsize], m2 + mova [dstq+2*mmsize], m0 + mova [dstq+3*mmsize], m5 + mova [dstq+4*mmsize], m3 + mova [dstq+5*mmsize], m1 + add src0q, mmsize + add dstq, mmsize*6 + sub lend, mmsize/2 +%endif + jg .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_S16P_TO_S16_6CH +INIT_XMM sse2slow +CONV_S16P_TO_S16_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_S16P_TO_S16_6CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_S16P_TO_FLT_2CH 0 +cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1 + lea lenq, [2*lend] + mov src1q, [src0q+gprsize] + mov src0q, [src0q ] + lea dstq, [dstq+4*lenq] + add src0q, lenq + add src1q, lenq + neg lenq + mova m5, [pf_s32_inv_scale] +.loop: + mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14 + mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15 + SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7 + ; m4 = 8, 9, 10, 11, 12, 13, 14, 15 + pxor m3, m3 + punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3 + punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7 + punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11 + punpckhwd m3, m4 ; m3 = 12, 13, 14, 15 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m0, m5 + mulps m1, m5 + mulps m2, m5 + mulps m3, m5 + mova [dstq+4*lenq ], m0 + mova [dstq+4*lenq+ mmsize], m1 + mova [dstq+4*lenq+2*mmsize], m2 + mova [dstq+4*lenq+3*mmsize], m3 + add lenq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_S16P_TO_FLT_2CH +%if HAVE_AVX +INIT_XMM avx +CONV_S16P_TO_FLT_2CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_S16P_TO_FLT_6CH 0 +%if ARCH_X86_64 +cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5 +%else +cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5 +%define lend dword r2m +%endif + mov src1q, [srcq+1*gprsize] + mov src2q, [srcq+2*gprsize] + mov src3q, [srcq+3*gprsize] + mov src4q, [srcq+4*gprsize] + mov src5q, [srcq+5*gprsize] + mov srcq, [srcq] + sub src1q, srcq + sub src2q, srcq + sub src3q, srcq + sub src4q, srcq + sub src5q, srcq + mova m7, [pf_s32_inv_scale] +%if cpuflag(ssse3) + %define unpack_even m6 + mova m6, [pb_shuf_unpack_even] +%if ARCH_X86_64 + %define unpack_odd m8 + mova m8, [pb_shuf_unpack_odd] +%else + %define unpack_odd [pb_shuf_unpack_odd] +%endif +%endif +.loop: + movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x + movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x + movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x + movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x + movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x + movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x + ; unpack words: + punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 + punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 + punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23 + ; blend dwords + shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19 + shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15 + shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 +%if cpuflag(ssse3) + pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15 + pshufb m0, unpack_even ; m0 = 0, 1, 2, 3 + pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19 + pshufb m1, unpack_even ; m1 = 4, 5, 6, 7 + pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23 + pshufb m2, unpack_even ; m2 = 8, 9, 10, 11 +%else + ; shuffle dwords + pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15 + pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19 + pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 + pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5 + punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3 + punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15 + punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7 + punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19 + punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11 + punpckhwd m6, m2 ; m6 = 20, 21, 22, 23 + SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5 +%endif + cvtdq2ps m0, m0 ; convert s32 to float + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + cvtdq2ps m4, m4 + cvtdq2ps m5, m5 + mulps m0, m7 ; scale float from s32 range to [-1.0,1.0] + mulps m1, m7 + mulps m2, m7 + mulps m3, m7 + mulps m4, m7 + mulps m5, m7 + mova [dstq ], m0 + mova [dstq+ mmsize], m1 + mova [dstq+2*mmsize], m2 + mova [dstq+3*mmsize], m3 + mova [dstq+4*mmsize], m4 + mova [dstq+5*mmsize], m5 + add srcq, mmsize/2 + add dstq, mmsize*6 + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_S16P_TO_FLT_6CH +INIT_XMM ssse3 +CONV_S16P_TO_FLT_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_S16P_TO_FLT_6CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_FLTP_TO_S16_2CH 0 +cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1 + lea lenq, [4*lend] + mov src1q, [src0q+gprsize] + mov src0q, [src0q ] + add dstq, lenq + add src0q, lenq + add src1q, lenq + neg lenq + mova m2, [pf_s16_scale] +%if cpuflag(ssse3) + mova m3, [pb_interleave_words] +%endif +.loop: + mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6 + mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7 + cvtps2dq m0, m0 + cvtps2dq m1, m1 +%if cpuflag(ssse3) + packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7 + pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 +%else + packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x + packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x + punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 +%endif + mova [dstq+lenq], m0 + add lenq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_FLTP_TO_S16_2CH +INIT_XMM ssse3 +CONV_FLTP_TO_S16_2CH + +;------------------------------------------------------------------------------ +; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_FLTP_TO_S16_6CH 0 +%if ARCH_X86_64 +cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5 +%else +cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5 +%define lend dword r2m +%endif + mov src1q, [srcq+1*gprsize] + mov src2q, [srcq+2*gprsize] + mov src3q, [srcq+3*gprsize] + mov src4q, [srcq+4*gprsize] + mov src5q, [srcq+5*gprsize] + mov srcq, [srcq] + sub src1q, srcq + sub src2q, srcq + sub src3q, srcq + sub src4q, srcq + sub src5q, srcq + movaps xmm6, [pf_s16_scale] +.loop: +%if cpuflag(sse2) + mulps m0, m6, [srcq ] + mulps m1, m6, [srcq+src1q] + mulps m2, m6, [srcq+src2q] + mulps m3, m6, [srcq+src3q] + mulps m4, m6, [srcq+src4q] + mulps m5, m6, [srcq+src5q] + cvtps2dq m0, m0 + cvtps2dq m1, m1 + cvtps2dq m2, m2 + cvtps2dq m3, m3 + cvtps2dq m4, m4 + cvtps2dq m5, m5 + packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21 + packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22 + packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23 + ; unpack words: + movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x + punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 + punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23 + punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 + ; blend dwords: + shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15 + shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 + shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 + ; shuffle dwords: + shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 + shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7 + shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23 + mova [dstq+0*mmsize], m3 + mova [dstq+1*mmsize], m1 + mova [dstq+2*mmsize], m0 +%else ; sse + movlps xmm0, [srcq ] + movlps xmm1, [srcq+src1q] + movlps xmm2, [srcq+src2q] + movlps xmm3, [srcq+src3q] + movlps xmm4, [srcq+src4q] + movlps xmm5, [srcq+src5q] + mulps xmm0, xmm6 + mulps xmm1, xmm6 + mulps xmm2, xmm6 + mulps xmm3, xmm6 + mulps xmm4, xmm6 + mulps xmm5, xmm6 + cvtps2pi mm0, xmm0 + cvtps2pi mm1, xmm1 + cvtps2pi mm2, xmm2 + cvtps2pi mm3, xmm3 + cvtps2pi mm4, xmm4 + cvtps2pi mm5, xmm5 + packssdw mm0, mm3 ; m0 = 0, 6, 3, 9 + packssdw mm1, mm4 ; m1 = 1, 7, 4, 10 + packssdw mm2, mm5 ; m2 = 2, 8, 5, 11 + ; unpack words + pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6 + punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7 + punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11 + punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9 + ; unpack dwords + pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1 + punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final) + punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final) + punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final) + mova [dstq+0*mmsize], mm0 + mova [dstq+1*mmsize], mm1 + mova [dstq+2*mmsize], mm2 +%endif + add srcq, mmsize + add dstq, mmsize*3 + sub lend, mmsize/4 + jg .loop +%if mmsize == 8 + emms + RET +%else + REP_RET +%endif +%endmacro + +INIT_MMX sse +CONV_FLTP_TO_S16_6CH +INIT_XMM sse2 +CONV_FLTP_TO_S16_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_FLTP_TO_S16_6CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_FLTP_TO_FLT_2CH 0 +cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1 + mov src1q, [src0q+gprsize] + mov src0q, [src0q] + lea lenq, [4*lend] + add src0q, lenq + add src1q, lenq + lea dstq, [dstq+2*lenq] + neg lenq +.loop + mova m0, [src0q+lenq ] + mova m1, [src1q+lenq ] + mova m2, [src0q+lenq+mmsize] + mova m3, [src1q+lenq+mmsize] + SBUTTERFLYPS 0, 1, 4 + SBUTTERFLYPS 2, 3, 4 + mova [dstq+2*lenq+0*mmsize], m0 + mova [dstq+2*lenq+1*mmsize], m1 + mova [dstq+2*lenq+2*mmsize], m2 + mova [dstq+2*lenq+3*mmsize], m3 + add lenq, 2*mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse +CONV_FLTP_TO_FLT_2CH +%if HAVE_AVX +INIT_XMM avx +CONV_FLTP_TO_FLT_2CH +%endif + ;----------------------------------------------------------------------------- ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len, ; int channels); @@ -303,3 +814,449 @@ CONV_FLTP_TO_FLT_6CH INIT_XMM avx CONV_FLTP_TO_FLT_6CH %endif + +;------------------------------------------------------------------------------ +; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_S16_TO_S16P_2CH 0 +cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1 + lea lenq, [2*lend] + mov dst1q, [dst0q+gprsize] + mov dst0q, [dst0q ] + lea srcq, [srcq+2*lenq] + add dst0q, lenq + add dst1q, lenq + neg lenq +%if cpuflag(ssse3) + mova m3, [pb_deinterleave_words] +%endif +.loop: + mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 + mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 +%if cpuflag(ssse3) + pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7 + pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15 + SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14 + ; m1 = 1, 3, 5, 7, 9, 11, 13, 15 +%else ; sse2 + pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7 + pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7 + pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15 + pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15 + DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14 + ; m1 = 1, 3, 5, 7, 9, 11, 13, 15 +%endif + mova [dst0q+lenq], m0 + mova [dst1q+lenq], m1 + add lenq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_S16_TO_S16P_2CH +INIT_XMM ssse3 +CONV_S16_TO_S16P_2CH +%if HAVE_AVX +INIT_XMM avx +CONV_S16_TO_S16P_2CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_S16_TO_S16P_6CH 0 +%if ARCH_X86_64 +cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5 +%else +cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5 +%define lend dword r2m +%endif + mov dst1q, [dstq+ gprsize] + mov dst2q, [dstq+2*gprsize] + mov dst3q, [dstq+3*gprsize] + mov dst4q, [dstq+4*gprsize] + mov dst5q, [dstq+5*gprsize] + mov dstq, [dstq ] + sub dst1q, dstq + sub dst2q, dstq + sub dst3q, dstq + sub dst4q, dstq + sub dst5q, dstq +.loop: + mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 + mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15 + mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 + PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x + shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 + psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x + SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 + ; m1 = 4, 10, 5, 11, x, x, x, x + SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21 + ; m2 = 16, 22, 17, 23, x, x, x, x + SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 + ; m3 = 2, 8, 14, 20, 3, 9, 15, 21 + punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23 + movq [dstq ], m0 + movhps [dstq+dst1q], m0 + movq [dstq+dst2q], m3 + movhps [dstq+dst3q], m3 + movq [dstq+dst4q], m1 + movhps [dstq+dst5q], m1 + add srcq, mmsize*3 + add dstq, mmsize/2 + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +%define PALIGNR PALIGNR_MMX +INIT_XMM sse2 +CONV_S16_TO_S16P_6CH +%define PALIGNR PALIGNR_SSSE3 +INIT_XMM ssse3 +CONV_S16_TO_S16P_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_S16_TO_S16P_6CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_S16_TO_FLTP_2CH 0 +cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1 + lea lenq, [4*lend] + mov dst1q, [dst0q+gprsize] + mov dst0q, [dst0q ] + add srcq, lenq + add dst0q, lenq + add dst1q, lenq + neg lenq + mova m3, [pf_s32_inv_scale] + mova m4, [pw_zero_even] +.loop: + mova m1, [srcq+lenq] + pslld m0, m1, 16 + pand m1, m4 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + mulps m0, m0, m3 + mulps m1, m1, m3 + mova [dst0q+lenq], m0 + mova [dst1q+lenq], m1 + add lenq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_S16_TO_FLTP_2CH +%if HAVE_AVX +INIT_XMM avx +CONV_S16_TO_FLTP_2CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_S16_TO_FLTP_6CH 0 +%if ARCH_X86_64 +cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 +%else +cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 +%define lend dword r2m +%endif + mov dst1q, [dstq+ gprsize] + mov dst2q, [dstq+2*gprsize] + mov dst3q, [dstq+3*gprsize] + mov dst4q, [dstq+4*gprsize] + mov dst5q, [dstq+5*gprsize] + mov dstq, [dstq ] + sub dst1q, dstq + sub dst2q, dstq + sub dst3q, dstq + sub dst4q, dstq + sub dst5q, dstq + mova m6, [pf_s16_inv_scale] +.loop: + mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 + mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15 + mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 + PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x + shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 + psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x + SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 + ; m1 = 4, 10, 5, 11, x, x, x, x + SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21 + ; m2 = 16, 22, 17, 23, x, x, x, x + SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 + ; m3 = 2, 8, 14, 20, 3, 9, 15, 21 + punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23 + S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18 + ; m2 = 1, 7, 13, 19 + S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20 + ; m4 = 3, 9, 15, 21 + S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22 + ; m5 = 5, 11, 17, 23 + SWAP 1,2,3,4 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + cvtdq2ps m4, m4 + cvtdq2ps m5, m5 + mulps m0, m6 + mulps m1, m6 + mulps m2, m6 + mulps m3, m6 + mulps m4, m6 + mulps m5, m6 + mova [dstq ], m0 + mova [dstq+dst1q], m1 + mova [dstq+dst2q], m2 + mova [dstq+dst3q], m3 + mova [dstq+dst4q], m4 + mova [dstq+dst5q], m5 + add srcq, mmsize*3 + add dstq, mmsize + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +%define PALIGNR PALIGNR_MMX +INIT_XMM sse2 +CONV_S16_TO_FLTP_6CH +%define PALIGNR PALIGNR_SSSE3 +INIT_XMM ssse3 +CONV_S16_TO_FLTP_6CH +INIT_XMM sse4 +CONV_S16_TO_FLTP_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_S16_TO_FLTP_6CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_FLT_TO_S16P_2CH 0 +cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1 + lea lenq, [2*lend] + mov dst1q, [dst0q+gprsize] + mov dst0q, [dst0q ] + lea srcq, [srcq+4*lenq] + add dst0q, lenq + add dst1q, lenq + neg lenq + mova m5, [pf_s16_scale] +.loop: + mova m0, [srcq+4*lenq ] + mova m1, [srcq+4*lenq+ mmsize] + mova m2, [srcq+4*lenq+2*mmsize] + mova m3, [srcq+4*lenq+3*mmsize] + DEINT2_PS 0, 1, 4 + DEINT2_PS 2, 3, 4 + mulps m0, m0, m5 + mulps m1, m1, m5 + mulps m2, m2, m5 + mulps m3, m3, m5 + cvtps2dq m0, m0 + cvtps2dq m1, m1 + cvtps2dq m2, m2 + cvtps2dq m3, m3 + packssdw m0, m2 + packssdw m1, m3 + mova [dst0q+lenq], m0 + mova [dst1q+lenq], m1 + add lenq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_FLT_TO_S16P_2CH +%if HAVE_AVX +INIT_XMM avx +CONV_FLT_TO_S16P_2CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_FLT_TO_S16P_6CH 0 +%if ARCH_X86_64 +cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 +%else +cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 +%define lend dword r2m +%endif + mov dst1q, [dstq+ gprsize] + mov dst2q, [dstq+2*gprsize] + mov dst3q, [dstq+3*gprsize] + mov dst4q, [dstq+4*gprsize] + mov dst5q, [dstq+5*gprsize] + mov dstq, [dstq ] + sub dst1q, dstq + sub dst2q, dstq + sub dst3q, dstq + sub dst4q, dstq + sub dst5q, dstq + mova m6, [pf_s16_scale] +.loop: + mulps m0, m6, [srcq+0*mmsize] + mulps m3, m6, [srcq+1*mmsize] + mulps m1, m6, [srcq+2*mmsize] + mulps m4, m6, [srcq+3*mmsize] + mulps m2, m6, [srcq+4*mmsize] + mulps m5, m6, [srcq+5*mmsize] + cvtps2dq m0, m0 + cvtps2dq m1, m1 + cvtps2dq m2, m2 + cvtps2dq m3, m3 + cvtps2dq m4, m4 + cvtps2dq m5, m5 + packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 + packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 + packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 + PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x + shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19 + psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x + SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 + ; m3 = 4, 10, 5, 11, x, x, x, x + SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21 + ; m2 = 16, 22, 17, 23, x, x, x, x + SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 + ; m1 = 2, 8, 14, 20, 3, 9, 15, 21 + punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23 + movq [dstq ], m0 + movhps [dstq+dst1q], m0 + movq [dstq+dst2q], m1 + movhps [dstq+dst3q], m1 + movq [dstq+dst4q], m3 + movhps [dstq+dst5q], m3 + add srcq, mmsize*6 + add dstq, mmsize/2 + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +%define PALIGNR PALIGNR_MMX +INIT_XMM sse2 +CONV_FLT_TO_S16P_6CH +%define PALIGNR PALIGNR_SSSE3 +INIT_XMM ssse3 +CONV_FLT_TO_S16P_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_FLT_TO_S16P_6CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_FLT_TO_FLTP_2CH 0 +cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1 + lea lenq, [4*lend] + mov dst1q, [dst0q+gprsize] + mov dst0q, [dst0q ] + lea srcq, [srcq+2*lenq] + add dst0q, lenq + add dst1q, lenq + neg lenq +.loop: + mova m0, [srcq+2*lenq ] + mova m1, [srcq+2*lenq+mmsize] + DEINT2_PS 0, 1, 2 + mova [dst0q+lenq], m0 + mova [dst1q+lenq], m1 + add lenq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse +CONV_FLT_TO_FLTP_2CH +%if HAVE_AVX +INIT_XMM avx +CONV_FLT_TO_FLTP_2CH +%endif + +;------------------------------------------------------------------------------ +; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len, +; int channels); +;------------------------------------------------------------------------------ + +%macro CONV_FLT_TO_FLTP_6CH 0 +%if ARCH_X86_64 +cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 +%else +cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 +%define lend dword r2m +%endif + mov dst1q, [dstq+ gprsize] + mov dst2q, [dstq+2*gprsize] + mov dst3q, [dstq+3*gprsize] + mov dst4q, [dstq+4*gprsize] + mov dst5q, [dstq+5*gprsize] + mov dstq, [dstq ] + sub dst1q, dstq + sub dst2q, dstq + sub dst3q, dstq + sub dst4q, dstq + sub dst5q, dstq +.loop: + mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3 + mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7 + mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11 + mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15 + mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19 + mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23 + + SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13 + ; m3 = 2, 14, 3, 15 + SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17 + ; m4 = 6, 18, 7, 19 + SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21 + ; m5 = 10, 22, 11, 23 + SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18 + ; m4 = 1, 7, 13, 19 + SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20 + ; m2 = 3, 9, 15, 21 + SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22 + ; m5 = 5, 11, 17, 23 + mova [dstq ], m0 + mova [dstq+dst1q], m4 + mova [dstq+dst2q], m3 + mova [dstq+dst3q], m2 + mova [dstq+dst4q], m1 + mova [dstq+dst5q], m5 + add srcq, mmsize*6 + add dstq, mmsize + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +INIT_XMM sse2 +CONV_FLT_TO_FLTP_6CH +%if HAVE_AVX +INIT_XMM avx +CONV_FLT_TO_FLTP_6CH +%endif diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c index 2de49709c2..a3589c6957 100644 --- a/libavresample/x86/audio_convert_init.c +++ b/libavresample/x86/audio_convert_init.c @@ -22,6 +22,8 @@ #include "libavutil/cpu.h" #include "libavresample/audio_convert.h" +/* flat conversions */ + extern void ff_conv_s16_to_s32_sse2(int16_t *dst, const int32_t *src, int len); extern void ff_conv_s16_to_flt_sse2(float *dst, const int16_t *src, int len); @@ -38,6 +40,49 @@ extern void ff_conv_flt_to_s16_sse2(int16_t *dst, const float *src, int len); extern void ff_conv_flt_to_s32_sse2(int32_t *dst, const float *src, int len); extern void ff_conv_flt_to_s32_avx (int32_t *dst, const float *src, int len); +/* interleave conversions */ + +extern void ff_conv_s16p_to_s16_2ch_sse2(int16_t *dst, int16_t *const *src, + int len, int channels); +extern void ff_conv_s16p_to_s16_2ch_avx (int16_t *dst, int16_t *const *src, + int len, int channels); + +extern void ff_conv_s16p_to_s16_6ch_sse2(int16_t *dst, int16_t *const *src, + int len, int channels); +extern void ff_conv_s16p_to_s16_6ch_sse2slow(int16_t *dst, int16_t *const *src, + int len, int channels); +extern void ff_conv_s16p_to_s16_6ch_avx (int16_t *dst, int16_t *const *src, + int len, int channels); + +extern void ff_conv_s16p_to_flt_2ch_sse2(float *dst, int16_t *const *src, + int len, int channels); +extern void ff_conv_s16p_to_flt_2ch_avx (float *dst, int16_t *const *src, + int len, int channels); + +extern void ff_conv_s16p_to_flt_6ch_sse2 (float *dst, int16_t *const *src, + int len, int channels); +extern void ff_conv_s16p_to_flt_6ch_ssse3(float *dst, int16_t *const *src, + int len, int channels); +extern void ff_conv_s16p_to_flt_6ch_avx (float *dst, int16_t *const *src, + int len, int channels); + +extern void ff_conv_fltp_to_s16_2ch_sse2 (int16_t *dst, float *const *src, + int len, int channels); +extern void ff_conv_fltp_to_s16_2ch_ssse3(int16_t *dst, float *const *src, + int len, int channels); + +extern void ff_conv_fltp_to_s16_6ch_sse (int16_t *dst, float *const *src, + int len, int channels); +extern void ff_conv_fltp_to_s16_6ch_sse2(int16_t *dst, float *const *src, + int len, int channels); +extern void ff_conv_fltp_to_s16_6ch_avx (int16_t *dst, float *const *src, + int len, int channels); + +extern void ff_conv_fltp_to_flt_2ch_sse(float *dst, float *const *src, int len, + int channels); +extern void ff_conv_fltp_to_flt_2ch_avx(float *dst, float *const *src, int len, + int channels); + extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len, int channels); extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len, @@ -45,6 +90,58 @@ extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len, extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len, int channels); +/* deinterleave conversions */ + +extern void ff_conv_s16_to_s16p_2ch_sse2(int16_t *const *dst, int16_t *src, + int len, int channels); +extern void ff_conv_s16_to_s16p_2ch_ssse3(int16_t *const *dst, int16_t *src, + int len, int channels); +extern void ff_conv_s16_to_s16p_2ch_avx (int16_t *const *dst, int16_t *src, + int len, int channels); + +extern void ff_conv_s16_to_s16p_6ch_sse2 (int16_t *const *dst, int16_t *src, + int len, int channels); +extern void ff_conv_s16_to_s16p_6ch_ssse3(int16_t *const *dst, int16_t *src, + int len, int channels); +extern void ff_conv_s16_to_s16p_6ch_avx (int16_t *const *dst, int16_t *src, + int len, int channels); + +extern void ff_conv_s16_to_fltp_2ch_sse2(float *const *dst, int16_t *src, + int len, int channels); +extern void ff_conv_s16_to_fltp_2ch_avx (float *const *dst, int16_t *src, + int len, int channels); + +extern void ff_conv_s16_to_fltp_6ch_sse2 (float *const *dst, int16_t *src, + int len, int channels); +extern void ff_conv_s16_to_fltp_6ch_ssse3(float *const *dst, int16_t *src, + int len, int channels); +extern void ff_conv_s16_to_fltp_6ch_sse4 (float *const *dst, int16_t *src, + int len, int channels); +extern void ff_conv_s16_to_fltp_6ch_avx (float *const *dst, int16_t *src, + int len, int channels); + +extern void ff_conv_flt_to_s16p_2ch_sse2(int16_t *const *dst, float *src, + int len, int channels); +extern void ff_conv_flt_to_s16p_2ch_avx (int16_t *const *dst, float *src, + int len, int channels); + +extern void ff_conv_flt_to_s16p_6ch_sse2 (int16_t *const *dst, float *src, + int len, int channels); +extern void ff_conv_flt_to_s16p_6ch_ssse3(int16_t *const *dst, float *src, + int len, int channels); +extern void ff_conv_flt_to_s16p_6ch_avx (int16_t *const *dst, float *src, + int len, int channels); + +extern void ff_conv_flt_to_fltp_2ch_sse(float *const *dst, float *src, int len, + int channels); +extern void ff_conv_flt_to_fltp_2ch_avx(float *const *dst, float *src, int len, + int channels); + +extern void ff_conv_flt_to_fltp_6ch_sse2(float *const *dst, float *src, int len, + int channels); +extern void ff_conv_flt_to_fltp_6ch_avx (float *const *dst, float *src, int len, + int channels); + av_cold void ff_audio_convert_init_x86(AudioConvert *ac) { #if HAVE_YASM @@ -56,10 +153,25 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, 6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx); } + if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP, + 6, 1, 2, "SSE", ff_conv_fltp_to_s16_6ch_sse); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, + 2, 16, 8, "SSE", ff_conv_fltp_to_flt_2ch_sse); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_FLT, + 2, 16, 4, "SSE", ff_conv_flt_to_fltp_2ch_sse); + } if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S32, 0, 16, 16, "SSE2", ff_conv_s32_to_s16_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P, + 6, 16, 8, "SSE2", ff_conv_s16p_to_s16_6ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP, + 6, 16, 4, "SSE2", ff_conv_fltp_to_s16_6ch_sse2); + } else { + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P, + 6, 1, 4, "SSE2SLOW", ff_conv_s16p_to_s16_6ch_sse2slow); } ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S32, AV_SAMPLE_FMT_S16, 0, 16, 8, "SSE2", ff_conv_s16_to_s32_sse2); @@ -71,6 +183,42 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) 0, 16, 16, "SSE2", ff_conv_flt_to_s16_sse2); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S32, AV_SAMPLE_FMT_FLT, 0, 16, 16, "SSE2", ff_conv_flt_to_s32_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P, + 2, 16, 16, "SSE2", ff_conv_s16p_to_s16_2ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16P, + 2, 16, 8, "SSE2", ff_conv_s16p_to_flt_2ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16P, + 6, 16, 4, "SSE2", ff_conv_s16p_to_flt_6ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP, + 2, 16, 4, "SSE2", ff_conv_fltp_to_s16_2ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S16, + 2, 16, 8, "SSE2", ff_conv_s16_to_s16p_2ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S16, + 6, 16, 4, "SSE2", ff_conv_s16_to_s16p_6ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_S16, + 2, 16, 8, "SSE2", ff_conv_s16_to_fltp_2ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_S16, + 6, 16, 4, "SSE2", ff_conv_s16_to_fltp_6ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, + 2, 16, 8, "SSE2", ff_conv_flt_to_s16p_2ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, + 6, 16, 4, "SSE2", ff_conv_flt_to_s16p_6ch_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_FLT, + 6, 16, 4, "SSE2", ff_conv_flt_to_fltp_6ch_sse2); + } + if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSE) { + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16P, + 6, 16, 4, "SSSE3", ff_conv_s16p_to_flt_6ch_ssse3); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP, + 2, 16, 4, "SSSE3", ff_conv_fltp_to_s16_2ch_ssse3); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S16, + 2, 16, 8, "SSSE3", ff_conv_s16_to_s16p_2ch_ssse3); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S16, + 6, 16, 4, "SSSE3", ff_conv_s16_to_s16p_6ch_ssse3); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_S16, + 6, 16, 4, "SSSE3", ff_conv_s16_to_fltp_6ch_ssse3); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, + 6, 16, 4, "SSSE3", ff_conv_flt_to_s16p_6ch_ssse3); } if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16, @@ -83,8 +231,34 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) 0, 32, 16, "AVX", ff_conv_s32_to_flt_avx); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S32, AV_SAMPLE_FMT_FLT, 0, 32, 32, "AVX", ff_conv_flt_to_s32_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P, + 2, 16, 16, "AVX", ff_conv_s16p_to_s16_2ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P, + 6, 16, 8, "AVX", ff_conv_s16p_to_s16_6ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16P, + 2, 16, 8, "AVX", ff_conv_s16p_to_flt_2ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16P, + 6, 16, 4, "AVX", ff_conv_s16p_to_flt_6ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP, + 6, 16, 4, "AVX", ff_conv_fltp_to_s16_6ch_avx); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, 6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S16, + 2, 16, 8, "AVX", ff_conv_s16_to_s16p_2ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S16, + 6, 16, 4, "AVX", ff_conv_s16_to_s16p_6ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_S16, + 2, 16, 8, "AVX", ff_conv_s16_to_fltp_2ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_S16, + 6, 16, 4, "AVX", ff_conv_s16_to_fltp_6ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, + 2, 16, 8, "AVX", ff_conv_flt_to_s16p_2ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLT, + 6, 16, 4, "AVX", ff_conv_flt_to_s16p_6ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_FLT, + 2, 16, 4, "AVX", ff_conv_flt_to_fltp_2ch_avx); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_FLT, + 6, 16, 4, "AVX", ff_conv_flt_to_fltp_6ch_avx); } #endif } diff --git a/libavresample/x86/util.asm b/libavresample/x86/util.asm index ca7fde513a..0ce953159c 100644 --- a/libavresample/x86/util.asm +++ b/libavresample/x86/util.asm @@ -33,3 +33,9 @@ psrad m%1, 16 %endif %endmacro + +%macro DEINT2_PS 3 ; src0/even dst, src1/odd dst, temp + shufps m%3, m%1, m%2, q3131 + shufps m%1, m%2, q2020 + SWAP %2,%3 +%endmacro diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 979ae41577..cf5ea363f6 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -637,3 +637,15 @@ %rotate 1 %endrep %endmacro + +%macro PMOVSXWD 2; dst, src +%if cpuflag(sse4) + pmovsxwd %1, %2 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + punpcklwd %1, %1 + psrad %1, 16 +%endif +%endmacro |