diff options
author | Justin Ruggles <justin.ruggles@gmail.com> | 2011-01-30 15:06:46 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-02-04 03:08:09 +0100 |
commit | fe2ff6d24745f0739bfde9061092c1268557310b (patch) | |
tree | 9cbcf8b2472dd7612dd84c8b6b237d9d02b4daf9 | |
parent | a35d782d28ef0497f2b65eb300c2e6a6028fc165 (diff) | |
download | ffmpeg-fe2ff6d24745f0739bfde9061092c1268557310b.tar.gz |
Separate format conversion DSP functions from DSPContext.
This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.
Signed-off-by: Mans Rullgard <mans@mansr.com>
(cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae)
32 files changed, 1204 insertions, 882 deletions
diff --git a/libavcodec/Makefile b/libavcodec/Makefile index de1bde0737..6a0a05b870 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -12,6 +12,7 @@ OBJS = allcodecs.o \ bitstream_filter.o \ dsputil.o \ faanidct.o \ + fmtconvert.o \ imgconvert.o \ jrevdct.o \ opt.o \ diff --git a/libavcodec/aac.h b/libavcodec/aac.h index 714e314cba..cff476a6eb 100644 --- a/libavcodec/aac.h +++ b/libavcodec/aac.h @@ -35,6 +35,7 @@ #include "fft.h" #include "mpeg4audio.h" #include "sbr.h" +#include "fmtconvert.h" #include <stdint.h> @@ -268,6 +269,7 @@ typedef struct { FFTContext mdct; FFTContext mdct_small; DSPContext dsp; + FmtConvertContext fmt_conv; int random_state; /** @} */ diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c index 0ea7dc84a5..411c1dfc1b 100644 --- a/libavcodec/aacdec.c +++ b/libavcodec/aacdec.c @@ -85,6 +85,7 @@ #include "get_bits.h" #include "dsputil.h" #include "fft.h" +#include "fmtconvert.h" #include "lpc.h" #include "aac.h" @@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx) ff_aac_sbr_init(); dsputil_init(&ac->dsp, avctx); + ff_fmt_convert_init(&ac->fmt_conv, avctx); ac->random_state = 0x1f2e3d4c; @@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data, *data_size = data_size_tmp; if (samples) - ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); + ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); if (ac->output_configured) ac->output_configured = OC_LOCKED; diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c index 8e40ce1ccc..5ebee1908d 100644 --- a/libavcodec/ac3dec.c +++ b/libavcodec/ac3dec.c @@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx) ff_mdct_init(&s->imdct_512, 9, 1, 1.0); ff_kbd_window_init(s->window, 5.0, 256); dsputil_init(&s->dsp, avctx); + ff_fmt_convert_init(&s->fmt_conv, avctx); av_lfg_init(&s->dith_state, 0); /* set scale value for float to int16 conversion */ @@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) } else { gain *= s->dynamic_range[0]; } - s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); + s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); } /* apply spectral extension to high frequency bins */ @@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size, av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n"); err = 1; } - s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels); + s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels); out_samples += 256 * s->out_channels; } *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t); diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h index 55520cdcee..147e5e59bc 100644 --- a/libavcodec/ac3dec.h +++ b/libavcodec/ac3dec.h @@ -55,6 +55,7 @@ #include "get_bits.h" #include "dsputil.h" #include "fft.h" +#include "fmtconvert.h" /* override ac3.h to include coupling channel */ #undef AC3_MAX_CHANNELS @@ -190,6 +191,7 @@ typedef struct { ///@defgroup opt optimization DSPContext dsp; ///< for optimization + FmtConvertContext fmt_conv; ///< optimized conversion functions float mul_bias; ///< scaling for float_to_int16 conversion ///@} diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 4c30e0ab9f..014456ee32 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o OBJS += arm/dsputil_init_arm.o \ arm/dsputil_arm.o \ arm/fft_init_arm.o \ + arm/fmtconvert_init_arm.o \ arm/jrevdct_arm.o \ arm/mpegvideo_arm.o \ arm/simple_idct_arm.o \ @@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \ arm/dsputil_armv6.o \ arm/simple_idct_armv6.o \ +VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \ + OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ arm/dsputil_init_vfp.o \ + $(VFP-OBJS-yes) OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ arm/mpegvideo_iwmmxt.o \ @@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \ OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \ arm/dsputil_neon.o \ + arm/fmtconvert_neon.o \ arm/int_neon.o \ arm/mpegvideo_neon.o \ arm/simple_idct_neon.o \ diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 67982048f9..76ae632273 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, int len); void ff_butterflies_float_neon(float *v1, float *v2, int len); float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); -void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, - float mul, int len); void ff_vector_fmul_reverse_neon(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, @@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, int len); -void ff_float_to_int16_neon(int16_t *, const float *, long); -void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); @@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; c->butterflies_float = ff_butterflies_float_neon; c->scalarproduct_float = ff_scalarproduct_float_neon; - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; c->vector_fmul_add = ff_vector_fmul_add_neon; c->vector_clipf = ff_vector_clipf_neon; @@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->float_to_int16 = ff_float_to_int16_neon; - c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; - } - if (CONFIG_VORBIS_DECODER) c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c index 76ef6b4171..bd52315934 100644 --- a/libavcodec/arm/dsputil_init_vfp.c +++ b/libavcodec/arm/dsputil_init_vfp.c @@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); -void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) { c->vector_fmul = ff_vector_fmul_vfp; c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; -#if HAVE_ARMV6 - c->float_to_int16 = ff_float_to_int16_vfp; -#endif } diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S index 8329f6cc57..05a911502b 100644 --- a/libavcodec/arm/dsputil_neon.S +++ b/libavcodec/arm/dsputil_neon.S @@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1 bx lr endfunc -function ff_float_to_int16_neon, export=1 - subs r2, r2, #8 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q9, q1, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vshrn.s32 d4, q8, #16 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q0, q0, #16 - vshrn.s32 d5, q9, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vld1.64 {d16-d17},[r1,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r1,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6-d7}, [r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vld1.64 {d0-d1}, [r1,:128]! - vshrn.s32 d4, q8, #16 - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vshrn.s32 d5, q9, #16 - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vst1.64 {d6-d7}, [r0,:128]! - bx lr -3: vshrn.s32 d4, q8, #16 - vshrn.s32 d5, q9, #16 - vst1.64 {d4-d5}, [r0,:128]! - bx lr -endfunc - -function ff_float_to_int16_interleave_neon, export=1 - cmp r3, #2 - ldrlt r1, [r1] - blt ff_float_to_int16_neon - bne 4f - - ldr r3, [r1] - ldr r1, [r1, #4] - - subs r2, r2, #8 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q9, q1, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q10, q8, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vld1.64 {d26-d27},[r1,:128]! - vsri.32 q11, q9, #16 - vst1.64 {d20-d21},[r0,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q12, q0, #16 - vld1.64 {d16-d17},[r3,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d25},[r0,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r3,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d26-d27},[r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vsri.32 q10, q8, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vsri.32 q11, q9, #16 - vld1.64 {d26-d27},[r1,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d20-d21},[r0,:128]! - vsri.32 q12, q0, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d27},[r0,:128]! - bx lr -3: vsri.32 q10, q8, #16 - vsri.32 q11, q9, #16 - vst1.64 {d20-d23},[r0,:128]! - bx lr - -4: push {r4-r8,lr} - cmp r3, #4 - lsl ip, r3, #1 - blt 4f - - @ 4 channels -5: ldmia r1!, {r4-r7} - mov lr, r2 - mov r8, r0 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #8 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q9, q8, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 q11, q10, #16 - vld1.64 {d4-d5}, [r6,:128]! - vcvt.s32.f32 q2, q2, #16 - vzip.32 d18, d22 - vld1.64 {d6-d7}, [r7,:128]! - vcvt.s32.f32 q3, q3, #16 - vzip.32 d19, d23 - vst1.64 {d18}, [r8], ip - vsri.32 q1, q0, #16 - vst1.64 {d22}, [r8], ip - vsri.32 q3, q2, #16 - vst1.64 {d19}, [r8], ip - vzip.32 d2, d6 - vst1.64 {d23}, [r8], ip - vzip.32 d3, d7 - beq 7f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.64 {d2}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6}, [r8], ip - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.64 {d3}, [r8], ip - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d7}, [r8], ip - b 6b -7: vst1.64 {d2}, [r8], ip - vst1.64 {d6}, [r8], ip - vst1.64 {d3}, [r8], ip - vst1.64 {d7}, [r8], ip - subs r3, r3, #4 - popeq {r4-r8,pc} - cmp r3, #4 - add r0, r0, #8 - bge 5b - - @ 2 channels -4: cmp r3, #2 - blt 4f - ldmia r1!, {r4-r5} - mov lr, r2 - mov r8, r0 - tst lr, #8 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 6f - subs lr, lr, #8 - beq 7f - vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #16 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 d18, d16, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 d19, d17, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r5,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vsri.32 d2, d0, #16 - vst1.32 {d19[1]}, [r8], ip - vsri.32 d3, d1, #16 - vst1.32 {d22[0]}, [r8], ip - vsri.32 d6, d4, #16 - vst1.32 {d22[1]}, [r8], ip - vsri.32 d7, d5, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - beq 6f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - bgt 6b -6: vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - b 8f -7: vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip -8: subs r3, r3, #2 - add r0, r0, #4 - popeq {r4-r8,pc} - - @ 1 channel -4: ldr r4, [r1],#4 - tst r2, #8 - mov lr, r2 - mov r5, r0 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - bne 8f -6: subs lr, lr, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r4,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - beq 7f - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 -7: vst1.16 {d4[1]}, [r5,:16], ip - vst1.16 {d4[3]}, [r5,:16], ip - vst1.16 {d5[1]}, [r5,:16], ip - vst1.16 {d5[3]}, [r5,:16], ip - vst1.16 {d6[1]}, [r5,:16], ip - vst1.16 {d6[3]}, [r5,:16], ip - vst1.16 {d7[1]}, [r5,:16], ip - vst1.16 {d7[3]}, [r5,:16], ip - bgt 6b - pop {r4-r8,pc} -8: subs lr, lr, #8 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - popeq {r4-r8,pc} - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - b 6b -endfunc - function ff_vector_fmul_neon, export=1 subs r3, r3, #8 vld1.64 {d0-d3}, [r1,:128]! @@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0] bx lr endfunc -function ff_int32_to_float_fmul_scalar_neon, export=1 -VFP vdup.32 q0, d0[0] -VFP len .req r2 -NOVFP vdup.32 q0, r2 -NOVFP len .req r3 - - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 -1: subs len, len, #8 - pld [r1, #16] - vmul.f32 q9, q3, q0 - vmul.f32 q10, q8, q0 - beq 2f - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 - vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - b 1b -2: vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - bx lr - .unreq len -endfunc - function ff_vector_fmul_reverse_neon, export=1 add r2, r2, r3, lsl #2 sub r2, r2, #32 diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S index a65b69e20a..197d500819 100644 --- a/libavcodec/arm/dsputil_vfp.S +++ b/libavcodec/arm/dsputil_vfp.S @@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1 vpop {d8-d15} bx lr endfunc - -#if HAVE_ARMV6 -/** - * ARM VFP optimized float to int16 conversion. - * Assume that len is a positive number and is multiple of 8, destination - * buffer is at least 4 bytes aligned (8 bytes alignment is better for - * performance), little endian byte sex - */ -@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) -function ff_float_to_int16_vfp, export=1 - push {r4-r8,lr} - vpush {d8-d11} - vldmia r1!, {s16-s23} - vcvt.s32.f32 s0, s16 - vcvt.s32.f32 s1, s17 - vcvt.s32.f32 s2, s18 - vcvt.s32.f32 s3, s19 - vcvt.s32.f32 s4, s20 - vcvt.s32.f32 s5, s21 - vcvt.s32.f32 s6, s22 - vcvt.s32.f32 s7, s23 -1: - subs r2, r2, #8 - vmov r3, r4, s0, s1 - vmov r5, r6, s2, s3 - vmov r7, r8, s4, s5 - vmov ip, lr, s6, s7 - vldmiagt r1!, {s16-s23} - ssat r4, #16, r4 - ssat r3, #16, r3 - ssat r6, #16, r6 - ssat r5, #16, r5 - pkhbt r3, r3, r4, lsl #16 - pkhbt r4, r5, r6, lsl #16 - vcvtgt.s32.f32 s0, s16 - vcvtgt.s32.f32 s1, s17 - vcvtgt.s32.f32 s2, s18 - vcvtgt.s32.f32 s3, s19 - vcvtgt.s32.f32 s4, s20 - vcvtgt.s32.f32 s5, s21 - vcvtgt.s32.f32 s6, s22 - vcvtgt.s32.f32 s7, s23 - ssat r8, #16, r8 - ssat r7, #16, r7 - ssat lr, #16, lr - ssat ip, #16, ip - pkhbt r5, r7, r8, lsl #16 - pkhbt r6, ip, lr, lsl #16 - stmia r0!, {r3-r6} - bgt 1b - - vpop {d8-d11} - pop {r4-r8,pc} -endfunc -#endif diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c new file mode 100644 index 0000000000..4b6e3939f5 --- /dev/null +++ b/libavcodec/arm/fmtconvert_init_arm.c @@ -0,0 +1,48 @@ +/* + * ARM optimized Format Conversion Utils + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavcodec/avcodec.h" +#include "libavcodec/fmtconvert.h" + +void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, + float mul, int len); + +void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); +void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); + +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); + +void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) +{ + if (HAVE_ARMVFP && HAVE_ARMV6) { + c->float_to_int16 = ff_float_to_int16_vfp; + } + + if (HAVE_NEON) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; + + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->float_to_int16 = ff_float_to_int16_neon; + c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; + } + } +} diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S new file mode 100644 index 0000000000..359e57e40b --- /dev/null +++ b/libavcodec/arm/fmtconvert_neon.S @@ -0,0 +1,391 @@ +/* + * ARM NEON optimised Format Conversion Utils + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + + preserve8 + .text + +function ff_float_to_int16_neon, export=1 + subs r2, r2, #8 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q9, q1, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vshrn.s32 d4, q8, #16 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q0, q0, #16 + vshrn.s32 d5, q9, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vld1.64 {d16-d17},[r1,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r1,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6-d7}, [r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vld1.64 {d0-d1}, [r1,:128]! + vshrn.s32 d4, q8, #16 + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vshrn.s32 d5, q9, #16 + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vst1.64 {d6-d7}, [r0,:128]! + bx lr +3: vshrn.s32 d4, q8, #16 + vshrn.s32 d5, q9, #16 + vst1.64 {d4-d5}, [r0,:128]! + bx lr +endfunc + +function ff_float_to_int16_interleave_neon, export=1 + cmp r3, #2 + ldrlt r1, [r1] + blt ff_float_to_int16_neon + bne 4f + + ldr r3, [r1] + ldr r1, [r1, #4] + + subs r2, r2, #8 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q9, q1, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q10, q8, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vld1.64 {d26-d27},[r1,:128]! + vsri.32 q11, q9, #16 + vst1.64 {d20-d21},[r0,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q12, q0, #16 + vld1.64 {d16-d17},[r3,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d25},[r0,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r3,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d26-d27},[r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vsri.32 q10, q8, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vsri.32 q11, q9, #16 + vld1.64 {d26-d27},[r1,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d20-d21},[r0,:128]! + vsri.32 q12, q0, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d27},[r0,:128]! + bx lr +3: vsri.32 q10, q8, #16 + vsri.32 q11, q9, #16 + vst1.64 {d20-d23},[r0,:128]! + bx lr + +4: push {r4-r8,lr} + cmp r3, #4 + lsl ip, r3, #1 + blt 4f + + @ 4 channels +5: ldmia r1!, {r4-r7} + mov lr, r2 + mov r8, r0 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #8 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q9, q8, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 q11, q10, #16 + vld1.64 {d4-d5}, [r6,:128]! + vcvt.s32.f32 q2, q2, #16 + vzip.32 d18, d22 + vld1.64 {d6-d7}, [r7,:128]! + vcvt.s32.f32 q3, q3, #16 + vzip.32 d19, d23 + vst1.64 {d18}, [r8], ip + vsri.32 q1, q0, #16 + vst1.64 {d22}, [r8], ip + vsri.32 q3, q2, #16 + vst1.64 {d19}, [r8], ip + vzip.32 d2, d6 + vst1.64 {d23}, [r8], ip + vzip.32 d3, d7 + beq 7f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.64 {d2}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6}, [r8], ip + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.64 {d3}, [r8], ip + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d7}, [r8], ip + b 6b +7: vst1.64 {d2}, [r8], ip + vst1.64 {d6}, [r8], ip + vst1.64 {d3}, [r8], ip + vst1.64 {d7}, [r8], ip + subs r3, r3, #4 + popeq {r4-r8,pc} + cmp r3, #4 + add r0, r0, #8 + bge 5b + + @ 2 channels +4: cmp r3, #2 + blt 4f + ldmia r1!, {r4-r5} + mov lr, r2 + mov r8, r0 + tst lr, #8 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 6f + subs lr, lr, #8 + beq 7f + vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #16 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 d18, d16, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 d19, d17, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r5,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vsri.32 d2, d0, #16 + vst1.32 {d19[1]}, [r8], ip + vsri.32 d3, d1, #16 + vst1.32 {d22[0]}, [r8], ip + vsri.32 d6, d4, #16 + vst1.32 {d22[1]}, [r8], ip + vsri.32 d7, d5, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + beq 6f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + bgt 6b +6: vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + b 8f +7: vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip +8: subs r3, r3, #2 + add r0, r0, #4 + popeq {r4-r8,pc} + + @ 1 channel +4: ldr r4, [r1],#4 + tst r2, #8 + mov lr, r2 + mov r5, r0 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + bne 8f +6: subs lr, lr, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r4,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + beq 7f + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 +7: vst1.16 {d4[1]}, [r5,:16], ip + vst1.16 {d4[3]}, [r5,:16], ip + vst1.16 {d5[1]}, [r5,:16], ip + vst1.16 {d5[3]}, [r5,:16], ip + vst1.16 {d6[1]}, [r5,:16], ip + vst1.16 {d6[3]}, [r5,:16], ip + vst1.16 {d7[1]}, [r5,:16], ip + vst1.16 {d7[3]}, [r5,:16], ip + bgt 6b + pop {r4-r8,pc} +8: subs lr, lr, #8 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + popeq {r4-r8,pc} + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + b 6b +endfunc + +function ff_int32_to_float_fmul_scalar_neon, export=1 +VFP vdup.32 q0, d0[0] +VFP len .req r2 +NOVFP vdup.32 q0, r2 +NOVFP len .req r3 + + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 +1: subs len, len, #8 + pld [r1, #16] + vmul.f32 q9, q3, q0 + vmul.f32 q10, q8, q0 + beq 2f + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 + vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + b 1b +2: vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + bx lr + .unreq len +endfunc diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S new file mode 100644 index 0000000000..1d19e7758b --- /dev/null +++ b/libavcodec/arm/fmtconvert_vfp.S @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + + .syntax unified + +/** + * ARM VFP optimized float to int16 conversion. + * Assume that len is a positive number and is multiple of 8, destination + * buffer is at least 4 bytes aligned (8 bytes alignment is better for + * performance), little endian byte sex + */ +@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) +function ff_float_to_int16_vfp, export=1 + push {r4-r8,lr} + vpush {d8-d11} + vldmia r1!, {s16-s23} + vcvt.s32.f32 s0, s16 + vcvt.s32.f32 s1, s17 + vcvt.s32.f32 s2, s18 + vcvt.s32.f32 s3, s19 + vcvt.s32.f32 s4, s20 + vcvt.s32.f32 s5, s21 + vcvt.s32.f32 s6, s22 + vcvt.s32.f32 s7, s23 +1: + subs r2, r2, #8 + vmov r3, r4, s0, s1 + vmov r5, r6, s2, s3 + vmov r7, r8, s4, s5 + vmov ip, lr, s6, s7 + vldmiagt r1!, {s16-s23} + ssat r4, #16, r4 + ssat r3, #16, r3 + ssat r6, #16, r6 + ssat r5, #16, r5 + pkhbt r3, r3, r4, lsl #16 + pkhbt r4, r5, r6, lsl #16 + vcvtgt.s32.f32 s0, s16 + vcvtgt.s32.f32 s1, s17 + vcvtgt.s32.f32 s2, s18 + vcvtgt.s32.f32 s3, s19 + vcvtgt.s32.f32 s4, s20 + vcvtgt.s32.f32 s5, s21 + vcvtgt.s32.f32 s6, s22 + vcvtgt.s32.f32 s7, s23 + ssat r8, #16, r8 + ssat r7, #16, r7 + ssat lr, #16, lr + ssat ip, #16, ip + pkhbt r5, r7, r8, lsl #16 + pkhbt r6, ip, lr, lsl #16 + stmia r0!, {r3-r6} + bgt 1b + + vpop {d8-d11} + pop {r4-r8,pc} +endfunc diff --git a/libavcodec/binkaudio.c b/libavcodec/binkaudio.c index ae2f6c88b0..53484654db 100644 --- a/libavcodec/binkaudio.c +++ b/libavcodec/binkaudio.c @@ -33,6 +33,7 @@ #include "get_bits.h" #include "dsputil.h" #include "fft.h" +#include "fmtconvert.h" extern const uint16_t ff_wma_critical_freqs[25]; @@ -43,6 +44,7 @@ typedef struct { AVCodecContext *avctx; GetBitContext gb; DSPContext dsp; + FmtConvertContext fmt_conv; int first; int channels; int frame_len; ///< transform size (samples) @@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx) s->avctx = avctx; dsputil_init(&s->dsp, avctx); + ff_fmt_convert_init(&s->fmt_conv, avctx); /* determine frame length */ if (avctx->sample_rate < 22050) { @@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct) ff_rdft_calc(&s->trans.rdft, coeffs); } - s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels); + s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, + s->frame_len, s->channels); if (!s->first) { int count = s->overlap_len * s->channels; diff --git a/libavcodec/dca.c b/libavcodec/dca.c index 3a3eb25d0b..63ea32992e 100644 --- a/libavcodec/dca.c +++ b/libavcodec/dca.c @@ -40,6 +40,7 @@ #include "dca.h" #include "synth_filter.h" #include "dcadsp.h" +#include "fmtconvert.h" //#define TRACE @@ -347,6 +348,7 @@ typedef struct { FFTContext imdct; SynthFilterContext synth; DCADSPContext dcadsp; + FmtConvertContext fmt_conv; } DCAContext; static const uint16_t dca_vlc_offs[] = { @@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index) block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel); } - s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l], + s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l], block, rscale, 8); } @@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx, } } - s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); + s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); samples += 256 * channels; } @@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx) ff_mdct_init(&s->imdct, 6, 1, 1.0); ff_synth_filter_init(&s->synth); ff_dcadsp_init(&s->dcadsp); + ff_fmt_convert_init(&s->fmt_conv, avctx); for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++) s->samples_chanptr[i] = s->samples + i * 256; diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 2d4ec72026..84714def41 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len) return p; } -static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ - int i; - for(i=0; i<len; i++) - dst[i] = src[i] * mul; -} - static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, uint32_t maxi, uint32_t maxisign) { @@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i } } -static av_always_inline int float_to_int16_one(const float *src){ - return av_clip_int16(lrintf(*src)); -} - -static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){ - int i; - for(i=0; i<len; i++) - dst[i] = float_to_int16_one(src+i); -} - -static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){ - int i,j,c; - if(channels==2){ - for(i=0; i<len; i++){ - dst[2*i] = float_to_int16_one(src[0]+i); - dst[2*i+1] = float_to_int16_one(src[1]+i); - } - }else{ - for(c=0; c<channels; c++) - for(i=0, j=c; i<len; i++, j+=channels) - dst[j] = float_to_int16_one(src[c]+i); - } -} - static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift) { int res = 0; @@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->vector_fmul_reverse = vector_fmul_reverse_c; c->vector_fmul_add = vector_fmul_add_c; c->vector_fmul_window = vector_fmul_window_c; - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; c->vector_clipf = vector_clipf_c; - c->float_to_int16 = ff_float_to_int16_c; - c->float_to_int16_interleave = ff_float_to_int16_interleave_c; c->scalarproduct_int16 = scalarproduct_int16_c; c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; c->scalarproduct_float = scalarproduct_float_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index b942e66a37..c8111866c2 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -392,7 +392,6 @@ typedef struct DSPContext { /* assume len is a multiple of 4, and arrays are 16-byte aligned */ void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len); /* assume len is a multiple of 8, and arrays are 16-byte aligned */ - void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); /** * Multiply a vector of floats by a scalar float. Source and @@ -445,10 +444,6 @@ typedef struct DSPContext { */ void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); - /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ - void (*float_to_int16)(int16_t *dst, const float *src, long len); - void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels); - /* (I)DCT */ void (*fdct)(DCTELEM *block/* align 16*/); void (*fdct248)(DCTELEM *block/* align 16*/); diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c new file mode 100644 index 0000000000..e26b8997ab --- /dev/null +++ b/libavcodec/fmtconvert.c @@ -0,0 +1,68 @@ +/* + * Format Conversion Utils + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "avcodec.h" +#include "fmtconvert.h" + +static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ + int i; + for(i=0; i<len; i++) + dst[i] = src[i] * mul; +} + +static av_always_inline int float_to_int16_one(const float *src){ + return av_clip_int16(lrintf(*src)); +} + +static void float_to_int16_c(int16_t *dst, const float *src, long len) +{ + int i; + for(i=0; i<len; i++) + dst[i] = float_to_int16_one(src+i); +} + +static void float_to_int16_interleave_c(int16_t *dst, const float **src, + long len, int channels) +{ + int i,j,c; + if(channels==2){ + for(i=0; i<len; i++){ + dst[2*i] = float_to_int16_one(src[0]+i); + dst[2*i+1] = float_to_int16_one(src[1]+i); + } + }else{ + for(c=0; c<channels; c++) + for(i=0, j=c; i<len; i++, j+=channels) + dst[j] = float_to_int16_one(src[c]+i); + } +} + +av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) +{ + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; + c->float_to_int16 = float_to_int16_c; + c->float_to_int16_interleave = float_to_int16_interleave_c; + + if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); + if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx); + if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx); +} diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h new file mode 100644 index 0000000000..f2ee261f99 --- /dev/null +++ b/libavcodec/fmtconvert.h @@ -0,0 +1,79 @@ +/* + * Format Conversion Utils + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FMTCONVERT_H +#define AVCODEC_FMTCONVERT_H + +#include "avcodec.h" + +typedef struct FmtConvertContext { + /** + * Convert an array of int32_t to float and multiply by a float value. + * @param dst destination array of float. + * constraints: 16-byte aligned + * @param src source array of int32_t. + * constraints: 16-byte aligned + * @param len number of elements to convert. + * constraints: multiple of 8 + */ + void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); + + /** + * Convert an array of float to an array of int16_t. + * + * Convert floats from in the range [-32768.0,32767.0] to ints + * without rescaling + * + * @param dst destination array of int16_t. + * constraints: 16-byte aligned + * @param src source array of float. + * constraints: 16-byte aligned + * @param len number of elements to convert. + * constraints: multiple of 8 + */ + void (*float_to_int16)(int16_t *dst, const float *src, long len); + + /** + * Convert multiple arrays of float to an interleaved array of int16_t. + * + * Convert floats from in the range [-32768.0,32767.0] to ints + * without rescaling + * + * @param dst destination array of interleaved int16_t. + * constraints: 16-byte aligned + * @param src source array of float arrays, one for each channel. + * constraints: 16-byte aligned + * @param len number of elements to convert. + * constraints: multiple of 8 + * @param channels number of channels + */ + void (*float_to_int16_interleave)(int16_t *dst, const float **src, + long len, int channels); +} FmtConvertContext; + +void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); + +void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); +void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx); +void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx); + +#endif /* AVCODEC_FMTCONVERT_H */ diff --git a/libavcodec/nellymoserdec.c b/libavcodec/nellymoserdec.c index 8b13a5d894..80e04ee0a2 100644 --- a/libavcodec/nellymoserdec.c +++ b/libavcodec/nellymoserdec.c @@ -38,6 +38,7 @@ #include "avcodec.h" #include "dsputil.h" #include "fft.h" +#include "fmtconvert.h" #define ALT_BITSTREAM_READER_LE #include "get_bits.h" @@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext { float scale_bias; DSPContext dsp; FFTContext imdct_ctx; + FmtConvertContext fmt_conv; DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2]; } NellyMoserDecodeContext; @@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) { ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0); dsputil_init(&s->dsp, avctx); + ff_fmt_convert_init(&s->fmt_conv, avctx); s->scale_bias = 1.0/(1*8); @@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx, for (i=0 ; i<blocks ; i++) { nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf); - s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); + s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); *data_size += NELLY_SAMPLES*sizeof(int16_t); } diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index 9b2358d49c..35ea0c38f8 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \ OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \ ppc/fdct_altivec.o \ ppc/float_altivec.o \ + ppc/fmtconvert_altivec.o \ ppc/gmc_altivec.o \ ppc/idct_altivec.o \ ppc/int_altivec.o \ diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c index 60bae9a757..ba97cbfd3b 100644 --- a/libavcodec/ppc/float_altivec.c +++ b/libavcodec/ppc/float_altivec.c @@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa } } -static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) -{ - union { - vector float v; - float s[4]; - } mul_u; - int i; - vector float src1, src2, dst1, dst2, mul_v, zero; - - zero = (vector float)vec_splat_u32(0); - mul_u.s[0] = mul; - mul_v = vec_splat(mul_u.v, 0); - - for(i=0; i<len; i+=8) { - src1 = vec_ctf(vec_ld(0, src+i), 0); - src2 = vec_ctf(vec_ld(16, src+i), 0); - dst1 = vec_madd(src1, mul_v, zero); - dst2 = vec_madd(src2, mul_v, zero); - vec_st(dst1, 0, dst+i); - vec_st(dst2, 16, dst+i); - } -} - - -static vector signed short -float_to_int16_one_altivec(const float *src) -{ - vector float s0 = vec_ld(0, src); - vector float s1 = vec_ld(16, src); - vector signed int t0 = vec_cts(s0, 0); - vector signed int t1 = vec_cts(s1, 0); - return vec_packs(t0,t1); -} - -static void float_to_int16_altivec(int16_t *dst, const float *src, long len) -{ - int i; - vector signed short d0, d1, d; - vector unsigned char align; - if(((long)dst)&15) //FIXME - for(i=0; i<len-7; i+=8) { - d0 = vec_ld(0, dst+i); - d = float_to_int16_one_altivec(src+i); - d1 = vec_ld(15, dst+i); - d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); - align = vec_lvsr(0, dst+i); - d0 = vec_perm(d1, d, align); - d1 = vec_perm(d, d1, align); - vec_st(d0, 0, dst+i); - vec_st(d1,15, dst+i); - } - else - for(i=0; i<len-7; i+=8) { - d = float_to_int16_one_altivec(src+i); - vec_st(d, 0, dst+i); - } -} - -static void -float_to_int16_interleave_altivec(int16_t *dst, const float **src, - long len, int channels) -{ - int i; - vector signed short d0, d1, d2, c0, c1, t0, t1; - vector unsigned char align; - if(channels == 1) - float_to_int16_altivec(dst, src[0], len); - else - if (channels == 2) { - if(((long)dst)&15) - for(i=0; i<len-7; i+=8) { - d0 = vec_ld(0, dst + i); - t0 = float_to_int16_one_altivec(src[0] + i); - d1 = vec_ld(31, dst + i); - t1 = float_to_int16_one_altivec(src[1] + i); - c0 = vec_mergeh(t0, t1); - c1 = vec_mergel(t0, t1); - d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); - align = vec_lvsr(0, dst + i); - d0 = vec_perm(d2, c0, align); - d1 = vec_perm(c0, c1, align); - vec_st(d0, 0, dst + i); - d0 = vec_perm(c1, d2, align); - vec_st(d1, 15, dst + i); - vec_st(d0, 31, dst + i); - dst+=8; - } - else - for(i=0; i<len-7; i+=8) { - t0 = float_to_int16_one_altivec(src[0] + i); - t1 = float_to_int16_one_altivec(src[1] + i); - d0 = vec_mergeh(t0, t1); - d1 = vec_mergel(t0, t1); - vec_st(d0, 0, dst + i); - vec_st(d1, 16, dst + i); - dst+=8; - } - } else { - DECLARE_ALIGNED(16, int16_t, tmp)[len]; - int c, j; - for (c = 0; c < channels; c++) { - float_to_int16_altivec(tmp, src[c], len); - for (i = 0, j = c; i < len; i++, j+=channels) { - dst[j] = tmp[i]; - } - } - } -} - void float_init_altivec(DSPContext* c, AVCodecContext *avctx) { c->vector_fmul = vector_fmul_altivec; c->vector_fmul_reverse = vector_fmul_reverse_altivec; c->vector_fmul_add = vector_fmul_add_altivec; - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->vector_fmul_window = vector_fmul_window_altivec; - c->float_to_int16 = float_to_int16_altivec; - c->float_to_int16_interleave = float_to_int16_interleave_altivec; } } diff --git a/libavcodec/ppc/fmtconvert_altivec.c b/libavcodec/ppc/fmtconvert_altivec.c new file mode 100644 index 0000000000..e5287c96c1 --- /dev/null +++ b/libavcodec/ppc/fmtconvert_altivec.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/fmtconvert.h" + +#include "dsputil_altivec.h" +#include "util_altivec.h" + +static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) +{ + union { + vector float v; + float s[4]; + } mul_u; + int i; + vector float src1, src2, dst1, dst2, mul_v, zero; + + zero = (vector float)vec_splat_u32(0); + mul_u.s[0] = mul; + mul_v = vec_splat(mul_u.v, 0); + + for(i=0; i<len; i+=8) { + src1 = vec_ctf(vec_ld(0, src+i), 0); + src2 = vec_ctf(vec_ld(16, src+i), 0); + dst1 = vec_madd(src1, mul_v, zero); + dst2 = vec_madd(src2, mul_v, zero); + vec_st(dst1, 0, dst+i); + vec_st(dst2, 16, dst+i); + } +} + + +static vector signed short +float_to_int16_one_altivec(const float *src) +{ + vector float s0 = vec_ld(0, src); + vector float s1 = vec_ld(16, src); + vector signed int t0 = vec_cts(s0, 0); + vector signed int t1 = vec_cts(s1, 0); + return vec_packs(t0,t1); +} + +static void float_to_int16_altivec(int16_t *dst, const float *src, long len) +{ + int i; + vector signed short d0, d1, d; + vector unsigned char align; + if(((long)dst)&15) //FIXME + for(i=0; i<len-7; i+=8) { + d0 = vec_ld(0, dst+i); + d = float_to_int16_one_altivec(src+i); + d1 = vec_ld(15, dst+i); + d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); + align = vec_lvsr(0, dst+i); + d0 = vec_perm(d1, d, align); + d1 = vec_perm(d, d1, align); + vec_st(d0, 0, dst+i); + vec_st(d1,15, dst+i); + } + else + for(i=0; i<len-7; i+=8) { + d = float_to_int16_one_altivec(src+i); + vec_st(d, 0, dst+i); + } +} + +static void +float_to_int16_interleave_altivec(int16_t *dst, const float **src, + long len, int channels) +{ + int i; + vector signed short d0, d1, d2, c0, c1, t0, t1; + vector unsigned char align; + if(channels == 1) + float_to_int16_altivec(dst, src[0], len); + else + if (channels == 2) { + if(((long)dst)&15) + for(i=0; i<len-7; i+=8) { + d0 = vec_ld(0, dst + i); + t0 = float_to_int16_one_altivec(src[0] + i); + d1 = vec_ld(31, dst + i); + t1 = float_to_int16_one_altivec(src[1] + i); + c0 = vec_mergeh(t0, t1); + c1 = vec_mergel(t0, t1); + d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); + align = vec_lvsr(0, dst + i); + d0 = vec_perm(d2, c0, align); + d1 = vec_perm(c0, c1, align); + vec_st(d0, 0, dst + i); + d0 = vec_perm(c1, d2, align); + vec_st(d1, 15, dst + i); + vec_st(d0, 31, dst + i); + dst+=8; + } + else + for(i=0; i<len-7; i+=8) { + t0 = float_to_int16_one_altivec(src[0] + i); + t1 = float_to_int16_one_altivec(src[1] + i); + d0 = vec_mergeh(t0, t1); + d1 = vec_mergel(t0, t1); + vec_st(d0, 0, dst + i); + vec_st(d1, 16, dst + i); + dst+=8; + } + } else { + DECLARE_ALIGNED(16, int16_t, tmp)[len]; + int c, j; + for (c = 0; c < channels; c++) { + float_to_int16_altivec(tmp, src[c], len); + for (i = 0, j = c; i < len; i++, j+=channels) { + dst[j] = tmp[i]; + } + } + } +} + +void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx) +{ + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; + if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->float_to_int16 = float_to_int16_altivec; + c->float_to_int16_interleave = float_to_int16_interleave_altivec; + } +} diff --git a/libavcodec/vorbis_dec.c b/libavcodec/vorbis_dec.c index 9fef5eb26f..bca56ba663 100644 --- a/libavcodec/vorbis_dec.c +++ b/libavcodec/vorbis_dec.c @@ -31,6 +31,7 @@ #include "get_bits.h" #include "dsputil.h" #include "fft.h" +#include "fmtconvert.h" #include "vorbis.h" #include "xiph.h" @@ -127,6 +128,7 @@ typedef struct vorbis_context_s { AVCodecContext *avccontext; GetBitContext gb; DSPContext dsp; + FmtConvertContext fmt_conv; FFTContext mdct[2]; uint_fast8_t first_frame; @@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext) vc->avccontext = avccontext; dsputil_init(&vc->dsp, avccontext); + ff_fmt_convert_init(&vc->fmt_conv, avccontext); vc->scale_bias = 32768.0f; @@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext, len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i]; } - vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels); + vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len, + vc->audio_channels); *data_size = len * 2 * vc->audio_channels; return buf_size ; diff --git a/libavcodec/wma.c b/libavcodec/wma.c index e0b9b68395..a7eacb8c78 100644 --- a/libavcodec/wma.c +++ b/libavcodec/wma.c @@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2) s->block_align = avctx->block_align; dsputil_init(&s->dsp, avctx); + ff_fmt_convert_init(&s->fmt_conv, avctx); if (avctx->codec->id == CODEC_ID_WMAV1) { s->version = 1; diff --git a/libavcodec/wma.h b/libavcodec/wma.h index 11274ad970..a51b3e83cf 100644 --- a/libavcodec/wma.h +++ b/libavcodec/wma.h @@ -26,6 +26,7 @@ #include "put_bits.h" #include "dsputil.h" #include "fft.h" +#include "fmtconvert.h" /* size of blocks */ #define BLOCK_MIN_BITS 7 @@ -134,6 +135,7 @@ typedef struct WMACodecContext { float lsp_pow_m_table1[(1 << LSP_POW_BITS)]; float lsp_pow_m_table2[(1 << LSP_POW_BITS)]; DSPContext dsp; + FmtConvertContext fmt_conv; #ifdef TRACE int frame_count; diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c index d85d80d574..83f8dea8bb 100644 --- a/libavcodec/wmadec.c +++ b/libavcodec/wmadec.c @@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples) incr = s->nb_channels; for (ch = 0; ch < MAX_CHANNELS; ch++) output[ch] = s->frame_out[ch]; - s->dsp.float_to_int16_interleave(samples, output, n, incr); + s->fmt_conv.float_to_int16_interleave(samples, output, n, incr); for (ch = 0; ch < incr; ch++) { /* prepare for next block */ memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float)); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 943edcb0ec..83cec00442 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ x86/deinterlace.o \ + x86/fmtconvert.o \ x86/h264_chromamc.o \ $(YASM-OBJS-yes) @@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \ x86/dsputil_mmx.o \ x86/fdct_mmx.o \ + x86/fmtconvert_mmx.o \ x86/idct_mmx_xvid.o \ x86/idct_sse2_xvid.o \ x86/motion_est_mmx.o \ diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 2eb7d85f14..39bf3f2936 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s } #endif /* HAVE_6REGS */ -static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) -{ - x86_reg i = -4*len; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "1: \n" - "cvtpi2ps (%2,%0), %%xmm0 \n" - "cvtpi2ps 8(%2,%0), %%xmm1 \n" - "cvtpi2ps 16(%2,%0), %%xmm2 \n" - "cvtpi2ps 24(%2,%0), %%xmm3 \n" - "movlhps %%xmm1, %%xmm0 \n" - "movlhps %%xmm3, %%xmm2 \n" - "mulps %%xmm4, %%xmm0 \n" - "mulps %%xmm4, %%xmm2 \n" - "movaps %%xmm0, (%1,%0) \n" - "movaps %%xmm2, 16(%1,%0) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(dst+len), "r"(src+len), "m"(mul) - ); -} - -static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) -{ - x86_reg i = -4*len; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "1: \n" - "cvtdq2ps (%2,%0), %%xmm0 \n" - "cvtdq2ps 16(%2,%0), %%xmm1 \n" - "mulps %%xmm4, %%xmm0 \n" - "mulps %%xmm4, %%xmm1 \n" - "movaps %%xmm0, (%1,%0) \n" - "movaps %%xmm1, 16(%1,%0) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(dst+len), "r"(src+len), "m"(mul) - ); -} - static void vector_clipf_sse(float *dst, const float *src, float min, float max, int len) { @@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max, ); } -static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - // not bit-exact: pf2id uses different rounding than C and SSE - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "pf2id (%2,%0,2) , %%mm0 \n\t" - "pf2id 8(%2,%0,2) , %%mm1 \n\t" - "pf2id 16(%2,%0,2) , %%mm2 \n\t" - "pf2id 24(%2,%0,2) , %%mm3 \n\t" - "packssdw %%mm1 , %%mm0 \n\t" - "packssdw %%mm3 , %%mm2 \n\t" - "movq %%mm0 , (%1,%0) \n\t" - "movq %%mm2 , 8(%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - "femms \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} -static void float_to_int16_sse(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "cvtps2pi (%2,%0,2) , %%mm0 \n\t" - "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" - "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" - "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" - "packssdw %%mm1 , %%mm0 \n\t" - "packssdw %%mm3 , %%mm2 \n\t" - "movq %%mm0 , (%1,%0) \n\t" - "movq %%mm2 , 8(%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - "emms \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} - -static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" - "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" - "packssdw %%xmm1 , %%xmm0 \n\t" - "movdqa %%xmm0 , (%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} - void ff_vp3_idct_mmx(int16_t *input_data); void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); @@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data); void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); -void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); @@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); -#if !HAVE_YASM -#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) -#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) -#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) -#endif -#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse - -#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ -/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ -static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ - DECLARE_ALIGNED(16, int16_t, tmp)[len];\ - int i,j,c;\ - for(c=0; c<channels; c++){\ - float_to_int16_##cpu(tmp, src[c], len);\ - for(i=0, j=c; i<len; i++, j+=channels)\ - dst[j] = tmp[i];\ - }\ -}\ -\ -static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ - if(channels==1)\ - float_to_int16_##cpu(dst, src[0], len);\ - else if(channels==2){\ - x86_reg reglen = len; \ - const float *src0 = src[0];\ - const float *src1 = src[1];\ - __asm__ volatile(\ - "shl $2, %0 \n"\ - "add %0, %1 \n"\ - "add %0, %2 \n"\ - "add %0, %3 \n"\ - "neg %0 \n"\ - body\ - :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ - );\ - }else if(channels==6){\ - ff_float_to_int16_interleave6_##cpu(dst, src, len);\ - }else\ - float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ -} - -FLOAT_TO_INT16_INTERLEAVE(3dnow, - "1: \n" - "pf2id (%2,%0), %%mm0 \n" - "pf2id 8(%2,%0), %%mm1 \n" - "pf2id (%3,%0), %%mm2 \n" - "pf2id 8(%3,%0), %%mm3 \n" - "packssdw %%mm1, %%mm0 \n" - "packssdw %%mm3, %%mm2 \n" - "movq %%mm0, %%mm1 \n" - "punpcklwd %%mm2, %%mm0 \n" - "punpckhwd %%mm2, %%mm1 \n" - "movq %%mm0, (%1,%0)\n" - "movq %%mm1, 8(%1,%0)\n" - "add $16, %0 \n" - "js 1b \n" - "femms \n" -) - -FLOAT_TO_INT16_INTERLEAVE(sse, - "1: \n" - "cvtps2pi (%2,%0), %%mm0 \n" - "cvtps2pi 8(%2,%0), %%mm1 \n" - "cvtps2pi (%3,%0), %%mm2 \n" - "cvtps2pi 8(%3,%0), %%mm3 \n" - "packssdw %%mm1, %%mm0 \n" - "packssdw %%mm3, %%mm2 \n" - "movq %%mm0, %%mm1 \n" - "punpcklwd %%mm2, %%mm0 \n" - "punpckhwd %%mm2, %%mm1 \n" - "movq %%mm0, (%1,%0)\n" - "movq %%mm1, 8(%1,%0)\n" - "add $16, %0 \n" - "js 1b \n" - "emms \n" -) - -FLOAT_TO_INT16_INTERLEAVE(sse2, - "1: \n" - "cvtps2dq (%2,%0), %%xmm0 \n" - "cvtps2dq (%3,%0), %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "movhlps %%xmm0, %%xmm1 \n" - "punpcklwd %%xmm1, %%xmm0 \n" - "movdqa %%xmm0, (%1,%0) \n" - "add $16, %0 \n" - "js 1b \n" -) - -static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ - if(channels==6) - ff_float_to_int16_interleave6_3dn2(dst, src, len); - else - float_to_int16_interleave_3dnow(dst, src, len, channels); -} - float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) @@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) if(mm_flags & AV_CPU_FLAG_3DNOW){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; c->vector_fmul = vector_fmul_3dnow; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16 = float_to_int16_3dnow; - c->float_to_int16_interleave = float_to_int16_interleave_3dnow; - } } if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; #if HAVE_6REGS c->vector_fmul_window = vector_fmul_window_3dnow2; #endif - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16_interleave = float_to_int16_interleave_3dn2; - } } if(mm_flags & AV_CPU_FLAG_MMX2){ #if HAVE_YASM @@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #if HAVE_6REGS c->vector_fmul_window = vector_fmul_window_sse; #endif - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; c->vector_clipf = vector_clipf_sse; - c->float_to_int16 = float_to_int16_sse; - c->float_to_int16_interleave = float_to_int16_interleave_sse; #if HAVE_YASM c->scalarproduct_float = ff_scalarproduct_float_sse; #endif @@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) if(mm_flags & AV_CPU_FLAG_3DNOW) c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse if(mm_flags & AV_CPU_FLAG_SSE2){ - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; - c->float_to_int16 = float_to_int16_sse2; - c->float_to_int16_interleave = float_to_int16_interleave_sse2; #if HAVE_YASM c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 099f0a80df..b1b37e1fb9 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 section .text align=16 -%macro PSWAPD_SSE 2 - pshufw %1, %2, 0x4e -%endmacro -%macro PSWAPD_3DN1 2 - movq %1, %2 - psrlq %1, 32 - punpckldq %1, %2 -%endmacro - -%macro FLOAT_TO_INT16_INTERLEAVE6 1 -; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) -cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 -%ifdef ARCH_X86_64 - %define lend r10d - mov lend, r2d -%else - %define lend dword r2m -%endif - mov src1q, [srcq+1*gprsize] - mov src2q, [srcq+2*gprsize] - mov src3q, [srcq+3*gprsize] - mov src4q, [srcq+4*gprsize] - mov src5q, [srcq+5*gprsize] - mov srcq, [srcq] - sub src1q, srcq - sub src2q, srcq - sub src3q, srcq - sub src4q, srcq - sub src5q, srcq -.loop: - cvtps2pi mm0, [srcq] - cvtps2pi mm1, [srcq+src1q] - cvtps2pi mm2, [srcq+src2q] - cvtps2pi mm3, [srcq+src3q] - cvtps2pi mm4, [srcq+src4q] - cvtps2pi mm5, [srcq+src5q] - packssdw mm0, mm3 - packssdw mm1, mm4 - packssdw mm2, mm5 - pswapd mm3, mm0 - punpcklwd mm0, mm1 - punpckhwd mm1, mm2 - punpcklwd mm2, mm3 - pswapd mm3, mm0 - punpckldq mm0, mm2 - punpckhdq mm2, mm1 - punpckldq mm1, mm3 - movq [dstq ], mm0 - movq [dstq+16], mm2 - movq [dstq+ 8], mm1 - add srcq, 8 - add dstq, 24 - sub lend, 2 - jg .loop - emms - RET -%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 - -%define pswapd PSWAPD_SSE -FLOAT_TO_INT16_INTERLEAVE6 sse -%define cvtps2pi pf2id -%define pswapd PSWAPD_3DN1 -FLOAT_TO_INT16_INTERLEAVE6 3dnow -%undef pswapd -FLOAT_TO_INT16_INTERLEAVE6 3dn2 -%undef cvtps2pi - - - %macro SCALARPRODUCT 1 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm new file mode 100644 index 0000000000..6c744fc581 --- /dev/null +++ b/libavcodec/x86/fmtconvert.asm @@ -0,0 +1,91 @@ +;****************************************************************************** +;* x86 optimized Format Conversion Utils +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +section .text align=16 + +%macro PSWAPD_SSE 2 + pshufw %1, %2, 0x4e +%endmacro +%macro PSWAPD_3DN1 2 + movq %1, %2 + psrlq %1, 32 + punpckldq %1, %2 +%endmacro + +%macro FLOAT_TO_INT16_INTERLEAVE6 1 +; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) +cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 +%ifdef ARCH_X86_64 + %define lend r10d + mov lend, r2d +%else + %define lend dword r2m +%endif + mov src1q, [srcq+1*gprsize] + mov src2q, [srcq+2*gprsize] + mov src3q, [srcq+3*gprsize] + mov src4q, [srcq+4*gprsize] + mov src5q, [srcq+5*gprsize] + mov srcq, [srcq] + sub src1q, srcq + sub src2q, srcq + sub src3q, srcq + sub src4q, srcq + sub src5q, srcq +.loop: + cvtps2pi mm0, [srcq] + cvtps2pi mm1, [srcq+src1q] + cvtps2pi mm2, [srcq+src2q] + cvtps2pi mm3, [srcq+src3q] + cvtps2pi mm4, [srcq+src4q] + cvtps2pi mm5, [srcq+src5q] + packssdw mm0, mm3 + packssdw mm1, mm4 + packssdw mm2, mm5 + pswapd mm3, mm0 + punpcklwd mm0, mm1 + punpckhwd mm1, mm2 + punpcklwd mm2, mm3 + pswapd mm3, mm0 + punpckldq mm0, mm2 + punpckhdq mm2, mm1 + punpckldq mm1, mm3 + movq [dstq ], mm0 + movq [dstq+16], mm2 + movq [dstq+ 8], mm1 + add srcq, 8 + add dstq, 24 + sub lend, 2 + jg .loop + emms + RET +%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 + +%define pswapd PSWAPD_SSE +FLOAT_TO_INT16_INTERLEAVE6 sse +%define cvtps2pi pf2id +%define pswapd PSWAPD_3DN1 +FLOAT_TO_INT16_INTERLEAVE6 3dnow +%undef pswapd +FLOAT_TO_INT16_INTERLEAVE6 3dn2 +%undef cvtps2pi diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c new file mode 100644 index 0000000000..ea41f730e8 --- /dev/null +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -0,0 +1,266 @@ +/* + * Format Conversion Utils + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86_cpu.h" +#include "libavcodec/fmtconvert.h" + +static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) +{ + x86_reg i = -4*len; + __asm__ volatile( + "movss %3, %%xmm4 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "1: \n" + "cvtpi2ps (%2,%0), %%xmm0 \n" + "cvtpi2ps 8(%2,%0), %%xmm1 \n" + "cvtpi2ps 16(%2,%0), %%xmm2 \n" + "cvtpi2ps 24(%2,%0), %%xmm3 \n" + "movlhps %%xmm1, %%xmm0 \n" + "movlhps %%xmm3, %%xmm2 \n" + "mulps %%xmm4, %%xmm0 \n" + "mulps %%xmm4, %%xmm2 \n" + "movaps %%xmm0, (%1,%0) \n" + "movaps %%xmm2, 16(%1,%0) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(dst+len), "r"(src+len), "m"(mul) + ); +} + +static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) +{ + x86_reg i = -4*len; + __asm__ volatile( + "movss %3, %%xmm4 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "1: \n" + "cvtdq2ps (%2,%0), %%xmm0 \n" + "cvtdq2ps 16(%2,%0), %%xmm1 \n" + "mulps %%xmm4, %%xmm0 \n" + "mulps %%xmm4, %%xmm1 \n" + "movaps %%xmm0, (%1,%0) \n" + "movaps %%xmm1, 16(%1,%0) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(dst+len), "r"(src+len), "m"(mul) + ); +} + +static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ + x86_reg reglen = len; + // not bit-exact: pf2id uses different rounding than C and SSE + __asm__ volatile( + "add %0 , %0 \n\t" + "lea (%2,%0,2) , %2 \n\t" + "add %0 , %1 \n\t" + "neg %0 \n\t" + "1: \n\t" + "pf2id (%2,%0,2) , %%mm0 \n\t" + "pf2id 8(%2,%0,2) , %%mm1 \n\t" + "pf2id 16(%2,%0,2) , %%mm2 \n\t" + "pf2id 24(%2,%0,2) , %%mm3 \n\t" + "packssdw %%mm1 , %%mm0 \n\t" + "packssdw %%mm3 , %%mm2 \n\t" + "movq %%mm0 , (%1,%0) \n\t" + "movq %%mm2 , 8(%1,%0) \n\t" + "add $16 , %0 \n\t" + " js 1b \n\t" + "femms \n\t" + :"+r"(reglen), "+r"(dst), "+r"(src) + ); +} + +static void float_to_int16_sse(int16_t *dst, const float *src, long len){ + x86_reg reglen = len; + __asm__ volatile( + "add %0 , %0 \n\t" + "lea (%2,%0,2) , %2 \n\t" + "add %0 , %1 \n\t" + "neg %0 \n\t" + "1: \n\t" + "cvtps2pi (%2,%0,2) , %%mm0 \n\t" + "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" + "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" + "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" + "packssdw %%mm1 , %%mm0 \n\t" + "packssdw %%mm3 , %%mm2 \n\t" + "movq %%mm0 , (%1,%0) \n\t" + "movq %%mm2 , 8(%1,%0) \n\t" + "add $16 , %0 \n\t" + " js 1b \n\t" + "emms \n\t" + :"+r"(reglen), "+r"(dst), "+r"(src) + ); +} + +static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ + x86_reg reglen = len; + __asm__ volatile( + "add %0 , %0 \n\t" + "lea (%2,%0,2) , %2 \n\t" + "add %0 , %1 \n\t" + "neg %0 \n\t" + "1: \n\t" + "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" + "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" + "packssdw %%xmm1 , %%xmm0 \n\t" + "movdqa %%xmm0 , (%1,%0) \n\t" + "add $16 , %0 \n\t" + " js 1b \n\t" + :"+r"(reglen), "+r"(dst), "+r"(src) + ); +} + +void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); +void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); +void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); + +#if !HAVE_YASM +#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) +#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) +#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) +#endif +#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse + +#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ +/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ +static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ + DECLARE_ALIGNED(16, int16_t, tmp)[len];\ + int i,j,c;\ + for(c=0; c<channels; c++){\ + float_to_int16_##cpu(tmp, src[c], len);\ + for(i=0, j=c; i<len; i++, j+=channels)\ + dst[j] = tmp[i];\ + }\ +}\ +\ +static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ + if(channels==1)\ + float_to_int16_##cpu(dst, src[0], len);\ + else if(channels==2){\ + x86_reg reglen = len; \ + const float *src0 = src[0];\ + const float *src1 = src[1];\ + __asm__ volatile(\ + "shl $2, %0 \n"\ + "add %0, %1 \n"\ + "add %0, %2 \n"\ + "add %0, %3 \n"\ + "neg %0 \n"\ + body\ + :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ + );\ + }else if(channels==6){\ + ff_float_to_int16_interleave6_##cpu(dst, src, len);\ + }else\ + float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ +} + +FLOAT_TO_INT16_INTERLEAVE(3dnow, + "1: \n" + "pf2id (%2,%0), %%mm0 \n" + "pf2id 8(%2,%0), %%mm1 \n" + "pf2id (%3,%0), %%mm2 \n" + "pf2id 8(%3,%0), %%mm3 \n" + "packssdw %%mm1, %%mm0 \n" + "packssdw %%mm3, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "punpcklwd %%mm2, %%mm0 \n" + "punpckhwd %%mm2, %%mm1 \n" + "movq %%mm0, (%1,%0)\n" + "movq %%mm1, 8(%1,%0)\n" + "add $16, %0 \n" + "js 1b \n" + "femms \n" +) + +FLOAT_TO_INT16_INTERLEAVE(sse, + "1: \n" + "cvtps2pi (%2,%0), %%mm0 \n" + "cvtps2pi 8(%2,%0), %%mm1 \n" + "cvtps2pi (%3,%0), %%mm2 \n" + "cvtps2pi 8(%3,%0), %%mm3 \n" + "packssdw %%mm1, %%mm0 \n" + "packssdw %%mm3, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "punpcklwd %%mm2, %%mm0 \n" + "punpckhwd %%mm2, %%mm1 \n" + "movq %%mm0, (%1,%0)\n" + "movq %%mm1, 8(%1,%0)\n" + "add $16, %0 \n" + "js 1b \n" + "emms \n" +) + +FLOAT_TO_INT16_INTERLEAVE(sse2, + "1: \n" + "cvtps2dq (%2,%0), %%xmm0 \n" + "cvtps2dq (%3,%0), %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "movhlps %%xmm0, %%xmm1 \n" + "punpcklwd %%xmm1, %%xmm0 \n" + "movdqa %%xmm0, (%1,%0) \n" + "add $16, %0 \n" + "js 1b \n" +) + +static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ + if(channels==6) + ff_float_to_int16_interleave6_3dn2(dst, src, len); + else + float_to_int16_interleave_3dnow(dst, src, len, channels); +} + +void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) +{ + int mm_flags = av_get_cpu_flags(); + + if (mm_flags & AV_CPU_FLAG_MMX) { + + if(mm_flags & AV_CPU_FLAG_3DNOW){ + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->float_to_int16 = float_to_int16_3dnow; + c->float_to_int16_interleave = float_to_int16_interleave_3dnow; + } + } + if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->float_to_int16_interleave = float_to_int16_interleave_3dn2; + } + } + if(mm_flags & AV_CPU_FLAG_SSE){ + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; + c->float_to_int16 = float_to_int16_sse; + c->float_to_int16_interleave = float_to_int16_interleave_sse; + } + if(mm_flags & AV_CPU_FLAG_SSE2){ + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; + c->float_to_int16 = float_to_int16_sse2; + c->float_to_int16_interleave = float_to_int16_interleave_sse2; + } + } +} |