diff options
author | David Schleef <ds@schleef.org> | 2005-10-02 01:59:27 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-10-02 01:59:27 +0000 |
commit | 1cc52aa34b431e333d22e53b81779a4efaee02f7 (patch) | |
tree | 1a30e0aee2b7d69f7bdc6333a134f5088d46c93c /liboil/dct | |
parent | 414196740a906a7fb35463418cf1f3996f8e3cc0 (diff) | |
download | liboil-1cc52aa34b431e333d22e53b81779a4efaee02f7.tar.gz |
What have I done?!? Move files around.
* liboil/colorspace/Makefile.am:
* liboil/colorspace/argb_paint.c:
* liboil/colorspace/argb_paint_i386.c:
* liboil/colorspace/ayuv2argb.c:
* liboil/colorspace/ayuv2argb_i386.c:
* liboil/colorspace/composite.c:
* liboil/colorspace/composite_i386.c:
* liboil/colorspace/resample.c:
* liboil/colorspace/resample_powerpc.c:
* liboil/colorspace/rgb2bgr.c:
* liboil/colorspace/rgb2bgr_powerpc.c:
* liboil/colorspace/rgb2rgba.c:
* liboil/colorspace/rgb2rgba_powerpc.c:
* liboil/colorspace/yuv.c:
* liboil/conv/Makefile.am:
* liboil/conv/conv_powerpc.c:
* liboil/conv/conv_ref.c:
* liboil/copy/Makefile.am:
* liboil/copy/copy.c:
* liboil/copy/copy8x8.c:
* liboil/copy/copy8x8_i386.c:
* liboil/copy/copy_i386.c:
* liboil/copy/copy_powerpc.c:
* liboil/copy/permute.c:
* liboil/copy/splat_i386.c:
* liboil/copy/splat_powerpc.c:
* liboil/copy/splat_ref.c:
* liboil/copy/tablelookup_ref.c:
* liboil/copy/trans8x8.c:
* liboil/copy/trans8x8_i386.c:
* liboil/dct/Makefile.am:
* liboil/dct/fdct8x8theora_i386.c:
* liboil/dct/idct8x8_i386.c:
* liboil/i386/Makefile.am:
* liboil/jpeg/Makefile.am:
* liboil/jpeg/zigzag8x8_powerpc.c:
* liboil/md5/Makefile.am:
* liboil/md5/md5_i386.c:
* liboil/md5/md5_powerpc.c:
* liboil/powerpc/Makefile.am:
* liboil/powerpc/abs.c: (abs_u16_s16_a16_altivec):
* liboil/powerpc/clip.c: (clip_s16_ppcasm), (clip_s16_ppcasm2),
(clip_s16_ppcasm3):
* liboil/powerpc/conv.c: (_sl_clipconv_S8_F32__powerpc_altivec),
(_sl_clipconv_S16_F32__powerpc_altivec),
(_sl_clipconvert_S32_F32__powerpc_altivec),
(convert_s16_f64__powerpc), (_sl_convert_S16_F32__powerpc),
(conv_f64_s16_altivec), (clipconv_s16_f64_ppcasm):
* liboil/powerpc/copy.c: (copy_u8_altivec), (copy_u8_altivec2):
* liboil/powerpc/md5.c: (md5_asm1), (md5_asm2), (md5_asm3):
* liboil/powerpc/mix.c: (mix_u8_a16_altivec):
* liboil/powerpc/multsum.c: (multsum_f32_ppcasm):
* liboil/powerpc/resample.c: (__attribute__),
(merge_linear_argb_powerpc):
* liboil/powerpc/rgb2bgr.c: (rgb2bgr_ppc), (rgb2bgr_ppc2),
(rgb2bgr_ppc3), (rgb2bgr_ppc4):
* liboil/powerpc/rgb2rgba.c: (rgb2rgba_powerpcasm):
* liboil/powerpc/sad8x8.c: (sad8x8_s16_a16_altivec),
(sad8x8_s16_l15_a16_altivec):
* liboil/powerpc/splat.c: (splat_u8_ns_altivec),
(splat_u8_ns_altivec2), (splat_u32_ns_altivec):
* liboil/powerpc/zigzag8x8.c: (__attribute__),
(zigzag8x8_s16_a16_altivec):
* liboil/ref/Makefile.am:
* liboil/ref/argb_paint.c: (argb_paint_u8_ref):
* liboil/ref/ayuv2argb.c: (ayuv2argb_u8_ref):
* liboil/ref/composite.c: (composite_test),
(composite_in_argb_ref), (composite_in_argb_const_src_ref),
(composite_in_argb_const_mask_ref), (composite_over_argb_ref),
(composite_over_argb_const_src_ref), (composite_add_argb_ref),
(composite_add_argb_const_src_ref), (composite_in_over_argb_ref),
(composite_in_over_argb_const_src_ref),
(composite_in_over_argb_const_mask_ref), (composite_add_u8_ref),
(composite_over_u8_ref):
* liboil/ref/conv.c:
* liboil/ref/copy.c: (copy_u8_ref):
* liboil/ref/copy8x8.c: (copy8x8_u8_ref):
* liboil/ref/permute.c: (permute_test):
* liboil/ref/resample.c: (resample_linear_u8_test),
(resample_linear_argb_test), (resample_linear_u8_ref),
(resample_linear_argb_ref), (merge_linear_argb_test),
(merge_linear_argb_ref):
* liboil/ref/rgb.c: (rgb2bgr_ref), (rgb2rgba_ref):
* liboil/ref/splat.c: (splat_u8_ref), (splat_u32_ref),
(splat_u8_ns_ref), (splat_u32_ns_ref):
* liboil/ref/tablelookup.c: (tablelookup_u8_ref):
* liboil/ref/trans8x8.c:
* liboil/ref/yuv.c: (yuyv2ayuv_ref), (yvyu2ayuv_ref),
(uyvy2ayuv_ref), (ayuv2yuyv_ref), (ayuv2yvyu_ref), (ayuv2uyvy_ref):
* liboil/simdpack/Makefile.am:
* liboil/simdpack/abs_i386.c:
* liboil/simdpack/abs_powerpc.c:
* liboil/simdpack/clip_powerpc.c:
* liboil/simdpack/mix_powerpc.c:
* liboil/simdpack/mult8x8_i386.c:
* liboil/simdpack/multsum_powerpc.c:
* liboil/simdpack/sad8x8_powerpc.c:
* liboil/simdpack/scalarmult_i386.c:
* liboil/simdpack/vectoradd_s_i386.c:
Diffstat (limited to 'liboil/dct')
-rw-r--r-- | liboil/dct/Makefile.am | 14 | ||||
-rw-r--r-- | liboil/dct/fdct8x8theora_i386.c | 358 | ||||
-rw-r--r-- | liboil/dct/idct8x8_i386.c | 744 |
3 files changed, 0 insertions, 1116 deletions
diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am index 54d4374..eef1626 100644 --- a/liboil/dct/Makefile.am +++ b/liboil/dct/Makefile.am @@ -10,20 +10,6 @@ noinst_LTLIBRARIES = libdct.la $(opt_libs) noinst_HEADERS = \ dct.h -if HAVE_CPU_I386 -i386_sources = \ - idct8x8_i386.c \ - fdct8x8theora_i386.c -else -i386_sources = -endif - -if HAVE_CPU_AMD64 -amd64_sources = -else -amd64_sources = -endif - c_sources = \ dct12_f32.c \ dct36_f32.c \ diff --git a/liboil/dct/fdct8x8theora_i386.c b/liboil/dct/fdct8x8theora_i386.c deleted file mode 100644 index 7d8bce3..0000000 --- a/liboil/dct/fdct8x8theora_i386.c +++ /dev/null @@ -1,358 +0,0 @@ -/* - * LIBOIL - Library of Optimized Inner Loops - * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/*========================================================================== - * - * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY - * KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR - * PURPOSE. - * - * Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved. - * - *--------------------------------------------------------------------------*/ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include <liboil/liboilfunction.h> -#include <liboil/liboilfuncs.h> -#include <liboil/dct/dct.h> -#include <math.h> - -/* FIXME this causes problems on old gcc */ -static const __attribute__ ((aligned(8),used)) int64_t xC1S7 = 0x0fb15fb15fb15fb15LL; -static const __attribute__ ((aligned(8),used)) int64_t xC2S6 = 0x0ec83ec83ec83ec83LL; -static const __attribute__ ((aligned(8),used)) int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL; -static const __attribute__ ((aligned(8),used)) int64_t xC4S4 = 0x0b505b505b505b505LL; -static const __attribute__ ((aligned(8),used)) int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL; -static const __attribute__ ((aligned(8),used)) int64_t xC6S2 = 0x061f861f861f861f8LL; -static const __attribute__ ((aligned(8),used)) int64_t xC7S1 = 0x031f131f131f131f1LL; - -#if defined(__MINGW32__) || defined(__CYGWIN__) || \ - defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) -# define M(a) "_" #a -#else -# define M(a) #a -#endif - -OIL_DECLARE_CLASS(fdct8x8theora); - -/* execute stage 1 of forward DCT */ -#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \ - " movq " #ip0 ", %%mm0 \n\t" \ - " movq " #ip1 ", %%mm1 \n\t" \ - " movq " #ip3 ", %%mm2 \n\t" \ - " movq " #ip5 ", %%mm3 \n\t" \ - " movq %%mm0, %%mm4 \n\t" \ - " movq %%mm1, %%mm5 \n\t" \ - " movq %%mm2, %%mm6 \n\t" \ - " movq %%mm3, %%mm7 \n\t" \ - \ - " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \ - " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \ - " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \ - " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \ - " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \ - " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \ - \ - " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \ - \ - " paddsw %%mm2, %%mm2 \n\t" \ - \ - " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \ - \ - " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \ - " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \ - " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \ - " paddsw %%mm3, %%mm3 \n\t" \ - " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \ - \ - " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \ - /* ------------------------------------------------------------------- */ \ - " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \ - " paddsw %%mm7, %%mm7 \n\t" \ - " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \ - /* ------------------------------------------------------------------- */ \ - " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \ - " paddsw %%mm3, %%mm3 \n\t" \ - \ - " movq %%mm2, %%mm0 \n\t" /* make a copy */ \ - " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \ - \ - " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \ - " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \ - " psrlw $15, %%mm2 \n\t" \ - " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \ - \ - " movq %%mm3, %%mm2 \n\t" \ - " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \ - \ - " movq %%mm3, %%mm0 \n\t" \ - " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \ - \ - " psrlw $15, %%mm2 \n\t" \ - " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \ - " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \ - \ - " movq %%mm3," #ip0 " \n\t" \ - /* ------------------------------------------------------------------- */ \ - " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \ - " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \ - \ - " movq " #temp ", %%mm2 \n\t" \ - " movq %%mm2, %%mm0 \n\t" \ - \ - " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \ - " paddw %%mm0, %%mm3 \n\t" \ - \ - " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ - " movq %%mm5, %%mm0 \n\t" \ - \ - " movq %%mm5, %%mm2 \n\t" \ - " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \ - \ - " psrlw $15, %%mm2 \n\t" \ - " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \ - \ - " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \ - " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \ - \ - " movq %%mm5, %%mm0 \n\t" \ - " movq %%mm5, %%mm2 \n\t" \ - \ - " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \ - " psrlw $15, %%mm2 \n\t" \ - \ - " movq " #temp ", %%mm3 \n\t" \ - " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \ - \ - " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \ - " movq %%mm3, %%mm2 \n\t" \ - \ - " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \ - " psrlw $15, %%mm2 \n\t" \ - \ - " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ - " psubsw %%mm5, %%mm3 \n\t" \ - \ - " movq %%mm3," #ip6 " \n\t" \ - /* ------------------------------------------------------------------- */ \ - " movq "M(xC4S4)", %%mm0 \n\t" \ - " movq %%mm1, %%mm2 \n\t" \ - " movq %%mm1, %%mm3 \n\t" \ - \ - " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \ - " psrlw $15, %%mm2 \n\t" \ - \ - " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \ - " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \ - \ - " movq %%mm7, %%mm2 \n\t" \ - " movq %%mm7, %%mm3 \n\t" \ - \ - " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \ - " psrlw $15, %%mm2 \n\t" \ - \ - " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \ - " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \ - /* ------------------------------------------------------------------- */ \ - " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \ - " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \ - \ - " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \ - " paddsw %%mm6, %%mm6 \n\t" \ - " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \ - \ - " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \ - " paddsw %%mm1, %%mm1 \n\t" \ - " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \ - /* ------------------------------------------------------------------- */ \ - " movq "M(xC1S7)", %%mm7 \n\t" \ - " movq %%mm1, %%mm2 \n\t" \ - \ - " movq %%mm1, %%mm3 \n\t" \ - " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \ - \ - " movq "M(xC7S1)", %%mm7 \n\t" \ - " psrlw $15, %%mm2 \n\t" \ - \ - " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \ - " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \ - \ - " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \ - " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ - \ - " movq %%mm0, %%mm5 \n\t" \ - " movq %%mm0, %%mm2 \n\t" \ - \ - " movq "M(xC1S7)", %%mm7 \n\t" \ - " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \ - \ - " movq "M(xC7S1)", %%mm7 \n\t" \ - " psrlw $15, %%mm2 \n\t" \ - \ - " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \ - " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \ - \ - " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \ - " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \ - \ - " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \ - " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \ - \ - " movq %%mm1," #ip1 " \n\t" \ - " movq %%mm3," #ip7 " \n\t" \ - /* ------------------------------------------------------------------- */ \ - " movq "M(xC3S5)", %%mm0 \n\t" \ - " movq "M(xC5S3)", %%mm1 \n\t" \ - \ - " movq %%mm6, %%mm5 \n\t" \ - " movq %%mm6, %%mm7 \n\t" \ - \ - " movq %%mm4, %%mm2 \n\t" \ - " movq %%mm4, %%mm3 \n\t" \ - \ - " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \ - " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \ - \ - " psrlw $15, %%mm2 \n\t" \ - " psrlw $15, %%mm5 \n\t" \ - \ - " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \ - " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \ - \ - " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \ - " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \ - \ - " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \ - " movq %%mm4," #ip3 " \n\t" \ - \ - " movq %%mm3, %%mm4 \n\t" \ - " movq %%mm7, %%mm6 \n\t" \ - \ - " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \ - " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \ - \ - " paddw %%mm2, %%mm4 \n\t" \ - " paddw %%mm5, %%mm6 \n\t" \ - \ - " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \ - " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \ - \ - " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \ - " movq %%mm3," #ip5 " \n\t" - -#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \ - op0,op1,op2,op3,op4,op5,op6,op7) \ - " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \ - " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \ - " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \ - " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \ - " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \ - " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \ - " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \ - " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \ - " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \ - /* Transpose 2x8 block */ \ - " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \ - " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \ - " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \ - " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \ - " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \ - " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \ - " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \ - " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \ - " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \ - " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \ - " movq %%mm4," #op4 " \n\t" \ - " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \ - " movq %%mm5," #op5 " \n\t" \ - " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \ - " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \ - " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \ - " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \ - " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \ - " movq %%mm6," #op7 " \n\t" \ - " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \ - " movq %%mm1," #op6 " \n\t" \ - " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \ - " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \ - " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \ - " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \ - " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \ - " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \ - " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \ - " movq %%mm0," #op0 " \n\t" \ - " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \ - " movq %%mm1," #op1 " \n\t" \ - " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \ - " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \ - " movq %%mm4," #op3 " \n\t" \ - " movq %%mm2," #op2 " \n\t" - - -static void -fdct8x8theora_mmx(int16_t *src, int16_t *dest) -{ - int64_t __attribute__((aligned(8))) align_tmp[16]; - int16_t *const temp= (int16_t*)align_tmp; - - __asm__ __volatile__ ( - " .balign 16 \n\t" - /* - * Input data is an 8x8 block. To make processing of the data more efficent - * we will transpose the block of data to two 4x8 blocks??? - */ - Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0), - (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1)) - Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2)) - - Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0), - 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1)) - Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2)) - - Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), - 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1)) - Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2)) - - Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), - 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1)) - Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2)) - - " emms \n\t" - - : "+r" (src), - "+r" (dest) - : "r" (temp) - : "memory" - ); -} - -OIL_DEFINE_IMPL_FULL (fdct8x8theora_mmx, fdct8x8theora, OIL_IMPL_FLAG_MMX); - diff --git a/liboil/dct/idct8x8_i386.c b/liboil/dct/idct8x8_i386.c deleted file mode 100644 index e8a88c2..0000000 --- a/liboil/dct/idct8x8_i386.c +++ /dev/null @@ -1,744 +0,0 @@ -/* - * LIBOIL - Library of Optimized Inner Loops - * Copyright (c) 2004 David A. Schleef <ds@schleef.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include <liboil/liboil.h> -#include <liboil/dct/dct.h> -#include <math.h> - - -OIL_DECLARE_CLASS (idct8x8_s16); -OIL_DECLARE_CLASS (dct8x8_s16); - -#define CONST(x) (32768.0*(x) + 0.5) - -#define C1_0000 (32767) -#define C0_9808 CONST(0.980785280) -#define C0_9239 CONST(0.923879532) -#define C0_8315 CONST(0.831469612) -#define C0_7071 CONST(0.707106781) -#define C0_5556 CONST(0.555570233) -#define C0_3827 CONST(0.382683432) -#define C0_1951 CONST(0.195090322) - -#define FOUR(x) { x, x, x, x } -#define MMX_CONST(x) {32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5} - -static const int16_t -dct_mmx_constants [][4] = { - FOUR(0), - FOUR(C0_9808), - FOUR(C0_9239), - FOUR(C0_8315), - FOUR(C0_7071), - FOUR(C0_5556), - FOUR(C0_3827), - FOUR(C0_1951), - { 1, 1, -1, -1 }, // 64 - { 1, -1, 1, -1 }, - { C1_0000, C0_9239, C0_7071, C0_3827 }, // 80 - { C1_0000, C0_3827, C0_7071, C0_9239 }, // 88 - { C0_9808, C0_8315, C0_5556, C0_1951 }, // 96 - { C0_8315, C0_1951, C0_9808, C0_5556 }, // 104 - { 1, -1, -1, -1 }, - { C0_5556, C0_9808, C0_1951, C0_8315 }, // 120 - { 1, -1, 1, 1 }, - { C0_1951, C0_5556, C0_8315, C0_9808 }, // 136 - { 1, -1, 1, -1 }, - FOUR(CONST(0.5)), //152 - { C0_7071, C0_9239, C0_7071, C0_3827 }, // 160 - { C0_7071, C0_3827, C0_7071, C0_9239 }, // 168 -}; - -static void -idct8x8_s16_mmx (int16_t *dest, int dstr, int16_t *src, int sstr) -{ - int32_t tmp[32]; - int32_t save_ebx; - - asm volatile ( - " movl %%ebx, %6 \n" - /* left half */ - " movl %1, %%eax \n" // src - " movl %3, %%ebx \n" // sstr - " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4 - - " movq (%%eax), %%mm0 \n" - " movq (%%eax), %%mm1 \n" - " paddsw (%%ecx), %%mm0 \n" // ss07s34 - " psubsw (%%ecx), %%mm1 \n" // ss16s25 - " pmulhw 32(%5), %%mm0 \n" // .7071 - " pmulhw 32(%5), %%mm1 \n" // .7071 - - " movq (%%eax,%%ebx,2), %%mm2 \n" - " movq (%%eax,%%ebx,2), %%mm3 \n" - " movq (%%ecx,%%ebx,2), %%mm4 \n" - " movq (%%ecx,%%ebx,2), %%mm5 \n" - " pmulhw 16(%5), %%mm2 \n" // .9239 - " pmulhw 48(%5), %%mm3 \n" // .3827 - " pmulhw 48(%5), %%mm4 \n" // .3827 - " pmulhw 16(%5), %%mm5 \n" // .9239 - " paddsw %%mm4, %%mm2 \n" // ds07s34 - " psubsw %%mm5, %%mm3 \n" // ds16s25 - - " movq %%mm0, %%mm4 \n" - " movq %%mm1, %%mm5 \n" - " paddsw %%mm2, %%mm0 \n" // s07 - " psubsw %%mm2, %%mm4 \n" // s34 - " paddsw %%mm3, %%mm1 \n" // s16 - " psubsw %%mm3, %%mm5 \n" // s25 - - " movq %%mm0, 0(%4) \n" - " movq %%mm1, 8(%4) \n" - " movq %%mm5, 16(%4) \n" - " movq %%mm4, 24(%4) \n" - - " addl %3, %%eax \n" - " addl %3, %%ecx \n" - - " movq (%%eax), %%mm0 \n" - " pmulhw 8(%5), %%mm0 \n" - " movq (%%eax,%%ebx,2), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq (%%ecx), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq (%%ecx,%%ebx,2), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" // d07 - - " movq (%%eax), %%mm2 \n" - " pmulhw 24(%5), %%mm2 \n" - " movq (%%eax,%%ebx,2), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " psubsw %%mm1, %%mm2 \n" - " movq (%%ecx), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm2 \n" - " movq (%%ecx,%%ebx,2), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " psubsw %%mm1, %%mm2 \n" // d16 - - " movq (%%eax), %%mm3 \n" - " pmulhw 40(%5), %%mm3 \n" - " movq (%%eax,%%ebx,2), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm3 \n" - " movq (%%ecx), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " paddsw %%mm1, %%mm3 \n" - " movq (%%ecx,%%ebx,2), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm3 \n" // d25 - - " movq (%%eax), %%mm4 \n" - " pmulhw 56(%5), %%mm4 \n" - " movq (%%eax,%%ebx,2), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " psubsw %%mm1, %%mm4 \n" - " movq (%%ecx), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm4 \n" - " movq (%%ecx,%%ebx,2), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm4 \n" // d34 - - " movl %0, %%eax \n" // dest - " movl %2, %%ebx \n" // dstr - " leal (%%ebx, %%ebx, 2), %%edx \n" // dstr*3 - - " movq %%mm0, %%mm1 \n" - " paddsw 0(%4), %%mm1 \n" - " movq %%mm1, (%%eax) \n" - - " movq %%mm2, %%mm1 \n" - " paddsw 8(%4), %%mm1 \n" - " movq %%mm1, (%%eax, %%ebx, 1) \n" - - " movq %%mm3, %%mm1 \n" - " paddsw 16(%4), %%mm1 \n" - " movq %%mm1, (%%eax, %%ebx, 2) \n" // s25 + d25 - - " movq %%mm4, %%mm1 \n" - " paddsw 24(%4), %%mm1 \n" - " movq %%mm1, (%%eax, %%edx, 1) \n" - - " leal (%%eax, %%ebx, 4), %%eax \n" - " movq 24(%4), %%mm1 \n" - " psubsw %%mm4, %%mm1 \n" - " movq %%mm1, (%%eax) \n" - - " movq 16(%4), %%mm1 \n" - " psubsw %%mm3, %%mm1 \n" - " movq %%mm1, (%%eax, %%ebx, 1) \n" - - " movq 8(%4), %%mm1 \n" - " psubsw %%mm2, %%mm1 \n" - " movq %%mm1, (%%eax, %%ebx, 2) \n" - - " movq 0(%4), %%mm1 \n" - " psubsw %%mm0, %%mm1 \n" - " movq %%mm1, (%%eax, %%edx, 1) \n" - - /* right half */ - " movl %1, %%eax \n" // src - " movl %3, %%ebx \n" // sstr - " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4 - - " movq 8(%%eax), %%mm0 \n" - " movq 8(%%eax), %%mm1 \n" - " paddsw 8(%%ecx), %%mm0 \n" // ss07s34 - " psubsw 8(%%ecx), %%mm1 \n" // ss16s25 - " pmulhw 32(%5), %%mm0 \n" // .7071 - " pmulhw 32(%5), %%mm1 \n" // .7071 - - " movq 8(%%eax,%%ebx,2), %%mm2 \n" - " movq 8(%%eax,%%ebx,2), %%mm3 \n" - " movq 8(%%ecx,%%ebx,2), %%mm4 \n" - " movq 8(%%ecx,%%ebx,2), %%mm5 \n" - " pmulhw 16(%5), %%mm2 \n" // .9239 - " pmulhw 48(%5), %%mm3 \n" // .3827 - " pmulhw 48(%5), %%mm4 \n" // .3827 - " pmulhw 16(%5), %%mm5 \n" // .9239 - " paddsw %%mm4, %%mm2 \n" // ds07s34 - " psubsw %%mm5, %%mm3 \n" // ds16s25 - - " movq %%mm0, %%mm4 \n" - " movq %%mm1, %%mm5 \n" - " paddsw %%mm2, %%mm0 \n" // s07 - " psubsw %%mm2, %%mm4 \n" // s34 - " paddsw %%mm3, %%mm1 \n" // s16 - " psubsw %%mm3, %%mm5 \n" // s25 - - " movq %%mm0, 0(%4) \n" - " movq %%mm1, 8(%4) \n" - " movq %%mm5, 16(%4) \n" - " movq %%mm4, 24(%4) \n" - - " addl %3, %%eax \n" - " addl %3, %%ecx \n" - - " movq 8(%%eax), %%mm0 \n" - " pmulhw 8(%5), %%mm0 \n" - " movq 8(%%eax,%%ebx,2), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 8(%%ecx), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 8(%%ecx,%%ebx,2), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" // d07 - - " movq 8(%%eax), %%mm2 \n" - " pmulhw 24(%5), %%mm2 \n" - " movq 8(%%eax,%%ebx,2), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " psubsw %%mm1, %%mm2 \n" - " movq 8(%%ecx), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm2 \n" - " movq 8(%%ecx,%%ebx,2), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " psubsw %%mm1, %%mm2 \n" // d16 - - " movq 8(%%eax), %%mm3 \n" - " pmulhw 40(%5), %%mm3 \n" - " movq 8(%%eax,%%ebx,2), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm3 \n" - " movq 8(%%ecx), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " paddsw %%mm1, %%mm3 \n" - " movq 8(%%ecx,%%ebx,2), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm3 \n" // d25 - - " movq 8(%%eax), %%mm4 \n" - " pmulhw 56(%5), %%mm4 \n" - " movq 8(%%eax,%%ebx,2), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " psubsw %%mm1, %%mm4 \n" - " movq 8(%%ecx), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm4 \n" - " movq 8(%%ecx,%%ebx,2), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm4 \n" // d34 - - " movl %0, %%eax \n" // dest - " movl %2, %%ebx \n" // dstr - " leal (%%ebx, %%ebx, 2), %%edx \n" // dstr*3 - - " movq %%mm0, %%mm1 \n" - " paddsw 0(%4), %%mm1 \n" - " movq %%mm1, 8(%%eax) \n" - - " movq %%mm2, %%mm1 \n" - " paddsw 8(%4), %%mm1 \n" - " movq %%mm1, 8(%%eax, %%ebx, 1) \n" - - " movq %%mm3, %%mm1 \n" - " paddsw 16(%4), %%mm1 \n" - " movq %%mm1, 8(%%eax, %%ebx, 2) \n" // s25 + d25 - - " movq %%mm4, %%mm1 \n" - " paddsw 24(%4), %%mm1 \n" - " movq %%mm1, 8(%%eax, %%edx, 1) \n" - - " leal (%%eax, %%ebx, 4), %%eax \n" - " movq 24(%4), %%mm1 \n" - " psubsw %%mm4, %%mm1 \n" - " movq %%mm1, 8(%%eax) \n" - - " movq 16(%4), %%mm1 \n" - " psubsw %%mm3, %%mm1 \n" - " movq %%mm1, 8(%%eax, %%ebx, 1) \n" - - " movq 8(%4), %%mm1 \n" - " psubsw %%mm2, %%mm1 \n" - " movq %%mm1, 8(%%eax, %%ebx, 2) \n" - - " movq 0(%4), %%mm1 \n" - " psubsw %%mm0, %%mm1 \n" - " movq %%mm1, 8(%%eax, %%edx, 1) \n" - - - /* rows */ - " movl %0, %%eax \n" /* dest */ -#define LOOP \ - " pshufw $0x88, 0(%%eax), %%mm0 \n" /* x0 x2 x0 x2 */ \ - " pshufw $0x88, 8(%%eax), %%mm1 \n" /* x4 x6 x4 x6 */ \ - " pmulhw 160(%5), %%mm0 \n" /* 0.707 0.9239 0.707 0.3827 */ \ - " pmulhw 168(%5), %%mm1 \n" /* 0.707 0.3827 0.707 0.9239 */ \ - " pmullw 64(%5), %%mm1 \n" /* 1 1 -1 -1 */ \ - " paddsw %%mm1, %%mm0 \n" /* ss07s34 ds07s34 ss16s25 ds16s25 */ \ - \ - " pshufw $0xa0, %%mm0, %%mm1 \n" /* ss07s34 ss07s34 ss16s25 ss16s25 */ \ - " pshufw $0xf5, %%mm0, %%mm2 \n" /* ds07s34 ds07s34 ds16s25 ds16s25 */ \ - " pmullw 72(%5), %%mm2 \n" /* 1 -1 1 -1 */ \ - " paddsw %%mm2, %%mm1 \n" /* s07 s34 s16 s25 */ \ - " pshufw $0x78, %%mm1, %%mm2 \n" /* s07 s16 s25 s34 */ \ - \ - " pshufw $0x55, 0(%%eax), %%mm0 \n" \ - " pmulhw 96(%5), %%mm0 \n" \ - " pshufw $0xff, 0(%%eax), %%mm1 \n" \ - " pmulhw 104(%5), %%mm1 \n" \ - " pmullw 112(%5), %%mm1 \n" \ - " paddsw %%mm1, %%mm0 \n" \ - " pshufw $0x55, 8(%%eax), %%mm1 \n" \ - " pmulhw 120(%5), %%mm1 \n" \ - " pmullw 128(%5), %%mm1 \n" \ - " paddsw %%mm1, %%mm0 \n" \ - " pshufw $0xff, 8(%%eax), %%mm1 \n" \ - " pmulhw 136(%5), %%mm1 \n" \ - " pmullw 144(%5), %%mm1 \n" \ - " paddsw %%mm1, %%mm0 \n" \ - \ - " movq %%mm2, %%mm1 \n" \ - " paddsw %%mm0, %%mm1 \n" \ - " psubsw %%mm0, %%mm2 \n" \ - " pshufw $0x1b, %%mm2, %%mm2 \n" \ - \ - " movq %%mm1, 0(%%eax) \n" \ - " movq %%mm2, 8(%%eax) \n" \ - " addl %3, %%eax \n" - - LOOP - LOOP - LOOP - LOOP - LOOP - LOOP - LOOP - LOOP -#undef LOOP - - " movl %6, %%ebx \n" - " emms \n" - : - : "m" (dest), "m" (src), "m" (dstr), "m" (sstr), "r" (tmp), "r" (dct_mmx_constants), "m" (save_ebx) - : "eax", "ecx", "edx"); -} -OIL_DEFINE_IMPL_FULL (idct8x8_s16_mmx, idct8x8_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); - -#if 0 -#define CONST(x) (32768.0*(x) + 0.5) - -#define C1_0000 (32767) -#define C0_9808 CONST(0.980785280) -#define C0_9239 CONST(0.923879532) -#define C0_8315 CONST(0.831469612) -#define C0_7071 CONST(0.707106781) -#define C0_5556 CONST(0.555570233) -#define C0_3827 CONST(0.382683432) -#define C0_1951 CONST(0.195090322) - -#define FOUR(x) { x, x, x, x } -#define MMX_CONST(x) {32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5} - -static const int16_t -dct_mmx_constants [][4] = { - FOUR(0), - FOUR(C0_9808), - FOUR(C0_9239), - FOUR(C0_8315), - FOUR(C0_7071), - FOUR(C0_5556), - FOUR(C0_3827), - FOUR(C0_1951), - { 1, 1, -1, -1 }, // 64 - { 1, -1, 1, -1 }, - { C1_0000, C0_9239, C0_7071, C0_3827 }, // 80 - { C1_0000, C0_3827, C0_7071, C0_9239 }, // 88 - { C0_9808, C0_8315, C0_5556, C0_1951 }, // 96 - { C0_8315, C0_1951, C0_9808, C0_5556 }, // 104 - { 1, -1, -1, -1 }, - { C0_5556, C0_9808, C0_1951, C0_8315 }, // 120 - { 1, -1, 1, 1 }, - { C0_1951, C0_5556, C0_8315, C0_9808 }, // 136 - { 1, -1, 1, -1 }, -}; -#endif - -/* a 3dnow version can use pmulhrw instead of pmulhw for increased - * accuracy */ -static void -fdct8x8s_s16_mmx (uint16_t *dest, int dstr, uint16_t *src, int sstr) -{ - int32_t tmp[32]; - int32_t save_ebx; - - asm volatile ( - " movl %%ebx, %6 \n" - /* first half */ - " movl %1, %%eax \n" // src - " movl %3, %%ebx \n" // sstr - " leal (%%ebx,%%ebx,2),%%edx \n" // sstr * 3 - " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4 - - " movq (%%eax), %%mm0 \n" - " movq (%%eax), %%mm1 \n" - " paddsw (%%ecx,%%edx,1), %%mm0 \n" // s07 - " psubsw (%%ecx,%%edx,1), %%mm1 \n" // d07 - " movq %%mm1, (%4) \n" - - " movq (%%eax,%%ebx), %%mm2 \n" - " movq (%%eax,%%ebx), %%mm3 \n" - " paddsw (%%ecx,%%ebx,2), %%mm2 \n" // s16 - " psubsw (%%ecx,%%ebx,2), %%mm3 \n" // d16 - " movq %%mm3, 8(%4) \n" - - " movq (%%eax,%%ebx,2), %%mm1 \n" - " movq (%%eax,%%ebx,2), %%mm4 \n" - " paddsw (%%ecx,%%ebx), %%mm1 \n" // s25 - " psubsw (%%ecx,%%ebx), %%mm4 \n" // d25 - " movq %%mm4, 16(%4) \n" - - " movq (%%eax,%%edx), %%mm3 \n" - " movq (%%eax,%%edx), %%mm5 \n" - " paddsw (%%ecx), %%mm3 \n" // s34 - " psubsw (%%ecx), %%mm5 \n" // d34 - " movq %%mm5, 24(%4) \n" - - " movq %%mm0, %%mm4 \n" - " paddsw %%mm3, %%mm0 \n" // ss07s34 - " psubsw %%mm3, %%mm4 \n" // ds07s34 - - " movq %%mm2, %%mm5 \n" - " paddsw %%mm1, %%mm2 \n" // ss16s25 - " psubsw %%mm1, %%mm5 \n" // ds16s25 - - " movq %%mm0, %%mm1 \n" - " paddsw %%mm2, %%mm1 \n" - " pmulhw 32(%5), %%mm1 \n" // .7071 - " psubsw %%mm2, %%mm0 \n" - " pmulhw 32(%5), %%mm0 \n" // .7071 - - " movq %%mm4, %%mm2 \n" - " pmulhw 16(%5), %%mm2 \n" - " movq %%mm5, %%mm6 \n" - " pmulhw 48(%5), %%mm6 \n" - " paddsw %%mm6, %%mm2 \n" // - - " pmulhw 48(%5), %%mm4 \n" - " pmulhw 16(%5), %%mm5 \n" - " psubsw %%mm5, %%mm4 \n" // - - " movl %0, %%eax \n" // dest - " movl %2, %%ebx \n" // dstr - " add %%ebx, %%ebx \n" - " leal (%%ebx,%%ebx,2),%%edx \n" // dstr * 3 - " movq %%mm1, 0(%%eax) \n" - " movq %%mm2, 0(%%eax,%%ebx) \n" - " movq %%mm0, 0(%%eax,%%ebx,2) \n" - " movq %%mm4, 0(%%eax,%%edx) \n" - - " add %2, %%eax \n" - " movq 0(%4), %%mm0 \n" - " pmulhw 8(%5), %%mm0 \n" - " movq 8(%4), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 16(%4), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 24(%4), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq %%mm0, (%%eax) \n" - - " movq 0(%4), %%mm0 \n" - " pmulhw 24(%5), %%mm0 \n" - " movq 8(%4), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq 16(%4), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq 24(%4), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq %%mm0, (%%eax,%%ebx) \n" - - " movq 0(%4), %%mm0 \n" - " pmulhw 40(%5), %%mm0 \n" - " movq 8(%4), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq 16(%4), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 24(%4), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq %%mm0, (%%eax,%%ebx,2) \n" - - " movq 0(%4), %%mm0 \n" - " pmulhw 56(%5), %%mm0 \n" - " movq 8(%4), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq 16(%4), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 24(%4), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq %%mm0, (%%eax,%%edx) \n" - - /* second half */ - - " movl %1, %%eax \n" // src - " add $8, %%eax \n" - " movl %3, %%ebx \n" // sstr - " leal (%%ebx,%%ebx,2),%%edx \n" // sstr * 3 - " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4 - - " movq (%%eax), %%mm0 \n" - " movq (%%eax), %%mm1 \n" - " paddsw (%%ecx,%%edx,1), %%mm0 \n" // s07 - " psubsw (%%ecx,%%edx,1), %%mm1 \n" // d07 - " movq %%mm1, (%4) \n" - - " movq (%%eax,%%ebx), %%mm2 \n" - " movq (%%eax,%%ebx), %%mm3 \n" - " paddsw (%%ecx,%%ebx,2), %%mm2 \n" // s16 - " psubsw (%%ecx,%%ebx,2), %%mm3 \n" // d16 - " movq %%mm3, 8(%4) \n" - - " movq (%%eax,%%ebx,2), %%mm1 \n" - " movq (%%eax,%%ebx,2), %%mm4 \n" - " paddsw (%%ecx,%%ebx), %%mm1 \n" // s25 - " psubsw (%%ecx,%%ebx), %%mm4 \n" // d25 - " movq %%mm4, 16(%4) \n" - - " movq (%%eax,%%edx), %%mm3 \n" - " movq (%%eax,%%edx), %%mm5 \n" - " paddsw (%%ecx), %%mm3 \n" // s34 - " psubsw (%%ecx), %%mm5 \n" // d34 - " movq %%mm5, 24(%4) \n" - - " movq %%mm0, %%mm4 \n" - " paddsw %%mm3, %%mm0 \n" // ss07s34 - " psubsw %%mm3, %%mm4 \n" // ds07s34 - - " movq %%mm2, %%mm5 \n" - " paddsw %%mm1, %%mm2 \n" // ss16s25 - " psubsw %%mm1, %%mm5 \n" // ds16s25 - - " movq %%mm0, %%mm1 \n" - " paddsw %%mm2, %%mm1 \n" - " pmulhw 32(%5), %%mm1 \n" // .7071 - " psubsw %%mm2, %%mm0 \n" - " pmulhw 32(%5), %%mm0 \n" // .7071 - - " movq %%mm4, %%mm2 \n" - " pmulhw 16(%5), %%mm2 \n" - " movq %%mm5, %%mm6 \n" - " pmulhw 48(%5), %%mm6 \n" - " paddsw %%mm6, %%mm2 \n" // - - " pmulhw 48(%5), %%mm4 \n" - " pmulhw 16(%5), %%mm5 \n" - " psubsw %%mm5, %%mm4 \n" // - - " movl %0, %%eax \n" // dest - " add $8, %%eax \n" - " movl %2, %%ebx \n" // dstr - " add %%ebx, %%ebx \n" - " leal (%%ebx,%%ebx,2),%%edx \n" // dstr * 3 - " movq %%mm1, 0(%%eax) \n" - " movq %%mm2, 0(%%eax,%%ebx) \n" - " movq %%mm0, 0(%%eax,%%ebx,2) \n" - " movq %%mm4, 0(%%eax,%%edx) \n" - - " add %2, %%eax \n" - " movq 0(%4), %%mm0 \n" - " pmulhw 8(%5), %%mm0 \n" - " movq 8(%4), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 16(%4), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 24(%4), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq %%mm0, (%%eax) \n" - - " movq 0(%4), %%mm0 \n" - " pmulhw 24(%5), %%mm0 \n" - " movq 8(%4), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq 16(%4), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq 24(%4), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq %%mm0, (%%eax,%%ebx) \n" - - " movq 0(%4), %%mm0 \n" - " pmulhw 40(%5), %%mm0 \n" - " movq 8(%4), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq 16(%4), %%mm1 \n" - " pmulhw 56(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 24(%4), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq %%mm0, (%%eax,%%ebx,2) \n" - - " movq 0(%4), %%mm0 \n" - " pmulhw 56(%5), %%mm0 \n" - " movq 8(%4), %%mm1 \n" - " pmulhw 40(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq 16(%4), %%mm1 \n" - " pmulhw 24(%5), %%mm1 \n" - " paddsw %%mm1, %%mm0 \n" - " movq 24(%4), %%mm1 \n" - " pmulhw 8(%5), %%mm1 \n" - " psubsw %%mm1, %%mm0 \n" - " movq %%mm0, (%%eax,%%edx) \n" - - " movl %0, %%ecx \n" // dest - -#define LOOP \ - " movq (%%ecx), %%mm0 \n" \ - " pshufw $0x1b, 8(%%ecx), %%mm1 \n" \ - " movq %%mm0, %%mm2 \n" \ - " paddsw %%mm1, %%mm0 \n" /* s07 s16 s25 s34 */ \ - " psubsw %%mm1, %%mm2 \n" /* d07 d16 d25 d34 */ \ - \ - " pshufw $0xbb, %%mm0, %%mm1 \n" /* s25 s34 s25 s34 */ \ - " pshufw $0x44, %%mm0, %%mm0 \n" /* s07 s16 s07 s16 */ \ - \ - " pmullw 64(%5), %%mm1 \n" \ - " paddsw %%mm1, %%mm0 \n" /* ss07s34 ss16s25 ds07s34 ds16s25 */ \ - \ - " pshufw $0x88, %%mm0, %%mm1 \n" /* ss07s34 ds07s34 ss07s34 ds07s34 */ \ - " pshufw $0xdd, %%mm0, %%mm0 \n" /* ss16s25 ds16s25 ss16s25 ds16s25 */ \ - \ - " pmulhw 80(%5), %%mm1 \n" \ - \ - " pmullw 64(%5), %%mm0 \n" \ - " pmulhw 88(%5), %%mm0 \n" \ - \ - " paddsw %%mm1, %%mm0 \n" \ - \ - " pshufw $0x00, %%mm2, %%mm3 \n" \ - " pmulhw 96(%5), %%mm3 \n" \ - " pshufw $0x55, %%mm2, %%mm1 \n" \ - " pmulhw 104(%5), %%mm1 \n" \ - " pmullw 112(%5), %%mm1 \n" \ - " paddsw %%mm1, %%mm3 \n" \ - " pshufw $0xaa, %%mm2, %%mm1 \n" \ - " pmulhw 120(%5), %%mm1 \n" \ - " pmullw 128(%5), %%mm1 \n" \ - " paddsw %%mm1, %%mm3 \n" \ - " pshufw $0xff, %%mm2, %%mm1 \n" \ - " pmulhw 136(%5), %%mm1 \n" \ - " pmullw 144(%5), %%mm1 \n" \ - " paddsw %%mm1, %%mm3 \n" \ - \ - " movq %%mm0, %%mm1 \n" \ - " punpckhwd %%mm3, %%mm1 \n" \ - " punpcklwd %%mm3, %%mm0 \n" \ - \ - " movq %%mm0, (%%ecx) \n" \ - " movq %%mm1, 8(%%ecx) \n" \ - \ - " add %3, %%eax \n" \ - " add %2, %%ecx \n" - - LOOP - LOOP - LOOP - LOOP - LOOP - LOOP - LOOP - LOOP - - " emms \n" - " movl %6, %%ebx \n" - : - : "m" (dest), "m" (src), "m" (dstr), "m" (sstr), "r" (tmp), "r" (dct_mmx_constants), "m" (save_ebx) - : "eax", "ecx", "edx"); - -} -OIL_DEFINE_IMPL_FULL (fdct8x8s_s16_mmx, fdct8x8s_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); - |