diff options
author | David Schleef <ds@schleef.org> | 2005-08-01 09:49:20 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-08-01 09:49:20 +0000 |
commit | 6221d08ea2092d1e8d8429ea4b24554f88ada1c9 (patch) | |
tree | 97138e634276d5d241dd7ba948610e02ad35114e | |
parent | 48c9a788517b68892535519456a3fb4424408890 (diff) | |
download | liboil-6221d08ea2092d1e8d8429ea4b24554f88ada1c9.tar.gz |
* Makefile.am: add 'foreign' to automake flags
* configure.ac: add some dirs
* examples/Makefile.am: same
* examples/memcpy-speed.c: (main): change back to gromit's cpu
* examples/oil-inspect.c: (oil_print_impl): Don't run non-runnable
implementations.
* examples/oil-test.c: (dump_array), (main): minor fixes
* examples/taylor/Makefile.am: new
* examples/taylor/example1.c: new
* liboil/Makefile.am: add dirs
* liboil/colorspace/composite.c: (composite_over_argb_noclamp_2):
alternate clamping version
* liboil/simdpack/scalarmult.c: add unrolled impls
* testsuite/instruction/check-instructions.pl: fixes
* liboil/fb/Makefile.am: new
* liboil/fb/fbmmx.h: new
* liboil/fb/fbmmx.c: new
-rw-r--r-- | ChangeLog | 20 | ||||
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | examples/Makefile.am | 2 | ||||
-rw-r--r-- | examples/memcpy-speed.c | 2 | ||||
-rw-r--r-- | examples/oil-inspect.c | 4 | ||||
-rw-r--r-- | examples/oil-test.c | 7 | ||||
-rw-r--r-- | examples/taylor/Makefile.am | 8 | ||||
-rw-r--r-- | examples/taylor/example1.c | 123 | ||||
-rw-r--r-- | liboil/Makefile.am | 3 | ||||
-rw-r--r-- | liboil/colorspace/composite.c | 22 | ||||
-rw-r--r-- | liboil/fb/Makefile.am | 27 | ||||
-rw-r--r-- | liboil/fb/fbmmx.c | 2387 | ||||
-rw-r--r-- | liboil/fb/fbmmx.h | 62 | ||||
-rw-r--r-- | liboil/simdpack/scalarmult.c | 27 | ||||
-rwxr-xr-x | testsuite/instruction/check-instructions.pl | 30 |
16 files changed, 2711 insertions, 17 deletions
@@ -1,5 +1,25 @@ 2005-08-01 David Schleef <ds@schleef.org> + * Makefile.am: add 'foreign' to automake flags + * configure.ac: add some dirs + * examples/Makefile.am: same + * examples/memcpy-speed.c: (main): change back to gromit's cpu + * examples/oil-inspect.c: (oil_print_impl): Don't run non-runnable + implementations. + * examples/oil-test.c: (dump_array), (main): minor fixes + * examples/taylor/Makefile.am: new + * examples/taylor/example1.c: new + * liboil/Makefile.am: add dirs + * liboil/colorspace/composite.c: (composite_over_argb_noclamp_2): + alternate clamping version + * liboil/simdpack/scalarmult.c: add unrolled impls + * testsuite/instruction/check-instructions.pl: fixes + * liboil/fb/Makefile.am: new + * liboil/fb/fbmmx.h: new + * liboil/fb/fbmmx.c: new + +2005-08-01 David Schleef <ds@schleef.org> + * liboil/colorspace/composite_i386.c: (composite_over_argb_sse2_2), (composite_over_argb_sse2_3): hacking * liboil/liboilcpu.c: (oil_cpu_i386_getflags_cpuid): Intel's SSE2 diff --git a/Makefile.am b/Makefile.am index f2f4f64..3d35d50 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,6 @@ +AUTOMAKE_OPTIONS = foreign + SUBDIRS = liboil testsuite examples doc EXTRA_DIST = COPYING autogen.sh gtk-doc.make diff --git a/configure.ac b/configure.ac index d72e7e3..6a0e5f1 100644 --- a/configure.ac +++ b/configure.ac @@ -205,6 +205,7 @@ liboil/colorspace/Makefile liboil/conv/Makefile liboil/copy/Makefile liboil/dct/Makefile +liboil/fb/Makefile liboil/math/Makefile liboil/md5/Makefile liboil/motovec/Makefile @@ -218,6 +219,7 @@ examples/Makefile examples/huffman/Makefile examples/jpeg/Makefile examples/md5/Makefile +examples/taylor/Makefile examples/uberopt/Makefile examples/work/Makefile liboil.pc diff --git a/examples/Makefile.am b/examples/Makefile.am index f38c12a..baefa28 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -1,5 +1,5 @@ -SUBDIRS = jpeg md5 uberopt work huffman +SUBDIRS = jpeg md5 uberopt work huffman taylor noinst_PROGRAMS = example1 oil-inspect oil-test report memcpy-speed diff --git a/examples/memcpy-speed.c b/examples/memcpy-speed.c index 0e9bba2..c74295a 100644 --- a/examples/memcpy-speed.c +++ b/examples/memcpy-speed.c @@ -26,7 +26,7 @@ main(int argc, char *argv[]) oil_init (); - cpufreq = 400e6/16; + cpufreq = 1788e6; s = malloc(1024*1024*64+1024); d = malloc(1024*1024*64+1024); diff --git a/examples/oil-inspect.c b/examples/oil-inspect.c index 95e8e21..6ab2e14 100644 --- a/examples/oil-inspect.c +++ b/examples/oil-inspect.c @@ -105,7 +105,9 @@ oil_print_impl (OilFunctionImpl *impl, OilTest *test, char* prefix) char *c; unsigned int cpu_flags = oil_cpu_get_flags(); - oil_test_check_impl (test, impl); + if (oil_impl_is_runnable (impl)) { + oil_test_check_impl (test, impl); + } printf ("%s%s\n", prefix, impl->name); c = oil_flags_to_string (impl->flags); diff --git a/examples/oil-test.c b/examples/oil-test.c index 9072da4..7fa6192 100644 --- a/examples/oil-test.c +++ b/examples/oil-test.c @@ -75,9 +75,9 @@ dump_array (void *data, void *ref_data, OilType type, int pre_n, int stride, printf(" "); \ for(j=0;j<pre_n;j++){ \ x = fabs(OIL_GET(data, i*stride + j*s2, type) - \ - OIL_GET(data, i*stride + j*s2, type)); \ + OIL_GET(ref_data, i*stride + j*s2, type)); \ if (x >= 1.0) { \ - printf("[" format "] ", OIL_GET(data, i*stride + j*s2, type)); \ + printf("*" format "* ", OIL_GET(data, i*stride + j*s2, type)); \ } else { \ printf(format " ", OIL_GET(data, i*stride + j*s2, type)); \ } \ @@ -104,6 +104,7 @@ dump_array (void *data, void *ref_data, OilType type, int pre_n, int stride, break; case OIL_TYPE_u32p: DUMP(uint32_t, "%u"); + //DUMP(uint32_t, "%08x"); break; case OIL_TYPE_f32p: DUMP(float, "%g"); @@ -173,7 +174,7 @@ int main (int argc, char *argv[]) test = oil_test_new(klass); oil_test_set_iterations(test, 1); - test->n = 100; + test->n = 10; test->m = 10; impl = klass->reference_impl; diff --git a/examples/taylor/Makefile.am b/examples/taylor/Makefile.am new file mode 100644 index 0000000..7cad1e8 --- /dev/null +++ b/examples/taylor/Makefile.am @@ -0,0 +1,8 @@ + +noinst_PROGRAMS = example1 + + +example1_SOURCES = example1.c +example1_CFLAGS = $(LIBOIL_CFLAGS) +example1_LDADD = $(LIBOIL_LIBS) + diff --git a/examples/taylor/example1.c b/examples/taylor/example1.c new file mode 100644 index 0000000..7893b4e --- /dev/null +++ b/examples/taylor/example1.c @@ -0,0 +1,123 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <liboil/liboil.h> +#include <liboil/liboilprofile.h> +#include <stdio.h> +#include <stdlib.h> + +#define N 1000 + +static void taylor4_f32_ref (float *dest, float *src, float *a, int n); +static void taylor4_f32_oil (float *dest, float *src, float *a, int n); + +int main(int argc, char *argv[]) +{ + float *dest; + float *src; + float a[4]; + int i; + OilProfile prof; + double ave, std; + + oil_init(); + + src = malloc(N*sizeof(float)); + dest = malloc(N*sizeof(float)); + + for(i=0;i<N;i++){ + src[i] = i; + } + a[0] = 1; + a[1] = 1; + a[2] = 1; + a[3] = 1; + + oil_profile_init(&prof); + for(i=0;i<10;i++){ + oil_profile_start(&prof); + taylor4_f32_ref (dest, src, a, N); + oil_profile_stop(&prof); + } + oil_profile_get_ave_std (&prof, &ave, &std); + printf("ref: %10.4g %10.4g\n", ave, std); + for(i=0;i<10;i++){ + printf("%g\n", dest[i]); + } + + oil_profile_init(&prof); + for(i=0;i<10;i++){ + oil_profile_start(&prof); + taylor4_f32_oil (dest, src, a, N); + oil_profile_stop(&prof); + } + oil_profile_get_ave_std (&prof, &ave, &std); + printf("oil: %10.4g %10.4g\n", ave, std); + for(i=0;i<10;i++){ + printf("%g\n", dest[i]); + } + + return 0; +} + +static void +taylor4_f32_ref (float *dest, float *src, float *a, int n) +{ + int i; + float x; + for(i=0;i<n;i++){ + x = src[i]; + dest[i] = a[0]; + dest[i] += a[1] * x; + dest[i] += a[2] * x * x; + dest[i] += a[3] * x * x * x; + } + +} + + +static void +taylor4_f32_oil (float *dest, float *src, float *a, int n) +{ + float tmp1[N]; + float tmp2[N]; + float tmp3[N]; + + oil_scalarmultiply_f32_ns (tmp1, src, a+1, n); + + oil_scalaradd_f32_ns (tmp1, tmp1, a, n); + + oil_multiply_f32 (tmp2, src, src, n); + oil_scalarmultiply_f32_ns (tmp3, tmp2, a+2, n); + oil_add_f32 (tmp1, tmp1, tmp3, n); + + oil_multiply_f32 (tmp2, tmp2, src, n); + oil_scalarmultiply_f32_ns (tmp3, tmp2, a+3, n); + oil_add_f32 (dest, tmp1, tmp3, n); +} + diff --git a/liboil/Makefile.am b/liboil/Makefile.am index ebb98c2..17a9de1 100644 --- a/liboil/Makefile.am +++ b/liboil/Makefile.am @@ -1,7 +1,7 @@ pkgincludedir = $(includedir)/liboil-@LIBOIL_MAJORMINOR@/liboil -SUBDIRS = colorspace conv copy dct jpeg math md5 motovec simdpack sse utf8 +SUBDIRS = colorspace conv copy dct fb jpeg math md5 motovec simdpack sse utf8 lib_LTLIBRARIES = liboiltmp1.la liboil-@LIBOIL_MAJORMINOR@.la @@ -25,6 +25,7 @@ liboilfunctions_la_LIBADD = \ colorspace/libcolorspace.la \ copy/libcopy.la \ dct/libdct.la \ + fb/libfb.la \ jpeg/libjpeg.la \ math/libmath.la \ md5/libmd5.la \ diff --git a/liboil/colorspace/composite.c b/liboil/colorspace/composite.c index 36fb354..5c63c25 100644 --- a/liboil/colorspace/composite.c +++ b/liboil/colorspace/composite.c @@ -316,3 +316,25 @@ composite_over_argb_noclamp (uint32_t *dest, uint32_t *src, int n) } OIL_DEFINE_IMPL (composite_over_argb_noclamp, composite_over_argb); +#define oil_divide_255_2(x) ((((x)+128)*257)>>24) +#define oil_muldiv_255_2(a,b) oil_divide_255_2((a)*(b)) +#define COMPOSITE_OVER_2(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) + +static void +composite_over_argb_noclamp_2 (uint32_t *dest, uint32_t *src, int n) +{ + int i; + uint8_t a; + + for(i=0;i<n;i++){ + a = oil_argb_A(src[i]); + dest[i] = oil_argb_noclamp( + COMPOSITE_OVER_2(oil_argb_A(dest[i]),oil_argb_A(src[i]),a), + COMPOSITE_OVER_2(oil_argb_R(dest[i]),oil_argb_R(src[i]),a), + COMPOSITE_OVER_2(oil_argb_G(dest[i]),oil_argb_G(src[i]),a), + COMPOSITE_OVER_2(oil_argb_B(dest[i]),oil_argb_B(src[i]),a)); + } + +} +OIL_DEFINE_IMPL (composite_over_argb_noclamp_2, composite_over_argb); + diff --git a/liboil/fb/Makefile.am b/liboil/fb/Makefile.am new file mode 100644 index 0000000..0987982 --- /dev/null +++ b/liboil/fb/Makefile.am @@ -0,0 +1,27 @@ + +CLEANFILES = empty.c + +if USE_ALT_OPT +opt_libs = libfb_opt.la +else +opt_libs = +endif +noinst_LTLIBRARIES = libfb.la $(opt_libs) + +if HAVE_CPU_I386 +c_sources = \ + fbmmx.c +else +c_sources = \ + empty.c +endif + +libfb_la_SOURCES = $(c_sources) +libfb_la_CFLAGS = $(MMX_CFLAGS) $(SSE_CFLAGS) $(LIBOIL_CFLAGS) + +libfb_opt_la_SOURCES = $(c_sources) +libfb_opt_la_CFLAGS = $(LIBOIL_CFLAGS) $(LIBOIL_OPT_CFLAGS) $(MMX_CFLAGS) $(SSE_CFLAGS) + +empty.c: + echo >empty.c + diff --git a/liboil/fb/fbmmx.c b/liboil/fb/fbmmx.c new file mode 100644 index 0000000..90f8748 --- /dev/null +++ b/liboil/fb/fbmmx.c @@ -0,0 +1,2387 @@ +/* + * Copyright © 2004 Red Hat, Inc. + * Copyright © 2004 Nicholas Miell + * Copyright © 2005 Trolltech AS + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. Red Hat makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Søren Sandmann (sandmann@redhat.com) + * Minor Improvements: Nicholas Miell (nmiell@gmail.com) + * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) + * + * Based on work by Owen Taylor + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboil.h> +#include <liboil/liboilfunction.h> + +#include <mmintrin.h> +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ + +#include "fbmmx.h" + +#define CHECKPOINT() + +#if 0 +/* --------------- MMX code patch for fbcompose.c --------------------- */ + +static FASTCALL void +mmxCombineMaskU (CARD32 *src, const CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = mask + width; + while (mask < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + a = MmxAlpha(a); + MmxMul(s, a); + *src = MmxFrom(s); + ++src; + ++mask; + } + _mm_empty(); +} +#endif + +OIL_DECLARE_CLASS(composite_over_argb); + +static void +mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const uint32_t *end = dest + width; + + while (dest < end) { + __m64 x, y, a; + x = MmxTo(*src); + y = MmxTo(*dest); + a = MmxAlpha(x); + a = MmxNegate(a); + MmxMulAdd(y, a, x); + *dest = MmxFrom(y); + ++dest; + ++src; + } + _mm_empty(); +} +OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX); + +#if 0 +static FASTCALL void +mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, y, a; + x = MmxTo(*dest); + y = MmxTo(*src); + a = MmxAlpha(x); + a = MmxNegate(a); + MmxMulAdd(y, a, x); + *dest = MmxFrom(y); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineInU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, a; + x = MmxTo(*src); + a = MmxTo(*dest); + a = MmxAlpha(a); + MmxMul(x, a); + *dest = MmxFrom(x); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, a; + x = MmxTo(*dest); + a = MmxTo(*src); + a = MmxAlpha(a); + MmxMul(x, a); + *dest = MmxFrom(x); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, a; + x = MmxTo(*src); + a = MmxTo(*dest); + a = MmxAlpha(a); + a = MmxNegate(a); + MmxMul(x, a); + *dest = MmxFrom(x); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, a; + x = MmxTo(*dest); + a = MmxTo(*src); + a = MmxAlpha(a); + a = MmxNegate(a); + MmxMul(x, a); + *dest = MmxFrom(x); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 s, da, d, sia; + s = MmxTo(*src); + d = MmxTo(*dest); + sia = MmxAlpha(s); + sia = MmxNegate(sia); + da = MmxAlpha(d); + MmxAddMul(s, da, d, sia); + *dest = MmxFrom(s); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end; + + end = dest + width; + + while (dest < end) { + __m64 s, dia, d, sa; + s = MmxTo(*src); + d = MmxTo(*dest); + sa = MmxAlpha(s); + dia = MmxAlpha(d); + dia = MmxNegate(dia); + MmxAddMul(s, dia, d, sa); + *dest = MmxFrom(s); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 s, dia, d, sia; + s = MmxTo(*src); + d = MmxTo(*dest); + sia = MmxAlpha(s); + dia = MmxAlpha(d); + sia = MmxNegate(sia); + dia = MmxNegate(dia); + MmxAddMul(s, dia, d, sia); + *dest = MmxFrom(s); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAddU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + + const CARD32 *end = dest + width; + while (dest < end) { + __m64 s, d; + s = MmxTo(*src); + d = MmxTo(*dest); + s = MmxAdd(s, d); + *dest = MmxFrom(s); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = dest + width; + while (dest < end) { + CARD32 s = *src; + CARD32 d = *dest; + __m64 ms = MmxTo(s); + __m64 md = MmxTo(d); + CARD32 sa = s >> 24; + CARD32 da = ~d >> 24; + + if (sa > da) { + __m64 msa = MmxTo(FbIntDiv(da, sa)); + msa = MmxAlpha(msa); + MmxMul(ms, msa); + } + MmxAdd(md, ms); + *dest = MmxFrom(md); + ++src; + ++dest; + } + _mm_empty(); +} + + +static FASTCALL void +mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + MmxMul(s, a); + *dest = MmxFrom(s); + ++src; + ++mask; + ++dest; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 sa = MmxAlpha(s); + MmxMul(s, a); + MmxMul(a, sa); + a = MmxNegate(a); + MmxMulAdd(d, a, s); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + da = MmxNegate(da); + MmxMul(s, a); + MmxMulAdd(s, da, d); + *dest = MmxFrom(s); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + + +static FASTCALL void +mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + MmxMul(s, a); + MmxMul(s, da); + *dest = MmxFrom(s); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 sa = MmxAlpha(s); + MmxMul(a, sa); + MmxMul(d, a); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + da = MmxNegate(da); + MmxMul(s, a); + MmxMul(s, da); + *dest = MmxFrom(s); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 sa = MmxAlpha(s); + MmxMul(a, sa); + a = MmxNegate(a); + MmxMul(d, a); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + __m64 sa = MmxAlpha(s); + MmxMul(s, a); + MmxMul(a, sa); + a = MmxNegate(a); + MmxAddMul(d, a, s, da); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + __m64 sa = MmxAlpha(s) + MmxMul(s, a); + MmxMul(a, sa); + da = MmxNegate(da); + MmxAddMul(d, a, s, da); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + __m64 sa = MmxAlpha(s); + MmxMul(s, a); + MmxMul(a, sa); + da = MmxNegate(da); + a = MmxNegate(a); + MmxAddMul(d, a, s, da); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + MmxMul(s, a); + d = MmxAdd(s, d); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +extern FbComposeFunctions composeFunctions; + +void fbComposeSetupMMX(void) +{ + /* check if we have MMX support and initialize accordingly */ + if (fbHaveMMX()) { + composeFunctions.combineU[PictOpOver] = mmxCombineOverU; + composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU; + composeFunctions.combineU[PictOpIn] = mmxCombineInU; + composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU; + composeFunctions.combineU[PictOpOut] = mmxCombineOutU; + composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU; + composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU; + composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU; + composeFunctions.combineU[PictOpXor] = mmxCombineXorU; + composeFunctions.combineU[PictOpAdd] = mmxCombineAddU; + composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU; + + composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC; + composeFunctions.combineC[PictOpOver] = mmxCombineOverC; + composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC; + composeFunctions.combineC[PictOpIn] = mmxCombineInC; + composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC; + composeFunctions.combineC[PictOpOut] = mmxCombineOutC; + composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC; + composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC; + composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC; + composeFunctions.combineC[PictOpXor] = mmxCombineXorC; + composeFunctions.combineC[PictOpAdd] = mmxCombineAddC; + + composeFunctions.combineMaskU = mmxCombineMaskU; + } +} + + +/* ------------------ MMX code paths called from fbpict.c ----------------------- */ + +typedef struct +{ + ullong mmx_4x00ff; + ullong mmx_4x0080; + ullong mmx_565_rgb; + ullong mmx_565_unpack_multiplier; + ullong mmx_565_r; + ullong mmx_565_g; + ullong mmx_565_b; + ullong mmx_mask_0; + ullong mmx_mask_1; + ullong mmx_mask_2; + ullong mmx_mask_3; + ullong mmx_full_alpha; + ullong mmx_ffff0000ffff0000; + ullong mmx_0000ffff00000000; + ullong mmx_000000000000ffff; +} MMXData; + +static const MMXData c = +{ + .mmx_4x00ff = 0x00ff00ff00ff00ffULL, + .mmx_4x0080 = 0x0080008000800080ULL, + .mmx_565_rgb = 0x000001f0003f001fULL, + .mmx_565_r = 0x000000f800000000ULL, + .mmx_565_g = 0x0000000000fc0000ULL, + .mmx_565_b = 0x00000000000000f8ULL, + .mmx_mask_0 = 0xffffffffffff0000ULL, + .mmx_mask_1 = 0xffffffff0000ffffULL, + .mmx_mask_2 = 0xffff0000ffffffffULL, + .mmx_mask_3 = 0x0000ffffffffffffULL, + .mmx_full_alpha = 0x00ff000000000000ULL, + .mmx_565_unpack_multiplier = 0x0000008404100840ULL, + .mmx_ffff0000ffff0000 = 0xffff0000ffff0000ULL, + .mmx_0000ffff00000000 = 0x0000ffff00000000ULL, + .mmx_000000000000ffff = 0x000000000000ffffULL, +}; + +#define MC(x) ((__m64) c.mmx_##x) + +static __inline__ __m64 +shift (__m64 v, int s) +{ + if (s > 0) + return _mm_slli_si64 (v, s); + else if (s < 0) + return _mm_srli_si64 (v, -s); + else + return v; +} + +static __inline__ __m64 +negate (__m64 mask) +{ + return _mm_xor_si64 (mask, MC(4x00ff)); +} + +static __inline__ __m64 +pix_multiply (__m64 a, __m64 b) +{ + __m64 res; + + res = _mm_mullo_pi16 (a, b); + res = _mm_add_pi16 (res, MC(4x0080)); + res = _mm_srli_pi16 (res, 8); + + return res; +} + +static __inline__ __m64 +expand_alpha (__m64 pixel) +{ + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); +} + +static __inline__ __m64 +expand_alpha_rev (__m64 pixel) +{ + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); +} + +static __inline__ __m64 +invert_colors (__m64 pixel) +{ + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); +} + +/* Notes about writing mmx code + * + * give memory operands as the second operand. If you give it as the + * first, gcc will first load it into a register, then use that + * register + * + * ie. use + * + * _mm_mullo_pi16 (x, mmx_constant); + * + * not + * + * _mm_mullo_pi16 (mmx_constant, x); + * + * Also try to minimize dependencies. i.e. when you need a value, try + * to calculate it from a value that was calculated as early as + * possible. + */ + +static __inline__ __m64 +over (__m64 src, __m64 srca, __m64 dest) +{ + return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); +} + +static __inline__ __m64 +over_rev_non_pre (__m64 src, __m64 dest) +{ + __m64 srca = expand_alpha (src); + __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); + + return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); +} + +static __inline__ __m64 +in (__m64 src, + __m64 mask) +{ + return pix_multiply (src, mask); +} + +static __inline__ __m64 +in_over (__m64 src, + __m64 srca, + __m64 mask, + __m64 dest) +{ + return over(in(src, mask), pix_multiply(srca, mask), dest); +} + +static __inline__ __m64 +load8888 (CARD32 v) +{ + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); +} + +static __inline__ __m64 +pack8888 (__m64 lo, __m64 hi) +{ + __m64 r; + r = _mm_packs_pu16 (lo, hi); + return r; +} + +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into + * + * 00RR00GG00BB + * + * --- Expanding 565 in the low word --- + * + * m = (m << (32 - 3)) | (m << (16 - 5)) | m; + * m = m & (01f0003f001f); + * m = m * (008404100840); + * m = m >> 8; + * + * Note the trick here - the top word is shifted by another nibble to + * avoid it bumping into the middle word + */ +static __inline__ __m64 +expand565 (__m64 pixel, int pos) +{ + __m64 p = pixel; + __m64 t1, t2; + + /* move pixel to low 16 bit and zero the rest */ + p = shift (shift (p, (3 - pos) * 16), -48); + + t1 = shift (p, 36 - 11); + t2 = shift (p, 16 - 5); + + p = _mm_or_si64 (t1, p); + p = _mm_or_si64 (t2, p); + p = _mm_and_si64 (p, MC(565_rgb)); + + pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); + return _mm_srli_pi16 (pixel, 8); +} + +static __inline__ __m64 +expand8888 (__m64 in, int pos) +{ + if (pos == 0) + return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); + else + return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); +} + +static __inline__ __m64 +pack565 (__m64 pixel, __m64 target, int pos) +{ + __m64 p = pixel; + __m64 t = target; + __m64 r, g, b; + + r = _mm_and_si64 (p, MC(565_r)); + g = _mm_and_si64 (p, MC(565_g)); + b = _mm_and_si64 (p, MC(565_b)); + + r = shift (r, - (32 - 8) + pos * 16); + g = shift (g, - (16 - 3) + pos * 16); + b = shift (b, - (0 + 3) + pos * 16); + + if (pos == 0) + t = _mm_and_si64 (t, MC(mask_0)); + else if (pos == 1) + t = _mm_and_si64 (t, MC(mask_1)); + else if (pos == 2) + t = _mm_and_si64 (t, MC(mask_2)); + else if (pos == 3) + t = _mm_and_si64 (t, MC(mask_3)); + + p = _mm_or_si64 (r, t); + p = _mm_or_si64 (g, p); + + return _mm_or_si64 (b, p); +} + +void +fbCompositeSolid_nx8888mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 src; + CARD32 *dstLine, *dst; + CARD16 w; + FbStride dstStride; + __m64 vsrc, vsrca; + + CHECKPOINT(); + + fbComposeGetSolid(pSrc, src, pDst->format); + + if (src >> 24 == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + + vsrc = load8888 (src); + vsrca = expand_alpha (vsrc); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + w = width; + + CHECKPOINT(); + + while (w && (unsigned long)dst & 7) + { + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), + _mm_setzero_si64()); + + w--; + dst++; + } + + while (w >= 2) + { + __m64 vdest; + __m64 dest0, dest1; + + vdest = *(__m64 *)dst; + + dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); + dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); + + *(__m64 *)dst = pack8888(dest0, dest1); + + dst += 2; + w -= 2; + } + + CHECKPOINT(); + + while (w) + { + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64()); + + w--; + dst++; + } + } + + _mm_empty(); +} + +void +fbCompositeSolid_nx0565mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 src; + CARD16 *dstLine, *dst; + CARD16 w; + FbStride dstStride; + __m64 vsrc, vsrca; + + CHECKPOINT(); + + fbComposeGetSolid(pSrc, src, pDst->format); + + if (src >> 24 == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); + + vsrc = load8888 (src); + vsrca = expand_alpha (vsrc); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + w = width; + + CHECKPOINT(); + + while (w && (unsigned long)dst & 7) + { + ullong d = *dst; + __m64 vdest = expand565 ((__m64)d, 0); + vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); + *dst = (ullong)vdest; + + w--; + dst++; + } + + while (w >= 4) + { + __m64 vdest; + + vdest = *(__m64 *)dst; + + vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); + vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); + vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); + vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); + + *(__m64 *)dst = vdest; + + dst += 4; + w -= 4; + } + + CHECKPOINT(); + + while (w) + { + ullong d = *dst; + __m64 vdest = expand565 ((__m64)d, 0); + vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); + *dst = (ullong)vdest; + + w--; + dst++; + } + } + + _mm_empty(); +} + +void +fbCompositeSolidMask_nx8888x8888Cmmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 src, srca; + CARD32 *dstLine; + CARD32 *maskLine; + FbStride dstStride, maskStride; + __m64 vsrc, vsrca; + + CHECKPOINT(); + + fbComposeGetSolid(pSrc, src, pDst->format); + + srca = src >> 24; + if (srca == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1); + + vsrc = load8888(src); + vsrca = expand_alpha(vsrc); + + while (height--) + { + int twidth = width; + CARD32 *p = (CARD32 *)maskLine; + CARD32 *q = (CARD32 *)dstLine; + + while (twidth && (unsigned long)q & 7) + { + CARD32 m = *(CARD32 *)p; + + if (m) + { + __m64 vdest = load8888(*q); + vdest = in_over(vsrc, vsrca, load8888(m), vdest); + *q = (ullong)pack8888(vdest, _mm_setzero_si64()); + } + + twidth--; + p++; + q++; + } + + while (twidth >= 2) + { + CARD32 m0, m1; + m0 = *p; + m1 = *(p + 1); + + if (m0 | m1) + { + __m64 dest0, dest1; + __m64 vdest = *(__m64 *)q; + + dest0 = in_over(vsrc, vsrca, load8888(m0), + expand8888 (vdest, 0)); + dest1 = in_over(vsrc, vsrca, load8888(m1), + expand8888 (vdest, 1)); + + *(__m64 *)q = pack8888(dest0, dest1); + } + + p += 2; + q += 2; + twidth -= 2; + } + + while (twidth) + { + CARD32 m = *(CARD32 *)p; + + if (m) + { + __m64 vdest = load8888(*q); + vdest = in_over(vsrc, vsrca, load8888(m), vdest); + *q = (ullong)pack8888(vdest, _mm_setzero_si64()); + } + + twidth--; + p++; + q++; + } + + dstLine += dstStride; + maskLine += maskStride; + } + + _mm_empty(); +} + +void +fbCompositeSrc_8888x8x8888mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 *dstLine, *dst; + CARD32 *srcLine, *src; + CARD8 *maskLine; + CARD32 mask; + __m64 vmask; + FbStride dstStride, srcStride, maskStride; + CARD16 w; + __m64 srca; + + CHECKPOINT(); + + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); + + mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine; + vmask = load8888 (mask); + srca = MC(4x00ff); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + while (w && (unsigned long)dst & 7) + { + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); + + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); + + w--; + dst++; + src++; + } + + while (w >= 16) + { + __m64 vd0 = *(__m64 *)(dst + 0); + __m64 vd1 = *(__m64 *)(dst + 2); + __m64 vd2 = *(__m64 *)(dst + 4); + __m64 vd3 = *(__m64 *)(dst + 6); + __m64 vd4 = *(__m64 *)(dst + 8); + __m64 vd5 = *(__m64 *)(dst + 10); + __m64 vd6 = *(__m64 *)(dst + 12); + __m64 vd7 = *(__m64 *)(dst + 14); + + __m64 vs0 = *(__m64 *)(src + 0); + __m64 vs1 = *(__m64 *)(src + 2); + __m64 vs2 = *(__m64 *)(src + 4); + __m64 vs3 = *(__m64 *)(src + 6); + __m64 vs4 = *(__m64 *)(src + 8); + __m64 vs5 = *(__m64 *)(src + 10); + __m64 vs6 = *(__m64 *)(src + 12); + __m64 vs7 = *(__m64 *)(src + 14); + + vd0 = (__m64)pack8888 ( + in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), + in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); + + vd1 = (__m64)pack8888 ( + in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), + in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); + + vd2 = (__m64)pack8888 ( + in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), + in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); + + vd3 = (__m64)pack8888 ( + in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), + in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); + + vd4 = (__m64)pack8888 ( + in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), + in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); + + vd5 = (__m64)pack8888 ( + in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), + in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); + + vd6 = (__m64)pack8888 ( + in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), + in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); + + vd7 = (__m64)pack8888 ( + in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), + in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); + + *(__m64 *)(dst + 0) = vd0; + *(__m64 *)(dst + 2) = vd1; + *(__m64 *)(dst + 4) = vd2; + *(__m64 *)(dst + 6) = vd3; + *(__m64 *)(dst + 8) = vd4; + *(__m64 *)(dst + 10) = vd5; + *(__m64 *)(dst + 12) = vd6; + *(__m64 *)(dst + 14) = vd7; + + w -= 16; + dst += 16; + src += 16; + } + + while (w) + { + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); + + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); + + w--; + dst++; + src++; + } + } + + _mm_empty(); +} + +void +fbCompositeSrc_8888x8888mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 *dstLine, *dst; + CARD32 *srcLine, *src; + FbStride dstStride, srcStride; + CARD16 w; + __m64 srca; + + CHECKPOINT(); + + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); + + srca = MC (4x00ff); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + while (w && (unsigned long)dst & 7) + { + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); + + *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64()); + + w--; + dst++; + src++; + } + + while (w >= 2) + { + __m64 vd = *(__m64 *)(dst + 0); + __m64 vs = *(__m64 *)(src + 0); + __m64 vs0 = expand8888 (vs, 0); + __m64 vs1 = expand8888 (vs, 1); + + *(__m64 *)dst = (__m64)pack8888 ( + over (vs0, expand_alpha (vs0), expand8888 (vd, 0)), + over (vs1, expand_alpha (vs1), expand8888 (vd, 1))); + + w -= 2; + dst += 2; + src += 2; + } + + while (w) + { + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); + + *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), + (__m64)_mm_setzero_si64()); + + w--; + dst++; + src++; + } + } + + _mm_empty(); +} + +void +fbCompositeSolidMask_nx8x8888mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 src, srca; + CARD32 *dstLine, *dst; + CARD8 *maskLine, *mask; + FbStride dstStride, maskStride; + CARD16 w; + __m64 vsrc, vsrca; + ullong srcsrc; + + CHECKPOINT(); + + fbComposeGetSolid(pSrc, src, pDst->format); + + srca = src >> 24; + if (srca == 0) + return; + + srcsrc = (unsigned long long)src << 32 | src; + + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); + + vsrc = load8888 (src); + vsrca = expand_alpha (vsrc); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + CHECKPOINT(); + + while (w && (unsigned long)dst & 7) + { + ullong m = *mask; + + if (m) + { + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst)); + *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); + } + + w--; + mask++; + dst++; + } + + CHECKPOINT(); + + while (w >= 2) + { + ullong m0, m1; + m0 = *mask; + m1 = *(mask + 1); + + if (srca == 0xff && (m0 & m1) == 0xff) + { + *(unsigned long long *)dst = srcsrc; + } + else if (m0 | m1) + { + __m64 vdest; + __m64 dest0, dest1; + + vdest = *(__m64 *)dst; + + dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0)); + dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1)); + + *(__m64 *)dst = pack8888(dest0, dest1); + } + + mask += 2; + dst += 2; + w -= 2; + } + + CHECKPOINT(); + + while (w) + { + ullong m = *mask; + + if (m) + { + __m64 vdest = load8888(*dst); + vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest); + *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); + } + + w--; + mask++; + dst++; + } + } + + _mm_empty(); +} + + +void +fbCompositeSolidMask_nx8x0565mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 src, srca; + CARD16 *dstLine, *dst; + CARD8 *maskLine, *mask; + FbStride dstStride, maskStride; + CARD16 w; + __m64 vsrc, vsrca; + unsigned long long srcsrcsrcsrc, src16; + + CHECKPOINT(); + + fbComposeGetSolid(pSrc, src, pDst->format); + + srca = src >> 24; + if (srca == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); + + vsrc = load8888 (src); + vsrca = expand_alpha (vsrc); + + src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0); + + srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | + (ullong)src16 << 16 | (ullong)src16; + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + CHECKPOINT(); + + while (w && (unsigned long)dst & 7) + { + ullong m = *mask; + + if (m) + { + ullong d = *dst; + __m64 vd = (__m64)d; + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); + } + + w--; + mask++; + dst++; + } + + CHECKPOINT(); + + while (w >= 4) + { + ullong m0, m1, m2, m3; + m0 = *mask; + m1 = *(mask + 1); + m2 = *(mask + 2); + m3 = *(mask + 3); + + if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) + { + *(unsigned long long *)dst = srcsrcsrcsrc; + } + else if (m0 | m1 | m2 | m3) + { + __m64 vdest; + __m64 vm0, vm1, vm2, vm3; + + vdest = *(__m64 *)dst; + + vm0 = (__m64)m0; + vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); + vm1 = (__m64)m1; + vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); + vm2 = (__m64)m2; + vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); + vm3 = (__m64)m3; + vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); + + *(__m64 *)dst = vdest; + } + + w -= 4; + mask += 4; + dst += 4; + } + + CHECKPOINT(); + + while (w) + { + ullong m = *mask; + + if (m) + { + ullong d = *dst; + __m64 vd = (__m64)d; + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); + } + + w--; + mask++; + dst++; + } + } + + _mm_empty(); +} + +void +fbCompositeSrc_8888RevNPx0565mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD16 *dstLine, *dst; + CARD32 *srcLine, *src; + FbStride dstStride, srcStride; + CARD16 w; + + CHECKPOINT(); + + fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); + + assert (pSrc->pDrawable == pMask->pDrawable); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + CHECKPOINT(); + + while (w && (unsigned long)dst & 7) + { + __m64 vsrc = load8888 (*src); + ullong d = *dst; + __m64 vdest = expand565 ((__m64)d, 0); + + vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); + + *dst = (ullong)vdest; + + w--; + dst++; + src++; + } + + CHECKPOINT(); + + while (w >= 4) + { + CARD32 s0, s1, s2, s3; + unsigned char a0, a1, a2, a3; + + s0 = *src; + s1 = *(src + 1); + s2 = *(src + 2); + s3 = *(src + 3); + + a0 = (s0 >> 24); + a1 = (s1 >> 24); + a2 = (s2 >> 24); + a3 = (s3 >> 24); + + if ((a0 & a1 & a2 & a3) == 0xFF) + { + __m64 vdest; + vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); + vdest = pack565(invert_colors(load8888(s1)), vdest, 1); + vdest = pack565(invert_colors(load8888(s2)), vdest, 2); + vdest = pack565(invert_colors(load8888(s3)), vdest, 3); + + *(__m64 *)dst = vdest; + } + else if (a0 | a1 | a2 | a3) + { + __m64 vdest = *(__m64 *)dst; + + vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); + vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); + vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); + vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); + + *(__m64 *)dst = vdest; + } + + w -= 4; + dst += 4; + src += 4; + } + + CHECKPOINT(); + + while (w) + { + __m64 vsrc = load8888 (*src); + ullong d = *dst; + __m64 vdest = expand565 ((__m64)d, 0); + + vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); + + *dst = (ullong)vdest; + + w--; + dst++; + src++; + } + } + + _mm_empty(); +} + +/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ + +void +fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 *dstLine, *dst; + CARD32 *srcLine, *src; + FbStride dstStride, srcStride; + CARD16 w; + + CHECKPOINT(); + + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); + + assert (pSrc->pDrawable == pMask->pDrawable); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + while (w && (unsigned long)dst & 7) + { + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); + + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); + + w--; + dst++; + src++; + } + + while (w >= 2) + { + ullong s0, s1; + unsigned char a0, a1; + __m64 d0, d1; + + s0 = *src; + s1 = *(src + 1); + + a0 = (s0 >> 24); + a1 = (s1 >> 24); + + if ((a0 & a1) == 0xFF) + { + d0 = invert_colors(load8888(s0)); + d1 = invert_colors(load8888(s1)); + + *(__m64 *)dst = pack8888 (d0, d1); + } + else if (a0 | a1) + { + __m64 vdest = *(__m64 *)dst; + + d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); + d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); + + *(__m64 *)dst = pack8888 (d0, d1); + } + + w -= 2; + dst += 2; + src += 2; + } + + while (w) + { + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); + + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); + + w--; + dst++; + src++; + } + } + + _mm_empty(); +} + +void +fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 src, srca; + CARD16 *dstLine; + CARD32 *maskLine; + FbStride dstStride, maskStride; + __m64 vsrc, vsrca; + + CHECKPOINT(); + + fbComposeGetSolid(pSrc, src, pDst->format); + + srca = src >> 24; + if (srca == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1); + + vsrc = load8888 (src); + vsrca = expand_alpha (vsrc); + + while (height--) + { + int twidth = width; + CARD32 *p = (CARD32 *)maskLine; + CARD16 *q = (CARD16 *)dstLine; + + while (twidth && ((unsigned long)q & 7)) + { + CARD32 m = *(CARD32 *)p; + + if (m) + { + ullong d = *q; + __m64 vdest = expand565 ((__m64)d, 0); + vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); + *q = (ullong)vdest; + } + + twidth--; + p++; + q++; + } + + while (twidth >= 4) + { + CARD32 m0, m1, m2, m3; + + m0 = *p; + m1 = *(p + 1); + m2 = *(p + 2); + m3 = *(p + 3); + + if ((m0 | m1 | m2 | m3)) + { + __m64 vdest = *(__m64 *)q; + + vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); + vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); + vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); + vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); + + *(__m64 *)q = vdest; + } + twidth -= 4; + p += 4; + q += 4; + } + + while (twidth) + { + CARD32 m; + + m = *(CARD32 *)p; + if (m) + { + ullong d = *q; + __m64 vdest = expand565((__m64)d, 0); + vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); + *q = (ullong)vdest; + } + + twidth--; + p++; + q++; + } + + maskLine += maskStride; + dstLine += dstStride; + } + + _mm_empty (); +} + +void +fbCompositeSrcAdd_8000x8000mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD8 *dstLine, *dst; + CARD8 *srcLine, *src; + FbStride dstStride, srcStride; + CARD16 w; + CARD8 s, d; + CARD16 t; + + CHECKPOINT(); + + fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1); + fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + while (w && (unsigned long)dst & 7) + { + s = *src; + d = *dst; + t = d + s; + s = t | (0 - (t >> 8)); + *dst = s; + + dst++; + src++; + w--; + } + + while (w >= 8) + { + *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); + dst += 8; + src += 8; + w -= 8; + } + + while (w) + { + s = *src; + d = *dst; + t = d + s; + s = t | (0 - (t >> 8)); + *dst = s; + + dst++; + src++; + w--; + } + } + + _mm_empty(); +} + +void +fbCompositeSrcAdd_8888x8888mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 *dstLine, *dst; + CARD32 *srcLine, *src; + FbStride dstStride, srcStride; + CARD16 w; + + CHECKPOINT(); + + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + while (w && (unsigned long)dst & 7) + { + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), + _mm_cvtsi32_si64(*dst))); + dst++; + src++; + w--; + } + + while (w >= 2) + { + *(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); + dst += 2; + src += 2; + w -= 2; + } + + if (w) + { + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), + _mm_cvtsi32_si64(*dst))); + + } + } + + _mm_empty(); +} + +#define GetStart(drw,x,y,type,stride,line,bpp) {\ + FbBits *__bits__; \ + FbStride __stride__; \ + int __xoff__,__yoff__; \ + \ + fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__); \ + (stride) = __stride__ * sizeof (FbBits) / sizeof (type); \ + (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__); \ +} + +Bool +fbSolidFillmmx (DrawablePtr pDraw, + int x, + int y, + int width, + int height, + FbBits xor) +{ + FbStride stride; + int bpp; + ullong fill; + __m64 vfill; + CARD32 byte_width; + CARD8 *byte_line; + FbBits *bits; + int xoff, yoff; + + CHECKPOINT(); + + fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); + + if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) + return FALSE; + + if (bpp != 16 && bpp != 32) + return FALSE; + + if (bpp == 16) + { + stride = stride * sizeof (FbBits) / 2; + byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff)); + byte_width = 2 * width; + stride *= 2; + } + else + { + stride = stride * sizeof (FbBits) / 4; + byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff)); + byte_width = 4 * width; + stride *= 4; + } + + fill = ((ullong)xor << 32) | xor; + vfill = (__m64)fill; + + while (height--) + { + int w; + CARD8 *d = byte_line; + byte_line += stride; + w = byte_width; + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(CARD16 *)d = xor; + w -= 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 7)) + { + *(CARD32 *)d = xor; + + w -= 4; + d += 4; + } + + while (w >= 64) + { + *(__m64*) (d + 0) = vfill; + *(__m64*) (d + 8) = vfill; + *(__m64*) (d + 16) = vfill; + *(__m64*) (d + 24) = vfill; + *(__m64*) (d + 32) = vfill; + *(__m64*) (d + 40) = vfill; + *(__m64*) (d + 48) = vfill; + *(__m64*) (d + 56) = vfill; + + w -= 64; + d += 64; + } + while (w >= 4) + { + *(CARD32 *)d = xor; + + w -= 4; + d += 4; + } + if (w >= 2) + { + *(CARD16 *)d = xor; + w -= 2; + d += 2; + } + } + + _mm_empty(); + return TRUE; +} + +Bool +fbCopyAreammx (DrawablePtr pSrc, + DrawablePtr pDst, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + FbBits * src_bits; + FbStride src_stride; + int src_bpp; + int src_xoff; + int src_yoff; + + FbBits * dst_bits; + FbStride dst_stride; + int dst_bpp; + int dst_xoff; + int dst_yoff; + + CARD8 * src_bytes; + CARD8 * dst_bytes; + int byte_width; + + fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff); + fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff); + + if (src_bpp != 16 && src_bpp != 32) + return FALSE; + + if (dst_bpp != 16 && dst_bpp != 32) + return FALSE; + + if (src_bpp != dst_bpp) + { + return FALSE; + } + + if (src_bpp == 16) + { + src_stride = src_stride * sizeof (FbBits) / 2; + dst_stride = dst_stride * sizeof (FbBits) / 2; + src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); + dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); + byte_width = 2 * width; + src_stride *= 2; + dst_stride *= 2; + } + else + { + src_stride = src_stride * sizeof (FbBits) / 4; + dst_stride = dst_stride * sizeof (FbBits) / 4; + src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); + dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); + byte_width = 4 * width; + src_stride *= 4; + dst_stride *= 4; + } + + while (height--) + { + int w; + CARD8 *s = src_bytes; + CARD8 *d = dst_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + w = byte_width; + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(CARD16 *)d = *(CARD16 *)s; + w -= 2; + s += 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 7)) + { + *(CARD32 *)d = *(CARD32 *)s; + + w -= 4; + s += 4; + d += 4; + } + + while (w >= 64) + { + *(__m64 *)(d + 0) = *(__m64 *)(s + 0); + *(__m64 *)(d + 8) = *(__m64 *)(s + 8); + *(__m64 *)(d + 16) = *(__m64 *)(s + 16); + *(__m64 *)(d + 24) = *(__m64 *)(s + 24); + *(__m64 *)(d + 32) = *(__m64 *)(s + 32); + *(__m64 *)(d + 40) = *(__m64 *)(s + 40); + *(__m64 *)(d + 48) = *(__m64 *)(s + 48); + *(__m64 *)(d + 56) = *(__m64 *)(s + 56); + w -= 64; + s += 64; + d += 64; + } + while (w >= 4) + { + *(CARD32 *)d = *(CARD32 *)s; + + w -= 4; + s += 4; + d += 4; + } + if (w >= 2) + { + *(CARD16 *)d = *(CARD16 *)s; + w -= 2; + s += 2; + d += 2; + } + } + + _mm_empty(); + return TRUE; +} + +void +fbCompositeCopyAreammx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + fbCopyAreammx (pSrc->pDrawable, + pDst->pDrawable, + xSrc, ySrc, + xDst, yDst, + width, height); +} + +#if !defined(__amd64__) && !defined(__x86_64__) + +enum CPUFeatures { + NoFeatures = 0, + MMX = 0x1, + MMX_Extensions = 0x2, + SSE = 0x6, + SSE2 = 0x8, + CMOV = 0x10 +}; + +static unsigned int detectCPUFeatures(void) { + unsigned int result; + char vendor[13]; + vendor[0] = 0; + vendor[12] = 0; + /* see p. 118 of amd64 instruction set manual Vol3 */ + __asm__ ("push %%ebx\n" + "pushf\n" + "pop %%eax\n" + "mov %%eax, %%ebx\n" + "xor $0x00200000, %%eax\n" + "push %%eax\n" + "popf\n" + "pushf\n" + "pop %%eax\n" + "mov $0x0, %%edx\n" + "xor %%ebx, %%eax\n" + "jz skip\n" + + "mov $0x00000000, %%eax\n" + "cpuid\n" + "mov %%ebx, %1\n" + "mov %%edx, %2\n" + "mov %%ecx, %3\n" + "mov $0x00000001, %%eax\n" + "cpuid\n" + "skip:\n" + "pop %%ebx\n" + "mov %%edx, %0\n" + : "=r" (result), + "=m" (vendor[0]), + "=m" (vendor[4]), + "=m" (vendor[8]) + : + : "%eax", "%ecx", "%edx" + ); + + unsigned int features = 0; + if (result) { + /* result now contains the standard feature bits */ + if (result & (1 << 15)) + features |= CMOV; + if (result & (1 << 23)) + features |= MMX; + if (result & (1 << 25)) + features |= SSE; + if (result & (1 << 26)) + features |= SSE2; + if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) { + /* check for AMD MMX extensions */ + + unsigned int result; + __asm__("push %%ebx\n" + "mov $0x80000000, %%eax\n" + "cpuid\n" + "xor %%edx, %%edx\n" + "cmp $0x1, %%eax\n" + "jge skip2\n" + "mov $0x80000001, %%eax\n" + "cpuid\n" + "skip2:\n" + "mov %%edx, %0\n" + "pop %%ebx\n" + : "=r" (result) + : + : "%eax", "%ecx", "%edx" + ); + if (result & (1<<22)) + features |= MMX_Extensions; + } + } + return features; +} + +Bool +fbHaveMMX (void) +{ + static Bool initialized = FALSE; + static Bool mmx_present; + + if (!initialized) + { + unsigned int features = detectCPUFeatures(); + mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions); + initialized = TRUE; + } + + return mmx_present; +} +#endif /* __amd64__ */ + + +#endif diff --git a/liboil/fb/fbmmx.h b/liboil/fb/fbmmx.h new file mode 100644 index 0000000..ffad351 --- /dev/null +++ b/liboil/fb/fbmmx.h @@ -0,0 +1,62 @@ +/* + * Copyright © 2004 Red Hat, Inc. + * Copyright © 2005 Trolltech AS + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. Red Hat makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Søren Sandmann (sandmann@redhat.com) + * Lars Knoll (lars@trolltech.com) + * + * Based on work by Owen Taylor + */ + + +#define MmxNegate(x) _mm_xor_si64((x), mmx_4x00ff) +#define MmxAlpha(x) _mm_shuffle_pi16 ((x), _MM_SHUFFLE(3, 3, 3, 3)); +#define MmxTo(x) _mm_unpacklo_pi8 (_mm_cvtsi32_si64((x)), mmx_0) +#define MmxFrom(x) (uint32_t)_mm_cvtsi64_si32(_mm_packs_pu16((x), mmx_0)) +#define MmxAdd(x, y) _mm_adds_pu16 ((x), (y)) + +#define MmxMulAdd(x, a, y) do { \ + x = _mm_mullo_pi16 (x, a); \ + x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); \ + x = _mm_adds_pu16 (x, mmx_4x0080); \ + x = _mm_srli_pi16 (x, 8); \ + x = _mm_adds_pu16(x, y); \ + } while (0) + +#define MmxMul(x, a) do { \ + x = _mm_mullo_pi16 (x, a); \ + x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); \ + x = _mm_adds_pu16 (x, mmx_4x0080); \ + x = _mm_srli_pi16 (x, 8); \ + } while (0) + +#define MmxAddMul(x, a, y, b) do { \ + x = _mm_mullo_pi16 (x, a); \ + y = _mm_mullo_pi16 (y, b); \ + x = _mm_srli_pi16(x, 1); \ + y = _mm_srli_pi16(y, 1); \ + x = _mm_adds_pu16 (x, y); \ + x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); \ + x = _mm_adds_pu16 (x, mmx_4x0080); \ + x = _mm_srli_pi16 (x, 7); \ + } while (0) + diff --git a/liboil/simdpack/scalarmult.c b/liboil/simdpack/scalarmult.c index f4d8bb3..446f5c1 100644 --- a/liboil/simdpack/scalarmult.c +++ b/liboil/simdpack/scalarmult.c @@ -176,3 +176,30 @@ SCALARMULT_DEFINE_UNROLL4 (f64); + + + + +#define SCALARMULT_DEFINE_X(type) \ +static void scalarmult_ ## type ## _x( \ + type_ ## type *dest, int dstr, \ + type_ ## type *src, int sstr, \ + type_ ## type *val, int n) \ +{ \ + int i; \ + for(i=0;i<n;i+=2){ \ + dest[i] = src[i] * *val; \ + dest[i+1] = src[i+1] * *val; \ + } \ +} \ +OIL_DEFINE_IMPL (scalarmult_ ## type ## _x, scalarmult_ ## type); + + +SCALARMULT_DEFINE_X (s8); +SCALARMULT_DEFINE_X (u8); +SCALARMULT_DEFINE_X (s16); +SCALARMULT_DEFINE_X (u16); +SCALARMULT_DEFINE_X (s32); +SCALARMULT_DEFINE_X (u32); +SCALARMULT_DEFINE_X (f32); +SCALARMULT_DEFINE_X (f64); diff --git a/testsuite/instruction/check-instructions.pl b/testsuite/instruction/check-instructions.pl index b478f81..b8058e1 100755 --- a/testsuite/instruction/check-instructions.pl +++ b/testsuite/instruction/check-instructions.pl @@ -200,7 +200,7 @@ sub check "paddsw", "paddusb", "paddusw", - "paddw", + #"paddw", #"pand", #"pandn", "pcmpeqb", @@ -217,9 +217,9 @@ sub check "psllw", "psrad", "psraw", - "psrld", - "psrlq", - "psrlw", + #"psrld", + #"psrlq", + #"psrlw", "psubb", "psubd", "psubsb", @@ -227,12 +227,12 @@ sub check "psubusb", "psubusw", "psubw", - "punpckhbw", - "punpckhdq", - "punpckhwd", - "punpcklbw", - "punpckldq", - "punpcklwd", + #"punpckhbw", + #"punpckhdq", + #"punpckhwd", + #"punpcklbw", + #"punpckldq", + #"punpcklwd", #"pxor", ); @@ -351,12 +351,22 @@ sub check "packssdw", "packsswb", "packuswb", + "paddw", "pand", "pandn", "pmaddwd", "pmulhw", "pmullw", "por", + "psrld", + "psrlq", + "psrlw", + "punpckhbw", + "punpckhdq", + "punpckhwd", + "punpcklbw", + "punpckldq", + "punpcklwd", "pxor", ); |