summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Jennings <mej@kainx.org>2005-06-14 19:39:01 +0000
committerMichael Jennings <mej@kainx.org>2005-06-14 19:39:01 +0000
commit80e9c020d442e2c4fef233c89734580e6bc02ac5 (patch)
tree1f5d9921a8488eee7e7f60d081d07f46013a4671
parent4cd902d7a367f7dacd9de5004a8f7e95863c9b16 (diff)
downloadeterm-80e9c020d442e2c4fef233c89734580e6bc02ac5.tar.gz
Tue Jun 14 15:36:09 2005 Michael Jennings (mej)
Added SSE2 support patch thanks to Tres Melton <tres@mindspring.com> and John Ellson <ellson@research.att.com>. ---------------------------------------------------------------------- SVN revision: 15322
-rw-r--r--ChangeLog5
-rw-r--r--configure.in40
-rw-r--r--src/Makefile.am12
-rw-r--r--src/pixmap.c30
-rw-r--r--src/sse2_cmod.c566
5 files changed, 637 insertions, 16 deletions
diff --git a/ChangeLog b/ChangeLog
index 75440f1..4423b44 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -5398,3 +5398,8 @@ Mon Jun 13 19:28:19 2005 Michael Jennings (mej)
Cleanups and optimizations from Tres Melton <tres@mindspring.com>.
----------------------------------------------------------------------
+Tue Jun 14 15:36:09 2005 Michael Jennings (mej)
+
+Added SSE2 support patch thanks to Tres Melton <tres@mindspring.com>
+and John Ellson <ellson@research.att.com>.
+----------------------------------------------------------------------
diff --git a/configure.in b/configure.in
index 47e5f02..2fd217d 100644
--- a/configure.in
+++ b/configure.in
@@ -220,7 +220,13 @@ AC_CHECK_FUNCS(atexit _exit unsetenv setutent \
seteuid memmove putenv strsep setresuid setresgid \
memmem usleep snprintf strcasestr strcasechr \
strcasepbrk strrev nl_langinfo)
+
+# NOTE: The following line is NOT NOT NOT NOT NOT a typo!
+# If you are having problems with it, libast.m4 is not installed
+# or aclocal couldn't find it. Hence the problem is on YOUR end.
dps_snprintf_oflow()
+
+dnl# Check for math lib.
AC_CHECK_LIB(m, pow)
dnl# Portability checks for various functions
@@ -500,9 +506,11 @@ AC_ARG_ENABLE(trans,
AC_DEFINE(PIXMAP_OFFSET, , [Define for pseudo-transparency support.])
])
+dnl#
+dnl# MMX support
+dnl#
AC_MSG_CHECKING(for MMX support)
HAVE_MMX=""
-HAVE_MMX_64=""
AC_ARG_ENABLE(mmx, [ --enable-mmx enable MMX assembly routines], [
test "x$enableval" = "xyes" && HAVE_MMX="yes"
], [
@@ -510,25 +518,39 @@ AC_ARG_ENABLE(mmx, [ --enable-mmx enable MMX assembly routines], [
i*86)
grep mmx /proc/cpuinfo >/dev/null 2>&1 && HAVE_MMX="yes"
;;
- x86_64)
- grep mmx /proc/cpuinfo >/dev/null 2>&1 && HAVE_MMX_64="yes"
- ;;
esac
])
if test "x$HAVE_MMX" = "xyes"; then
AC_MSG_RESULT([yes (32-bit)])
AC_DEFINE(HAVE_MMX, , [Define for 32-bit MMX support.])
-elif test "x$HAVE_MMX_64" = "xyes"; then
- dnl# AC_MSG_RESULT([yes (64-bit)])
- dnl# AC_DEFINE(HAVE_MMX_64, , [Define for 64-bit MMX support.])
- AC_MSG_RESULT([no (64-bit MMX not yet supported)])
else
AC_MSG_RESULT([no (no MMX detected)])
fi
-dnl# AM_CONDITIONAL(HAVE_MMX, test "x$HAVE_MMX" = "xyes" -o "x$HAVE_MMX_64" = "xyes")
AM_CONDITIONAL(HAVE_MMX, test "x$HAVE_MMX" = "xyes")
dnl#
+dnl# SSE2 support
+dnl#
+AC_MSG_CHECKING(for SSE2 support)
+HAVE_SSE2=""
+AC_ARG_ENABLE(sse2, [ --enable-sse2 enable SSE2 assembly routines], [
+ test "x$enableval" = "xyes" && HAVE_SSE2="yes"
+ ], [
+ case $host_cpu in
+ x86_64)
+ grep sse2 /proc/cpuinfo >/dev/null 2>&1 && HAVE_SSE2="yes"
+ ;;
+ esac
+ ])
+if test "x$HAVE_SSE2" = "xyes"; then
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_SSE2, , [Define for 64-bit SSE2 support.])
+else
+ AC_MSG_RESULT([no (no SSE2 detected)])
+fi
+AM_CONDITIONAL(HAVE_SSE2, test "x$HAVE_SSE2" = "xyes")
+
+dnl#
dnl# LibAST
dnl#
LIBAST_MIN=5
diff --git a/src/Makefile.am b/src/Makefile.am
index 84353df..d05650d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -6,6 +6,9 @@ bin_PROGRAMS = Eterm
MMX_SRCS = mmx_cmod.S
MMX_OBJS = mmx_cmod.lo
+SSE2_SRCS = sse2_cmod.c
+SSE2_OBJS = sse2_cmod.lo
+
libEterm_la_SOURCES = actions.c actions.h buttons.c buttons.h command.c \
command.h draw.c draw.h e.c e.h eterm_debug.h eterm_utmp.h \
events.c events.h feature.h font.c font.h grkelot.c \
@@ -16,22 +19,27 @@ libEterm_la_SOURCES = actions.c actions.h buttons.c buttons.h command.c
timer.c timer.h utmp.c windows.c windows.h defaultfont.c \
defaultfont.h libscream.c scream.h screamcfg.h
-EXTRA_libEterm_la_SOURCES = $(MMX_SRCS)
+EXTRA_libEterm_la_SOURCES = $(MMX_SRCS) $(SSE2_SRCS)
libEterm_la_LDFLAGS = -release $(VERSION)
+if HAVE_SSE2
+libEterm_la_DEPENDENCIES = feature.h $(SSE2_OBJS)
+libEterm_la_LIBADD = $(SSE2_OBJS)
+else
if HAVE_MMX
libEterm_la_DEPENDENCIES = feature.h $(MMX_OBJS)
libEterm_la_LIBADD = $(MMX_OBJS)
else
libEterm_la_DEPENDENCIES = feature.h
endif
+endif
Eterm_SOURCES = main.c
Eterm_DEPENDENCIES = libEterm.la
Eterm_LDFLAGS = -rpath $(libdir):$(pkglibdir)
Eterm_LDADD = libEterm.la
-EXTRA_DIST = mmx_cmod.S
+EXTRA_DIST = mmx_cmod.S sse2_cmod.c
install-exec-hook:
$(mkinstalldirs) $(DESTDIR)$(pkgdatadir)
diff --git a/src/pixmap.c b/src/pixmap.c
index f6305e7..724e33b 100644
--- a/src/pixmap.c
+++ b/src/pixmap.c
@@ -60,11 +60,16 @@ static const char cvs_ident[] = "$Id$";
/* Optimized check for rm, gm, and bm all < 256 */
#define COLORMODS_HAVE_SATURATION(rm, gm, bm) ((rm|gm|bm) >> 8)
-/* Assembler routines */
+/* Assembler routines for 32 bit cpu with mmx */
extern void shade_ximage_15_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
extern void shade_ximage_16_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
extern void shade_ximage_32_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+/* Assembler routines for 64 bit cpu with sse2 */
+extern void shade_ximage_15_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+extern void shade_ximage_16_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+extern void shade_ximage_32_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+
#ifdef PIXMAP_SUPPORT
static Imlib_Border bord_none = { 0, 0, 0, 0 };
#endif
@@ -1551,7 +1556,7 @@ need_colormod(register imlib_t *iml)
/* New optimized routines for tinting XImages written by Willem Monsuwe <willem@stack.nl> */
-#ifndef HAVE_MMX
+#if !defined HAVE_MMX && !defined HAVE_SSE2
/* RGB 15 */
static void
shade_ximage_15(void *data, int bpl, int w, int h, int rm, int gm, int bm)
@@ -1743,7 +1748,13 @@ void
colormod_trans(Pixmap p, imlib_t *iml, GC gc, unsigned short w, unsigned short h)
{
+#ifdef HAVE_SSE2
+ XImage * __attribute__ ((aligned (16))) ximg;
+#elif defined HAVE_MMX
+ XImage * __attribute__ ((aligned (8))) ximg;
+#else
XImage *ximg;
+#endif
register unsigned long i;
#if 0
@@ -1848,7 +1859,10 @@ colormod_trans(Pixmap p, imlib_t *iml, GC gc, unsigned short w, unsigned short h
/* Determine bitshift and bitmask values */
switch (real_depth) {
case 15:
-#ifdef HAVE_MMX
+#ifdef HAVE_SSE2
+ D_PIXMAP(("Using SSE2 - 15 bit\n"));
+ shade_ximage_15_sse2(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
+#elif defined HAVE_MMX
D_PIXMAP(("Using MMX - 15 bit\n"));
shade_ximage_15_mmx(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
#else
@@ -1857,7 +1871,10 @@ colormod_trans(Pixmap p, imlib_t *iml, GC gc, unsigned short w, unsigned short h
#endif
break;
case 16:
-#ifdef HAVE_MMX
+#ifdef HAVE_SSE2
+ D_PIXMAP(("Using SSE2 - 16 bit\n"));
+ shade_ximage_16_sse2(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
+#elif defined HAVE_MMX
D_PIXMAP(("Using MMX - 16 bit\n"));
shade_ximage_16_mmx(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
#else
@@ -1872,7 +1889,10 @@ colormod_trans(Pixmap p, imlib_t *iml, GC gc, unsigned short w, unsigned short h
}
/* drop */
case 32:
-#ifdef HAVE_MMX
+#ifdef HAVE_SSE2
+ D_PIXMAP(("Using SSE2 - 32 bit\n"));
+ shade_ximage_32_sse2(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
+#elif defined HAVE_MMX
D_PIXMAP(("Using MMX - 32 bit\n"));
shade_ximage_32_mmx(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
#else
diff --git a/src/sse2_cmod.c b/src/sse2_cmod.c
new file mode 100644
index 0000000..612f9eb
--- /dev/null
+++ b/src/sse2_cmod.c
@@ -0,0 +1,566 @@
+/* File: sse2_cmod.c
+ * Written and Copyright (C) 2005 by Tres Melton
+ *
+ * Permission is hereby granted to Michael Jennings to license this code as
+ * he sees fit. I'd prefer the GPL but he will choose the BSD. The debate
+ * is moot as this is to become a part of the Eterm project, for which he is
+ * the primary author. For users of this code I ask that any modifications
+ * be released back into the community but with Michael Jennings chooses the
+ * BSD license then that request has no backing in law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ * Much inspiration was drawn from the original x86 MMX port written by
+ * Willem Monsuwe <willem@stack.nl> in pure x86/MMX Assembly. The MMX
+ * instructions are taken almost verbatim but the memory and parameter
+ * accessing had to be completely reworked for the x86_64 ABI and to
+ * ensure they worked with various gcc options. Further the code was
+ * extended to take advantage of the 128 bit xmm registers in SSE2.
+ *
+ * Manuals used in this port:
+ * The Gnu Assembler
+ * http://www.gnu.org/software/binutils/manual/gas-2.9.1/html_mono/as.html
+ * AMD64 Architecture Programmer's Manual Volume 1: Application Programming
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24592.pdf
+ * AMD64 Architecture Programmer's Manual Volume 2: System Programming
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
+ * AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24594.pdf
+ * AMD64 Architecture Programmer's Manual Volume 4: 128-Bit Media Instructions
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26568.pdf
+ * AMD64 Architecture Programmer's Manual Volume 5: 64-Bit Media and x87 Floating-Point Instructions
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26569.pdf
+ * AMD64 Application Binary Interface (v. 0.95)
+ * http://www.x86-64.org/documentation/documentation/abi-0.95.pdf
+ *
+ * The 32 bit color modification algorithm is simple but so optimized
+ * (even the C version) that it is almost unreadable.
+ * Therefore the pseudo code is:
+ *
+ * for each color of each pixel
+ * new_color = color * modifier
+ * if ( new_color > max_color_value )
+ * new_color = max_color_value
+ * end if
+ * end for
+ *
+ * The AMD64 ABI is at version 0.95 and might change in the future. Further it has changed a
+ * number of times in the past (although mostly in 2002-2003) as evidenced by the mailing
+ * list on http://www.x86-64.org. The GCC, Glibc, and Linux kernel have changed as well
+ * during this time to keep up. The standard C definition states that function parameters
+ * are to be passed on the stack but that can be very inefficient compared to passing them
+ * in registers so gcc tries to use registers. This is very different than on the register
+ * starved i386 architecture (AMD64 adds 8 general purpose registers: %r8-%r15, and
+ * MMX/SSE2/F87 adds 16 64/128/80bit registers: %xmm0-%xmm15. The x86_64 version of GCC uses
+ * registers as efficiently as possible and as a result exactly which registers are used
+ * for which parameters has evolved. Since all of these tools change simultaneously inline
+ * assembly code in C functions is the only way to ensure that this code will continue to
+ * function through a (however unlikely) change. If pure assembly were to be used as the
+ * original MMX author, Willem Monsuwe, did and the ABI changed then this code would cease
+ * to function properly. After examination of Willem's code I'm wondering if he
+ * wrote it for GNU/Linux originally. The ENTER and LEAVE macros put all of the
+ * parameters on the stack so that they can be accessed by references to the Base_Pointer
+ * the way that ANSI C is defined. If he originally wrote this for GNU/Linux then he most
+ * likely would have just used the registers instead of unwinding those optimizations
+ * manually by pushing them to the stack. And if he explicitly wanted to use the stack
+ * then there are parameters to gcc that would have performed those operations for him:
+ * -mregparm/-mmemparm. Other gcc options that can tweak with the stack and the number
+ * of registers available for function parameters are: -fcall-used/-fcall-saved,
+ * -fcaller-saves, -fstack-protector, -fPIC/-fpic, -mno-push-args, etc.. It might be
+ * advisable to check for these switches when using the original MMX code and emit a
+ * warning if any are enabled. I know that the PIC option trashes the BX register and
+ * that both Willem and I use that register. In other words If you do manage to get it
+ * to compile & run w/ -fpic it WILL break. On the plus side, you can keep the pieces! :-)
+ * On the other hand I could be wrong about everything
+ *
+ * In Conclusion:
+ * Using C functions and inline assembly code should alleviate all of the concerns as the
+ * C compiler will ensure that the parameters get to the function in a gauranteed manner
+ * and the inline assembly explicitly loads them into the desired registers for the assembly
+ * code. This might seem like alot of overhead but great care has been taken to adhere to
+ * the x86_64 ABI so that gcc/gas/ld will not perform any unneeded operations even when no
+ * optimizations have been enabled (-O[123]).
+ */
+
+#include "config.h"
+
+#ifdef HAVE_SSE2
+
+void shade_ximage_15_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+{
+ __asm__ __volatile__ (
+ ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
+ "leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t" /* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6 */
+ "negq %%rbx \n\t" /* Negate the width to that we can increment the counter */
+ "jz 10f \n\t" /* Jump to end if the line count is zero */
+ "movd %[red_mod], %%xmm5 \n\t" /* Load the color modifiers into mmx registers */
+ "movd %[green_mod], %%xmm6 \n\t" /* " " */
+ "movd %[blue_mod], %%xmm7 \n\t" /* " " */
+ "punpcklwd %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low words. From A64_128bit_Media_Programming (p. 380) */
+ "punpcklwd %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same) */
+ "punpcklwd %%xmm7, %%xmm7 \n\t"
+ "punpckldq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low double words. From A64_128bit_Media_Programming (p. 376) */
+ "punpckldq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same) */
+ "punpckldq %%xmm7, %%xmm7 \n\t"
+ "punpcklqdq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low quad words. From A64_128bit_Media_Programming (p. 378) */
+ "punpcklqdq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same) */
+ "punpcklqdq %%xmm7, %%xmm7 \n\t"
+ "or %[red_mod], %[green_mod] \n\t" /* This, and the following 4 instructions, check to see if all three colormodifiers are */
+ "or %[blue_mod], %[green_mod] \n\t" /* less than 256. If any of the modifiers are > 256 then they will have the 9th, or higher, */
+ "sar $8, %[green_mod] \n\t" /* bit set. Then we shift off eight bits, leaving something set if a modifier > 256. */
+ "movq %%rax, %[blue_mod] \n\t" /* Use the register named blue_mod to now store bytes_per_line. */
+ "xor %[red_mod], %[red_mod] \n\t" /* zero red so we don't have to load an immediate value for the following compare. */
+ "cmp %[red_mod], %[green_mod] \n\t" /* Compare the left over bits to zero */
+ "jg 5f \n\t" /* If one of the colors (might) need saturated then jump to the secondary set of loops. */
+ "1: \n\t" /* Start of the outer loop (lines). */
+ "movq %%rbx, %%rcx \n\t" /* Move the width into the count register */
+ "addq $7, %%rcx \n\t"
+ "jns 3f \n\t"
+ "2: \n\t" /* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register ) */
+ "movdqu (%%rsi, %%rcx, 2), %%xmm0\n\t" /* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue) */
+ "movdqu %%xmm0, %%xmm1 \n\t" /* Create a copy of the pixel for the green color */
+ "movdqu %%xmm0, %%xmm2 \n\t" /* Create a copy of the pixel for the blue color */
+ "psrlw $5, %%xmm1 \n\t" /* Packed Shift Right Logical Words */
+ /* From A64_128bit_Media_Programming (p. 347) */
+ /* Shifts the blue off of the green color */
+ "psrlw $10, %%xmm0 \n\t" /* Shifts the blue & green off of the red color */
+ "psllw $11, %%xmm2 \n\t" /* Packed Shift Left Logical Words */
+ /* From A64_128bit_Media_Programming (p. 330) */
+ /* Shifts the red & green off of the blue color */
+ "psllw $11, %%xmm1 \n\t" /* Shifts the red off of the green color */
+ "psllw $8, %%xmm0 \n\t" /* Shifts the red color into position */
+ "psrlw $3, %%xmm1 \n\t" /* Shifts the green color into position */
+ "psrlw $3, %%xmm2 \n\t" /* Shifts the blue color into position */
+ "pmulhw %%xmm5, %%xmm0 \n\t" /* color *= modifier */
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "psllw $10, %%xmm0 \n\t" /* Shift red back into its original position */
+ "psllw $5, %%xmm1 \n\t" /* Shift green back into its original position */
+ "por %%xmm2, %%xmm0 \n\t" /* Mesh the colors back together */
+ "por %%xmm1, %%xmm0 \n\t"
+ "movdqu %%xmm0, (%%rsi, %%rcx, 2)\n\t" /* Place the shaded 8 pixels back into the image map */
+ "addq $8, %%rcx \n\t"
+ "js 2b \n\t"
+ "jmp 4f \n\t"
+ "3: \n\t" /* Deal with pixels one at a time here. */
+ "movw (%%rsi, %%rcx, 2), %%ax \n\t"
+ "movd %%eax, %%xmm0 \n\t"
+ "movq %%xmm0, %%xmm1 \n\t"
+ "movq %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $10, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $11, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $3, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "psllw $10, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movd %%xmm0, %%eax \n\t"
+ "movw %%ax, (%%rsi, %%rcx, 2) \n\t"
+ "incq %%rcx \n\t"
+ "4: \n\t"
+ "cmpq $6, %%rcx \n\t"
+ "jng 3b \n\t"
+ "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */
+ "decq %%rdx \n\t"
+ "jnz 1b \n\t"
+ "jmp 10f \n\t" /* We're done! */
+
+ "5: \n\t" /* Saturation is required */
+ "pcmpeqw %%xmm3, %%xmm3 \n\t" /* Packed Compare Equal Words */
+ /* From A64_128bit_Media_Programming (p. 276) */
+ /* This sets xmm3 to 128 1's (since mm6 = mm6) */
+ "psllw $5, %%xmm3 \n\t" /* xmm3 = 8 copies of 1111 1111 1110 0000 */
+ "6: \n\t"
+ "movq %%rbx, %%rcx \n\t"
+ "addq $7, %%rcx \n\t"
+ "jns 8f \n\t"
+ "7: \n\t"
+ "movdqu (%%rsi, %%rcx, 2), %%xmm0\n\t"
+ "movdqu %%xmm0, %%xmm1 \n\t"
+ "movdqu %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $10, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $11, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $3, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "paddusw %%xmm3, %%xmm0 \n\t"
+ "paddusw %%xmm3, %%xmm1 \n\t"
+ "paddusw %%xmm3, %%xmm2 \n\t"
+ "psubw %%xmm3, %%xmm0 \n\t" /* FIXME: This line needs added to the original asm code */
+ "psubw %%xmm3, %%xmm1 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "psllw $10, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movdqu %%xmm0, (%%rsi, %%rcx, 2)\n\t"
+ "addq $8, %%rcx \n\t"
+ "js 7b \n\t"
+ "jmp 9f \n\t"
+ "8: \n\t"
+ "movw (%%rsi, %%rcx, 2), %%ax \n\t"
+ "movd %%eax, %%xmm0 \n\t"
+ "movq %%xmm0, %%xmm1 \n\t"
+ "movq %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $10, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $11, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $3, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "paddusw %%xmm3, %%xmm0 \n\t"
+ "paddusw %%xmm3, %%xmm1 \n\t"
+ "paddusw %%xmm3, %%xmm2 \n\t"
+ "psubw %%xmm3, %%xmm0 \n\t" /* FIXME: This line needs added to the original asm code */
+ "psubw %%xmm3, %%xmm1 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "psllw $10, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movd %%xmm0, %%eax \n\t"
+ "movw %%ax, (%%rsi, %%rcx, 2) \n\t"
+ "incq %%rcx \n\t"
+ "9: \n\t"
+ "cmpq $6, %%rcx \n\t"
+ "jng 8b \n\t"
+ "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */
+ "decq %%rdx \n\t"
+ "jnz 6b \n\t"
+ "10: \n\t" /* This is the end. Jump here if the line count is zero. */
+ "emms \n\t" /* exit multi-media state (last asm instruction) */
+ : /* outputs: none */
+ /* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly) */
+ /* (however the compiler/assembler can preload 32bit values into 64bit registers) */
+ /* (that is why certain variables cannot be referenced by name -- use their register) */
+ : [data] "S" (data), /* put the pointer data into the rsi register */
+ [width] "b" (w), /* put the width in the %rbx register (cannot be referenced by name) */
+ [height] "d" (h), /* put the heigth in the %rdx register (cannot be referenced by name) */
+ [red_mod] "r" ((unsigned long)(rm)),/* put the red_modifier in a register (referenced by name) */
+ [green_mod] "r" ((unsigned long)(gm)),/* put the green_modifier in a register (referenced by name) */
+ [blue_mod] "r" ((unsigned long)(bm)),/* put the blue_modifier in a register (referenced by name) Later store the bytes_line here */
+ [bytes_line] "a" (bpl) /* put the bytes_per_line in the %rax register (cannot be referenced by name) */
+ : "memory" /* clobbers: (memory includes all the registers) */
+ ); /* End of Assembly */
+}
+
+
+void shade_ximage_16_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+{
+ __asm__ __volatile__ (
+ ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
+ "leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t" /* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6 */
+ "negq %%rbx \n\t" /* Negate the width to that we can increment the counter */
+ "jz 10f \n\t" /* Jump to end if the line count is zero */
+ "movd %[red_mod], %%xmm5 \n\t" /* Load the color modifiers into mmx registers */
+ "movd %[green_mod], %%xmm6 \n\t" /* " " */
+ "movd %[blue_mod], %%xmm7 \n\t" /* " " */
+ "punpcklwd %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low words. From A64_128bit_Media_Programming (p. 380) */
+ "punpcklwd %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same) */
+ "punpcklwd %%xmm7, %%xmm7 \n\t"
+ "punpckldq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low double words. From A64_128bit_Media_Programming (p. 376) */
+ "punpckldq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same) */
+ "punpckldq %%xmm7, %%xmm7 \n\t"
+ "punpcklqdq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low quad words. From A64_128bit_Media_Programming (p. 378) */
+ "punpcklqdq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same) */
+ "punpcklqdq %%xmm7, %%xmm7 \n\t"
+ "or %[red_mod], %[green_mod] \n\t" /* This, and the following 4 instructions, check to see if all three colormodifiers are */
+ "or %[blue_mod], %[green_mod] \n\t" /* less than 256. If any of the modifiers are > 256 then they will have the 9th, or higher, */
+ "sar $8, %[green_mod] \n\t" /* bit set. Then we shift off eight bits, leaving something set if a modifier > 256. */
+ "movq %%rax, %[blue_mod] \n\t" /* Use the register named blue_mod to now store bytes_per_line. */
+ "xor %[red_mod], %[red_mod] \n\t" /* zero red so we don't have to load an immediate value for the following compare. */
+ "cmp %[red_mod], %[green_mod] \n\t" /* Compare the left over bits to zero */
+ "jg 5f \n\t" /* If one of the colors (might) need saturated then jump to the secondary set of loops. */
+ "1: \n\t" /* Start of the outer loop (lines). */
+ "movq %%rbx, %%rcx \n\t" /* Move the width into the count register */
+ "addq $7, %%rcx \n\t"
+ "jns 3f \n\t"
+ "2: \n\t" /* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register ) */
+ "movdqu (%%rsi, %%rcx, 2), %%xmm0\n\t" /* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue) */
+ "movdqu %%xmm0, %%xmm1 \n\t" /* Create a copy of the pixel for the green color */
+ "movdqu %%xmm0, %%xmm2 \n\t" /* Create a copy of the pixel for the blue color */
+ "psrlw $5, %%xmm1 \n\t" /* Packed Shift Right Logical Words */
+ /* From A64_128bit_Media_Programming (p. 347) */
+ /* Shifts the blue off of the green color */
+ "psrlw $11, %%xmm0 \n\t" /* Shifts the blue & green off of the red color */
+ "psllw $11, %%xmm2 \n\t" /* Packed Shift Left Logical Words */
+ /* From A64_128bit_Media_Programming (p. 330) */
+ /* Shifts the red & green off of the blue color */
+ "psllw $10, %%xmm1 \n\t" /* Shifts the red off of the green color */
+ "psllw $8, %%xmm0 \n\t" /* Shifts the red color into position */
+ "psrlw $2, %%xmm1 \n\t" /* Shifts the green color into position */
+ "psrlw $3, %%xmm2 \n\t" /* Shifts the blue color into position */
+ "pmulhw %%xmm5, %%xmm0 \n\t" /* color *= modifier */
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "psllw $11, %%xmm0 \n\t" /* Shift red back into its original position */
+ "psllw $5, %%xmm1 \n\t" /* Shift green back into its original position */
+ "por %%xmm2, %%xmm0 \n\t" /* Mesh the colors back together */
+ "por %%xmm1, %%xmm0 \n\t"
+ "movdqu %%xmm0, (%%rsi, %%rcx, 2)\n\t" /* Place the shaded 8 pixels back into the image map */
+ "addq $8, %%rcx \n\t"
+ "js 2b \n\t"
+ "jmp 4f \n\t"
+ "3: \n\t" /* Deal with pixels one at a time here. */
+ "movw (%%rsi, %%rcx, 2), %%ax \n\t"
+ "movd %%eax, %%xmm0 \n\t"
+ "movq %%xmm0, %%xmm1 \n\t"
+ "movq %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $11, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $10, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $2, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "psllw $11, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movd %%xmm0, %%eax \n\t"
+ "movw %%ax, (%%rsi, %%rcx, 2) \n\t"
+ "incq %%rcx \n\t"
+ "4: \n\t"
+ "cmpq $6, %%rcx \n\t"
+ "jng 3b \n\t"
+ "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */
+ "decq %%rdx \n\t"
+ "jnz 1b \n\t"
+ "jmp 10f \n\t" /* We're done! */
+
+ "5: \n\t" /* Saturation is required */
+ "pcmpeqw %%xmm3, %%xmm3 \n\t" /* Packed Compare Equal Words */
+ /* From A64_128bit_Media_Programming (p. 276) */
+ /* This sets xmm3 to 128 1's (since mm6 = mm6) */
+ "movdqu %%xmm3, %%xmm4 \n\t" /* Make copy of 128 ones */
+ "psllw $5, %%xmm3 \n\t" /* xmm3 = 8 copies of 1111 1111 1110 0000 */
+ "psllw $6, %%xmm4 \n\t" /* xmm4 = 8 copies of 1111 1111 1100 0000 */
+ "6: \n\t"
+ "movq %%rbx, %%rcx \n\t"
+ "addq $7, %%rcx \n\t"
+ "jns 8f \n\t"
+ "7: \n\t"
+ "movdqu (%%rsi, %%rcx, 2), %%xmm0\n\t"
+ "movdqu %%xmm0, %%xmm1 \n\t"
+ "movdqu %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $11, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $10, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $2, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "paddusw %%xmm3, %%xmm0 \n\t"
+ "paddusw %%xmm4, %%xmm1 \n\t"
+ "paddusw %%xmm3, %%xmm2 \n\t"
+ "psubw %%xmm4, %%xmm1 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "psllw $11, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movdqu %%xmm0, (%%rsi, %%rcx, 2)\n\t"
+ "addq $8, %%rcx \n\t"
+ "js 7b \n\t"
+ "jmp 9f \n\t"
+ "8: \n\t"
+ "movw (%%rsi, %%rcx, 2), %%ax \n\t"
+ "movd %%eax, %%xmm0 \n\t"
+ "movq %%xmm0, %%xmm1 \n\t"
+ "movq %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $11, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $10, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $2, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ " \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ " \n\t"
+ "paddusw %%xmm3, %%xmm0 \n\t"
+ "paddusw %%xmm4, %%xmm1 \n\t"
+ "paddusw %%xmm3, %%xmm2 \n\t"
+ " \n\t"
+ "psubw %%xmm4, %%xmm1 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ " \n\t"
+ "psllw $11, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movd %%xmm0, %%eax \n\t"
+ "movw %%ax, (%%rsi, %%rcx, 2) \n\t"
+ "incq %%rcx \n\t"
+ "9: \n\t"
+ "cmpq $6, %%rcx \n\t"
+ "jng 8b \n\t"
+ "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */
+ "decq %%rdx \n\t"
+ "jnz 6b \n\t"
+ "10: \n\t" /* This is the end. Jump here if the line count is zero. */
+ "emms \n\t" /* exit multi-media state (last asm instruction) */
+ : /* outputs: none */
+ /* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly) */
+ /* (however the compiler/assembler can preload 32bit values into 64bit registers) */
+ /* (that is why certain variables cannot be referenced by name -- use their register) */
+ : [data] "S" (data), /* put the pointer data into the rsi register */
+ [width] "b" (w), /* put the width in the %rbx register (cannot be referenced by name) */
+ [height] "d" (h), /* put the heigth in the %rdx register (cannot be referenced by name) */
+ [red_mod] "r" ((unsigned long)(rm)),/* put the red_modifier in a register (referenced by name) */
+ [green_mod] "r" ((unsigned long)(gm)),/* put the green_modifier in a register (referenced by name) */
+ [blue_mod] "r" ((unsigned long)(bm)),/* put the blue_modifier in a register (referenced by name) Later store the bytes_line here */
+ [bytes_line] "a" (bpl) /* put the bytes_per_line in the %rax register (cannot be referenced by name) */
+ : "memory" /* clobbers: (memory includes all the registers) */
+ ); /* End of Assembly */
+}
+
+
+void shade_ximage_32_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+{
+ __asm__ __volatile__ (
+ ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
+ "leaq -4(%%rsi, %%rbx, 4), %%rsi\n\t" /* From A64_General_Purpose_and_System_Instructions (p. 182) */
+ /* Intel syntax section:[base + index*scale + disp] (used by AMD manuals) */
+ /* AT&T syntax section:disp(base, index, scale) (used by gas/gcc) */
+ /* Load Effective Address of (rsi + (rbx * size)) into rsi */
+ /* 32 bits per pixel means a multiplier of 4. */
+ "negq %%rbx \n\t" /* two's compliment negation of ebx (width) and sets the Zero Flag based on the results */
+ /* From A64_General_Purpose_and_System_Instructions (p. 212) */
+ "jz 10f \n\t" /* Jump to label 3 forward on Zero */
+ /* Basically if width = 0 blowout */
+ /* I don't understand why the height isn't checked (shouldn't matter, zero loop iterations) */
+ "movd %[red_mod], %%xmm4 \n\t" /* move red modifier into mm4 w/ zero extension to 128bits */
+ /* RGB's are 8 bit values. regardless of them coming in in 32/64 bit they are zero extended */
+ "psllq $16, %%xmm4 \n\t" /* Packed Shift Left Logical Quad words (left shift mm4 16bits twice, once for each 64bit value)*/
+ /* From A64_128bit_Media_Programming (p. 328) */
+ "movd %[green_mod], %%xmm5 \n\t" /* move green modifier into mm5 w/ zero extension to 128bits */
+ "por %%xmm5, %%xmm4 \n\t" /* Mesh green modifier into color modifier */
+ "psllq $16, %%xmm4 \n\t" /* Packed Shift Left Logical Quad words (left shift mm4 16bits twice, once for each 64bit value)*/
+ "movd %[blue_mod], %%xmm5 \n\t" /* move blue modifier (32 bits) into mm4 w/ zero extension to 128bits */
+ "por %%xmm5, %%xmm4 \n\t" /* Mesh blue modifier into color modifier */
+ /* mm4 (color modifier) now contains 00 00 00 00 : 00 00 00 00 :: 00 00 00 rm : 00 gm 00 bm */
+ "punpcklqdq %%xmm4, %%xmm4 \n\t" /* Unpack and Interleave low quad words. From A64_128bit_Media_Programming (p. 378) */
+ /* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same) */
+ "pcmpeqw %%xmm6, %%xmm6 \n\t" /* Packed Compare Equal Words */
+ /* From A64_128bit_Media_Programming (p. 276) */
+ /* This sets mm6 to 128 1's (since mm6 = mm6) */
+ "psllw $15, %%xmm6 \n\t" /* Packed Shift Left Logical Words */
+ /* From A64_128bit_Media_Programming (p. 330) */
+ /* This sets 8 16 bit values of 1000 0000 0000 0000 in the 128 bit word */
+ "movdqu %%xmm6, %%xmm5 \n\t" /* Copy mm6 to mm5 (we need mm6 later) */
+ "pmulhw %%xmm4, %%xmm5 \n\t" /* Packed Multiply High Signed Word */
+ /* mm4 = ( mm4 * mm5 ) >> 16 (8 times, once for each 16bit value) */
+ /* For each color_ modifier (cm) */
+ /* (( cm * 80 00 ) >> 16 ) = (( cm << 15 ) >> 16 ) = cm >> 1 */
+ "1: \n\t" /* The start of the outer loop (lines) */
+ "movq %%rbx, %%rcx \n\t" /* Load the counting register (rcx) with the width of the window to shade */
+ "incq %%rcx \n\t"
+ "2: \n\t" /* The start of the inner loop (columns) */
+ "movq (%%rsi, %%rcx, 4), %%xmm1 \n\t" /* sets mm1 to the 32bit color in the image map (data[ rcx ]) */
+ /* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits */
+ /* only move 32 bits with movd so we don't get two pixels worth of colors */
+ "pxor %%xmm0, %%xmm0 \n\t" /* 128bit exclusive or (sets mm0 to 0) */
+ "punpcklbw %%xmm1, %%xmm0 \n\t" /* Unpack and interleave low bytes */
+ /* For each color of the pixel expand to 16 bits and shift left 8 bits */
+ /* From A64_128bit_Media_Programming (p. 374) */
+ /* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first) */
+ "pxor %%xmm6, %%xmm0 \n\t" /* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times) */
+ "pmulhw %%xmm4, %%xmm0 \n\t" /* Package Multiply High Signed Word (an SSE2 instruction) 128bit mm0=color mm4=cm */
+ /* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0 */
+ /* and the high 16 bits of the result replace the 16 bits used from mm0 */
+ /* For (( each 16 bit color * each 16 bit color modifier ) >> 16 ) */
+ "psubw %%xmm5, %%xmm0 \n\t" /* Packed Subtract Words */
+ /* From A64_128bit_Media_Programming (p. 364) */
+ /* mm0=modified color mm5=corrected color modifier. mm0 = ( mm0 - mm5 ) */
+ /* 16 bit corrected modified color = ( modified color - corrected color modifier ) */
+ "packuswb %%xmm0, %%xmm0 \n\t" /* Pack with Saturation Signed Word to Unsigned Byte */
+ /* From A64_128bit_Media_Programming (p. 246) */
+ /* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0 */
+ /* The top 64 bits are now trashed. The remaining 64 bits are 2 pixels */
+ "movq %%xmm0, (%%rsi, %%rcx, 4) \n\t" /* puts the new 32 bit color value back into the data (image map) */
+ /* 32 bit color is still a double word so movd stays movd */
+ "addq $2, %%rcx \n\t" /* Increment the count register (more pixels left) */
+ "js 2b \n\t" /* Jump backwards to label 2 (restart inner loop) on negative (more pixels left) */
+ "jmp 5f \n\t" /* Jump to single pixel section after pairs are exhausted */
+ "4: \n\t" /* The start of the inner loop (columns) */
+ "movd (%%rsi, %%rcx, 4), %%xmm1 \n\t" /* sets mm1 to the 32bit color in the image map (data[ rcx ]) */
+ /* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits */
+ /* only move 32 bits with movd so we don't get two pixels worth of colors */
+ "pxor %%xmm0, %%xmm0 \n\t" /* 128bit exclusive or (sets mm0 to 0) */
+ "punpcklbw %%xmm1, %%xmm0 \n\t" /* Unpack and interleave low bytes */
+ /* For each color of the pixel expand to 16 bits and shift left 8 bits */
+ /* From A64_128bit_Media_Programming (p. 374) */
+ /* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first) */
+ "pxor %%xmm6, %%xmm0 \n\t" /* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times) */
+ "pmulhw %%xmm4, %%xmm0 \n\t" /* Package Multiply High Signed Word (an SSE2 instruction) 128bit mm0=color mm4=cm */
+ /* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0 */
+ /* and the high 16 bits of the result replace the 16 bits used from mm0 */
+ /* For (( each 16 bit color * each 16 bit color modifier ) >> 16 ) */
+ "psubw %%xmm5, %%xmm0 \n\t" /* Packed Subtract Words */
+ /* From A64_128bit_Media_Programming (p. 364) */
+ /* mm0=modified color mm5=corrected color modifier. mm0 = ( mm0 - mm5 ) */
+ /* 16 bit corrected modified color = ( modified color - corrected color modifier ) */
+ "packuswb %%xmm0, %%xmm0 \n\t" /* Pack with Saturation Signed Word to Unsigned Byte */
+ /* From A64_128bit_Media_Programming (p. 246) */
+ /* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0 */
+ "movd %%xmm0, (%%rsi, %%rcx, 4) \n\t" /* puts the new 32 bit color value back into the data (image map) */
+ /* 32 bit color is still a double word so movd stays movd */
+ "incq %%rcx \n\t" /* Increment the count register (more pixels left) */
+ "5: \n\t" /* Jump here after all pairs of pixels are exhausted */
+ "cmpq $0, %%rcx \n\t" /* Increment the count register (more pixels left) */
+ "jng 4b \n\t" /* Jump backwards to label 2 (restart inner loop) on NOT zero (more pixels left) */
+
+ "addq %%rax, %%rsi \n\t" /* Add bytes per line to the data pointer (advance the pointer to the next line) */
+ "decq %%rdx \n\t" /* Decrement the dx register (row count) */
+ "jnz 1b \n\t" /* Jump backwards to label 1 (restart outer loop) if not zero (more rows left) */
+ "10: \n\t" /* End of function (jump here to clean up and return to caller */
+ "emms \n\t" /* exit multi-media state (last asm instruction) */
+ : /* outputs: none */
+ /* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly) */
+ /* (however the compiler/assembler can preload 32bit values into 64bit registers) */
+ /* (that is why certain variables cannot be referenced by name -- use their register) */
+ : [data] "S" (data), /* put the pointer data into the rsi register */
+ [width] "b" (w), /* put the width in the %rbx register (cannot be referenced by name) */
+ [height] "d" (h), /* put the heigth in the %rdx register (cannot be referenced by name) */
+ [red_mod] "r" (rm), /* put the red_modifier in a register (referenced by name) */
+ [green_mod] "r" (gm), /* put the green_modifier in a register (referenced by name) */
+ [blue_mod] "r" (bm), /* put the blue_modifier in a register (referenced by name) */
+ [bytes_line] "a" (bpl) /* put the bytes_per_line in the %rax register (cannot be referenced by name) */
+ : "memory" /* clobbers: (memory includes all the registers) */
+ ); /* End of Assembly */
+}
+
+#endif