summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-08-01 08:51:48 +0000
committerDavid Schleef <ds@schleef.org>2005-08-01 08:51:48 +0000
commit48c9a788517b68892535519456a3fb4424408890 (patch)
tree3282253228fec7a1c0eef51ff525cb0696fb1f1d
parentdcac935729d4072732b60b1a7db40426e51584d4 (diff)
downloadliboil-48c9a788517b68892535519456a3fb4424408890.tar.gz
* liboil/colorspace/composite_i386.c: (composite_over_argb_sse2_2),
(composite_over_argb_sse2_3): hacking * liboil/liboilcpu.c: (oil_cpu_i386_getflags_cpuid): Intel's SSE2 implies MMXEXT
-rw-r--r--ChangeLog7
-rw-r--r--liboil/colorspace/composite_i386.c268
-rw-r--r--liboil/liboilcpu.c1
3 files changed, 276 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index 154a257..29d6b7e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2005-08-01 David Schleef <ds@schleef.org>
+
+ * liboil/colorspace/composite_i386.c: (composite_over_argb_sse2_2),
+ (composite_over_argb_sse2_3): hacking
+ * liboil/liboilcpu.c: (oil_cpu_i386_getflags_cpuid): Intel's SSE2
+ implies MMXEXT
+
2005-07-31 David Schleef <ds@schleef.org>
* liboil/colorspace/composite_i386.c: (composite_over_argb_mmx_4),
diff --git a/liboil/colorspace/composite_i386.c b/liboil/colorspace/composite_i386.c
index 0e6cd3a..415ec65 100644
--- a/liboil/colorspace/composite_i386.c
+++ b/liboil/colorspace/composite_i386.c
@@ -295,6 +295,7 @@ composite_over_argb_mmx_3 (uint32_t *dest, uint32_t *src, int n)
}
OIL_DEFINE_IMPL_FULL (composite_over_argb_mmx_3, composite_over_argb, OIL_IMPL_FLAG_MMX);
+/* written for gromit */
static void
composite_over_argb_mmx_4 (uint32_t *dest, uint32_t *src, int n)
{
@@ -440,6 +441,273 @@ composite_over_argb_sse2 (uint32_t *dest, uint32_t *src, int n)
}
OIL_DEFINE_IMPL_FULL (composite_over_argb_sse2, composite_over_argb, OIL_IMPL_FLAG_SSE2);
+/* written for shaun */
+static void
+composite_over_argb_sse2_2 (uint32_t *dest, uint32_t *src, int n)
+{
+ int end;
+
+ __asm__ __volatile__ (" pxor %%xmm7, %%xmm7\n" // mm7 = { 0, 0, 0, 0 }
+ " movl $0x80808080, %%eax\n"
+ " movd %%eax, %%xmm6\n" // mm6 = { 128, 128, 128, 128 }
+ " punpcklbw %%xmm7, %%xmm6\n"
+ " punpcklwd %%xmm6, %%xmm6\n"
+ " movl $0xffffffff, %%eax\n" // mm5 = { 255, 255, 255, 255 }
+ " movd %%eax, %%xmm5\n"
+ " punpcklbw %%xmm7, %%xmm5\n"
+ " punpcklwd %%xmm5, %%xmm5\n"
+ " movl $0x02020202, %%eax\n"
+ " movd %%eax, %%xmm4\n"
+ " punpcklbw %%xmm7, %%xmm4\n"
+ " paddw %%xmm5, %%xmm4\n" // mm4 = { 257, 257, 257, 257 }
+ " punpcklwd %%xmm4, %%xmm4\n"
+ :
+ :
+ :"eax");
+
+ if (n&1 && ((unsigned long)dest & 0xf)) {
+ __asm__ __volatile__ (
+ " movl (%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 1f\n"
+
+ " movd (%1), %%xmm1\n"
+ " punpcklbw %%xmm7, %%xmm1\n"
+ " pshuflw $0xff, %%xmm1, %%xmm0\n"
+ " pxor %%xmm5, %%xmm0\n"
+
+ " movd (%0), %%xmm3\n"
+ " punpcklbw %%xmm7, %%xmm3\n"
+ " pmullw %%xmm0, %%xmm3\n"
+ " paddw %%xmm6, %%xmm3\n"
+ " pmulhuw %%xmm4, %%xmm3\n"
+
+ " paddw %%xmm1, %%xmm3\n"
+ " packuswb %%xmm3, %%xmm3\n"
+ " movd %%xmm3, (%0)\n"
+
+ "1:\n"
+ " addl $4, %0\n"
+ " addl $4, %1\n"
+ :"+r" (dest), "+r" (src), "+r" (n)
+ :
+ :"eax");
+ }
+ end = n&1;
+ n>>=1;
+
+ if (n>0){
+ __asm__ __volatile__ ("\n"
+ "3:\n"
+ " movl (%1), %%eax\n"
+ " orl 4(%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 4f\n"
+
+ " movq (%1), %%xmm1\n"
+ " punpcklbw %%xmm7, %%xmm1\n"
+ " pshuflw $0xff, %%xmm1, %%xmm0\n"
+ " pshufhw $0xff, %%xmm0, %%xmm0\n"
+ " pxor %%xmm5, %%xmm0\n"
+
+ " movq (%0), %%xmm3\n"
+ " punpcklbw %%xmm7, %%xmm3\n"
+ " pmullw %%xmm0, %%xmm3\n"
+ " paddw %%xmm6, %%xmm3\n"
+ " pmulhuw %%xmm4, %%xmm3\n"
+ " paddw %%xmm1, %%xmm3\n"
+ " packuswb %%xmm3, %%xmm3\n"
+ " movq %%xmm3, (%0)\n"
+ "4:\n"
+ " addl $8, %0\n"
+ " addl $8, %1\n"
+ " subl $1, %2\n"
+ " jnz 3b\n"
+ :"+r" (dest), "+r" (src), "+r" (n)
+ :
+ :"eax");
+ }
+ if (end) {
+ __asm__ __volatile__ (
+ " movl (%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 1f\n"
+
+ " movd (%1), %%xmm1\n"
+ " punpcklbw %%xmm7, %%xmm1\n"
+ " pshuflw $0xff, %%xmm1, %%xmm0\n"
+ " pxor %%xmm5, %%xmm0\n"
+
+ " movd (%0), %%xmm3\n"
+ " punpcklbw %%xmm7, %%xmm3\n"
+ " pmullw %%xmm0, %%xmm3\n"
+ " paddw %%xmm6, %%xmm3\n"
+ " pmulhuw %%xmm4, %%xmm3\n"
+
+ " paddw %%xmm1, %%xmm3\n"
+ " packuswb %%xmm3, %%xmm3\n"
+ " movd %%xmm3, (%0)\n"
+
+ "1:\n"
+ " addl $4, %0\n"
+ " addl $4, %1\n"
+ :"+r" (dest), "+r" (src), "+r" (n)
+ :
+ :"eax");
+ }
+
+}
+OIL_DEFINE_IMPL_FULL (composite_over_argb_sse2_2, composite_over_argb, OIL_IMPL_FLAG_SSE2);
+
+/* written for shaun */
+static void
+composite_over_argb_sse2_3 (uint32_t *dest, uint32_t *src, int n)
+{
+ int begin;
+ int middle;
+ int end;
+
+ __asm__ __volatile__ (" pxor %%xmm7, %%xmm7\n" // mm7 = { 0, 0, 0, 0 }
+ " movl $0x80808080, %%eax\n"
+ " movd %%eax, %%xmm6\n" // mm6 = { 128, 128, 128, 128 }
+ " punpcklbw %%xmm7, %%xmm6\n"
+ " punpcklwd %%xmm6, %%xmm6\n"
+ " movl $0xffffffff, %%eax\n" // mm5 = { 255, 255, 255, 255 }
+ " movd %%eax, %%xmm5\n"
+ " punpcklbw %%xmm7, %%xmm5\n"
+ " punpcklwd %%xmm5, %%xmm5\n"
+ " movl $0x02020202, %%eax\n"
+ " movd %%eax, %%xmm4\n"
+ " punpcklbw %%xmm7, %%xmm4\n"
+ " paddw %%xmm5, %%xmm4\n" // mm4 = { 257, 257, 257, 257 }
+ " punpcklwd %%xmm4, %%xmm4\n"
+ :
+ :
+ :"eax");
+
+ begin = 0x3 & (4 - (((unsigned long)dest & 0xf) >> 2));
+ if (begin>n) {
+ begin = n;
+ middle = 0;
+ end = 0;
+ } else {
+ middle = (n-begin)>>2;
+ end = n - begin - middle*4;
+ }
+
+ if (begin>0) {
+ __asm__ __volatile__ ("\n"
+ "1:\n"
+ " movl (%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 2f\n"
+
+ " movd (%1), %%xmm1\n"
+ " punpcklbw %%xmm7, %%xmm1\n"
+ " pshuflw $0xff, %%xmm1, %%xmm0\n"
+ " pxor %%xmm5, %%xmm0\n"
+
+ " movd (%0), %%xmm3\n"
+ " punpcklbw %%xmm7, %%xmm3\n"
+ " pmullw %%xmm0, %%xmm3\n"
+ " paddw %%xmm6, %%xmm3\n"
+ " pmulhuw %%xmm4, %%xmm3\n"
+
+ " paddw %%xmm1, %%xmm3\n"
+ " packuswb %%xmm3, %%xmm3\n"
+ " movd %%xmm3, (%0)\n"
+
+ "2:\n"
+ " addl $4, %0\n"
+ " addl $4, %1\n"
+ " subl $1, %2\n"
+ " jnz 1b\n"
+ :"+r" (dest), "+r" (src), "+r" (begin)
+ :
+ :"eax");
+ }
+
+ if (middle>0){
+ __asm__ __volatile__ ("\n"
+ "1:\n"
+ " movdqu (%1), %%xmm1\n"
+ " movl (%1), %%eax\n"
+ " orl 4(%1), %%eax\n"
+ " orl 8(%1), %%eax\n"
+ " orl 12(%1), %%eax\n"
+ " test $0xff000000, %%eax\n"
+ " jz 2f\n"
+ " movdqa %%xmm1, %%xmm0\n"
+ " punpcklbw %%xmm7, %%xmm1\n"
+ " punpckhbw %%xmm7, %%xmm0\n"
+ " pshuflw $0xff, %%xmm1, %%xmm1\n"
+ " pshuflw $0xff, %%xmm0, %%xmm0\n"
+ " pshufhw $0xff, %%xmm1, %%xmm1\n"
+ " pshufhw $0xff, %%xmm0, %%xmm0\n"
+
+ " pxor %%xmm5, %%xmm1\n"
+ " pxor %%xmm5, %%xmm0\n"
+
+ " movdqa (%0), %%xmm3\n"
+ " movdqa %%xmm3, %%xmm2\n"
+ " punpcklbw %%xmm7, %%xmm3\n"
+ " punpckhbw %%xmm7, %%xmm2\n"
+
+ " pmullw %%xmm1, %%xmm3\n"
+ " paddw %%xmm6, %%xmm3\n"
+ " pmulhuw %%xmm4, %%xmm3\n"
+ " pmullw %%xmm0, %%xmm2\n"
+ " paddw %%xmm6, %%xmm2\n"
+ " pmulhuw %%xmm4, %%xmm2\n"
+ " packuswb %%xmm2, %%xmm3\n"
+
+ " movdqu (%1), %%xmm1\n"
+ " paddb %%xmm1, %%xmm3\n"
+ " movdqa %%xmm3, (%0)\n"
+ "2:\n"
+ " addl $16, %0\n"
+ " addl $16, %1\n"
+ " subl $1, %2\n"
+ " jnz 1b\n"
+ :"+r" (dest), "+r" (src), "+r" (middle)
+ :
+ :"eax");
+ }
+ if (end>0) {
+ __asm__ __volatile__ ("\n"
+ "1:\n"
+ " movl (%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 2f\n"
+
+ " movd (%1), %%xmm1\n"
+ " punpcklbw %%xmm7, %%xmm1\n"
+ " pshuflw $0xff, %%xmm1, %%xmm0\n"
+ " pxor %%xmm5, %%xmm0\n"
+
+ " movd (%0), %%xmm3\n"
+ " punpcklbw %%xmm7, %%xmm3\n"
+ " pmullw %%xmm0, %%xmm3\n"
+ " paddw %%xmm6, %%xmm3\n"
+ " pmulhuw %%xmm4, %%xmm3\n"
+
+ " paddw %%xmm1, %%xmm3\n"
+ " packuswb %%xmm3, %%xmm3\n"
+ " movd %%xmm3, (%0)\n"
+
+ "2:\n"
+ " addl $4, %0\n"
+ " addl $4, %1\n"
+ " subl $1, %2\n"
+ " jnz 1b\n"
+ :"+r" (dest), "+r" (src), "+r" (end)
+ :
+ :"eax");
+ }
+
+}
+OIL_DEFINE_IMPL_FULL (composite_over_argb_sse2_3, composite_over_argb, OIL_IMPL_FLAG_SSE2);
+
#if 0
static void
composite_over_argb_const_src_mmx (uint32_t *dest, uint32_t *src, int n)
diff --git a/liboil/liboilcpu.c b/liboil/liboilcpu.c
index 54968c4..1e33f68 100644
--- a/liboil/liboilcpu.c
+++ b/liboil/liboilcpu.c
@@ -180,6 +180,7 @@ oil_cpu_i386_getflags_cpuid (void)
}
if (edx & (1<<26)) {
oil_cpu_flags |= OIL_IMPL_FLAG_SSE2;
+ oil_cpu_flags |= OIL_IMPL_FLAG_MMXEXT;
}
if (ecx & (1<<0)) {
oil_cpu_flags |= OIL_IMPL_FLAG_SSE3;