summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-07-31 03:33:01 +0000
committerDavid Schleef <ds@schleef.org>2005-07-31 03:33:01 +0000
commitfeca034960b9aef6cd5bdc02238832af55ffff62 (patch)
tree1dd9a78e8b3df16ecd5c4afb609e0a1693a58557
parent70d44fea4410f36e761450088c805757bb0e68f8 (diff)
downloadliboil-feca034960b9aef6cd5bdc02238832af55ffff62.tar.gz
* liboil/colorspace/composite_i386.c: (composite_over_argb_mmx),
(composite_over_argb_mmx_2), (composite_over_argb_mmx_3), (composite_over_argb_mmx_4), (composite_over_argb_sse2): some new implementations
-rw-r--r--ChangeLog7
-rw-r--r--liboil/colorspace/composite_i386.c251
2 files changed, 233 insertions, 25 deletions
diff --git a/ChangeLog b/ChangeLog
index fb56e40..f83113a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
2005-07-30 David Schleef <ds@schleef.org>
+ * liboil/colorspace/composite_i386.c: (composite_over_argb_mmx),
+ (composite_over_argb_mmx_2), (composite_over_argb_mmx_3),
+ (composite_over_argb_mmx_4), (composite_over_argb_sse2): some
+ new implementations
+
+2005-07-30 David Schleef <ds@schleef.org>
+
* examples/report.c: (oil_print_class): Fix printing
* liboil/colorspace/Makefile.am:
* liboil/colorspace/composite.c: use colorspace header
diff --git a/liboil/colorspace/composite_i386.c b/liboil/colorspace/composite_i386.c
index d48375e..e0a7c1b 100644
--- a/liboil/colorspace/composite_i386.c
+++ b/liboil/colorspace/composite_i386.c
@@ -105,25 +105,24 @@ composite_over_argb_mmx (uint32_t *dest, uint32_t *src, int n)
" testl $0xff000000, %%eax\n"
" jz 2f\n"
- " movd %%eax, %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pshufw $0xff, %%mm1, %%mm0\n"
- " pxor %%mm5, %%mm0\n"
-
- " movd (%0), %%mm3\n"
- " punpcklbw %%mm7, %%mm3\n"
- " pmullw %%mm0, %%mm3\n"
- " paddw %%mm6, %%mm3\n"
- " movq %%mm3, %%mm2\n"
+ " movd %%eax, %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0xff, %%mm0, %%mm1\n"
+ " pxor %%mm5, %%mm1\n"
+
+ " movd (%0), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw %%mm1, %%mm2\n"
+ " paddw %%mm6, %%mm2\n"
+ " movq %%mm2, %%mm1\n"
+ " psrlw $8, %%mm1\n"
+ " paddw %%mm1, %%mm2\n"
" psrlw $8, %%mm2\n"
- " paddw %%mm2, %%mm3\n"
- " psrlw $8, %%mm3\n"
- " paddw %%mm1, %%mm3\n"
- " packuswb %%mm3, %%mm3\n"
+ " paddw %%mm0, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
- " movd %%mm3, %%eax\n"
- " movl %%eax, (%0)\n"
+ " movd %%mm2, (%0)\n"
"2:\n"
" addl $4, %0\n"
" addl $4, %1\n"
@@ -137,6 +136,212 @@ composite_over_argb_mmx (uint32_t *dest, uint32_t *src, int n)
}
OIL_DEFINE_IMPL_FULL (composite_over_argb_mmx, composite_over_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+/* unroll 2 */
+static void
+composite_over_argb_mmx_2 (uint32_t *dest, uint32_t *src, int n)
+{
+ __asm__ __volatile__ (" pxor %%mm7, %%mm7\n" // mm7 = { 0, 0, 0, 0 }
+ " movl $0x80808080, %%eax\n"
+ " movd %%eax, %%mm6\n" // mm6 = { 128, 128, 128, 128 }
+ " punpcklbw %%mm7, %%mm6\n"
+ " movl $0xffffffff, %%eax\n" // mm5 = { 255, 255, 255, 255 }
+ " movd %%eax, %%mm5\n"
+ " punpcklbw %%mm7, %%mm5\n"
+
+ " testl $0x1, %2\n"
+ " jz 2f\n"
+
+ " movl (%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 1f\n"
+
+ " movd %%eax, %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0xff, %%mm0, %%mm1\n"
+ " pxor %%mm5, %%mm1\n"
+
+ " movd (%0), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw %%mm1, %%mm2\n"
+ " paddw %%mm6, %%mm2\n"
+ " movq %%mm2, %%mm1\n"
+ " psrlw $8, %%mm1\n"
+ " paddw %%mm1, %%mm2\n"
+ " psrlw $8, %%mm2\n"
+
+ " paddw %%mm0, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+
+ " movd %%mm2, (%0)\n"
+
+ "1:\n"
+ " addl $4, %0\n"
+ " addl $4, %1\n"
+
+ "2:\n"
+ " shr $1, %2\n"
+ " jz 5f\n"
+ "3:\n"
+ " movl (%1), %%eax\n"
+ " orl 4(%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 4f\n"
+
+ " movd (%1), %%mm0\n"
+ " movd (%0), %%mm2\n"
+
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 4(%1), %%mm3\n"
+
+ " pshufw $0xff, %%mm0, %%mm1\n"
+ " punpcklbw %%mm7, %%mm2\n"
+
+ " pxor %%mm5, %%mm1\n"
+ " movd 4(%0), %%mm4\n"
+
+ " pmullw %%mm1, %%mm2\n"
+ " punpcklbw %%mm7, %%mm3\n"
+
+ " paddw %%mm6, %%mm2\n"
+ " punpcklbw %%mm7, %%mm4\n"
+
+ " movq %%mm2, %%mm1\n"
+ " pshufw $0xff, %%mm3, %%mm7\n"
+
+ " psrlw $8, %%mm1\n"
+ " pxor %%mm5, %%mm7\n"
+
+ " paddw %%mm1, %%mm2\n"
+ " pmullw %%mm7, %%mm4\n"
+
+ " psrlw $8, %%mm2\n"
+ " paddw %%mm6, %%mm4\n"
+
+ " paddw %%mm0, %%mm2\n"
+ " movq %%mm4, %%mm7\n"
+
+ " packuswb %%mm2, %%mm2\n"
+ " psrlw $8, %%mm7\n"
+
+ " movd %%mm2, (%0)\n"
+ " paddw %%mm7, %%mm4\n"
+
+ " psrlw $8, %%mm4\n"
+ " paddw %%mm3, %%mm4\n"
+ " packuswb %%mm4, %%mm4\n"
+ " movd %%mm4, 4(%0)\n"
+
+ " pxor %%mm7, %%mm7\n"
+ "4:\n"
+ " addl $8, %0\n"
+ " addl $8, %1\n"
+ " decl %2\n"
+ " jnz 3b\n"
+ "5:\n"
+ " emms\n"
+ :"+r" (dest), "+r" (src), "+r" (n)
+ :
+ :"eax");
+
+}
+OIL_DEFINE_IMPL_FULL (composite_over_argb_mmx_2, composite_over_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+/* replace pshufw with punpck */
+static void
+composite_over_argb_mmx_3 (uint32_t *dest, uint32_t *src, int n)
+{
+ __asm__ __volatile__ (" pxor %%mm7, %%mm7\n" // mm7 = { 0, 0, 0, 0 }
+ " movl $0x80808080, %%eax\n"
+ " movd %%eax, %%mm6\n" // mm6 = { 128, 128, 128, 128 }
+ " punpcklbw %%mm7, %%mm6\n"
+ " movl $0xffffffff, %%eax\n" // mm5 = { 255, 255, 255, 255 }
+ " movd %%eax, %%mm5\n"
+ " punpcklbw %%mm7, %%mm5\n"
+ "1:\n"
+ " movl (%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 2f\n"
+
+ " movd %%eax, %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movq %%mm0, %%mm1\n"
+ " punpckhwd %%mm1, %%mm1\n"
+ " punpckhdq %%mm1, %%mm1\n"
+ " pxor %%mm5, %%mm1\n"
+
+ " movd (%0), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw %%mm1, %%mm2\n"
+ " paddw %%mm6, %%mm2\n"
+ " movq %%mm2, %%mm1\n"
+ " psrlw $8, %%mm1\n"
+ " paddw %%mm1, %%mm2\n"
+ " psrlw $8, %%mm2\n"
+
+ " paddw %%mm0, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, (%0)\n"
+
+ "2:\n"
+ " addl $4, %0\n"
+ " addl $4, %1\n"
+ " decl %2\n"
+ " jnz 1b\n"
+ " emms\n"
+ :"+r" (dest), "+r" (src), "+r" (n)
+ :
+ :"eax");
+
+}
+OIL_DEFINE_IMPL_FULL (composite_over_argb_mmx_3, composite_over_argb, OIL_IMPL_FLAG_MMX);
+
+static void
+composite_over_argb_mmx_4 (uint32_t *dest, uint32_t *src, int n)
+{
+ __asm__ __volatile__ (" pxor %%mm7, %%mm7\n" // mm7 = { 0, 0, 0, 0 }
+ " movl $0x80808080, %%eax\n"
+ " movd %%eax, %%mm6\n" // mm6 = { 128, 128, 128, 128 }
+ " punpcklbw %%mm7, %%mm6\n"
+ " movl $0xffffffff, %%eax\n" // mm5 = { 255, 255, 255, 255 }
+ " movd %%eax, %%mm5\n"
+ " punpcklbw %%mm7, %%mm5\n"
+ " movl $0x02020202, %%eax\n"
+ " movd %%eax, %%mm4\n"
+ " punpcklbw %%mm7, %%mm4\n"
+ " paddw %%mm5, %%mm4\n" // mm5 = { 257, 257, 257, 257 }
+ "1:\n"
+ " movl (%1), %%eax\n"
+ " testl $0xff000000, %%eax\n"
+ " jz 2f\n"
+
+ " movd %%eax, %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0xff, %%mm0, %%mm1\n"
+ " pxor %%mm5, %%mm1\n"
+
+ " movd (%0), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw %%mm1, %%mm2\n"
+ " paddw %%mm6, %%mm2\n"
+ " pmulhuw %%mm4, %%mm2\n"
+
+ " paddw %%mm0, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+
+ " movd %%mm2, (%0)\n"
+ "2:\n"
+ " addl $4, %0\n"
+ " addl $4, %1\n"
+ " decl %2\n"
+ " jnz 1b\n"
+ " emms\n"
+ :"+r" (dest), "+r" (src), "+r" (n)
+ :
+ :"eax");
+
+}
+OIL_DEFINE_IMPL_FULL (composite_over_argb_mmx_4, composite_over_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
static void
composite_over_argb_sse2 (uint32_t *dest, uint32_t *src, int n)
{
@@ -147,6 +352,10 @@ composite_over_argb_sse2 (uint32_t *dest, uint32_t *src, int n)
" movl $0xffffffff, %%eax\n" // mm5 = { 255, 255, 255, 255 }
" movd %%eax, %%xmm5\n"
" punpcklbw %%xmm7, %%xmm5\n"
+ " movl $0x02020202, %%eax\n"
+ " movd %%eax, %%xmm4\n"
+ " punpcklbw %%xmm7, %%xmm4\n"
+ " paddw %%xmm5, %%xmm4\n" // mm4 = { 255, 255, 255, 255 }
"1:\n"
" movl (%1), %%eax\n"
" testl $0xff000000, %%eax\n"
@@ -155,25 +364,17 @@ composite_over_argb_sse2 (uint32_t *dest, uint32_t *src, int n)
" movd %%eax, %%xmm1\n"
" punpcklbw %%xmm7, %%xmm1\n"
" pshuflw $0xff, %%xmm1, %%xmm0\n"
-#if 1
" pxor %%xmm5, %%xmm0\n"
" movd (%0), %%xmm3\n"
" punpcklbw %%xmm7, %%xmm3\n"
" pmullw %%xmm0, %%xmm3\n"
" paddw %%xmm6, %%xmm3\n"
- " movq %%xmm3, %%xmm2\n"
- " psrlw $8, %%xmm2\n"
- " paddw %%xmm2, %%xmm3\n"
- " psrlw $8, %%xmm3\n"
+ " pmulhuw %%xmm4, %%xmm3\n"
" paddw %%xmm1, %%xmm3\n"
" packuswb %%xmm3, %%xmm3\n"
" movd %%xmm3, %%eax\n"
-#else
- " packuswb %%xmm1, %%xmm1\n"
- " movd %%xmm1, %%eax\n"
-#endif
" movl %%eax, (%0)\n"
"2:\n"