diff options
author | David Schleef <ds@schleef.org> | 2005-08-01 08:51:48 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-08-01 08:51:48 +0000 |
commit | 48c9a788517b68892535519456a3fb4424408890 (patch) | |
tree | 3282253228fec7a1c0eef51ff525cb0696fb1f1d | |
parent | dcac935729d4072732b60b1a7db40426e51584d4 (diff) | |
download | liboil-48c9a788517b68892535519456a3fb4424408890.tar.gz |
* liboil/colorspace/composite_i386.c: (composite_over_argb_sse2_2),
(composite_over_argb_sse2_3): hacking
* liboil/liboilcpu.c: (oil_cpu_i386_getflags_cpuid): Intel's SSE2
implies MMXEXT
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | liboil/colorspace/composite_i386.c | 268 | ||||
-rw-r--r-- | liboil/liboilcpu.c | 1 |
3 files changed, 276 insertions, 0 deletions
@@ -1,3 +1,10 @@ +2005-08-01 David Schleef <ds@schleef.org> + + * liboil/colorspace/composite_i386.c: (composite_over_argb_sse2_2), + (composite_over_argb_sse2_3): hacking + * liboil/liboilcpu.c: (oil_cpu_i386_getflags_cpuid): Intel's SSE2 + implies MMXEXT + 2005-07-31 David Schleef <ds@schleef.org> * liboil/colorspace/composite_i386.c: (composite_over_argb_mmx_4), diff --git a/liboil/colorspace/composite_i386.c b/liboil/colorspace/composite_i386.c index 0e6cd3a..415ec65 100644 --- a/liboil/colorspace/composite_i386.c +++ b/liboil/colorspace/composite_i386.c @@ -295,6 +295,7 @@ composite_over_argb_mmx_3 (uint32_t *dest, uint32_t *src, int n) } OIL_DEFINE_IMPL_FULL (composite_over_argb_mmx_3, composite_over_argb, OIL_IMPL_FLAG_MMX); +/* written for gromit */ static void composite_over_argb_mmx_4 (uint32_t *dest, uint32_t *src, int n) { @@ -440,6 +441,273 @@ composite_over_argb_sse2 (uint32_t *dest, uint32_t *src, int n) } OIL_DEFINE_IMPL_FULL (composite_over_argb_sse2, composite_over_argb, OIL_IMPL_FLAG_SSE2); +/* written for shaun */ +static void +composite_over_argb_sse2_2 (uint32_t *dest, uint32_t *src, int n) +{ + int end; + + __asm__ __volatile__ (" pxor %%xmm7, %%xmm7\n" // mm7 = { 0, 0, 0, 0 } + " movl $0x80808080, %%eax\n" + " movd %%eax, %%xmm6\n" // mm6 = { 128, 128, 128, 128 } + " punpcklbw %%xmm7, %%xmm6\n" + " punpcklwd %%xmm6, %%xmm6\n" + " movl $0xffffffff, %%eax\n" // mm5 = { 255, 255, 255, 255 } + " movd %%eax, %%xmm5\n" + " punpcklbw %%xmm7, %%xmm5\n" + " punpcklwd %%xmm5, %%xmm5\n" + " movl $0x02020202, %%eax\n" + " movd %%eax, %%xmm4\n" + " punpcklbw %%xmm7, %%xmm4\n" + " paddw %%xmm5, %%xmm4\n" // mm4 = { 257, 257, 257, 257 } + " punpcklwd %%xmm4, %%xmm4\n" + : + : + :"eax"); + + if (n&1 && ((unsigned long)dest & 0xf)) { + __asm__ __volatile__ ( + " movl (%1), %%eax\n" + " testl $0xff000000, %%eax\n" + " jz 1f\n" + + " movd (%1), %%xmm1\n" + " punpcklbw %%xmm7, %%xmm1\n" + " pshuflw $0xff, %%xmm1, %%xmm0\n" + " pxor %%xmm5, %%xmm0\n" + + " movd (%0), %%xmm3\n" + " punpcklbw %%xmm7, %%xmm3\n" + " pmullw %%xmm0, %%xmm3\n" + " paddw %%xmm6, %%xmm3\n" + " pmulhuw %%xmm4, %%xmm3\n" + + " paddw %%xmm1, %%xmm3\n" + " packuswb %%xmm3, %%xmm3\n" + " movd %%xmm3, (%0)\n" + + "1:\n" + " addl $4, %0\n" + " addl $4, %1\n" + :"+r" (dest), "+r" (src), "+r" (n) + : + :"eax"); + } + end = n&1; + n>>=1; + + if (n>0){ + __asm__ __volatile__ ("\n" + "3:\n" + " movl (%1), %%eax\n" + " orl 4(%1), %%eax\n" + " testl $0xff000000, %%eax\n" + " jz 4f\n" + + " movq (%1), %%xmm1\n" + " punpcklbw %%xmm7, %%xmm1\n" + " pshuflw $0xff, %%xmm1, %%xmm0\n" + " pshufhw $0xff, %%xmm0, %%xmm0\n" + " pxor %%xmm5, %%xmm0\n" + + " movq (%0), %%xmm3\n" + " punpcklbw %%xmm7, %%xmm3\n" + " pmullw %%xmm0, %%xmm3\n" + " paddw %%xmm6, %%xmm3\n" + " pmulhuw %%xmm4, %%xmm3\n" + " paddw %%xmm1, %%xmm3\n" + " packuswb %%xmm3, %%xmm3\n" + " movq %%xmm3, (%0)\n" + "4:\n" + " addl $8, %0\n" + " addl $8, %1\n" + " subl $1, %2\n" + " jnz 3b\n" + :"+r" (dest), "+r" (src), "+r" (n) + : + :"eax"); + } + if (end) { + __asm__ __volatile__ ( + " movl (%1), %%eax\n" + " testl $0xff000000, %%eax\n" + " jz 1f\n" + + " movd (%1), %%xmm1\n" + " punpcklbw %%xmm7, %%xmm1\n" + " pshuflw $0xff, %%xmm1, %%xmm0\n" + " pxor %%xmm5, %%xmm0\n" + + " movd (%0), %%xmm3\n" + " punpcklbw %%xmm7, %%xmm3\n" + " pmullw %%xmm0, %%xmm3\n" + " paddw %%xmm6, %%xmm3\n" + " pmulhuw %%xmm4, %%xmm3\n" + + " paddw %%xmm1, %%xmm3\n" + " packuswb %%xmm3, %%xmm3\n" + " movd %%xmm3, (%0)\n" + + "1:\n" + " addl $4, %0\n" + " addl $4, %1\n" + :"+r" (dest), "+r" (src), "+r" (n) + : + :"eax"); + } + +} +OIL_DEFINE_IMPL_FULL (composite_over_argb_sse2_2, composite_over_argb, OIL_IMPL_FLAG_SSE2); + +/* written for shaun */ +static void +composite_over_argb_sse2_3 (uint32_t *dest, uint32_t *src, int n) +{ + int begin; + int middle; + int end; + + __asm__ __volatile__ (" pxor %%xmm7, %%xmm7\n" // mm7 = { 0, 0, 0, 0 } + " movl $0x80808080, %%eax\n" + " movd %%eax, %%xmm6\n" // mm6 = { 128, 128, 128, 128 } + " punpcklbw %%xmm7, %%xmm6\n" + " punpcklwd %%xmm6, %%xmm6\n" + " movl $0xffffffff, %%eax\n" // mm5 = { 255, 255, 255, 255 } + " movd %%eax, %%xmm5\n" + " punpcklbw %%xmm7, %%xmm5\n" + " punpcklwd %%xmm5, %%xmm5\n" + " movl $0x02020202, %%eax\n" + " movd %%eax, %%xmm4\n" + " punpcklbw %%xmm7, %%xmm4\n" + " paddw %%xmm5, %%xmm4\n" // mm4 = { 257, 257, 257, 257 } + " punpcklwd %%xmm4, %%xmm4\n" + : + : + :"eax"); + + begin = 0x3 & (4 - (((unsigned long)dest & 0xf) >> 2)); + if (begin>n) { + begin = n; + middle = 0; + end = 0; + } else { + middle = (n-begin)>>2; + end = n - begin - middle*4; + } + + if (begin>0) { + __asm__ __volatile__ ("\n" + "1:\n" + " movl (%1), %%eax\n" + " testl $0xff000000, %%eax\n" + " jz 2f\n" + + " movd (%1), %%xmm1\n" + " punpcklbw %%xmm7, %%xmm1\n" + " pshuflw $0xff, %%xmm1, %%xmm0\n" + " pxor %%xmm5, %%xmm0\n" + + " movd (%0), %%xmm3\n" + " punpcklbw %%xmm7, %%xmm3\n" + " pmullw %%xmm0, %%xmm3\n" + " paddw %%xmm6, %%xmm3\n" + " pmulhuw %%xmm4, %%xmm3\n" + + " paddw %%xmm1, %%xmm3\n" + " packuswb %%xmm3, %%xmm3\n" + " movd %%xmm3, (%0)\n" + + "2:\n" + " addl $4, %0\n" + " addl $4, %1\n" + " subl $1, %2\n" + " jnz 1b\n" + :"+r" (dest), "+r" (src), "+r" (begin) + : + :"eax"); + } + + if (middle>0){ + __asm__ __volatile__ ("\n" + "1:\n" + " movdqu (%1), %%xmm1\n" + " movl (%1), %%eax\n" + " orl 4(%1), %%eax\n" + " orl 8(%1), %%eax\n" + " orl 12(%1), %%eax\n" + " test $0xff000000, %%eax\n" + " jz 2f\n" + " movdqa %%xmm1, %%xmm0\n" + " punpcklbw %%xmm7, %%xmm1\n" + " punpckhbw %%xmm7, %%xmm0\n" + " pshuflw $0xff, %%xmm1, %%xmm1\n" + " pshuflw $0xff, %%xmm0, %%xmm0\n" + " pshufhw $0xff, %%xmm1, %%xmm1\n" + " pshufhw $0xff, %%xmm0, %%xmm0\n" + + " pxor %%xmm5, %%xmm1\n" + " pxor %%xmm5, %%xmm0\n" + + " movdqa (%0), %%xmm3\n" + " movdqa %%xmm3, %%xmm2\n" + " punpcklbw %%xmm7, %%xmm3\n" + " punpckhbw %%xmm7, %%xmm2\n" + + " pmullw %%xmm1, %%xmm3\n" + " paddw %%xmm6, %%xmm3\n" + " pmulhuw %%xmm4, %%xmm3\n" + " pmullw %%xmm0, %%xmm2\n" + " paddw %%xmm6, %%xmm2\n" + " pmulhuw %%xmm4, %%xmm2\n" + " packuswb %%xmm2, %%xmm3\n" + + " movdqu (%1), %%xmm1\n" + " paddb %%xmm1, %%xmm3\n" + " movdqa %%xmm3, (%0)\n" + "2:\n" + " addl $16, %0\n" + " addl $16, %1\n" + " subl $1, %2\n" + " jnz 1b\n" + :"+r" (dest), "+r" (src), "+r" (middle) + : + :"eax"); + } + if (end>0) { + __asm__ __volatile__ ("\n" + "1:\n" + " movl (%1), %%eax\n" + " testl $0xff000000, %%eax\n" + " jz 2f\n" + + " movd (%1), %%xmm1\n" + " punpcklbw %%xmm7, %%xmm1\n" + " pshuflw $0xff, %%xmm1, %%xmm0\n" + " pxor %%xmm5, %%xmm0\n" + + " movd (%0), %%xmm3\n" + " punpcklbw %%xmm7, %%xmm3\n" + " pmullw %%xmm0, %%xmm3\n" + " paddw %%xmm6, %%xmm3\n" + " pmulhuw %%xmm4, %%xmm3\n" + + " paddw %%xmm1, %%xmm3\n" + " packuswb %%xmm3, %%xmm3\n" + " movd %%xmm3, (%0)\n" + + "2:\n" + " addl $4, %0\n" + " addl $4, %1\n" + " subl $1, %2\n" + " jnz 1b\n" + :"+r" (dest), "+r" (src), "+r" (end) + : + :"eax"); + } + +} +OIL_DEFINE_IMPL_FULL (composite_over_argb_sse2_3, composite_over_argb, OIL_IMPL_FLAG_SSE2); + #if 0 static void composite_over_argb_const_src_mmx (uint32_t *dest, uint32_t *src, int n) diff --git a/liboil/liboilcpu.c b/liboil/liboilcpu.c index 54968c4..1e33f68 100644 --- a/liboil/liboilcpu.c +++ b/liboil/liboilcpu.c @@ -180,6 +180,7 @@ oil_cpu_i386_getflags_cpuid (void) } if (edx & (1<<26)) { oil_cpu_flags |= OIL_IMPL_FLAG_SSE2; + oil_cpu_flags |= OIL_IMPL_FLAG_MMXEXT; } if (ecx & (1<<0)) { oil_cpu_flags |= OIL_IMPL_FLAG_SSE3; |