summaryrefslogtreecommitdiff
path: root/patches
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-12-22 22:04:54 +0000
committerDavid Schleef <ds@schleef.org>2005-12-22 22:04:54 +0000
commit4e381b38abceb843eb6dfd88f734a5c8a38b0b71 (patch)
treedcd40e81158778ce3eab664375fa85dadf1e7e80 /patches
parenta5ebcb17227fa00f782d9e8a00264f90e1116906 (diff)
downloadliboil-4e381b38abceb843eb6dfd88f734a5c8a38b0b71.tar.gz
* .cvsignore:
* BUGS: * doc/.cvsignore: * liboil/mmx/Makefile.am: * liboil/mmx/fbmmx.c: * patches/divide.c: * patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S: * patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S: * patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S: * patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S: * patches/patch-small-lib-2: * testsuite/Makefile.am: * testsuite/list_impls.c: Clean up local source tree. Put spare files in the place where spare files go.
Diffstat (limited to 'patches')
-rw-r--r--patches/divide.c177
-rw-r--r--patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S125
-rw-r--r--patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S231
-rw-r--r--patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S414
-rw-r--r--patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S227
-rw-r--r--patches/patch-small-lib-278
6 files changed, 1252 insertions, 0 deletions
diff --git a/patches/divide.c b/patches/divide.c
new file mode 100644
index 0000000..bb4be16
--- /dev/null
+++ b/patches/divide.c
@@ -0,0 +1,177 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2005 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+
+#include <liboil/liboil.h>
+#include <liboil/liboilfunction.h>
+#include <liboil/liboiltest.h>
+#include <liboil/liboilrandom.h>
+#include <stdio.h>
+
+
+static void
+divide_u32_u64_u32_test (OilTest *test)
+{
+ int i;
+ int n;
+ uint64_t *src1;
+ uint32_t *src2;
+
+ src1 = (uint64_t *)oil_test_get_source_data(test, OIL_ARG_SRC1);
+ src2 = (uint32_t *)oil_test_get_source_data(test, OIL_ARG_SRC2);
+ n = p1->post_n;
+ for(i=0;i<n;i++){
+ src2[i] = oil_rand_u32();
+ src1[i] = oil_rand_u32() * (uint64_t)src2[i];
+ }
+}
+
+OIL_DEFINE_CLASS_FULL (divide_u32_u64_u32, "uint32_t *dest, uint64_t *src1, uint32_t *src2, int n",
+ divide_u32_u64_u32_test);
+
+
+static void
+divide_u32_u64_u32_ref (uint32_t *dest, uint64_t *src1, uint32_t *src2, int n)
+{
+ int i;
+
+ for(i=0;i<n;i++){
+ dest[i] = src1[i] / src2[i];
+ }
+
+}
+OIL_DEFINE_IMPL_REF (divide_u32_u64_u32_ref, divide_u32_u64_u32);
+
+static void
+divide_u32_u64_u32_long (uint32_t *dest, uint64_t *src1, uint32_t *src2, int n)
+{
+ int i;
+ int bit;
+ uint64_t a;
+ uint32_t b;
+ uint32_t c;
+
+ for(i=0;i<n;i++){
+ a = src1[i];
+ b = src2[i];
+ c = 0;
+ for (bit = 31; bit >= 0; bit--) {
+ if (a >= ((uint64_t)b)<<bit) {
+ c |= 1<<bit;
+ a -= ((uint64_t)b)<<bit;
+ }
+ }
+ dest[i] = c;
+ }
+
+}
+OIL_DEFINE_IMPL (divide_u32_u64_u32_long, divide_u32_u64_u32);
+
+static int
+binlog(uint32_t x)
+{
+ int y = 0;
+ if (x &0xffff0000) y += 16;
+ if (x &0xff00ff00) y += 8;
+ if (x &0xf0f0f0f0) y += 4;
+ if (x &0xcccccccc) y += 2;
+ if (x &0xaaaaaaaa) y += 1;
+ return y;
+}
+
+static void
+divide_u32_u64_u32_long2 (uint32_t *dest, uint64_t *src1, uint32_t *src2, int n)
+{
+ int i;
+ int bit;
+ uint64_t a;
+ uint32_t b;
+ uint32_t c;
+ uint32_t d;
+ int shift;
+
+ for(i=0;i<n;i++){
+ a = src1[i];
+ b = src2[i];
+
+ d = b;
+ shift = binlog(b);
+
+ d = a>>shift;
+ c = (d/b) << shift;
+ a -= (uint64_t)c * b;
+
+ if (a > 0) {
+ for (bit = 31; bit >= 0; bit--) {
+ if (a >= ((uint64_t)b)<<bit) {
+ c += 1<<bit;
+ a -= ((uint64_t)b)<<bit;
+ }
+ if (a==0) break;
+ }
+ }
+ dest[i] = c;
+ }
+
+}
+OIL_DEFINE_IMPL (divide_u32_u64_u32_long2, divide_u32_u64_u32);
+
+static void
+divide_u32_u64_u32_asm (uint32_t *dest, uint64_t *src1, uint32_t *src2, int n)
+{
+ int i;
+ uint32_t a;
+ uint32_t d;
+
+ for(i=0;i<n;i++){
+ a = src1[i];
+ d = (src1[i]>>32);
+
+ __asm__ __volatile__ ("\n"
+ " div %2\n"
+ : "+a" (a), "+d" (d)
+ : "r" (src2[i]));
+
+ dest[i] = a;
+ }
+
+}
+OIL_DEFINE_IMPL (divide_u32_u64_u32_asm, divide_u32_u64_u32);
+
+
+
+
+
+
+
+
+
diff --git a/patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S b/patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S
new file mode 100644
index 0000000..db2cbec
--- /dev/null
+++ b/patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S
@@ -0,0 +1,125 @@
+ .file "nr-compose.c"
+
+# Ensure Inkscape is execshield protected
+ .section .note.GNU-stack
+ .previous
+
+ .text
+ .align 2
+.globl nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP
+ .type nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP,@function
+
+/*
+ * This code is in public domain
+ *
+ * c 32(%ebp)
+ * srs 28(%ebp)
+ * spx 24(%ebp)
+ * rs 20(%ebp)
+ * h 16(%ebp)
+ * w 12(%ebp)
+ * px 8(%ebp)
+ * r -8(%ebp)
+ * g -12(%ebp)
+ * b -16(%ebp)
+ * a -20(%ebp)
+ * s -24(%ebp) -> %esi
+ * d -28(%ebp) -> %edi
+ * x -32(%ebp) -> %ebx
+ * y -36(%ebp)
+ * ca -40(%ebp)
+ *
+ * mm0 Fg
+ * mm1 FgA
+ * mm2 FgPre
+ * mm3
+ * mm4
+ * mm5
+ * mm6 128
+ * mm7 0
+ *
+*/
+
+nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP:
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %ebx
+ subl $36, %esp
+ pushl %edi
+ pushl %esi
+
+/* Load %mm7 with [0 0 0 0] */
+ movl $0, %eax
+ movd %eax, %mm7
+
+/* Load %mm6 with [128 128 128 128] */
+ movl $0x80808080, %eax
+ movd %eax, %mm6
+ punpcklbw %mm7, %mm6
+
+/* FgC -> %mm0 */
+ movl 32(%ebp), %eax
+ movd (%eax), %mm0
+ punpcklbw %mm7, %mm0
+
+/* for (y = ...) */
+ movl 16(%ebp), %ecx
+.fory:
+
+/* d = px */
+/* s = spx */
+ movl 8(%ebp), %edi
+ movl 24(%ebp), %esi
+
+/* for (x = ...) */
+ movl 12(%ebp), %ebx
+.forx:
+
+/* [m m m m] -> %mm1 */
+ movzbl (%esi), %eax
+ testb $0xff, %al
+ jz .clip
+ movd %eax, %mm1
+ punpcklwd %mm1, %mm1
+ punpckldq %mm1, %mm1
+
+/* Fg -> mm2 */
+ movq %mm0, %mm2
+ pmullw %mm1, %mm2
+ paddw %mm6, %mm2
+ movq %mm2, %mm3
+ psrlw $8, %mm3
+ paddw %mm3, %mm2
+ psrlw $8, %mm2
+
+/* Store pixel */
+ packuswb %mm2, %mm2
+ movd %mm2, (%edi)
+
+.clip:
+ addl $4, %edi
+ incl %esi
+
+ decl %ebx
+ jnz .forx
+
+ movl 20(%ebp), %eax
+ addl %eax, 8(%ebp)
+ movl 28(%ebp), %eax
+ addl %eax, 24(%ebp)
+
+ decl %ecx
+ jnz .fory
+
+.exit:
+ emms
+ popl %esi
+ popl %edi
+ addl $36, %esp
+ popl %ebx
+ popl %ebp
+ ret
+
+.Lfe1:
+ .size nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP,.Lfe1-nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP
+ .ident "GCC: (GNU) 3.2"
diff --git a/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S b/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S
new file mode 100644
index 0000000..fe1d9be
--- /dev/null
+++ b/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S
@@ -0,0 +1,231 @@
+ .file "nr-compose.c"
+
+# Ensure Inkscape is execshield protected
+ .section .note.GNU-stack
+ .previous
+
+ .text
+ .align 2
+.globl nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP
+ .type nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP,@function
+
+/*
+ * This code is in public domain
+ *
+ * c 32(%ebp)
+ * srs 28(%ebp)
+ * spx 24(%ebp)
+ * rs 20(%ebp)
+ * h 16(%ebp)
+ * w 12(%ebp)
+ * px 8(%ebp)
+ * r -8(%ebp)
+ * g -12(%ebp)
+ * b -16(%ebp)
+ * a -20(%ebp)
+ * s -24(%ebp) -> %esi
+ * d -28(%ebp) -> %edi
+ * x -32(%ebp) -> %ebx
+ * y -36(%ebp)
+ * ca -40(%ebp)
+ *
+ * mm0 Fg
+ * mm1 MMMM
+ * mm2 FgM
+ * mm3
+ * mm4
+ * mm5 255
+ * mm6 128
+ * mm7 0
+ *
+*/
+
+nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP:
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %ebx
+ subl $36, %esp
+ pushl %edi
+ pushl %esi
+
+/* Load %mm7 with [0 0 0 0] */
+ movl $0, %eax
+ movd %eax, %mm7
+
+/* Load %mm6 with [128 128 128 128] */
+ movl $0x80808080, %eax
+ movd %eax, %mm6
+ punpcklbw %mm7, %mm6
+
+/* Load %mm5 with [255 255 255 255] */
+ movl $0xffffffff, %eax
+ movd %eax, %mm5
+ punpcklbw %mm7, %mm5
+
+/* FgC -> %mm0 */
+ movl 32(%ebp), %eax
+ movd (%eax), %mm0
+ punpcklbw %mm7, %mm0
+
+/* Check full opacity */
+ cmpb $0xff, %al
+ jz .opaque
+
+/* for (y = ...) */
+ movl 16(%ebp), %ecx
+.fory:
+
+/* d = px */
+/* s = spx */
+ movl 8(%ebp), %edi
+ movl 24(%ebp), %esi
+
+/* for (x = ...) */
+ movl 12(%ebp), %ebx
+.forx:
+
+/* [m m m m] -> %mm1 */
+ movzbl (%esi), %eax
+ testb $0xff, %al
+ jz .clip
+ movd %eax, %mm1
+ punpcklwd %mm1, %mm1
+ punpckldq %mm1, %mm1
+
+/* Fg -> mm2 */
+ movq %mm0, %mm2
+ pmullw %mm1, %mm2
+ paddw %mm6, %mm2
+ movq %mm2, %mm3
+ psrlw $8, %mm3
+ paddw %mm3, %mm2
+ psrlw $8, %mm2
+
+/* [255 - FgA] -> mm1 */
+ movq %mm2, %mm1
+ punpckhwd %mm1, %mm1
+ punpckhdq %mm1, %mm1
+ pxor %mm5, %mm1
+
+/* Bg -> mm3 */
+ movd (%edi), %mm3
+ punpcklbw %mm7, %mm3
+
+/* Fg + ((255 - FgA) * Bg) / 255 */
+ pmullw %mm1, %mm3
+ paddw %mm6, %mm3
+ movq %mm3, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm3
+ psrlw $8, %mm3
+ paddw %mm2, %mm3
+
+/* Store pixel */
+ packuswb %mm3, %mm3
+ movd %mm3, (%edi)
+
+.clip:
+ addl $4, %edi
+ incl %esi
+
+ decl %ebx
+ jnz .forx
+
+ movl 20(%ebp), %eax
+ addl %eax, 8(%ebp)
+ movl 28(%ebp), %eax
+ addl %eax, 24(%ebp)
+
+ decl %ecx
+ jnz .fory
+
+.exit:
+ emms
+ popl %esi
+ popl %edi
+ addl $36, %esp
+ popl %ebx
+ popl %ebp
+ ret
+
+.opaque:
+/* for (y = ...) */
+ movl 16(%ebp), %ecx
+.o_fory:
+
+/* d = px */
+/* s = spx */
+ movl 8(%ebp), %edi
+ movl 24(%ebp), %esi
+
+/* for (x = ...) */
+ movl 12(%ebp), %ebx
+.o_forx:
+
+/* [m m m m] -> %mm1 */
+ movzbl (%esi), %eax
+ testb $0xff, %al
+ jz .o_clip
+ cmpb $0xff, %al
+ jz .o_full
+ movd %eax, %mm1
+ punpcklwd %mm1, %mm1
+ punpckldq %mm1, %mm1
+
+/* Fg -> mm2 */
+ movq %mm0, %mm2
+ pmullw %mm1, %mm2
+ paddw %mm6, %mm2
+ movq %mm2, %mm3
+ psrlw $8, %mm3
+ paddw %mm3, %mm2
+ psrlw $8, %mm2
+
+/* [255 - FgA] -> mm1 */
+ movq %mm2, %mm1
+ punpckhwd %mm1, %mm1
+ punpckhdq %mm1, %mm1
+ pxor %mm5, %mm1
+
+/* Bg -> mm3 */
+ movd (%edi), %mm3
+ punpcklbw %mm7, %mm3
+
+/* Fg + ((255 - FgA) * Bg) / 255 */
+ pmullw %mm1, %mm3
+ paddw %mm6, %mm3
+ movq %mm3, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm3
+ psrlw $8, %mm3
+ paddw %mm2, %mm3
+
+ jmp .o_store
+
+.o_full:
+ movq %mm0, %mm3
+
+.o_store:
+/* Store pixel */
+ packuswb %mm3, %mm3
+ movd %mm3, (%edi)
+
+.o_clip:
+ addl $4, %edi
+ incl %esi
+
+ decl %ebx
+ jnz .o_forx
+
+ movl 20(%ebp), %eax
+ addl %eax, 8(%ebp)
+ movl 28(%ebp), %eax
+ addl %eax, 24(%ebp)
+
+ decl %ecx
+ jnz .o_fory
+ jmp .exit
+
+.Lfe1:
+ .size nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP,.Lfe1-nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP
+ .ident "GCC: (GNU) 3.2"
diff --git a/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S b/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S
new file mode 100644
index 0000000..e30056a
--- /dev/null
+++ b/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S
@@ -0,0 +1,414 @@
+ .file "nr-compose-transform.c"
+
+# Ensure Inkscape is execshield protected
+ .section .note.GNU-stack
+ .previous
+
+ .text
+ .align 2
+.globl nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0
+ .type nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0,@function
+
+/*
+ * This code is in public domain
+ *
+ */
+
+nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0:
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %ebx
+ subl $48, %esp
+ pushl %edi
+ pushl %esi
+
+/* Load %mm7 with [0 0 0 0] */
+ movl $0, %eax
+ movd %eax, %mm7
+
+/* Load %mm6 with [128 128 128 128] */
+ movl $0x80808080, %eax
+ movd %eax, %mm6
+ punpcklbw %mm7, %mm6
+
+/* Load %mm5 with [255 255 255 255] */
+ movl $0xffffffff, %eax
+ movd %eax, %mm5
+ punpcklbw %mm7, %mm5
+
+/* Load %mm0 with [a a a a] */
+ movzbl 44(%ebp), %eax
+ movd %eax, %mm0
+ punpcklwd %mm0, %mm0
+ punpckldq %mm0, %mm0
+
+ movl 8(%ebp), %eax
+ movl %eax, -8(%ebp)
+ movl 40(%ebp), %eax
+ addl $16, %eax
+ movl (%eax), %eax
+ movl %eax, -12(%ebp)
+ movl 40(%ebp), %eax
+ addl $20, %eax
+ movl (%eax), %eax
+ movl %eax, -16(%ebp)
+ movl $0, -24(%ebp)
+.L29:
+ movl -24(%ebp), %eax
+ cmpl 16(%ebp), %eax
+ jl .L32
+ jmp .L28
+.L32:
+ movl -8(%ebp), %edi
+
+ movl -12(%ebp), %eax
+ movl %eax, %esi
+ movl -16(%ebp), %eax
+ movl %eax, -36(%ebp)
+
+ movl 12(%ebp), %ebx
+.for_x_0:
+
+ movl %esi, %ecx
+ cmpl $0, %ecx
+ js .clip_0
+ sarl $12, %ecx
+ cmpl 28(%ebp), %ecx
+ jge .clip_0
+ shll $2, %ecx
+
+ movl -36(%ebp), %eax
+ cmpl $0, %eax
+ js .clip_0
+ sarl $12, %eax
+ cmpl 32(%ebp), %eax
+ jge .clip_0
+ imull 36(%ebp), %eax
+
+ addl %ecx, %eax
+ addl 24(%ebp), %eax
+
+/* Fg -> %mm1 */
+ movl (%eax), %eax
+ testl $0xff000000, %eax
+ jz .clip_0
+ movd %eax, %mm1
+ punpcklbw %mm7, %mm1
+
+/* [a a a 255] -> %mm3 */
+ shrl $24, %eax
+ movl $0x10101, %edx
+ mull %edx
+ orl $0xff000000, %eax
+ movd %eax, %mm3
+ punpcklbw %mm7, %mm3
+
+/* [Fg * a] -> mm1 */
+ pmullw %mm3, %mm1
+ paddw %mm6, %mm1
+ movq %mm1, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm1
+ psrlw $8, %mm1
+
+/* Multiply by alpha */
+ pmullw %mm0, %mm1
+ paddw %mm6, %mm1
+ movq %mm1, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm1
+ psrlw $8, %mm1
+
+/* [255 - FgA] -> mm2 */
+ movq %mm1, %mm2
+ punpckhwd %mm2, %mm2
+ punpckhdq %mm2, %mm2
+ pxor %mm5, %mm2
+
+/* Bg -> mm3 */
+ movd (%edi), %mm3
+ punpcklbw %mm7, %mm3
+
+/* Fg + ((255 - FgA) * Bg) / 255 */
+
+ pmullw %mm2, %mm3
+ paddw %mm6, %mm3
+ movq %mm3, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm3
+ psrlw $8, %mm3
+ paddw %mm1, %mm3
+
+/* Store pixel */
+ packuswb %mm3, %mm3
+ movd %mm3, (%edi)
+
+.clip_0:
+.L37:
+ movl 40(%ebp), %ecx
+ movl (%ecx), %edx
+ addl %edx, %esi
+ movl 4(%ecx), %edx
+ addl %edx, -36(%ebp)
+
+ addl $4, %edi
+
+ decl %ebx
+ jnz .for_x_0
+
+.L34:
+ movl 8(%ecx), %edx
+ addl %edx, -12(%ebp)
+ movl 12(%ecx), %edx
+ addl %edx, -16(%ebp)
+
+ movl 20(%ebp), %edx
+ leal -8(%ebp), %eax
+ addl %edx, (%eax)
+ leal -24(%ebp), %eax
+ incl (%eax)
+ jmp .L29
+.L28:
+ emms
+ popl %esi
+ popl %edi
+ addl $48, %esp
+ popl %ebx
+ popl %ebp
+ ret
+.Lfe2:
+ .size nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0,.Lfe2-nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0
+
+/*
+ *
+ * dbits 52(%ebp)
+ * alpha 48(%ebp)
+ * FF_S 44(%ebp)
+ *
+ * d -32(%ebp) -> %edi
+ * i -60(%ebp) -> %esi
+ * sx -64(%ebp) -> %ebx
+ * sy -68(%ebp)
+ * s -72(%ebp)
+ *
+ * %mm0 a a a a
+ * %mm1 FgA
+ * %mm2 SumFgA
+ * %mm3 a a a 255
+ * %mm4
+*/
+
+ .align 2
+.globl nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n
+ .type nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n,@function
+nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n:
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %ebx
+ subl $72, %esp
+ pushl %edi
+ pushl %esi
+
+/* Load %mm7 with [0 0 0 0] */
+ movl $0, %eax
+ movd %eax, %mm7
+
+/* Load %mm6 with [128 128 128 128] */
+ movl $0x80808080, %eax
+ movd %eax, %mm6
+ punpcklbw %mm7, %mm6
+
+/* Load %mm5 with [255 255 255 255] */
+ movl $0xffffffff, %eax
+ movd %eax, %mm5
+ punpcklbw %mm7, %mm5
+
+/* Load %mm0 with [a a a a] */
+ movzbl 48(%ebp), %eax
+ movd %eax, %mm0
+ punpcklwd %mm0, %mm0
+ punpckldq %mm0, %mm0
+
+ movl $1, %eax
+ movzbl 52(%ebp), %ecx
+ sall %cl, %eax
+ movl %eax, -8(%ebp)
+ movl 8(%ebp), %eax
+ movl %eax, -12(%ebp)
+ movl 40(%ebp), %eax
+ addl $16, %eax
+ movl (%eax), %eax
+ movl %eax, -16(%ebp)
+ movl 40(%ebp), %eax
+ addl $20, %eax
+ movl (%eax), %eax
+ movl %eax, -20(%ebp)
+ movl $0, -28(%ebp)
+.L44:
+ movl -28(%ebp), %eax
+ cmpl 16(%ebp), %eax
+ jl .L47
+ jmp .exit_n
+.L47:
+ movl -12(%ebp), %eax
+ movl %eax, -32(%ebp)
+ movl -16(%ebp), %eax
+ movl %eax, -36(%ebp)
+ movl -20(%ebp), %eax
+ movl %eax, -40(%ebp)
+ movl $0, -24(%ebp)
+.L48:
+ movl -24(%ebp), %eax
+ cmpl 12(%ebp), %eax
+ jl .L51
+ jmp .L49
+.L51:
+
+/* Zero accumulator */
+ movq %mm7, %mm2
+
+/* Set i to dptr (size - 1) */
+ movl -8(%ebp), %esi
+ sub $1, %esi
+ shll $3, %esi
+
+ movl 44(%ebp), %edi
+ movl -36(%ebp), %ecx
+
+.for_i_n:
+ movl (%edi,%esi), %ebx
+ addl %ecx, %ebx
+/* Test negative before shift */
+ cmpl $0, %ebx
+ js .next_i_n
+ sarl $12, %ebx
+ cmpl 28(%ebp), %ebx
+ jge .next_i_n
+/* We multiply sx by 4 here */
+ shll $2, %ebx
+
+ movl 4(%edi,%esi), %eax
+ addl -40(%ebp), %eax
+/* Test negative before shift */
+ cmpl $0, %eax
+ js .next_i_n
+ sarl $12, %eax
+ cmpl 32(%ebp), %eax
+ jge .next_i_n
+/* We multiply sy by srs here */
+ imull 36(%ebp), %eax
+
+ addl %ebx, %eax
+ addl 24(%ebp), %eax
+
+/* Fg -> %mm1 */
+ movl (%eax), %eax
+ testl $0xff000000, %eax
+ jz .next_i_n
+ movd %eax, %mm1
+ punpcklbw %mm7, %mm1
+
+/* [a a a 255] -> %mm3 */
+ shrl $24, %eax
+ movl $0x10101, %edx
+ mull %edx
+ orl $0xff000000, %eax
+ movd %eax, %mm3
+ punpcklbw %mm7, %mm3
+
+/* [Fg * a] -> mm1 */
+ pmullw %mm3, %mm1
+ paddw %mm6, %mm1
+ movq %mm1, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm1
+ psrlw $8, %mm1
+
+/* Add to accumulator */
+ paddw %mm1, %mm2
+
+.next_i_n:
+ subl $8, %esi
+ jnb .for_i_n
+
+/* Divide components by sample size */
+ movd 52(%ebp), %mm3
+ psrlw %mm3, %mm2
+
+/* Multiply by alpha */
+ pmullw %mm0, %mm2
+ paddw %mm6, %mm2
+ movq %mm2, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm2
+ psrlw $8, %mm2
+
+/* [255 - FgA] -> mm1 */
+ movq %mm2, %mm1
+ punpckhwd %mm1, %mm1
+ punpckhdq %mm1, %mm1
+ pxor %mm5, %mm1
+
+ movl -32(%ebp), %edi
+/* Bg -> mm3 */
+ movd (%edi), %mm3
+ punpcklbw %mm7, %mm3
+
+/* Fg + ((255 - FgA) * Bg) / 255 */
+
+ pmullw %mm1, %mm3
+ paddw %mm6, %mm3
+ movq %mm3, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm3
+ psrlw $8, %mm3
+ paddw %mm2, %mm3
+
+/* Store pixel */
+ packuswb %mm3, %mm3
+ movd %mm3, (%edi)
+
+.L58:
+ movl 40(%ebp), %eax
+ movl (%eax), %edx
+ leal -36(%ebp), %eax
+ addl %edx, (%eax)
+ movl 40(%ebp), %eax
+ addl $4, %eax
+ movl (%eax), %edx
+ leal -40(%ebp), %eax
+ addl %edx, (%eax)
+ leal -32(%ebp), %eax
+ addl $4, (%eax)
+ leal -24(%ebp), %eax
+ incl (%eax)
+ jmp .L48
+.L49:
+ movl 40(%ebp), %eax
+ addl $8, %eax
+ movl (%eax), %edx
+ leal -16(%ebp), %eax
+ addl %edx, (%eax)
+ movl 40(%ebp), %eax
+ addl $12, %eax
+ movl (%eax), %edx
+ leal -20(%ebp), %eax
+ addl %edx, (%eax)
+ movl 20(%ebp), %edx
+ leal -12(%ebp), %eax
+ addl %edx, (%eax)
+ leal -28(%ebp), %eax
+ incl (%eax)
+ jmp .L44
+
+.exit_n:
+ emms
+ popl %esi
+ popl %edi
+ addl $72, %esp
+ popl %ebx
+ popl %ebp
+ ret
+.Lfe3:
+ .size nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n,.Lfe3-nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n
+ .ident "GCC: (GNU) 3.2"
diff --git a/patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S b/patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S
new file mode 100644
index 0000000..37261e5
--- /dev/null
+++ b/patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S
@@ -0,0 +1,227 @@
+ .file "nr-compose.c"
+
+# Ensure Inkscape is execshield protected
+ .section .note.GNU-stack
+ .previous
+
+ .text
+ .align 2
+.globl nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P
+ .type nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P,@function
+
+/*
+ * This code is in public domain
+ *
+ * alpha 32(%ebp)
+ * srs 28(%ebp)
+ * spx 24(%ebp)
+ * rs 20(%ebp)
+ * h 16(%ebp)
+ * w 12(%ebp)
+ * px 8(%ebp)
+ * r -8(%ebp)
+ * g -12(%ebp)
+ * b -16(%ebp)
+ * a -20(%ebp)
+ * s -24(%ebp) -> %esi
+ * d -28(%ebp) -> %edi
+ * x -32(%ebp) -> %ebx
+ * y -36(%ebp)
+ * ca -40(%ebp)
+ *
+ * mm0 A
+ * mm1 FgA
+ * mm2 FgPre
+ * mm3
+ * mm4
+ * mm5 255
+ * mm6 128
+ * mm7 0
+ *
+*/
+
+nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P:
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %ebx
+ subl $36, %esp
+ pushl %edi
+ pushl %esi
+
+/* Load %mm7 with [0 0 0 0] */
+ movl $0, %eax
+ movd %eax, %mm7
+
+/* Load %mm6 with [128 128 128 128] */
+ movl $0x80808080, %eax
+ movd %eax, %mm6
+ punpcklbw %mm7, %mm6
+
+/* Load %mm5 with [255 255 255 255] */
+ movl $0xffffffff, %eax
+ movd %eax, %mm5
+ punpcklbw %mm7, %mm5
+
+/* Load %mm0 with [a a a a] */
+/* Check full opacity */
+ movzbl 32(%ebp), %eax
+ cmpb $0xff, %al
+ jz .opaque
+ movd %eax, %mm0
+ punpcklwd %mm0, %mm0
+ punpckldq %mm0, %mm0
+
+/* for (y = ...) */
+ movl 16(%ebp), %ecx
+.fory:
+
+/* d = px */
+/* s = spx */
+ movl 8(%ebp), %edi
+ movl 24(%ebp), %esi
+
+/* for (x = ...) */
+ movl 12(%ebp), %ebx
+.forx:
+
+/* Fg -> %mm1 */
+/* fixme: Do we have to bother about alignment here? (Lauris) */
+ movl (%esi), %eax
+ testl $0xff000000, %eax
+ jz .clip
+ movd %eax, %mm1
+ punpcklbw %mm7, %mm1
+
+/* [Fg * a] -> mm1 */
+ pmullw %mm0, %mm1
+ paddw %mm6, %mm1
+ movq %mm1, %mm2
+ psrlw $8, %mm2
+ paddw %mm2, %mm1
+ psrlw $8, %mm1
+
+/* [255 - FgA] -> mm2 */
+ movq %mm1, %mm2
+ punpckhwd %mm2, %mm2
+ punpckhdq %mm2, %mm2
+ pxor %mm5, %mm2
+
+/* Bg -> mm3 */
+ movd (%edi), %mm3
+ punpcklbw %mm7, %mm3
+
+/* Fg + ((255 - FgA) * Bg) / 255 */
+ pmullw %mm2, %mm3
+ paddw %mm6, %mm3
+ movq %mm3, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm3
+ psrlw $8, %mm3
+ paddw %mm1, %mm3
+
+/* Store pixel */
+ packuswb %mm3, %mm3
+ movd %mm3, %eax
+ movb %al, 0(%edi)
+ shrl $8, %eax
+ movb %al, 1(%edi)
+ shrl $8, %eax
+ movb %al, 2(%edi)
+
+.clip:
+ addl $3, %edi
+ addl $4, %esi
+
+ decl %ebx
+ jnz .forx
+
+ movl 20(%ebp), %eax
+ addl %eax, 8(%ebp)
+ movl 28(%ebp), %eax
+ addl %eax, 24(%ebp)
+
+ decl %ecx
+ jnz .fory
+
+.exit:
+ emms
+ popl %esi
+ popl %edi
+ addl $36, %esp
+ popl %ebx
+ popl %ebp
+ ret
+
+.opaque:
+/* for (y = ...) */
+ movl 16(%ebp), %ecx
+.o_fory:
+
+/* d = px */
+/* s = spx */
+ movl 8(%ebp), %edi
+ movl 24(%ebp), %esi
+
+/* for (x = ...) */
+ movl 12(%ebp), %ebx
+.o_forx:
+
+/* Fg -> %mm1 */
+/* fixme: Do we have to bother about alignment here? (Lauris) */
+ movl (%esi), %eax
+ testl $0xff000000, %eax
+ jz .o_clip
+ cmpl $0xff000000, %eax
+ jnb .o_store
+ movd %eax, %mm1
+ punpcklbw %mm7, %mm1
+
+/* [255 - FgA] -> mm2 */
+ movq %mm1, %mm2
+ punpckhwd %mm2, %mm2
+ punpckhdq %mm2, %mm2
+ pxor %mm5, %mm2
+
+/* Bg -> mm3 */
+ movd (%edi), %mm3
+ punpcklbw %mm7, %mm3
+
+/* Fg + ((255 - FgA) * Bg) / 255 */
+ pmullw %mm2, %mm3
+ paddw %mm6, %mm3
+ movq %mm3, %mm4
+ psrlw $8, %mm4
+ paddw %mm4, %mm3
+ psrlw $8, %mm3
+ paddw %mm1, %mm3
+
+/* Store pixel */
+ packuswb %mm3, %mm3
+ movd %mm3, %eax
+.o_store:
+ movb %al, 0(%edi)
+ shrl $8, %eax
+ movb %al, 1(%edi)
+ shrl $8, %eax
+ movb %al, 2(%edi)
+
+.o_clip:
+ addl $3, %edi
+ addl $4, %esi
+
+ decl %ebx
+ jnz .o_forx
+
+ movl 20(%ebp), %eax
+ addl %eax, 8(%ebp)
+ movl 28(%ebp), %eax
+ addl %eax, 24(%ebp)
+
+ decl %ecx
+ jnz .o_fory
+
+ jmp .exit
+
+.Lfe1:
+ .size nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P,.Lfe1-nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P
+ .ident "GCC: (GNU) 3.2"
diff --git a/patches/patch-small-lib-2 b/patches/patch-small-lib-2
new file mode 100644
index 0000000..938e36a
--- /dev/null
+++ b/patches/patch-small-lib-2
@@ -0,0 +1,78 @@
+Index: autogen.sh
+===================================================================
+RCS file: /cvs/liboil/liboil/autogen.sh,v
+retrieving revision 1.8
+diff -u -r1.8 autogen.sh
+--- autogen.sh 26 Jul 2005 20:32:36 -0000 1.8
++++ autogen.sh 3 Aug 2005 21:16:18 -0000
+@@ -1,4 +1,4 @@
+ #!/bin/sh
+
+ autoreconf -i -f &&
+-./configure --enable-maintainer-mode --disable-static $@
++./configure --enable-maintainer-mode --disable-static --enable-library-peeling $@
+Index: configure.ac
+===================================================================
+RCS file: /cvs/liboil/liboil/configure.ac,v
+retrieving revision 1.56
+diff -u -r1.56 configure.ac
+--- configure.ac 3 Aug 2005 03:33:47 -0000 1.56
++++ configure.ac 3 Aug 2005 21:16:18 -0000
+@@ -60,6 +60,14 @@
+ AC_DEFINE(ENABLE_BROKEN_IMPLS, 1, [Define if compiling broken implementations])
+ fi
+
++AC_ARG_ENABLE(library-peeling,
++ AC_HELP_STRING([--enable-library-peeling],[peel unused functions]),
++ enable_library_peeling=$enableval,enable_broken_implementations=no)
++if test "x$enable_library_peeling" = xyes ; then
++ AC_DEFINE(ENABLE_PEELING, 1, [Define if peeling library])
++ LIBOIL_CFLAGS="$LIBOIL_CFLAGS -ffunction-sections -fdata-sections"
++fi
++
+ ##################################################
+ # Check for gtk-doc.
+ ##################################################
+Index: liboil/Makefile.am
+===================================================================
+RCS file: /cvs/liboil/liboil/liboil/Makefile.am,v
+retrieving revision 1.41
+diff -u -r1.41 Makefile.am
+--- liboil/Makefile.am 3 Aug 2005 03:33:47 -0000 1.41
++++ liboil/Makefile.am 3 Aug 2005 21:16:18 -0000
+@@ -46,7 +46,6 @@
+ -no-undefined \
+ -export-symbols-regex 'oil_'
+ liboiltmp1_la_LIBADD = \
+- liboilfunctions.la \
+ $(LIBM)
+
+ liboil_@LIBOIL_MAJORMINOR@_la_SOURCES = \
+@@ -67,12 +66,12 @@
+ liboiltest.c \
+ liboilmarshal.c
+ liboil_@LIBOIL_MAJORMINOR@_la_LIBADD = \
+- liboilfunctions.la \
+ $(LIBM)
+ liboil_@LIBOIL_MAJORMINOR@_la_CFLAGS = $(LIBOIL_CFLAGS)
+ liboil_@LIBOIL_MAJORMINOR@_la_LDFLAGS = \
+ -no-undefined \
+ -version-info $(LIBOIL_LIBVERSION) \
++ .libs/liboilfunctions.a \
+ -export-symbols-regex '^oil_'
+
+ # This is required to use 'make -j2'. Automake doesn't seem to notice
+@@ -111,11 +110,11 @@
+ echo ' NULL' >>liboilarray.c.tmp
+ echo '};' >>liboilarray.c.tmp
+ echo >>liboilarray.c.tmp
+- grep '^_oil_function_impl_' .libs/liboiltmp1.exp | \
++ grep '^_oil_function_impl_.*ref' .libs/liboiltmp1.exp | \
+ sed 's/.*/extern OilFunctionImpl &;/' >>liboilarray.c.tmp
+ echo >>liboilarray.c.tmp
+ echo 'OilFunctionImpl *_oil_function_impl_array[] = {' >>liboilarray.c.tmp
+- grep '^_oil_function_impl_' .libs/liboiltmp1.exp | \
++ grep '^_oil_function_impl_.*ref' .libs/liboiltmp1.exp | \
+ sed 's/.*/ \&&,/' >>liboilarray.c.tmp
+ echo ' NULL' >>liboilarray.c.tmp
+ echo '};' >>liboilarray.c.tmp