From 4e381b38abceb843eb6dfd88f734a5c8a38b0b71 Mon Sep 17 00:00:00 2001 From: David Schleef Date: Thu, 22 Dec 2005 22:04:54 +0000 Subject: * .cvsignore: * BUGS: * doc/.cvsignore: * liboil/mmx/Makefile.am: * liboil/mmx/fbmmx.c: * patches/divide.c: * patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S: * patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S: * patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S: * patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S: * patches/patch-small-lib-2: * testsuite/Makefile.am: * testsuite/list_impls.c: Clean up local source tree. Put spare files in the place where spare files go. --- patches/divide.c | 177 +++++++++ patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S | 125 +++++++ patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S | 231 ++++++++++++ ...mx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S | 414 +++++++++++++++++++++ patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S | 227 +++++++++++ patches/patch-small-lib-2 | 78 ++++ 6 files changed, 1252 insertions(+) create mode 100644 patches/divide.c create mode 100644 patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S create mode 100644 patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S create mode 100644 patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S create mode 100644 patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S create mode 100644 patches/patch-small-lib-2 (limited to 'patches') diff --git a/patches/divide.c b/patches/divide.c new file mode 100644 index 0000000..bb4be16 --- /dev/null +++ b/patches/divide.c @@ -0,0 +1,177 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2005 David A. Schleef + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include + +#include +#include +#include +#include +#include + + +static void +divide_u32_u64_u32_test (OilTest *test) +{ + int i; + int n; + uint64_t *src1; + uint32_t *src2; + + src1 = (uint64_t *)oil_test_get_source_data(test, OIL_ARG_SRC1); + src2 = (uint32_t *)oil_test_get_source_data(test, OIL_ARG_SRC2); + n = p1->post_n; + for(i=0;i= 0; bit--) { + if (a >= ((uint64_t)b)<>shift; + c = (d/b) << shift; + a -= (uint64_t)c * b; + + if (a > 0) { + for (bit = 31; bit >= 0; bit--) { + if (a >= ((uint64_t)b)<>32); + + __asm__ __volatile__ ("\n" + " div %2\n" + : "+a" (a), "+d" (d) + : "r" (src2[i])); + + dest[i] = a; + } + +} +OIL_DEFINE_IMPL (divide_u32_u64_u32_asm, divide_u32_u64_u32); + + + + + + + + + diff --git a/patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S b/patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S new file mode 100644 index 0000000..db2cbec --- /dev/null +++ b/patches/nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP.S @@ -0,0 +1,125 @@ + .file "nr-compose.c" + +# Ensure Inkscape is execshield protected + .section .note.GNU-stack + .previous + + .text + .align 2 +.globl nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP + .type nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP,@function + +/* + * This code is in public domain + * + * c 32(%ebp) + * srs 28(%ebp) + * spx 24(%ebp) + * rs 20(%ebp) + * h 16(%ebp) + * w 12(%ebp) + * px 8(%ebp) + * r -8(%ebp) + * g -12(%ebp) + * b -16(%ebp) + * a -20(%ebp) + * s -24(%ebp) -> %esi + * d -28(%ebp) -> %edi + * x -32(%ebp) -> %ebx + * y -36(%ebp) + * ca -40(%ebp) + * + * mm0 Fg + * mm1 FgA + * mm2 FgPre + * mm3 + * mm4 + * mm5 + * mm6 128 + * mm7 0 + * +*/ + +nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP: + pushl %ebp + movl %esp, %ebp + pushl %ebx + subl $36, %esp + pushl %edi + pushl %esi + +/* Load %mm7 with [0 0 0 0] */ + movl $0, %eax + movd %eax, %mm7 + +/* Load %mm6 with [128 128 128 128] */ + movl $0x80808080, %eax + movd %eax, %mm6 + punpcklbw %mm7, %mm6 + +/* FgC -> %mm0 */ + movl 32(%ebp), %eax + movd (%eax), %mm0 + punpcklbw %mm7, %mm0 + +/* for (y = ...) */ + movl 16(%ebp), %ecx +.fory: + +/* d = px */ +/* s = spx */ + movl 8(%ebp), %edi + movl 24(%ebp), %esi + +/* for (x = ...) */ + movl 12(%ebp), %ebx +.forx: + +/* [m m m m] -> %mm1 */ + movzbl (%esi), %eax + testb $0xff, %al + jz .clip + movd %eax, %mm1 + punpcklwd %mm1, %mm1 + punpckldq %mm1, %mm1 + +/* Fg -> mm2 */ + movq %mm0, %mm2 + pmullw %mm1, %mm2 + paddw %mm6, %mm2 + movq %mm2, %mm3 + psrlw $8, %mm3 + paddw %mm3, %mm2 + psrlw $8, %mm2 + +/* Store pixel */ + packuswb %mm2, %mm2 + movd %mm2, (%edi) + +.clip: + addl $4, %edi + incl %esi + + decl %ebx + jnz .forx + + movl 20(%ebp), %eax + addl %eax, 8(%ebp) + movl 28(%ebp), %eax + addl %eax, 24(%ebp) + + decl %ecx + jnz .fory + +.exit: + emms + popl %esi + popl %edi + addl $36, %esp + popl %ebx + popl %ebp + ret + +.Lfe1: + .size nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP,.Lfe1-nr_mmx_R8G8B8A8_P_EMPTY_A8_RGBAP + .ident "GCC: (GNU) 3.2" diff --git a/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S b/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S new file mode 100644 index 0000000..fe1d9be --- /dev/null +++ b/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP.S @@ -0,0 +1,231 @@ + .file "nr-compose.c" + +# Ensure Inkscape is execshield protected + .section .note.GNU-stack + .previous + + .text + .align 2 +.globl nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP + .type nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP,@function + +/* + * This code is in public domain + * + * c 32(%ebp) + * srs 28(%ebp) + * spx 24(%ebp) + * rs 20(%ebp) + * h 16(%ebp) + * w 12(%ebp) + * px 8(%ebp) + * r -8(%ebp) + * g -12(%ebp) + * b -16(%ebp) + * a -20(%ebp) + * s -24(%ebp) -> %esi + * d -28(%ebp) -> %edi + * x -32(%ebp) -> %ebx + * y -36(%ebp) + * ca -40(%ebp) + * + * mm0 Fg + * mm1 MMMM + * mm2 FgM + * mm3 + * mm4 + * mm5 255 + * mm6 128 + * mm7 0 + * +*/ + +nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP: + pushl %ebp + movl %esp, %ebp + pushl %ebx + subl $36, %esp + pushl %edi + pushl %esi + +/* Load %mm7 with [0 0 0 0] */ + movl $0, %eax + movd %eax, %mm7 + +/* Load %mm6 with [128 128 128 128] */ + movl $0x80808080, %eax + movd %eax, %mm6 + punpcklbw %mm7, %mm6 + +/* Load %mm5 with [255 255 255 255] */ + movl $0xffffffff, %eax + movd %eax, %mm5 + punpcklbw %mm7, %mm5 + +/* FgC -> %mm0 */ + movl 32(%ebp), %eax + movd (%eax), %mm0 + punpcklbw %mm7, %mm0 + +/* Check full opacity */ + cmpb $0xff, %al + jz .opaque + +/* for (y = ...) */ + movl 16(%ebp), %ecx +.fory: + +/* d = px */ +/* s = spx */ + movl 8(%ebp), %edi + movl 24(%ebp), %esi + +/* for (x = ...) */ + movl 12(%ebp), %ebx +.forx: + +/* [m m m m] -> %mm1 */ + movzbl (%esi), %eax + testb $0xff, %al + jz .clip + movd %eax, %mm1 + punpcklwd %mm1, %mm1 + punpckldq %mm1, %mm1 + +/* Fg -> mm2 */ + movq %mm0, %mm2 + pmullw %mm1, %mm2 + paddw %mm6, %mm2 + movq %mm2, %mm3 + psrlw $8, %mm3 + paddw %mm3, %mm2 + psrlw $8, %mm2 + +/* [255 - FgA] -> mm1 */ + movq %mm2, %mm1 + punpckhwd %mm1, %mm1 + punpckhdq %mm1, %mm1 + pxor %mm5, %mm1 + +/* Bg -> mm3 */ + movd (%edi), %mm3 + punpcklbw %mm7, %mm3 + +/* Fg + ((255 - FgA) * Bg) / 255 */ + pmullw %mm1, %mm3 + paddw %mm6, %mm3 + movq %mm3, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm3 + psrlw $8, %mm3 + paddw %mm2, %mm3 + +/* Store pixel */ + packuswb %mm3, %mm3 + movd %mm3, (%edi) + +.clip: + addl $4, %edi + incl %esi + + decl %ebx + jnz .forx + + movl 20(%ebp), %eax + addl %eax, 8(%ebp) + movl 28(%ebp), %eax + addl %eax, 24(%ebp) + + decl %ecx + jnz .fory + +.exit: + emms + popl %esi + popl %edi + addl $36, %esp + popl %ebx + popl %ebp + ret + +.opaque: +/* for (y = ...) */ + movl 16(%ebp), %ecx +.o_fory: + +/* d = px */ +/* s = spx */ + movl 8(%ebp), %edi + movl 24(%ebp), %esi + +/* for (x = ...) */ + movl 12(%ebp), %ebx +.o_forx: + +/* [m m m m] -> %mm1 */ + movzbl (%esi), %eax + testb $0xff, %al + jz .o_clip + cmpb $0xff, %al + jz .o_full + movd %eax, %mm1 + punpcklwd %mm1, %mm1 + punpckldq %mm1, %mm1 + +/* Fg -> mm2 */ + movq %mm0, %mm2 + pmullw %mm1, %mm2 + paddw %mm6, %mm2 + movq %mm2, %mm3 + psrlw $8, %mm3 + paddw %mm3, %mm2 + psrlw $8, %mm2 + +/* [255 - FgA] -> mm1 */ + movq %mm2, %mm1 + punpckhwd %mm1, %mm1 + punpckhdq %mm1, %mm1 + pxor %mm5, %mm1 + +/* Bg -> mm3 */ + movd (%edi), %mm3 + punpcklbw %mm7, %mm3 + +/* Fg + ((255 - FgA) * Bg) / 255 */ + pmullw %mm1, %mm3 + paddw %mm6, %mm3 + movq %mm3, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm3 + psrlw $8, %mm3 + paddw %mm2, %mm3 + + jmp .o_store + +.o_full: + movq %mm0, %mm3 + +.o_store: +/* Store pixel */ + packuswb %mm3, %mm3 + movd %mm3, (%edi) + +.o_clip: + addl $4, %edi + incl %esi + + decl %ebx + jnz .o_forx + + movl 20(%ebp), %eax + addl %eax, 8(%ebp) + movl 28(%ebp), %eax + addl %eax, 24(%ebp) + + decl %ecx + jnz .o_fory + jmp .exit + +.Lfe1: + .size nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP,.Lfe1-nr_mmx_R8G8B8A8_P_R8G8B8A8_P_A8_RGBAP + .ident "GCC: (GNU) 3.2" diff --git a/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S b/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S new file mode 100644 index 0000000..e30056a --- /dev/null +++ b/patches/nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM.S @@ -0,0 +1,414 @@ + .file "nr-compose-transform.c" + +# Ensure Inkscape is execshield protected + .section .note.GNU-stack + .previous + + .text + .align 2 +.globl nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0 + .type nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0,@function + +/* + * This code is in public domain + * + */ + +nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0: + pushl %ebp + movl %esp, %ebp + pushl %ebx + subl $48, %esp + pushl %edi + pushl %esi + +/* Load %mm7 with [0 0 0 0] */ + movl $0, %eax + movd %eax, %mm7 + +/* Load %mm6 with [128 128 128 128] */ + movl $0x80808080, %eax + movd %eax, %mm6 + punpcklbw %mm7, %mm6 + +/* Load %mm5 with [255 255 255 255] */ + movl $0xffffffff, %eax + movd %eax, %mm5 + punpcklbw %mm7, %mm5 + +/* Load %mm0 with [a a a a] */ + movzbl 44(%ebp), %eax + movd %eax, %mm0 + punpcklwd %mm0, %mm0 + punpckldq %mm0, %mm0 + + movl 8(%ebp), %eax + movl %eax, -8(%ebp) + movl 40(%ebp), %eax + addl $16, %eax + movl (%eax), %eax + movl %eax, -12(%ebp) + movl 40(%ebp), %eax + addl $20, %eax + movl (%eax), %eax + movl %eax, -16(%ebp) + movl $0, -24(%ebp) +.L29: + movl -24(%ebp), %eax + cmpl 16(%ebp), %eax + jl .L32 + jmp .L28 +.L32: + movl -8(%ebp), %edi + + movl -12(%ebp), %eax + movl %eax, %esi + movl -16(%ebp), %eax + movl %eax, -36(%ebp) + + movl 12(%ebp), %ebx +.for_x_0: + + movl %esi, %ecx + cmpl $0, %ecx + js .clip_0 + sarl $12, %ecx + cmpl 28(%ebp), %ecx + jge .clip_0 + shll $2, %ecx + + movl -36(%ebp), %eax + cmpl $0, %eax + js .clip_0 + sarl $12, %eax + cmpl 32(%ebp), %eax + jge .clip_0 + imull 36(%ebp), %eax + + addl %ecx, %eax + addl 24(%ebp), %eax + +/* Fg -> %mm1 */ + movl (%eax), %eax + testl $0xff000000, %eax + jz .clip_0 + movd %eax, %mm1 + punpcklbw %mm7, %mm1 + +/* [a a a 255] -> %mm3 */ + shrl $24, %eax + movl $0x10101, %edx + mull %edx + orl $0xff000000, %eax + movd %eax, %mm3 + punpcklbw %mm7, %mm3 + +/* [Fg * a] -> mm1 */ + pmullw %mm3, %mm1 + paddw %mm6, %mm1 + movq %mm1, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm1 + psrlw $8, %mm1 + +/* Multiply by alpha */ + pmullw %mm0, %mm1 + paddw %mm6, %mm1 + movq %mm1, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm1 + psrlw $8, %mm1 + +/* [255 - FgA] -> mm2 */ + movq %mm1, %mm2 + punpckhwd %mm2, %mm2 + punpckhdq %mm2, %mm2 + pxor %mm5, %mm2 + +/* Bg -> mm3 */ + movd (%edi), %mm3 + punpcklbw %mm7, %mm3 + +/* Fg + ((255 - FgA) * Bg) / 255 */ + + pmullw %mm2, %mm3 + paddw %mm6, %mm3 + movq %mm3, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm3 + psrlw $8, %mm3 + paddw %mm1, %mm3 + +/* Store pixel */ + packuswb %mm3, %mm3 + movd %mm3, (%edi) + +.clip_0: +.L37: + movl 40(%ebp), %ecx + movl (%ecx), %edx + addl %edx, %esi + movl 4(%ecx), %edx + addl %edx, -36(%ebp) + + addl $4, %edi + + decl %ebx + jnz .for_x_0 + +.L34: + movl 8(%ecx), %edx + addl %edx, -12(%ebp) + movl 12(%ecx), %edx + addl %edx, -16(%ebp) + + movl 20(%ebp), %edx + leal -8(%ebp), %eax + addl %edx, (%eax) + leal -24(%ebp), %eax + incl (%eax) + jmp .L29 +.L28: + emms + popl %esi + popl %edi + addl $48, %esp + popl %ebx + popl %ebp + ret +.Lfe2: + .size nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0,.Lfe2-nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_0 + +/* + * + * dbits 52(%ebp) + * alpha 48(%ebp) + * FF_S 44(%ebp) + * + * d -32(%ebp) -> %edi + * i -60(%ebp) -> %esi + * sx -64(%ebp) -> %ebx + * sy -68(%ebp) + * s -72(%ebp) + * + * %mm0 a a a a + * %mm1 FgA + * %mm2 SumFgA + * %mm3 a a a 255 + * %mm4 +*/ + + .align 2 +.globl nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n + .type nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n,@function +nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n: + pushl %ebp + movl %esp, %ebp + pushl %ebx + subl $72, %esp + pushl %edi + pushl %esi + +/* Load %mm7 with [0 0 0 0] */ + movl $0, %eax + movd %eax, %mm7 + +/* Load %mm6 with [128 128 128 128] */ + movl $0x80808080, %eax + movd %eax, %mm6 + punpcklbw %mm7, %mm6 + +/* Load %mm5 with [255 255 255 255] */ + movl $0xffffffff, %eax + movd %eax, %mm5 + punpcklbw %mm7, %mm5 + +/* Load %mm0 with [a a a a] */ + movzbl 48(%ebp), %eax + movd %eax, %mm0 + punpcklwd %mm0, %mm0 + punpckldq %mm0, %mm0 + + movl $1, %eax + movzbl 52(%ebp), %ecx + sall %cl, %eax + movl %eax, -8(%ebp) + movl 8(%ebp), %eax + movl %eax, -12(%ebp) + movl 40(%ebp), %eax + addl $16, %eax + movl (%eax), %eax + movl %eax, -16(%ebp) + movl 40(%ebp), %eax + addl $20, %eax + movl (%eax), %eax + movl %eax, -20(%ebp) + movl $0, -28(%ebp) +.L44: + movl -28(%ebp), %eax + cmpl 16(%ebp), %eax + jl .L47 + jmp .exit_n +.L47: + movl -12(%ebp), %eax + movl %eax, -32(%ebp) + movl -16(%ebp), %eax + movl %eax, -36(%ebp) + movl -20(%ebp), %eax + movl %eax, -40(%ebp) + movl $0, -24(%ebp) +.L48: + movl -24(%ebp), %eax + cmpl 12(%ebp), %eax + jl .L51 + jmp .L49 +.L51: + +/* Zero accumulator */ + movq %mm7, %mm2 + +/* Set i to dptr (size - 1) */ + movl -8(%ebp), %esi + sub $1, %esi + shll $3, %esi + + movl 44(%ebp), %edi + movl -36(%ebp), %ecx + +.for_i_n: + movl (%edi,%esi), %ebx + addl %ecx, %ebx +/* Test negative before shift */ + cmpl $0, %ebx + js .next_i_n + sarl $12, %ebx + cmpl 28(%ebp), %ebx + jge .next_i_n +/* We multiply sx by 4 here */ + shll $2, %ebx + + movl 4(%edi,%esi), %eax + addl -40(%ebp), %eax +/* Test negative before shift */ + cmpl $0, %eax + js .next_i_n + sarl $12, %eax + cmpl 32(%ebp), %eax + jge .next_i_n +/* We multiply sy by srs here */ + imull 36(%ebp), %eax + + addl %ebx, %eax + addl 24(%ebp), %eax + +/* Fg -> %mm1 */ + movl (%eax), %eax + testl $0xff000000, %eax + jz .next_i_n + movd %eax, %mm1 + punpcklbw %mm7, %mm1 + +/* [a a a 255] -> %mm3 */ + shrl $24, %eax + movl $0x10101, %edx + mull %edx + orl $0xff000000, %eax + movd %eax, %mm3 + punpcklbw %mm7, %mm3 + +/* [Fg * a] -> mm1 */ + pmullw %mm3, %mm1 + paddw %mm6, %mm1 + movq %mm1, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm1 + psrlw $8, %mm1 + +/* Add to accumulator */ + paddw %mm1, %mm2 + +.next_i_n: + subl $8, %esi + jnb .for_i_n + +/* Divide components by sample size */ + movd 52(%ebp), %mm3 + psrlw %mm3, %mm2 + +/* Multiply by alpha */ + pmullw %mm0, %mm2 + paddw %mm6, %mm2 + movq %mm2, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm2 + psrlw $8, %mm2 + +/* [255 - FgA] -> mm1 */ + movq %mm2, %mm1 + punpckhwd %mm1, %mm1 + punpckhdq %mm1, %mm1 + pxor %mm5, %mm1 + + movl -32(%ebp), %edi +/* Bg -> mm3 */ + movd (%edi), %mm3 + punpcklbw %mm7, %mm3 + +/* Fg + ((255 - FgA) * Bg) / 255 */ + + pmullw %mm1, %mm3 + paddw %mm6, %mm3 + movq %mm3, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm3 + psrlw $8, %mm3 + paddw %mm2, %mm3 + +/* Store pixel */ + packuswb %mm3, %mm3 + movd %mm3, (%edi) + +.L58: + movl 40(%ebp), %eax + movl (%eax), %edx + leal -36(%ebp), %eax + addl %edx, (%eax) + movl 40(%ebp), %eax + addl $4, %eax + movl (%eax), %edx + leal -40(%ebp), %eax + addl %edx, (%eax) + leal -32(%ebp), %eax + addl $4, (%eax) + leal -24(%ebp), %eax + incl (%eax) + jmp .L48 +.L49: + movl 40(%ebp), %eax + addl $8, %eax + movl (%eax), %edx + leal -16(%ebp), %eax + addl %edx, (%eax) + movl 40(%ebp), %eax + addl $12, %eax + movl (%eax), %edx + leal -20(%ebp), %eax + addl %edx, (%eax) + movl 20(%ebp), %edx + leal -12(%ebp), %eax + addl %edx, (%eax) + leal -28(%ebp), %eax + incl (%eax) + jmp .L44 + +.exit_n: + emms + popl %esi + popl %edi + addl $72, %esp + popl %ebx + popl %ebp + ret +.Lfe3: + .size nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n,.Lfe3-nr_mmx_R8G8B8A8_P_R8G8B8A8_P_R8G8B8A8_N_TRANSFORM_n + .ident "GCC: (GNU) 3.2" diff --git a/patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S b/patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S new file mode 100644 index 0000000..37261e5 --- /dev/null +++ b/patches/nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P.S @@ -0,0 +1,227 @@ + .file "nr-compose.c" + +# Ensure Inkscape is execshield protected + .section .note.GNU-stack + .previous + + .text + .align 2 +.globl nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P + .type nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P,@function + +/* + * This code is in public domain + * + * alpha 32(%ebp) + * srs 28(%ebp) + * spx 24(%ebp) + * rs 20(%ebp) + * h 16(%ebp) + * w 12(%ebp) + * px 8(%ebp) + * r -8(%ebp) + * g -12(%ebp) + * b -16(%ebp) + * a -20(%ebp) + * s -24(%ebp) -> %esi + * d -28(%ebp) -> %edi + * x -32(%ebp) -> %ebx + * y -36(%ebp) + * ca -40(%ebp) + * + * mm0 A + * mm1 FgA + * mm2 FgPre + * mm3 + * mm4 + * mm5 255 + * mm6 128 + * mm7 0 + * +*/ + +nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P: + pushl %ebp + movl %esp, %ebp + pushl %ebx + subl $36, %esp + pushl %edi + pushl %esi + +/* Load %mm7 with [0 0 0 0] */ + movl $0, %eax + movd %eax, %mm7 + +/* Load %mm6 with [128 128 128 128] */ + movl $0x80808080, %eax + movd %eax, %mm6 + punpcklbw %mm7, %mm6 + +/* Load %mm5 with [255 255 255 255] */ + movl $0xffffffff, %eax + movd %eax, %mm5 + punpcklbw %mm7, %mm5 + +/* Load %mm0 with [a a a a] */ +/* Check full opacity */ + movzbl 32(%ebp), %eax + cmpb $0xff, %al + jz .opaque + movd %eax, %mm0 + punpcklwd %mm0, %mm0 + punpckldq %mm0, %mm0 + +/* for (y = ...) */ + movl 16(%ebp), %ecx +.fory: + +/* d = px */ +/* s = spx */ + movl 8(%ebp), %edi + movl 24(%ebp), %esi + +/* for (x = ...) */ + movl 12(%ebp), %ebx +.forx: + +/* Fg -> %mm1 */ +/* fixme: Do we have to bother about alignment here? (Lauris) */ + movl (%esi), %eax + testl $0xff000000, %eax + jz .clip + movd %eax, %mm1 + punpcklbw %mm7, %mm1 + +/* [Fg * a] -> mm1 */ + pmullw %mm0, %mm1 + paddw %mm6, %mm1 + movq %mm1, %mm2 + psrlw $8, %mm2 + paddw %mm2, %mm1 + psrlw $8, %mm1 + +/* [255 - FgA] -> mm2 */ + movq %mm1, %mm2 + punpckhwd %mm2, %mm2 + punpckhdq %mm2, %mm2 + pxor %mm5, %mm2 + +/* Bg -> mm3 */ + movd (%edi), %mm3 + punpcklbw %mm7, %mm3 + +/* Fg + ((255 - FgA) * Bg) / 255 */ + pmullw %mm2, %mm3 + paddw %mm6, %mm3 + movq %mm3, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm3 + psrlw $8, %mm3 + paddw %mm1, %mm3 + +/* Store pixel */ + packuswb %mm3, %mm3 + movd %mm3, %eax + movb %al, 0(%edi) + shrl $8, %eax + movb %al, 1(%edi) + shrl $8, %eax + movb %al, 2(%edi) + +.clip: + addl $3, %edi + addl $4, %esi + + decl %ebx + jnz .forx + + movl 20(%ebp), %eax + addl %eax, 8(%ebp) + movl 28(%ebp), %eax + addl %eax, 24(%ebp) + + decl %ecx + jnz .fory + +.exit: + emms + popl %esi + popl %edi + addl $36, %esp + popl %ebx + popl %ebp + ret + +.opaque: +/* for (y = ...) */ + movl 16(%ebp), %ecx +.o_fory: + +/* d = px */ +/* s = spx */ + movl 8(%ebp), %edi + movl 24(%ebp), %esi + +/* for (x = ...) */ + movl 12(%ebp), %ebx +.o_forx: + +/* Fg -> %mm1 */ +/* fixme: Do we have to bother about alignment here? (Lauris) */ + movl (%esi), %eax + testl $0xff000000, %eax + jz .o_clip + cmpl $0xff000000, %eax + jnb .o_store + movd %eax, %mm1 + punpcklbw %mm7, %mm1 + +/* [255 - FgA] -> mm2 */ + movq %mm1, %mm2 + punpckhwd %mm2, %mm2 + punpckhdq %mm2, %mm2 + pxor %mm5, %mm2 + +/* Bg -> mm3 */ + movd (%edi), %mm3 + punpcklbw %mm7, %mm3 + +/* Fg + ((255 - FgA) * Bg) / 255 */ + pmullw %mm2, %mm3 + paddw %mm6, %mm3 + movq %mm3, %mm4 + psrlw $8, %mm4 + paddw %mm4, %mm3 + psrlw $8, %mm3 + paddw %mm1, %mm3 + +/* Store pixel */ + packuswb %mm3, %mm3 + movd %mm3, %eax +.o_store: + movb %al, 0(%edi) + shrl $8, %eax + movb %al, 1(%edi) + shrl $8, %eax + movb %al, 2(%edi) + +.o_clip: + addl $3, %edi + addl $4, %esi + + decl %ebx + jnz .o_forx + + movl 20(%ebp), %eax + addl %eax, 8(%ebp) + movl 28(%ebp), %eax + addl %eax, 24(%ebp) + + decl %ecx + jnz .o_fory + + jmp .exit + +.Lfe1: + .size nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P,.Lfe1-nr_mmx_R8G8B8_R8G8B8_R8G8B8A8_P + .ident "GCC: (GNU) 3.2" diff --git a/patches/patch-small-lib-2 b/patches/patch-small-lib-2 new file mode 100644 index 0000000..938e36a --- /dev/null +++ b/patches/patch-small-lib-2 @@ -0,0 +1,78 @@ +Index: autogen.sh +=================================================================== +RCS file: /cvs/liboil/liboil/autogen.sh,v +retrieving revision 1.8 +diff -u -r1.8 autogen.sh +--- autogen.sh 26 Jul 2005 20:32:36 -0000 1.8 ++++ autogen.sh 3 Aug 2005 21:16:18 -0000 +@@ -1,4 +1,4 @@ + #!/bin/sh + + autoreconf -i -f && +-./configure --enable-maintainer-mode --disable-static $@ ++./configure --enable-maintainer-mode --disable-static --enable-library-peeling $@ +Index: configure.ac +=================================================================== +RCS file: /cvs/liboil/liboil/configure.ac,v +retrieving revision 1.56 +diff -u -r1.56 configure.ac +--- configure.ac 3 Aug 2005 03:33:47 -0000 1.56 ++++ configure.ac 3 Aug 2005 21:16:18 -0000 +@@ -60,6 +60,14 @@ + AC_DEFINE(ENABLE_BROKEN_IMPLS, 1, [Define if compiling broken implementations]) + fi + ++AC_ARG_ENABLE(library-peeling, ++ AC_HELP_STRING([--enable-library-peeling],[peel unused functions]), ++ enable_library_peeling=$enableval,enable_broken_implementations=no) ++if test "x$enable_library_peeling" = xyes ; then ++ AC_DEFINE(ENABLE_PEELING, 1, [Define if peeling library]) ++ LIBOIL_CFLAGS="$LIBOIL_CFLAGS -ffunction-sections -fdata-sections" ++fi ++ + ################################################## + # Check for gtk-doc. + ################################################## +Index: liboil/Makefile.am +=================================================================== +RCS file: /cvs/liboil/liboil/liboil/Makefile.am,v +retrieving revision 1.41 +diff -u -r1.41 Makefile.am +--- liboil/Makefile.am 3 Aug 2005 03:33:47 -0000 1.41 ++++ liboil/Makefile.am 3 Aug 2005 21:16:18 -0000 +@@ -46,7 +46,6 @@ + -no-undefined \ + -export-symbols-regex 'oil_' + liboiltmp1_la_LIBADD = \ +- liboilfunctions.la \ + $(LIBM) + + liboil_@LIBOIL_MAJORMINOR@_la_SOURCES = \ +@@ -67,12 +66,12 @@ + liboiltest.c \ + liboilmarshal.c + liboil_@LIBOIL_MAJORMINOR@_la_LIBADD = \ +- liboilfunctions.la \ + $(LIBM) + liboil_@LIBOIL_MAJORMINOR@_la_CFLAGS = $(LIBOIL_CFLAGS) + liboil_@LIBOIL_MAJORMINOR@_la_LDFLAGS = \ + -no-undefined \ + -version-info $(LIBOIL_LIBVERSION) \ ++ .libs/liboilfunctions.a \ + -export-symbols-regex '^oil_' + + # This is required to use 'make -j2'. Automake doesn't seem to notice +@@ -111,11 +110,11 @@ + echo ' NULL' >>liboilarray.c.tmp + echo '};' >>liboilarray.c.tmp + echo >>liboilarray.c.tmp +- grep '^_oil_function_impl_' .libs/liboiltmp1.exp | \ ++ grep '^_oil_function_impl_.*ref' .libs/liboiltmp1.exp | \ + sed 's/.*/extern OilFunctionImpl &;/' >>liboilarray.c.tmp + echo >>liboilarray.c.tmp + echo 'OilFunctionImpl *_oil_function_impl_array[] = {' >>liboilarray.c.tmp +- grep '^_oil_function_impl_' .libs/liboiltmp1.exp | \ ++ grep '^_oil_function_impl_.*ref' .libs/liboiltmp1.exp | \ + sed 's/.*/ \&&,/' >>liboilarray.c.tmp + echo ' NULL' >>liboilarray.c.tmp + echo '};' >>liboilarray.c.tmp -- cgit v1.2.1