#include #include "asm.h" #ifdef DO_MMX_ASM /*\ |*| MMX assembly blending routines, with colour modding, for Imlib2 |*| Written by Willem Monsuwe |*| |*| Special (hairy) constructs are only commented on first use. \*/ /*\ All functions have the same calling convention: |*| PR_(imlib_mmx__rgba_to_rgb[A]_cmod(void *src, int sw, void *dst, int dw, |*| int w, int h, ImlibColorModifier *cm) \*/ #define src 8(%ebp) #define sw 12(%ebp) #define dst 16(%ebp) #define dw 20(%ebp) #define w 24(%ebp) #define h 28(%ebp) #define cm 32(%ebp) /*\ Cmod tables, from %ebx \*/ #define rmap(x) (%ebx, x) #define gmap(x) 0x100(%ebx, x) #define bmap(x) 0x200(%ebx, x) #define amap(x) 0x300(%ebx, x) #define amap_ff 0x3ff(%ebx) .text .align 8 FN_(imlib_mmx_blend_rgba_to_rgb_cmod) FN_(imlib_mmx_blend_rgba_to_rgba_cmod) FN_(imlib_mmx_blend_rgb_to_rgb_cmod) FN_(imlib_mmx_blend_rgb_to_rgba_cmod) FN_(imlib_mmx_copy_rgba_to_rgb_cmod) FN_(imlib_mmx_copy_rgba_to_rgba_cmod) FN_(imlib_mmx_copy_rgb_to_rgba_cmod) FN_(imlib_mmx_add_blend_rgba_to_rgb_cmod) FN_(imlib_mmx_add_blend_rgba_to_rgba_cmod) FN_(imlib_mmx_add_blend_rgb_to_rgb_cmod) FN_(imlib_mmx_add_blend_rgb_to_rgba_cmod) FN_(imlib_mmx_add_copy_rgba_to_rgb_cmod) FN_(imlib_mmx_add_copy_rgba_to_rgba_cmod) FN_(imlib_mmx_add_copy_rgb_to_rgba_cmod) FN_(imlib_mmx_subtract_blend_rgba_to_rgb_cmod) FN_(imlib_mmx_subtract_blend_rgba_to_rgba_cmod) FN_(imlib_mmx_subtract_blend_rgb_to_rgb_cmod) FN_(imlib_mmx_subtract_blend_rgb_to_rgba_cmod) FN_(imlib_mmx_subtract_copy_rgba_to_rgb_cmod) FN_(imlib_mmx_subtract_copy_rgba_to_rgba_cmod) FN_(imlib_mmx_subtract_copy_rgb_to_rgba_cmod) FN_(imlib_mmx_reshade_blend_rgba_to_rgb_cmod) FN_(imlib_mmx_reshade_blend_rgba_to_rgba_cmod) FN_(imlib_mmx_reshade_blend_rgb_to_rgb_cmod) FN_(imlib_mmx_reshade_blend_rgb_to_rgba_cmod) FN_(imlib_mmx_reshade_copy_rgba_to_rgb_cmod) FN_(imlib_mmx_reshade_copy_rgba_to_rgba_cmod) FN_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod) #include "asm_loadimmq.S" /*\ MMX register use: |*| %mm1 = Source value |*| %mm2 = Destination value |*| %mm3 = Alpha value |*| %mm4 = 0 |*| %mm5-%mm6 = masks \*/ /*\ Common code \*/ /*\ Set MMX mode, save registers, load common parameters \*/ #define ENTER \ pushl %ebp ;\ movl %esp, %ebp ;\ pushl %ebx ;\ pushl %ecx ;\ pushl %edx ;\ pushl %edi ;\ pushl %esi ;\ movl cm, %ebx ;\ movl src, %esi ;\ movl dst, %edi ;\ movl w, %ecx ;\ leal (%esi, %ecx, 4), %esi ;\ leal (%edi, %ecx, 4), %edi ;\ negl %ecx ;\ jz 9f ;\ movl h, %edx ;\ decl %edx ;\ jz 9f ;\ #define LOOP_START \ 8: ;\ movl w, %ecx ;\ negl %ecx #define LOOP_END \ movl sw, %ecx ;\ leal (%esi, %ecx, 4), %esi ;\ movl dw, %ecx ;\ leal (%edi, %ecx, 4), %edi ;\ decl %edx ;\ jns 8b /*\ Unset MMX mode, reset registers, return \*/ #define LEAVE \ 9: ;\ emms ;\ popl %esi ;\ popl %edi ;\ popl %edx ;\ popl %ecx ;\ popl %ebx ;\ movl %ebp, %esp ;\ popl %ebp ;\ ret /*\ Load one value, colourmod it, and put it in %mm1 \*/ #define LOAD1_CMOD \ movzbl 3(%esi, %ecx, 4), %eax ;\ movzbl amap(%eax), %eax ;\ movd %eax, %mm1 ;\ movzbl 2(%esi, %ecx, 4), %eax ;\ psllq $8, %mm1 ;\ movzbl rmap(%eax), %eax ;\ movd %eax, %mm0 ;\ movzbl 1(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl gmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl (%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ por %mm0, %mm1 /*\ Load two values, colourmod them, and put them in %mm1 \*/ #define LOAD2_CMOD \ movzbl 7(%esi, %ecx, 4), %eax ;\ movzbl amap(%eax), %eax ;\ movd %eax, %mm1 ;\ movzbl 6(%esi, %ecx, 4), %eax ;\ psllq $8, %mm1 ;\ movzbl rmap(%eax), %eax ;\ movd %eax, %mm0 ;\ movzbl 5(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl gmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 4(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 3(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl amap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 2(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl rmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 1(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl gmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl (%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ por %mm0, %mm1 /*\ Load one value, alpha 0xff, colourmod it, and put it in %mm1 \*/ #define LOAD1_CMOD_AFF \ movzbl amap_ff, %eax ;\ movd %eax, %mm1 ;\ movzbl 2(%esi, %ecx, 4), %eax ;\ psllq $8, %mm1 ;\ movzbl rmap(%eax), %eax ;\ movd %eax, %mm0 ;\ movzbl 1(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl gmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl (%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ por %mm0, %mm1 /*\ Load two values, alpha 0xff, colourmod them, and put them in %mm1 \*/ #define LOAD2_CMOD_AFF \ movzbl amap_ff, %eax ;\ movd %eax, %mm1 ;\ movzbl 6(%esi, %ecx, 4), %eax ;\ psllq $8, %mm1 ;\ movzbl rmap(%eax), %eax ;\ movd %eax, %mm0 ;\ movzbl 5(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl gmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 4(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ por %mm0, %mm1 ;\ movzbl amap_ff, %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 2(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl rmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 1(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl gmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl (%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ por %mm0, %mm1 /*\ Load one value, colourmod it, alpha 0, and put it in %mm1 \*/ #define LOAD1_CMOD_A00 \ movzbl 2(%esi, %ecx, 4), %eax ;\ movzbl rmap(%eax), %eax ;\ movd %eax, %mm1 ;\ movzbl 1(%esi, %ecx, 4), %eax ;\ psllq $8, %mm1 ;\ movzbl gmap(%eax), %eax ;\ movd %eax, %mm0 ;\ movzbl (%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ por %mm0, %mm1 /*\ Load two values, colourmod them, alpha 0, and put them in %mm1 \*/ #define LOAD2_CMOD_A00 \ movzbl 6(%esi, %ecx, 4), %eax ;\ movzbl rmap(%eax), %eax ;\ movd %eax, %mm1 ;\ movzbl 5(%esi, %ecx, 4), %eax ;\ psllq $8, %mm1 ;\ movzbl gmap(%eax), %eax ;\ movd %eax, %mm0 ;\ movzbl 4(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 2(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl rmap(%eax), %eax ;\ psllq $16, %mm1 ;\ movd %eax, %mm0 ;\ movzbl 1(%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl gmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ movzbl (%esi, %ecx, 4), %eax ;\ por %mm0, %mm1 ;\ movzbl bmap(%eax), %eax ;\ psllq $8, %mm1 ;\ movd %eax, %mm0 ;\ por %mm0, %mm1 PR_(imlib_mmx_blend_rgba_to_rgb_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(c1, %mm5) CLEANUP_IMMQ_LOADS(1) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and unpack/copy to eight bytes |*| which are treated as four words. |*| Result ranges from [0, 0x7fff), and is mapped to |*| point value in [0.0, 1.0) by using the high word |*| of the 16->32 multiplications. |*| (Because we want the unsigned value we shift one bit, |*| and also shift the other factor to compensate.) |*| Magic to get the fourth byte: lhh \*/ movq %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Make the alpha value that gets multiplied to the |*| alpha channels 0, so the resulting alpha value is |*| the destination alpha value. \*/ psrlq $16, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (a * ((s - d) + 0.5)) \*/ psubw %mm2, %mm1 psllw $1, %mm1 paddw %mm5, %mm1 /*\ Roundoff \*/ pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_blend_rgba_to_rgb_cmod) PR_(imlib_mmx_blend_rgba_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(m0X000000, %mm5) LOAD_IMMQ(m00XXXXXX, %mm6) LOAD_IMMQ(c1, %mm7) CLEANUP_IMMQ_LOADS(3) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and target, a = src + (255 - dest) \*/ movq %mm2, %mm3 pxor %mm6, %mm3 paddusb %mm1, %mm3 /*\ Unpack/copy to eight bytes \*/ punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ Separate alpha channel \*/ movq %mm1, %mm0 pand %mm5, %mm0 /*\ d = d + (a * ((s - d) + 0.5)) \*/ psubw %mm2, %mm1 psllw $1, %mm1 paddw %mm7, %mm1 /*\ Roundoff \*/ pmulhw %mm3, %mm1 /*\ Replace alpha channel with separated out version in mm0 and add \*/ pand %mm6, %mm1 por %mm0, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_blend_rgba_to_rgba_cmod) PR_(imlib_mmx_blend_rgb_to_rgb_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(c1, %mm5) CLEANUP_IMMQ_LOADS(1) /*\ Load alpha beforehand, as it's always amap(0xff) \*/ movzbl amap_ff, %eax movd %eax, %mm3 punpcklbw %mm3, %mm3 punpcklwd %mm3, %mm3 punpckldq %mm3, %mm3 psrlw $1, %mm3 psrlq $16, %mm3 LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD_A00 movd (%edi, %ecx, 4), %mm2 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (a * ((s - d) + 0.5)) \*/ psubw %mm2, %mm1 psllw $1, %mm1 paddw %mm5, %mm1 /*\ Roundoff \*/ pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_blend_rgb_to_rgb_cmod) PR_(imlib_mmx_blend_rgb_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(m0X000000, %mm5) LOAD_IMMQ(m00XXXXXX, %mm6) LOAD_IMMQ(c1, %mm7) CLEANUP_IMMQ_LOADS(3) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD_AFF movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and target, a = src + (255 - dest) \*/ movq %mm2, %mm3 pxor %mm6, %mm3 paddusb %mm1, %mm3 /*\ Unpack/copy to eight bytes \*/ punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ Separate alpha channel \*/ movq %mm1, %mm0 pand %mm5, %mm0 /*\ d = d + (a * ((s - d) + 0.5)) \*/ psubw %mm2, %mm1 psllw $1, %mm1 paddw %mm7, %mm1 /*\ Roundoff \*/ pmulhw %mm3, %mm1 /*\ Replace alpha channel with separated out version in mm0 and add \*/ pand %mm6, %mm1 por %mm0, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_blend_rgb_to_rgba_cmod) PR_(imlib_mmx_copy_rgba_to_rgb_cmod): ENTER LOOP_START 1: movzbl (%esi, %ecx, 4), %eax movzbl bmap(%eax), %eax movb %al, (%edi, %ecx, 4) movzbl 1(%esi, %ecx, 4), %eax movzbl gmap(%eax), %eax movb %al, 1(%edi, %ecx, 4) movzbl 2(%esi, %ecx, 4), %eax movzbl rmap(%eax), %eax movb %al, 2(%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_copy_rgba_to_rgb_cmod) PR_(imlib_mmx_copy_rgba_to_rgba_cmod): ENTER LOOP_START 1: movzbl (%esi, %ecx, 4), %eax movzbl bmap(%eax), %eax movb %al, (%edi, %ecx, 4) movzbl 1(%esi, %ecx, 4), %eax movzbl gmap(%eax), %eax movb %al, 1(%edi, %ecx, 4) movzbl 2(%esi, %ecx, 4), %eax movzbl rmap(%eax), %eax movb %al, 2(%edi, %ecx, 4) movzbl 3(%esi, %ecx, 4), %eax movzbl amap(%eax), %eax movb %al, 3(%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_copy_rgba_to_rgba_cmod) PR_(imlib_mmx_copy_rgb_to_rgba_cmod): ENTER LOOP_START 1: movzbl (%esi, %ecx, 4), %eax movzbl bmap(%eax), %eax movb %al, (%edi, %ecx, 4) movzbl 1(%esi, %ecx, 4), %eax movzbl gmap(%eax), %eax movb %al, 1(%edi, %ecx, 4) movzbl 2(%esi, %ecx, 4), %eax movzbl rmap(%eax), %eax movb %al, 2(%edi, %ecx, 4) movb $0xff, 3(%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_copy_rgb_to_rgba_cmod) PR_(imlib_mmx_add_blend_rgba_to_rgb_cmod): ENTER pxor %mm4, %mm4 LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and unpack/copy to eight bytes \*/ movq %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 psrlq $16, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (a * s) \*/ psllw $1, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_add_blend_rgba_to_rgb_cmod) PR_(imlib_mmx_add_blend_rgba_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(mVX000000, %mm5) LOAD_IMMQ(m00XXXXXX, %mm6) CLEANUP_IMMQ_LOADS(2) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and target and unpack/copy to eight bytes \*/ movq %mm2, %mm3 pxor %mm6, %mm3 paddusb %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Make the alpha value that gets multiplied to the |*| alpha channels 0x7fff, so the resulting alpha value is |*| the sum of the source and destination alpha values. \*/ por %mm5, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (a * s) \*/ psllw $1, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_add_blend_rgba_to_rgba_cmod) PR_(imlib_mmx_add_blend_rgb_to_rgb_cmod): ENTER pxor %mm4, %mm4 /*\ Load alpha beforehand, as it's always amap(0xff) \*/ movzbl amap_ff, %eax movd %eax, %mm3 punpcklbw %mm3, %mm3 punpcklwd %mm3, %mm3 punpckldq %mm3, %mm3 psrlw $1, %mm3 psrlq $16, %mm3 LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD_A00 movd (%edi, %ecx, 4), %mm2 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (a * s) \*/ psllw $1, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_add_blend_rgb_to_rgb_cmod) PR_(imlib_mmx_add_blend_rgb_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(mVX000000, %mm5) LOAD_IMMQ(m00XXXXXX, %mm6) CLEANUP_IMMQ_LOADS(2) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD_AFF movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and target and unpack/copy to eight bytes \*/ movq %mm2, %mm3 pxor %mm6, %mm3 paddusb %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Make the alpha value that gets multiplied to the |*| alpha channels 0x7fff, so the resulting alpha value is |*| the sum of the source and destination alpha values. \*/ por %mm5, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (a * s) \*/ psllw $1, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_add_blend_rgb_to_rgba_cmod) PR_(imlib_mmx_add_copy_rgba_to_rgb_cmod): ENTER LOAD_IMMQ(m0XXX0XXX, %mm5) CLEANUP_IMMQ_LOADS(1) subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD movq (%edi, %ecx, 4), %mm2 /*\ Clear alpha channel of source \*/ pand %mm5, %mm1 /*\ d = d + s, unsigned saturation, and save \*/ paddusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD movd (%edi), %mm2 pand %mm5, %mm1 paddusb %mm1, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_add_copy_rgba_to_rgb_cmod) PR_(imlib_mmx_add_copy_rgba_to_rgba_cmod): ENTER subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD movq (%edi, %ecx, 4), %mm2 /*\ d = d + s, unsigned saturation, and save \*/ paddusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD movd (%edi), %mm2 paddusb %mm1, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_add_copy_rgba_to_rgba_cmod) PR_(imlib_mmx_add_copy_rgb_to_rgba_cmod): ENTER subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD_AFF movq (%edi, %ecx, 4), %mm2 /*\ d = d + s, unsigned saturation, and save \*/ paddusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD_AFF movd (%edi), %mm2 paddusb %mm1, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_add_copy_rgb_to_rgba_cmod) PR_(imlib_mmx_subtract_blend_rgba_to_rgb_cmod): ENTER pxor %mm4, %mm4 LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and unpack/copy to eight bytes \*/ movq %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 psrlq $16, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d - (a * s) \*/ psllw $1, %mm1 pmulhw %mm3, %mm1 psubw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_subtract_blend_rgba_to_rgb_cmod) PR_(imlib_mmx_subtract_blend_rgba_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(mV0000000, %mm5) LOAD_IMMQ(m00XXXXXX, %mm6) CLEANUP_IMMQ_LOADS(2) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and target and unpack/copy to eight bytes \*/ movq %mm2, %mm3 pxor %mm6, %mm3 paddusb %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Make alpha value that gets multiplied with alpha channel |*| 0x8000, (-1.0), so that the alpha result is s + d \*/ psrlq $16, %mm3 por %mm5, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d - (a * s) \*/ psllw $1, %mm1 pmulhw %mm3, %mm1 psubw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_subtract_blend_rgba_to_rgba_cmod) PR_(imlib_mmx_subtract_blend_rgb_to_rgb_cmod): ENTER pxor %mm4, %mm4 /*\ Load alpha beforehand, as it's always amap(0xff) \*/ movzbl amap_ff, %eax movd %eax, %mm3 punpcklbw %mm3, %mm3 punpcklwd %mm3, %mm3 punpckldq %mm3, %mm3 psrlw $1, %mm3 psrlq $16, %mm3 LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD_A00 movd (%edi, %ecx, 4), %mm2 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d - (a * s) \*/ psllw $1, %mm1 pmulhw %mm3, %mm1 psubw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_subtract_blend_rgb_to_rgb_cmod) PR_(imlib_mmx_subtract_blend_rgb_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(mV0000000, %mm5) LOAD_IMMQ(m00XXXXXX, %mm6) CLEANUP_IMMQ_LOADS(2) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD_AFF movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and target and unpack/copy to eight bytes \*/ movq %mm2, %mm3 pxor %mm6, %mm3 paddusb %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Make alpha value that gets multiplied with alpha channel |*| 0x8000, (-1.0), so that the alpha result is s + d \*/ psrlq $16, %mm3 por %mm5, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d - (a * s) \*/ psllw $1, %mm1 pmulhw %mm3, %mm1 psubw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_subtract_blend_rgb_to_rgba_cmod) PR_(imlib_mmx_subtract_copy_rgba_to_rgb_cmod): ENTER LOAD_IMMQ(m0XXX0XXX, %mm5) CLEANUP_IMMQ_LOADS(1) subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD movq (%edi, %ecx, 4), %mm2 /*\ Clear alpha channel of source \*/ pand %mm5, %mm1 /*\ d = d - s, unsigned saturation, and save \*/ psubusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD movd (%edi), %mm2 pand %mm5, %mm1 psubusb %mm1, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_subtract_copy_rgba_to_rgb_cmod) PR_(imlib_mmx_subtract_copy_rgba_to_rgba_cmod): ENTER LOAD_IMMQ(mX000X000, %mm5) CLEANUP_IMMQ_LOADS(1) subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD movq (%edi, %ecx, 4), %mm2 /*\ Negate destination alphas \*/ pxor %mm5, %mm2 /*\ d = d - s, unsigned saturation, and save \*/ psubusb %mm1, %mm2 /*\ Negate result alphas \*/ pxor %mm5, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD movd (%edi), %mm2 pxor %mm5, %mm2 psubusb %mm1, %mm2 pxor %mm5, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_subtract_copy_rgba_to_rgba_cmod) PR_(imlib_mmx_subtract_copy_rgb_to_rgba_cmod): ENTER LOAD_IMMQ(mX000X000, %mm5) CLEANUP_IMMQ_LOADS(1) subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD_AFF movq (%edi, %ecx, 4), %mm2 pxor %mm5, %mm2 /*\ d = d - s, unsigned saturation, and save \*/ psubusb %mm1, %mm2 pxor %mm5, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD_AFF movd (%edi), %mm2 pxor %mm5, %mm2 psubusb %mm1, %mm2 pxor %mm5, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_subtract_copy_rgb_to_rgba_cmod) PR_(imlib_mmx_reshade_blend_rgba_to_rgb_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(m000V0V0V, %mm6) CLEANUP_IMMQ_LOADS(1) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and unpack/copy to eight bytes \*/ movq %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 psrlq $16, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (2 * a * (s - 127)) \*/ psubw %mm6, %mm1 psllw $2, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_reshade_blend_rgba_to_rgb_cmod) PR_(imlib_mmx_reshade_blend_rgba_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(mI0000000, %mm5) LOAD_IMMQ(m000V0V0V, %mm6) LOAD_IMMQ(m00XXXXXX, %mm7) CLEANUP_IMMQ_LOADS(3) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and target and unpack/copy to eight bytes \*/ movq %mm2, %mm3 pxor %mm7, %mm3 paddusb %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Make the alpha value that gets multiplied to the |*| alpha channels 0x4000 (0.5), so the resulting alpha value is |*| the sum of the source and destination alpha values. \*/ psrlq $16, %mm3 por %mm5, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (2 * a * (s - 127)), (alpha channel: d = d + (2 * 0.5 * (s - 0)) ) \*/ psubw %mm6, %mm1 psllw $2, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_reshade_blend_rgba_to_rgba_cmod) PR_(imlib_mmx_reshade_blend_rgb_to_rgb_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(m000V0V0V, %mm6) CLEANUP_IMMQ_LOADS(1) /*\ Load alpha beforehand, as it's always amap(0xff) \*/ movzbl amap_ff, %eax movd %eax, %mm3 punpcklbw %mm3, %mm3 punpcklwd %mm3, %mm3 punpckldq %mm3, %mm3 psrlw $1, %mm3 psrlq $16, %mm3 LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD_A00 movd (%edi, %ecx, 4), %mm2 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (2 * a * (s - 127)) \*/ psubw %mm6, %mm1 psllw $2, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_reshade_blend_rgb_to_rgb_cmod) PR_(imlib_mmx_reshade_blend_rgb_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(mI0000000, %mm5) LOAD_IMMQ(m000V0V0V, %mm6) LOAD_IMMQ(m00XXXXXX, %mm7) CLEANUP_IMMQ_LOADS(3) LOOP_START 1: /*\ Load source and destination \*/ LOAD1_CMOD_AFF movd (%edi, %ecx, 4), %mm2 /*\ Get alpha from source and target and unpack/copy to eight bytes \*/ movq %mm2, %mm3 pxor %mm7, %mm3 paddusb %mm1, %mm3 punpcklbw %mm3, %mm3 punpckhwd %mm3, %mm3 punpckhdq %mm3, %mm3 psrlw $1, %mm3 /*\ Make the alpha value that gets multiplied to the |*| alpha channels 0x4000 (0.5), so the resulting alpha value is |*| the sum of the source and destination alpha values. \*/ psrlq $16, %mm3 por %mm5, %mm3 /*\ Unpack source and destination, bytes to words \*/ punpcklbw %mm4, %mm1 punpcklbw %mm4, %mm2 /*\ d = d + (2 * a * (s - 127)), (alpha channel: d = d + (2 * 0.5 * (s - 0)) ) \*/ psubw %mm6, %mm1 psllw $2, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm2 /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) incl %ecx js 1b LOOP_END LEAVE SIZE(imlib_mmx_reshade_blend_rgb_to_rgba_cmod) PR_(imlib_mmx_reshade_copy_rgba_to_rgb_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(m0XXX0XXX, %mm5) LOAD_IMMQ(m0VVV0VVV, %mm6) CLEANUP_IMMQ_LOADS(2) subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD movq (%edi, %ecx, 4), %mm2 /*\ To take advantage of saturation and be able to do 8 bytes |*| at a time, we divide reshading into two separate steps: |*| adding values above 128, and subtracting values below 128 |*| These values go into %mm1 and %mm3 respectively |*| - %mm1 becomes (2 * (s - 127)) |*| - %mm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) \*/ movq %mm1, %mm3 psubusb %mm6, %mm1 paddusb %mm1, %mm1 paddusb %mm6, %mm3 pxor %mm5, %mm3 paddusb %mm3, %mm3 /*\ Clear alpha channel of s1 and s2 \*/ pand %mm5, %mm1 pand %mm5, %mm3 /*\ d = d + s1 - s2, unsigned saturation, and save \*/ paddusb %mm1, %mm2 psubusb %mm3, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD movd (%edi), %mm2 movq %mm1, %mm3 psubusb %mm6, %mm1 paddusb %mm1, %mm1 paddusb %mm6, %mm3 pxor %mm5, %mm3 paddusb %mm3, %mm3 pand %mm5, %mm1 pand %mm5, %mm3 paddusb %mm1, %mm2 psubusb %mm3, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_reshade_copy_rgba_to_rgb_cmod) PR_(imlib_mmx_reshade_copy_rgba_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(m0XXX0XXX, %mm5) LOAD_IMMQ(m0VVV0VVV, %mm6) CLEANUP_IMMQ_LOADS(2) subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD movq (%edi, %ecx, 4), %mm2 /*\ This time, the alpha channels have to be added. |*| For that, the alpha channel of %mm1 should remain |*| the same. This is done by subtracting 0 from the |*| alpha channel, and then doing the *2 via a separate |*| register, clearing its alpha channel first. \*/ movq %mm1, %mm3 psubusb %mm6, %mm1 movq %mm1, %mm0 pand %mm5, %mm0 paddusb %mm0, %mm1 paddusb %mm6, %mm3 pxor %mm5, %mm3 paddusb %mm3, %mm3 /*\ Clear alpha channel of s2 \*/ pand %mm5, %mm3 /*\ d = d + s1 - s2, unsigned saturation, and save \*/ paddusb %mm1, %mm2 psubusb %mm3, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD movd (%edi), %mm2 movq %mm1, %mm3 psubusb %mm6, %mm1 movq %mm1, %mm0 pand %mm5, %mm0 paddusb %mm0, %mm1 paddusb %mm6, %mm3 pxor %mm5, %mm3 paddusb %mm3, %mm3 pand %mm5, %mm3 paddusb %mm1, %mm2 psubusb %mm3, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_reshade_copy_rgba_to_rgba_cmod) PR_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod): ENTER pxor %mm4, %mm4 LOAD_IMMQ(m0XXX0XXX, %mm5) LOAD_IMMQ(m0VVV0VVV, %mm6) CLEANUP_IMMQ_LOADS(2) subl $4, %esi subl $4, %edi LOOP_START incl %ecx jz 2f 1: /*\ Load source and destination \*/ LOAD2_CMOD_AFF movq (%edi, %ecx, 4), %mm2 movq %mm1, %mm3 psubusb %mm6, %mm1 movq %mm1, %mm0 pand %mm5, %mm0 paddusb %mm0, %mm1 paddusb %mm6, %mm3 pxor %mm5, %mm3 paddusb %mm3, %mm3 /*\ Clear alpha channel of s2 \*/ pand %mm5, %mm3 /*\ d = d + s1 - s2, unsigned saturation, and save \*/ paddusb %mm1, %mm2 psubusb %mm3, %mm2 movq %mm2, (%edi, %ecx, 4) addl $2, %ecx js 1b jnz 3f 2: LOAD1_CMOD_AFF movd (%edi), %mm2 movq %mm1, %mm3 psubusb %mm6, %mm1 movq %mm1, %mm0 pand %mm5, %mm0 paddusb %mm0, %mm1 paddusb %mm6, %mm3 pxor %mm5, %mm3 paddusb %mm3, %mm3 pand %mm5, %mm3 paddusb %mm1, %mm2 psubusb %mm3, %mm2 movd %mm2, (%edi) 3: LOOP_END LEAVE SIZE(imlib_mmx_reshade_copy_rgb_to_rgba_cmod) #endif #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif