#include #include "asm.h" #ifdef DO_AMD64_ASM /*\ |*| AMD64 SSE2 assembly blending routines for Imlib2 |*| Written by John Slaten |*| Based on MMX routines written by Willem Monsuwe \*/ /*\ Some useful masks \*/ .data .align 16 m0X000000: .byte 0, 0, 0, 0, 0, 0, 255, 0 .byte 0, 0, 0, 0, 0, 0, 255, 0 m10000000: .byte 0, 0, 0, 0, 0, 0, 0, 1 .byte 0, 0, 0, 0, 0, 0, 0, 1 m00XXXXXX: .byte 255, 255, 255, 255, 255, 255, 0, 0 .byte 255, 255, 255, 255, 255, 255, 0, 0 mVX000000: .byte 0, 0, 0, 0, 0, 0, 255, 127 .byte 0, 0, 0, 0, 0, 0, 255, 127 mV0000000: .byte 0, 0, 0, 0, 0, 0, 0, 128 .byte 0, 0, 0, 0, 0, 0, 0, 128 mX000X000: .byte 0, 0, 0, 0, 0, 0, 255, 255 .byte 0, 0, 0, 0, 0, 0, 255, 255 m0XXX0XXX0XXX0XXX: .byte 255, 255, 255, 0, 255, 255, 255, 0 .byte 255, 255, 255, 0, 255, 255, 255, 0 m0XXX0XXX00000000: .byte 255, 255, 255, 0, 255, 255, 255, 0 .byte 0, 0, 0, 0, 0, 0, 0, 0 m0XXX000000000000: .byte 255, 255, 255, 0, 0, 0, 0, 0 .byte 0, 0, 0, 0, 0, 0, 0, 0 mX000X000X000X000: .byte 0, 0, 0, 255, 0, 0, 0, 255 .byte 0, 0, 0, 255, 0, 0, 0, 255 mX000X00000000000: .byte 0, 0, 0, 255, 0, 0, 0, 255 .byte 0, 0, 0, 255, 0, 0, 0, 255 mX000000000000000: .byte 0, 0, 0, 255, 0, 0, 0, 255 .byte 0, 0, 0, 255, 0, 0, 0, 255 m1000100010001000: .byte 0, 0, 0, 1, 0, 0, 0, 1 .byte 0, 0, 0, 1, 0, 0, 0, 1 m000V0V0V000V0V0V: .byte 127, 0, 127, 0, 127, 0, 0, 0 .byte 127, 0, 127, 0, 127, 0, 0, 0 mI0000000I0000000: .byte 0, 0, 0, 0, 0, 0, 0, 64 .byte 0, 0, 0, 0, 0, 0, 0, 64 m0VVV0VVV0VVV0VVV: .byte 127, 127, 127, 0, 127, 127, 127, 0 .byte 127, 127, 127, 0, 127, 127, 127, 0 c1: .word 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1 /*\ All functions have the same calling convention: |*| __imlib_amd64__rgba_to_rgb[A](void *src, int sw, void *dst, int dw, |*| int w, int h, ImlibColorModifier *cm) |*| AMD64 GCC passes paramters by register, so no aliases exist in this version. \*/ .text .align 16 FN_(imlib_amd64_blend_rgba_to_rgb_cmod) FN_(imlib_amd64_blend_rgba_to_rgba_cmod) FN_(imlib_amd64_blend_rgb_to_rgba_cmod) FN_(imlib_amd64_blend_rgb_to_rgb_cmod) FN_(imlib_amd64_copy_rgba_to_rgb_cmod) FN_(imlib_amd64_copy_rgba_to_rgba_cmod) FN_(imlib_amd64_copy_rgb_to_rgba_cmod) FN_(imlib_amd64_add_blend_rgba_to_rgb_cmod) FN_(imlib_amd64_add_blend_rgba_to_rgba_cmod) FN_(imlib_amd64_add_blend_rgb_to_rgba_cmod) FN_(imlib_amd64_add_blend_rgb_to_rgb_cmod) FN_(imlib_amd64_add_copy_rgba_to_rgb_cmod) FN_(imlib_amd64_add_copy_rgba_to_rgba_cmod) FN_(imlib_amd64_add_copy_rgb_to_rgba_cmod) FN_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod) FN_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod) FN_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod) FN_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod) FN_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod) FN_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod) FN_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod) FN_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod) FN_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod) FN_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod) FN_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod) FN_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod) FN_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod) FN_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod) .extern pow_lut /*\ SSE register use: |*| %xmm1 = Source value |*| %xmm2 = Destination value |*| %xmm3 = Alpha value |*| %xmm4 = 0 |*| %xmm5-%xmm7 = masks \*/ /*\ Variables: |*| %rsi = src |*| %rdi = dst |*| %r8d = w |*| %r9d = h |*| %r10d = sw |*| %r11d = dw \*/ #define ENTER \ pushq %rbp ; \ movq %rsp, %rbp ; \ pushq %rbx ; \ pushq %r13 ; \ pushq %r14 ; \ movq %rsi, %r10 ; \ movq %rcx, %r11 ; \ movq %rdi, %rsi ; \ movq %rdx, %rdi ; \ movq 16(%rbp), %r14 ; \ ; \ /* param sanity check */ ; \ testq %r8, %r8 ; \ jz 9f ; \ testq %r9, %r9 ; \ jz 9f #define LEAVE \ popq %r14 ; \ popq %r13 ; \ popq %rbx ; \ movq %rbp, %rsp ; \ popq %rbp ; \ ret PR_(imlib_amd64_blend_rgba_to_rgb_cmod): ENTER pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 movdqa m00XXXXXX(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_blend_rgba_to_rgb_cmod) PR_(imlib_amd64_blend_rgba_to_rgba_cmod): ENTER pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 xorq %rax, %rax movdqa mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ roll $16, %edx andl $0x0000ff00, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah movd %eax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_blend_rgba_to_rgba_cmod) PR_(imlib_amd64_blend_rgb_to_rgba_cmod): ENTER pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 xorq %rax, %rax movdqa mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ roll $16, %edx andl $0x0000ff00, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah movd %eax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_blend_rgb_to_rgba_cmod) PR_(imlib_amd64_blend_rgb_to_rgb_cmod): ENTER pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 movdqa m00XXXXXX(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * ((s - d) + 0.5)) */ psubw %xmm2, %xmm1 psllw $1, %xmm1 paddw %xmm5, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_blend_rgb_to_rgb_cmod) PR_(imlib_amd64_copy_rgba_to_rgb_cmod): ENTER movq mX000X000X000X000(%rip), %r13 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $16, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq (%rdi, %rcx, 4), %rax andq %r13, %rax orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $16, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq (%rdi, %rcx, 4), %rax andq %r13, %rax orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $16, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq (%rdi, %rcx, 4), %rax andq %r13, %rax orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $16, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq (%rdi, %rcx, 4), %rax andq %r13, %rax orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $16, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq (%rdi, %rcx, 4), %rax andq %r13, %rax orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $16, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq (%rdi, %rcx, 4), %rax andq %r13, %rax orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $16, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq (%rdi, %rcx, 4), %rax andq %r13, %rax orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $16, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq (%rdi, %rcx, 4), %rax andq %r13, %rax orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = 0 */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movzbq %al, %rbx movzbq 0x000(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movl (%rdi, %rcx, 4), %eax andq %r13, %rax orq %rax, %rdx movl %edx, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_copy_rgba_to_rgb_cmod) PR_(imlib_amd64_copy_rgba_to_rgba_cmod): ENTER /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movl %edx, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_copy_rgba_to_rgba_cmod) PR_(imlib_amd64_copy_rgb_to_rgba_cmod): ENTER /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movl %edx, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_copy_rgb_to_rgba_cmod) PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): ENTER pxor %xmm4, %xmm4 movdqa m00XXXXXX(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_add_blend_rgba_to_rgb_cmod) PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): ENTER pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 xorq %rax, %rax movdqa mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ roll $16, %edx andl $0x0000ff00, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah movd %eax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_add_blend_rgba_to_rgba_cmod) PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): ENTER pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 xorq %rax, %rax movdqa mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ roll $16, %edx andl $0x0000ff00, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah movd %eax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 por %xmm6, %xmm1 pand %xmm6, %xmm0 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (s * ca) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_add_blend_rgb_to_rgba_cmod) PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): ENTER pxor %xmm4, %xmm4 movdqa m00XXXXXX(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (a * s) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_add_blend_rgb_to_rgb_cmod) PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): ENTER movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* d = d + (s & 0x00ffffff) */ pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_add_copy_rgba_to_rgb_cmod) PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): ENTER movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* d = (d & 0x00ffffff) + s */ pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_add_copy_rgba_to_rgba_cmod) PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): ENTER movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_add_copy_rgb_to_rgba_cmod) PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): ENTER pxor %xmm4, %xmm4 movdqa m00XXXXXX(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_subtract_blend_rgba_to_rgb_cmod) PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): ENTER movq pow_lut@GOTPCREL(%rip), %r13 pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 movdqa mX000X000X000X000(%rip), %xmm6 movdqa mX000X000(%rip), %xmm7 xorq %rax, %rax /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ roll $16, %edx andl $0x0000ff00, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah movd %eax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_subtract_blend_rgba_to_rgba_cmod) PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): ENTER movq pow_lut@GOTPCREL(%rip), %r13 pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 movdqa mX000X000X000X000(%rip), %xmm6 movdqa mX000X000(%rip), %xmm7 xorq %rax, %rax /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah rolq $32, %rax movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending */ roll $16, %edx andl $0x0000ff00, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah movd %eax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 /* unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - ((s * a) ^ 0xff000000) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_subtract_blend_rgb_to_rgba_cmod) PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): ENTER pxor %xmm4, %xmm4 movdqa m00XXXXXX(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the * destination alpha unchanged. */ pand %xmm6, %xmm3 /* Unpack src and dst to words */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d - (s * a) */ psllw $1, %xmm1 pmulhw %xmm3, %xmm1 psubsw %xmm1, %xmm2 /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_subtract_blend_rgb_to_rgb_cmod) PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): ENTER movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* d = d - (s & 0x00ffffff) */ pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_subtract_copy_rgba_to_rgb_cmod) PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): ENTER movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 movdqa mX000X000X000X000(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* d = d - s, d alpha = s alpha */ psubusb %xmm1, %xmm2 pand %xmm6, %xmm1 pand %xmm5, %xmm2 por %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_subtract_copy_rgba_to_rgba_cmod) PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): ENTER movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 movdqa mX000X000X000X000(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* d = (d - s) */ psubusb %xmm1, %xmm2 /* Preserve source alpha */ pand %xmm5, %xmm2 pand %xmm6, %xmm1 por %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_subtract_copy_rgb_to_rgba_cmod) PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): ENTER pxor %xmm4, %xmm4 movdqa m000V0V0V000V0V0V(%rip), %xmm6 movdqa m00XXXXXX(%rip), %xmm7 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_reshade_blend_rgba_to_rgb_cmod) PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): ENTER movq pow_lut@GOTPCREL(%rip), %r13 pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 movdqa mX000X000X000X000(%rip), %xmm6 movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm7 movdqa m000V0V0V000V0V0V(%rip), %xmm8 xorq %rax, %rax /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ roll $16, %edx andl $0x0000ff00, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah movd %eax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_reshade_blend_rgba_to_rgba_cmod) PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): ENTER pxor %xmm4, %xmm4 movdqa m000V0V0V000V0V0V(%rip), %xmm6 movdqa m00XXXXXX(%rip), %xmm7 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Unpack alpha */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ pand %xmm7, %xmm3 /* Unpack src and dst */ punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 /* d = d + (2 * a * (s - 127)) */ psubw %xmm6, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_reshade_blend_rgb_to_rgb_cmod) PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): ENTER movq pow_lut@GOTPCREL(%rip), %r13 pxor %xmm4, %xmm4 movdqa c1(%rip), %xmm5 movdqa mX000X000X000X000(%rip), %xmm6 movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm7 movdqa m000V0V0V000V0V0V(%rip), %xmm8 xorq %rax, %rax /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ movq %rdx, %rax andl $0xff000000, %edx roll $16, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movl %eax, %edx andl $0xff000000, %edx roll $16, %edx movb 7(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah rolq $32, %rax movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* Convert the cmod alpha to the pow_lut alpha that will be used * for blending, specialized for reshade by shifting the source alpha * right by one */ roll $16, %edx andl $0x0000ff00, %edx movb 3(%rdi, %rcx, 4), %dl movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah movd %eax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 pand %xmm6, %xmm0 por %xmm6, %xmm1 psubusb %xmm0, %xmm1 punpcklbw %xmm4, %xmm1 punpcklbw %xmm4, %xmm2 psubw %xmm8, %xmm1 psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_reshade_blend_rgb_to_rgba_cmod) PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): ENTER movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 movdqa m0VVV0VVV0VVV0VVV(%rip), %xmm6 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* To take advantage of saturation and be able to do 8 bytes * at a time, we divide reshading into two separate steps: * adding values above 128, and subtracting values below 128 * These values go into %mm1 and %mm3 respectively * - %xmm1 becomes (2 * (s - 127)) * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* dest alpha should not be changed in this func */ pand %xmm5, %xmm1 pand %xmm5, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_reshade_copy_rgba_to_rgb_cmod) PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): ENTER movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5 movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6 movdqu mX000X000X000X000(%rip), %xmm7 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax rorq $56, %rax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax ror $24, %eax movzbq %al, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx rol $8, %eax movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_reshade_copy_rgba_to_rgba_cmod) PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): ENTER movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5 movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6 movdqu mX000X000X000X000(%rip), %xmm7 /* Move right to left across each line, */ /* processing in two pixel chunks */ leaq (%rsi, %r8, 4), %rsi leaq (%rdi, %r8, 4), %rdi /* Last instruction is %rcx = 0 */ subq $4, %rsi subq $4, %rdi negq %r8 0: movq %r8, %rcx incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) prefetcht0 (%rdi, %rcx, 4) prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) prefetcht0 128(%rdi, %rcx, 4) /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx jz 2f jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax rorq $48, %rax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shlq $8, %rdx movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl shlq $8, %rdx movl $0x000000FF, %ebx movb 0x300(%r14, %rbx), %dl shlq $8, %rdx rolq $16, %rax movb %al, %bl movb 0x000(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x100(%r14, %rbx), %dl shlq $8, %rdx rolq $8, %rax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %rdx, %xmm1 movq (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) incq %rcx incq %rcx js 1b jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax ror $16, %eax movq $0x000000FF, %rbx movzbq 0x300(%r14, %rbx), %rdx shl $8, %edx movb %al, %bl movb 0x000(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x100(%r14, %rbx), %dl shl $8, %edx rol $8, %eax movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 movd (%rdi, %rcx, 4), %xmm2 /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ movdqa %xmm1, %xmm3 psubusb %xmm6, %xmm1 movdqa %xmm1, %xmm0 paddusb %xmm1, %xmm1 paddusb %xmm6, %xmm3 pxor %xmm5, %xmm3 paddusb %xmm3, %xmm3 /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movd %xmm2, (%rdi, %rcx, 4) 3: leaq (%rsi, %r10, 4), %rsi leaq (%rdi, %r11, 4), %rdi decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_reshade_copy_rgb_to_rgba_cmod) #endif #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif