diff options
Diffstat (limited to 'libswscale/x86/swscale_template.c')
-rw-r--r-- | libswscale/x86/swscale_template.c | 226 |
1 files changed, 163 insertions, 63 deletions
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 1e42ec5b12..1213197347 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -25,6 +25,7 @@ #undef REAL_MOVNTQ #undef MOVNTQ +#undef MOVNTQ2 #undef PREFETCH #if COMPILE_TEMPLATE_MMXEXT @@ -35,11 +36,95 @@ #if COMPILE_TEMPLATE_MMXEXT #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" +#define MOVNTQ2 "movntq " #else #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" +#define MOVNTQ2 "movq " #endif #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) +#if !COMPILE_TEMPLATE_MMXEXT +static av_always_inline void +dither_8to16(const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "psrlq $24, %%mm3\n\t" + "psllq $40, %%mm4\n\t" + "por %%mm4, %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } +} +#endif + +static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + dither_8to16(dither, offset); + filterSize--; + __asm__ volatile( + "movd %0, %%mm1\n\t" + "punpcklwd %%mm1, %%mm1\n\t" + "punpckldq %%mm1, %%mm1\n\t" + "psllw $3, %%mm1\n\t" + "paddw %%mm1, %%mm3\n\t" + "paddw %%mm1, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + ::"m"(filterSize) + ); + + __asm__ volatile(\ + "movq %%mm3, %%mm6\n\t" + "movq %%mm4, %%mm7\n\t" + "movl %3, %%ecx\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ + "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%mm3 \n\t"\ + "psraw $3, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm3 \n\t" + MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" + "add $8, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movq %%mm6, %%mm3\n\t" + "movq %%mm7, %%mm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : "%"REG_d, "%"REG_S, "%"REG_c + ); +} + #define YSCALEYUV2PACKEDX_UV \ __asm__ volatile(\ "xor %%"REG_a", %%"REG_a" \n\t"\ @@ -92,6 +177,7 @@ :: "r" (&c->redDither), \ "m" (dummy), "m" (dummy), "m" (dummy),\ "r" (dest), "m" (dstW_reg), "m"(uv_off) \ + NAMED_CONSTRAINTS_ADD(bF8,bFC) \ : "%"REG_a, "%"REG_d, "%"REG_S \ ); @@ -252,7 +338,7 @@ MOVNTQ( q3, 24(dst, index, 4))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) @@ -265,7 +351,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX_ACCURATE @@ -278,13 +364,13 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) YSCALEYUV2PACKEDX_END } else { YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } } @@ -298,7 +384,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX @@ -307,13 +393,13 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } else { YSCALEYUV2PACKEDX YSCALEYUV2RGBX "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } } @@ -342,7 +428,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm1, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) @@ -355,7 +441,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -366,7 +452,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" "paddusb "RED_DITHER"(%0), %%mm5\n\t" #endif - WRITERGB16(%4, %5, %%REGa) + WRITERGB16(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -379,7 +465,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -390,7 +476,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" "paddusb "RED_DITHER"(%0), %%mm5 \n\t" #endif - WRITERGB16(%4, %5, %%REGa) + WRITERGB16(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -419,7 +505,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm1, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) @@ -432,7 +518,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -443,7 +529,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" "paddusb "RED_DITHER"(%0), %%mm5\n\t" #endif - WRITERGB15(%4, %5, %%REGa) + WRITERGB15(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -456,7 +542,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -467,7 +553,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" "paddusb "RED_DITHER"(%0), %%mm5 \n\t" #endif - WRITERGB15(%4, %5, %%REGa) + WRITERGB15(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -521,7 +607,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "add $24, "#dst" \n\t"\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEBGR24MMXEXT(dst, dstw, index) \ @@ -569,7 +655,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "add $24, "#dst" \n\t"\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #if COMPILE_TEMPLATE_MMXEXT @@ -580,6 +666,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) #endif +#if HAVE_6REGS static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, @@ -589,17 +676,18 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize "add %4, %%"REG_c" \n\t" - WRITEBGR24(%%REGc, %5, %%REGa) + WRITEBGR24(%%REGc, "%5", %%REGa) :: "r" (&c->redDither), "m" (dummy), "m" (dummy), "m" (dummy), "r" (dest), "m" (dstW_reg), "m"(uv_off) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S ); } @@ -613,20 +701,22 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize "add %4, %%"REG_c" \n\t" - WRITEBGR24(%%REGc, %5, %%REGa) + WRITEBGR24(%%REGc, "%5", %%REGa) :: "r" (&c->redDither), "m" (dummy), "m" (dummy), "m" (dummy), "r" (dest), "m" (dstW_reg), "m"(uv_off) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S ); } +#endif /* HAVE_6REGS */ #define REAL_WRITEYUY2(dst, dstw, index) \ "packuswb %%mm3, %%mm3 \n\t"\ @@ -641,7 +731,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm7, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) @@ -654,7 +744,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -662,7 +752,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm4 \n\t" "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" - WRITEYUY2(%4, %5, %%REGa) + WRITEYUY2(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -675,7 +765,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -683,7 +773,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm4 \n\t" "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" - WRITEYUY2(%4, %5, %%REGa) + WRITEYUY2(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -784,15 +874,15 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), "a" (&c->redDither), "r" (abuf0), "r" (abuf1) : "%r8" ); #else - *(const uint16_t **)(&c->u_temp)=abuf0; - *(const uint16_t **)(&c->v_temp)=abuf1; + c->u_temp=(intptr_t)abuf0; + c->v_temp=(intptr_t)abuf1; __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -808,7 +898,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "packuswb %%mm7, %%mm1 \n\t" "pop %1 \n\t" "pop %0 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -822,7 +912,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "push %%"REG_BP" \n\t" YSCALEYUV2RGB(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -839,18 +929,18 @@ static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2RGB(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } @@ -862,7 +952,6 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -875,11 +964,12 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } @@ -891,7 +981,6 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -904,11 +993,12 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } @@ -960,13 +1050,12 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1109,7 +1198,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1122,7 +1211,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1138,7 +1227,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1151,7 +1240,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1177,11 +1266,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1191,11 +1281,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } } @@ -1222,11 +1313,12 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1242,11 +1334,12 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } } @@ -1273,11 +1366,12 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1293,11 +1387,12 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } } @@ -1354,7 +1449,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED1(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1367,7 +1462,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED1b(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1564,14 +1659,16 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) { enum AVPixelFormat dstFormat = c->dstFormat; - if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && - dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) { - if (!(c->flags & SWS_BITEXACT)) { + c->use_mmx_vfilter= 0; + if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 + && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { if (c->flags & SWS_ACCURATE_RND) { if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; +#if HAVE_6REGS case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; +#endif case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; @@ -1579,10 +1676,14 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } } else { + c->use_mmx_vfilter= 1; + c->yuv2planeX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; +#if HAVE_6REGS case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; +#endif case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; @@ -1590,7 +1691,6 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } } - } if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: @@ -1619,7 +1719,7 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } - if (c->srcBpc == 8 && c->dstBpc <= 10) { + if (c->srcBpc == 8 && c->dstBpc <= 14) { // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). #if COMPILE_TEMPLATE_MMXEXT if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { |