diff options
author | Ivo van Poorten <ivop@euronet.nl> | 2007-04-16 21:41:03 +0000 |
---|---|---|
committer | Ivo van Poorten <ivop@euronet.nl> | 2007-04-16 21:41:03 +0000 |
commit | b38d487466e68bd6baf2889017d2a751831560f0 (patch) | |
tree | 63c54f05c133132d55b330ac607e7671e1a32a9f /libswscale | |
parent | 4f99f932689523dd2a77656ca1b75e00178c3559 (diff) | |
download | ffmpeg-b38d487466e68bd6baf2889017d2a751831560f0.tar.gz |
New implementation of rgb32tobgr32
The previous implementation segfaulted with MMX enabled when fed an image
smaller than the size of the units the MMX code processed. The new code:
- is faster for MMX, MMX2 and plain C
- processes small images correctly
- is LGPL
Originally committed as revision 23009 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
Diffstat (limited to 'libswscale')
-rw-r--r-- | libswscale/rgb2rgb_template.c | 99 |
1 files changed, 58 insertions, 41 deletions
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c index 6489a4db91..7147855fed 100644 --- a/libswscale/rgb2rgb_template.c +++ b/libswscale/rgb2rgb_template.c @@ -1364,49 +1364,66 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) { + uint8_t *d = dst, *s = (uint8_t *) src; + const uint8_t *end = s + src_size; #ifdef HAVE_MMX -/* TODO: unroll this loop */ - asm volatile ( - "xor %%"REG_a", %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 32(%0, %%"REG_a") \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "pslld $16, %%mm0 \n\t" - "psrld $16, %%mm1 \n\t" - "pand "MANGLE(mask32r)", %%mm0 \n\t" - "pand "MANGLE(mask32g)", %%mm2 \n\t" - "pand "MANGLE(mask32b)", %%mm1 \n\t" - "por %%mm0, %%mm2 \n\t" - "por %%mm1, %%mm2 \n\t" - MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - "cmp %2, %%"REG_a" \n\t" - " jb 1b \n\t" - :: "r" (src), "r"(dst), "r" (src_size-7) - : "%"REG_a - ); - - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#else - unsigned i; - unsigned num_pixels = src_size >> 2; - for(i=0; i<num_pixels; i++) - { -#ifdef WORDS_BIGENDIAN - dst[4*i + 1] = src[4*i + 3]; - dst[4*i + 2] = src[4*i + 2]; - dst[4*i + 3] = src[4*i + 1]; -#else - dst[4*i + 0] = src[4*i + 2]; - dst[4*i + 1] = src[4*i + 1]; - dst[4*i + 2] = src[4*i + 0]; -#endif + __asm __volatile( + " "PREFETCH" (%1) \n" + " movq %3, %%mm7 \n" + " pxor %4, %%mm7 \n" + " movq %%mm7, %%mm6 \n" + " pxor %5, %%mm7 \n" + " jmp 2f \n" + ASMALIGN(4) + "1: \n" + " "PREFETCH" 32(%1) \n" + " movq (%1), %%mm0 \n" + " movq 8(%1), %%mm1 \n" +# ifdef HAVE_MMX2 + " pshufw $177, %%mm0, %%mm3 \n" + " pshufw $177, %%mm1, %%mm5 \n" + " pand %%mm7, %%mm0 \n" + " pand %%mm6, %%mm3 \n" + " pand %%mm7, %%mm1 \n" + " pand %%mm6, %%mm5 \n" + " por %%mm3, %%mm0 \n" + " por %%mm5, %%mm1 \n" +# else + " movq %%mm0, %%mm2 \n" + " movq %%mm1, %%mm4 \n" + " pand %%mm7, %%mm0 \n" + " pand %%mm6, %%mm2 \n" + " pand %%mm7, %%mm1 \n" + " pand %%mm6, %%mm4 \n" + " movq %%mm2, %%mm3 \n" + " movq %%mm4, %%mm5 \n" + " pslld $16, %%mm2 \n" + " psrld $16, %%mm3 \n" + " pslld $16, %%mm4 \n" + " psrld $16, %%mm5 \n" + " por %%mm2, %%mm0 \n" + " por %%mm4, %%mm1 \n" + " por %%mm3, %%mm0 \n" + " por %%mm5, %%mm1 \n" +# endif + " "MOVNTQ" %%mm0, (%0) \n" + " "MOVNTQ" %%mm1, 8(%0) \n" + " add $16, %0 \n" + " add $16, %1 \n" + "2: \n" + " cmp %1, %2 \n" + " ja 1b \n" + " "SFENCE" \n" + " "EMMS" \n" + : "+r"(d), "+r"(s) + : "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one) + : "memory"); +#endif + for (; s<end; s+=4, d+=4) { + int v = *(uint32_t *)s, g = v & 0xff00; + v &= 0xff00ff; + *(uint32_t *)d = (v>>16) + g + (v<<16); } -#endif } static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) |