diff options
author | Ilia Mirkin <imirkin@alum.mit.edu> | 2013-07-29 02:40:39 -0400 |
---|---|---|
committer | Ben Skeggs <bskeggs@redhat.com> | 2013-07-30 10:53:28 +1000 |
commit | 4492cf90c2a7fe27badf6f3f0f296755a65e1e83 (patch) | |
tree | 83b38b2ad7fdcd6f2d7b5b69eb101960013bfc5f | |
parent | 08c667b06d28756d0d32f82bd736b0bdf2b4448e (diff) | |
download | xorg-driver-xf86-video-nouveau-4492cf90c2a7fe27badf6f3f0f296755a65e1e83.tar.gz |
xv: speed up YV12 -> NV12 conversion using SSE2 if available
memcpy() goes from taking 45% to 66% of total function time, which
translates to a 30% decrease in NVPutImage runtime.
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
-rw-r--r-- | src/nouveau_xv.c | 33 |
1 files changed, 26 insertions, 7 deletions
diff --git a/src/nouveau_xv.c b/src/nouveau_xv.c index 567e30c..5569b7c 100644 --- a/src/nouveau_xv.c +++ b/src/nouveau_xv.c @@ -25,6 +25,8 @@ #include "config.h" #endif +#include <immintrin.h> + #include "xf86xv.h" #include <X11/extensions/Xv.h> #include "exa.h" @@ -532,30 +534,47 @@ NVCopyNV12ColorPlanes(unsigned char *src1, unsigned char *src2, w >>= 1; h >>= 1; +#ifdef __SSE2__ + l = w >> 3; + e = w & 7; +#else l = w >> 1; e = w & 1; +#endif for (j = 0; j < h; j++) { unsigned char *us = src1; unsigned char *vs = src2; unsigned int *vuvud = (unsigned int *) dst; + unsigned short *vud; for (i = 0; i < l; i++) { -#if X_BYTE_ORDER == X_BIG_ENDIAN +#ifdef __SSE2__ + _mm_storeu_si128( + (void*)vuvud, + _mm_unpacklo_epi8( + _mm_loadl_epi64((void*)vs), + _mm_loadl_epi64((void*)us))); + vuvud+=4; + us+=8; + vs+=8; +#else /* __SSE2__ */ +# if X_BYTE_ORDER == X_BIG_ENDIAN *vuvud++ = (vs[0]<<24) | (us[0]<<16) | (vs[1]<<8) | us[1]; -#else +# else *vuvud++ = vs[0] | (us[0]<<8) | (vs[1]<<16) | (us[1]<<24); -#endif +# endif us+=2; vs+=2; +#endif /* __SSE2__ */ } - if (e) { - unsigned short *vud = (unsigned short *) vuvud; + vud = (unsigned short *)vuvud; + for (i = 0; i < e; i++) { #if X_BYTE_ORDER == X_BIG_ENDIAN - *vud = us[0] | (vs[0]<<8); + vud[i] = us[i] | (vs[i]<<8); #else - *vud = vs[0] | (us[0]<<8); + vud[i] = vs[i] | (us[i]<<8); #endif } |