summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIlia Mirkin <imirkin@alum.mit.edu>2013-07-29 02:40:39 -0400
committerBen Skeggs <bskeggs@redhat.com>2013-07-30 10:53:28 +1000
commit4492cf90c2a7fe27badf6f3f0f296755a65e1e83 (patch)
tree83b38b2ad7fdcd6f2d7b5b69eb101960013bfc5f
parent08c667b06d28756d0d32f82bd736b0bdf2b4448e (diff)
downloadxorg-driver-xf86-video-nouveau-4492cf90c2a7fe27badf6f3f0f296755a65e1e83.tar.gz
xv: speed up YV12 -> NV12 conversion using SSE2 if available
memcpy() goes from taking 45% to 66% of total function time, which translates to a 30% decrease in NVPutImage runtime. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
-rw-r--r--src/nouveau_xv.c33
1 files changed, 26 insertions, 7 deletions
diff --git a/src/nouveau_xv.c b/src/nouveau_xv.c
index 567e30c..5569b7c 100644
--- a/src/nouveau_xv.c
+++ b/src/nouveau_xv.c
@@ -25,6 +25,8 @@
#include "config.h"
#endif
+#include <immintrin.h>
+
#include "xf86xv.h"
#include <X11/extensions/Xv.h>
#include "exa.h"
@@ -532,30 +534,47 @@ NVCopyNV12ColorPlanes(unsigned char *src1, unsigned char *src2,
w >>= 1;
h >>= 1;
+#ifdef __SSE2__
+ l = w >> 3;
+ e = w & 7;
+#else
l = w >> 1;
e = w & 1;
+#endif
for (j = 0; j < h; j++) {
unsigned char *us = src1;
unsigned char *vs = src2;
unsigned int *vuvud = (unsigned int *) dst;
+ unsigned short *vud;
for (i = 0; i < l; i++) {
-#if X_BYTE_ORDER == X_BIG_ENDIAN
+#ifdef __SSE2__
+ _mm_storeu_si128(
+ (void*)vuvud,
+ _mm_unpacklo_epi8(
+ _mm_loadl_epi64((void*)vs),
+ _mm_loadl_epi64((void*)us)));
+ vuvud+=4;
+ us+=8;
+ vs+=8;
+#else /* __SSE2__ */
+# if X_BYTE_ORDER == X_BIG_ENDIAN
*vuvud++ = (vs[0]<<24) | (us[0]<<16) | (vs[1]<<8) | us[1];
-#else
+# else
*vuvud++ = vs[0] | (us[0]<<8) | (vs[1]<<16) | (us[1]<<24);
-#endif
+# endif
us+=2;
vs+=2;
+#endif /* __SSE2__ */
}
- if (e) {
- unsigned short *vud = (unsigned short *) vuvud;
+ vud = (unsigned short *)vuvud;
+ for (i = 0; i < e; i++) {
#if X_BYTE_ORDER == X_BIG_ENDIAN
- *vud = us[0] | (vs[0]<<8);
+ vud[i] = us[i] | (vs[i]<<8);
#else
- *vud = vs[0] | (us[0]<<8);
+ vud[i] = vs[i] | (us[i]<<8);
#endif
}