summaryrefslogtreecommitdiff
path: root/liboil/simdpack
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-06-16 06:46:06 +0000
committerDavid Schleef <ds@schleef.org>2005-06-16 06:46:06 +0000
commit508ba3985f38081917b76fcfc4ac84e73ca2954a (patch)
treebd56b3f8c781a82c6f018a6d4a03d2c33fdd7a5b /liboil/simdpack
parent7ffccb74ac2ebfd0b6f8361d4016b0ceb3c9581f (diff)
downloadliboil-508ba3985f38081917b76fcfc4ac84e73ca2954a.tar.gz
* configure.ac: Add instruction checker
* testsuite/Makefile.am: * testsuite/instruction/Makefile.am: * testsuite/instruction/check-instructions.pl: * testsuite/instruction/list-impls.c: (main): * liboil/colorspace/argb_paint_i386.c: Fix flags based on advice of the instruction checker * liboil/colorspace/ayuv2argb_i386.c: * liboil/conv/conv_3dnow.c: * liboil/conv/conv_sse.c: * liboil/copy/trans8x8_i386.c: * liboil/dct/idct8x8_i386.c: * liboil/sse/conv_sse.c: * liboil/liboilfuncs.h: update * liboil/liboilmarshal.c: (_oil_test_marshal_function): update * liboil/liboiltest.c: (oil_test_new), (oil_test_check_function): regenerate inplace data for every test iteration. Bump default n to 1000 to force memcpy to choose a good function. (lame hack) * liboil/copy/copy_i386.c: (copy_u8_mmx3), (copy_u8_mmx4), (copy_u8_mmx5): new implementation, fix others * liboil/copy/splat_i386.c: (splat_u32_ns_mmx): make faster * liboil/copy/splat_ref.c: (splat_u8_ns_int): fix bug * liboil/colorspace/argb_paint.c: (argb_splat_u8_ref), (rgba_splat_u8_ref): New functions * liboil/simdpack/average2_u8.c: (average2_u8_ref), (average2_u8_trick), (average2_u8_fast), (average2_u8_unroll4): Implementations really need to follow stride rules. * liboil/Makefile.am: Don't use SSE flags, because people on powerpc don't appreciate it. * examples/memcpy-speed.c: (main): only go to 1<<24 bytes
Diffstat (limited to 'liboil/simdpack')
-rw-r--r--liboil/simdpack/average2_u8.c35
1 files changed, 21 insertions, 14 deletions
diff --git a/liboil/simdpack/average2_u8.c b/liboil/simdpack/average2_u8.c
index 3c3f546..fe07aa8 100644
--- a/liboil/simdpack/average2_u8.c
+++ b/liboil/simdpack/average2_u8.c
@@ -45,7 +45,7 @@ average2_u8_ref (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
int i;
for (i = 0; i < n; i++) {
- dest[i] = (src1[sstr1 * i] + src2[sstr2 * i]) >> 1;
+ dest[dstr * i] = (src1[sstr1 * i] + src2[sstr2 * i]) >> 1;
}
}
@@ -57,7 +57,8 @@ average2_u8_trick (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
{
unsigned int x, y, d;
- if (sstr1 == 1 && sstr2 == 1) {
+#if 0
+ if (sstr1 == 1 && sstr2 == 1 && dstr == 1) {
while (n > 0) {
x = *(unsigned int *) src1;
y = *(unsigned int *) src2;
@@ -67,8 +68,9 @@ average2_u8_trick (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
dest += 4;
n -= 4;
}
- }
- else {
+ } else
+#endif
+ {
while (n > 0) {
x = (src1[0] << 24) | (src1[sstr1] << 16) | (src1[2 *
sstr1] << 8) | (src1[3 * sstr1]);
@@ -76,12 +78,12 @@ average2_u8_trick (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
sstr2] << 8) | (src2[3 * sstr2]);
d = (((x ^ y) & 0xfefefefe) >> 1) + (x & y);
dest[0] = (d >> 24);
- dest[1] = (d >> 16);
- dest[2] = (d >> 8);
- dest[3] = (d >> 0);
+ dest[1*dstr] = (d >> 16);
+ dest[2*dstr] = (d >> 8);
+ dest[3*dstr] = (d >> 0);
src1 += 4 * sstr1;
src2 += 4 * sstr2;
- dest += 4;
+ dest += 4 * dstr;
n -= 4;
}
}
@@ -94,9 +96,10 @@ average2_u8_fast (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
uint8_t *src2, int sstr2, int n)
{
while (n > 0) {
- *dest++ = (*src1 + *src2) >> 1;
+ *dest = (*src1 + *src2) >> 1;
src1 += sstr1;
src2 += sstr2;
+ dest += dstr;
n--;
}
}
@@ -108,22 +111,26 @@ average2_u8_unroll4 (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
uint8_t *src2, int sstr2, int n)
{
while (n & 0x3) {
- *dest++ = (*src1 + *src2) >> 1;
+ *dest = (*src1 + *src2) >> 1;
src1 += sstr1;
src2 += sstr2;
n--;
}
while (n > 0) {
- *dest++ = (*src1 + *src2) >> 1;
+ *dest = (*src1 + *src2) >> 1;
+ dest += dstr;
src1 += sstr1;
src2 += sstr2;
- *dest++ = (*src1 + *src2) >> 1;
+ *dest = (*src1 + *src2) >> 1;
+ dest += dstr;
src1 += sstr1;
src2 += sstr2;
- *dest++ = (*src1 + *src2) >> 1;
+ *dest = (*src1 + *src2) >> 1;
+ dest += dstr;
src1 += sstr1;
src2 += sstr2;
- *dest++ = (*src1 + *src2) >> 1;
+ *dest = (*src1 + *src2) >> 1;
+ dest += dstr;
src1 += sstr1;
src2 += sstr2;
n -= 4;