diff options
author | David Schleef <ds@schleef.org> | 2005-06-16 06:46:06 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-06-16 06:46:06 +0000 |
commit | 508ba3985f38081917b76fcfc4ac84e73ca2954a (patch) | |
tree | bd56b3f8c781a82c6f018a6d4a03d2c33fdd7a5b /liboil/simdpack | |
parent | 7ffccb74ac2ebfd0b6f8361d4016b0ceb3c9581f (diff) | |
download | liboil-508ba3985f38081917b76fcfc4ac84e73ca2954a.tar.gz |
* configure.ac: Add instruction checker
* testsuite/Makefile.am:
* testsuite/instruction/Makefile.am:
* testsuite/instruction/check-instructions.pl:
* testsuite/instruction/list-impls.c: (main):
* liboil/colorspace/argb_paint_i386.c: Fix flags based on advice of
the instruction checker
* liboil/colorspace/ayuv2argb_i386.c:
* liboil/conv/conv_3dnow.c:
* liboil/conv/conv_sse.c:
* liboil/copy/trans8x8_i386.c:
* liboil/dct/idct8x8_i386.c:
* liboil/sse/conv_sse.c:
* liboil/liboilfuncs.h: update
* liboil/liboilmarshal.c: (_oil_test_marshal_function): update
* liboil/liboiltest.c: (oil_test_new), (oil_test_check_function):
regenerate inplace data for every test iteration. Bump default
n to 1000 to force memcpy to choose a good function. (lame hack)
* liboil/copy/copy_i386.c: (copy_u8_mmx3), (copy_u8_mmx4),
(copy_u8_mmx5): new implementation, fix others
* liboil/copy/splat_i386.c: (splat_u32_ns_mmx): make faster
* liboil/copy/splat_ref.c: (splat_u8_ns_int): fix bug
* liboil/colorspace/argb_paint.c: (argb_splat_u8_ref),
(rgba_splat_u8_ref): New functions
* liboil/simdpack/average2_u8.c: (average2_u8_ref),
(average2_u8_trick), (average2_u8_fast), (average2_u8_unroll4):
Implementations really need to follow stride rules.
* liboil/Makefile.am: Don't use SSE flags, because people on
powerpc don't appreciate it.
* examples/memcpy-speed.c: (main): only go to 1<<24 bytes
Diffstat (limited to 'liboil/simdpack')
-rw-r--r-- | liboil/simdpack/average2_u8.c | 35 |
1 files changed, 21 insertions, 14 deletions
diff --git a/liboil/simdpack/average2_u8.c b/liboil/simdpack/average2_u8.c index 3c3f546..fe07aa8 100644 --- a/liboil/simdpack/average2_u8.c +++ b/liboil/simdpack/average2_u8.c @@ -45,7 +45,7 @@ average2_u8_ref (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, int i; for (i = 0; i < n; i++) { - dest[i] = (src1[sstr1 * i] + src2[sstr2 * i]) >> 1; + dest[dstr * i] = (src1[sstr1 * i] + src2[sstr2 * i]) >> 1; } } @@ -57,7 +57,8 @@ average2_u8_trick (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, { unsigned int x, y, d; - if (sstr1 == 1 && sstr2 == 1) { +#if 0 + if (sstr1 == 1 && sstr2 == 1 && dstr == 1) { while (n > 0) { x = *(unsigned int *) src1; y = *(unsigned int *) src2; @@ -67,8 +68,9 @@ average2_u8_trick (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, dest += 4; n -= 4; } - } - else { + } else +#endif + { while (n > 0) { x = (src1[0] << 24) | (src1[sstr1] << 16) | (src1[2 * sstr1] << 8) | (src1[3 * sstr1]); @@ -76,12 +78,12 @@ average2_u8_trick (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, sstr2] << 8) | (src2[3 * sstr2]); d = (((x ^ y) & 0xfefefefe) >> 1) + (x & y); dest[0] = (d >> 24); - dest[1] = (d >> 16); - dest[2] = (d >> 8); - dest[3] = (d >> 0); + dest[1*dstr] = (d >> 16); + dest[2*dstr] = (d >> 8); + dest[3*dstr] = (d >> 0); src1 += 4 * sstr1; src2 += 4 * sstr2; - dest += 4; + dest += 4 * dstr; n -= 4; } } @@ -94,9 +96,10 @@ average2_u8_fast (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, uint8_t *src2, int sstr2, int n) { while (n > 0) { - *dest++ = (*src1 + *src2) >> 1; + *dest = (*src1 + *src2) >> 1; src1 += sstr1; src2 += sstr2; + dest += dstr; n--; } } @@ -108,22 +111,26 @@ average2_u8_unroll4 (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, uint8_t *src2, int sstr2, int n) { while (n & 0x3) { - *dest++ = (*src1 + *src2) >> 1; + *dest = (*src1 + *src2) >> 1; src1 += sstr1; src2 += sstr2; n--; } while (n > 0) { - *dest++ = (*src1 + *src2) >> 1; + *dest = (*src1 + *src2) >> 1; + dest += dstr; src1 += sstr1; src2 += sstr2; - *dest++ = (*src1 + *src2) >> 1; + *dest = (*src1 + *src2) >> 1; + dest += dstr; src1 += sstr1; src2 += sstr2; - *dest++ = (*src1 + *src2) >> 1; + *dest = (*src1 + *src2) >> 1; + dest += dstr; src1 += sstr1; src2 += sstr2; - *dest++ = (*src1 + *src2) >> 1; + *dest = (*src1 + *src2) >> 1; + dest += dstr; src1 += sstr1; src2 += sstr2; n -= 4; |