diff options
author | Falk Hüffner <mellum@users.sourceforge.net> | 2002-07-01 04:26:07 +0000 |
---|---|---|
committer | Falk Hüffner <mellum@users.sourceforge.net> | 2002-07-01 04:26:07 +0000 |
commit | bb7d4939bae627bc9f917ae1d2870cfd4f571cde (patch) | |
tree | 5c58ae1a8f176551b6c8ceeb603a84a11562e26c /libavcodec/alpha/dsputil_alpha.c | |
parent | e09d12f4f6cdf0828ef853c396636dcf6e4cf10f (diff) | |
download | ffmpeg-bb7d4939bae627bc9f917ae1d2870cfd4f571cde.tar.gz |
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
allows better scheduling of the memory accesses, and is portable among
all compilers.
Originally committed as revision 709 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/alpha/dsputil_alpha.c')
-rw-r--r-- | libavcodec/alpha/dsputil_alpha.c | 106 |
1 files changed, 64 insertions, 42 deletions
diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/dsputil_alpha.c index 06d2fdad51..5e1aa20933 100644 --- a/libavcodec/alpha/dsputil_alpha.c +++ b/libavcodec/alpha/dsputil_alpha.c @@ -22,64 +22,86 @@ void simple_idct_axp(DCTELEM *block); -static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, - int line_size) +void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, + int line_size); +void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, + int line_size); + +#if 0 +/* These functions were the base for the optimized assembler routines, + and remain here for documentation purposes. */ +static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, + int line_size) { int i = 8; + uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ ASM_ACCEPT_MVI; do { - UINT64 shorts; + uint64_t shorts0, shorts1; - shorts = ldq(block); - shorts = maxsw4(shorts, 0); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels); + shorts0 = ldq(block); + shorts0 = maxsw4(shorts0, 0); + shorts0 = minsw4(shorts0, clampmask); + stl(pkwb(shorts0), pixels); - shorts = ldq(block + 4); - shorts = maxsw4(shorts, 0); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels + 4); + shorts1 = ldq(block + 4); + shorts1 = maxsw4(shorts1, 0); + shorts1 = minsw4(shorts1, clampmask); + stl(pkwb(shorts1), pixels + 4); - pixels += line_size; - block += 8; + pixels += line_size; + block += 8; } while (--i); } -static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, - int line_size) +void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, + int line_size) { - int i = 8; + int h = 8; + /* Keep this function a leaf function by generating the constants + manually (mainly for the hack value ;-). */ + uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ + uint64_t signmask = zap(-1, 0x33); + signmask ^= signmask >> 1; /* 0x8000800080008000 */ ASM_ACCEPT_MVI; do { - UINT64 shorts; - - shorts = ldq(block); - shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ - shorts += unpkbw(ldl(pixels)); - shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ - shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ - shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ - shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ - stl(pkwb(shorts), pixels); - - /* next 4 */ - shorts = ldq(block + 4); - shorts &= ~WORD_VEC(0x8000); - shorts += unpkbw(ldl(pixels + 4)); - shorts &= ~WORD_VEC(0x8000); - shorts = minuw4(shorts, WORD_VEC(0x4000)); - shorts &= ~WORD_VEC(0x4000); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels + 4); - - pixels += line_size; - block += 8; - } while (--i); + uint64_t shorts0, pix0, signs0; + uint64_t shorts1, pix1, signs1; + + shorts0 = ldq(block); + shorts1 = ldq(block + 4); + + pix0 = unpkbw(ldl(pixels)); + /* Signed subword add (MMX paddw). */ + signs0 = shorts0 & signmask; + shorts0 &= ~signmask; + shorts0 += pix0; + shorts0 ^= signs0; + /* Clamp. */ + shorts0 = maxsw4(shorts0, 0); + shorts0 = minsw4(shorts0, clampmask); + + /* Next 4. */ + pix1 = unpkbw(ldl(pixels + 4)); + signs1 = shorts1 & signmask; + shorts1 &= ~signmask; + shorts1 += pix1; + shorts1 ^= signs1; + shorts1 = maxsw4(shorts1, 0); + shorts1 = minsw4(shorts1, clampmask); + + stl(pkwb(shorts0), pixels); + stl(pkwb(shorts1), pixels + 4); + + pixels += line_size; + block += 8; + } while (--h); } +#endif /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 Since the immediate result could be greater than 255, we do the @@ -222,7 +244,7 @@ void dsputil_init_alpha(void) /* amask clears all bits that correspond to present features. */ if (amask(AMASK_MVI) == 0) { - put_pixels_clamped = put_pixels_clamped_axp; - add_pixels_clamped = add_pixels_clamped_axp; + put_pixels_clamped = put_pixels_clamped_mvi_asm; + add_pixels_clamped = add_pixels_clamped_mvi_asm; } } |