diff options
Diffstat (limited to 'cogl/cogl-bitmap-conversion.c')
-rw-r--r-- | cogl/cogl-bitmap-conversion.c | 468 |
1 files changed, 468 insertions, 0 deletions
diff --git a/cogl/cogl-bitmap-conversion.c b/cogl/cogl-bitmap-conversion.c new file mode 100644 index 00000000..b085f6eb --- /dev/null +++ b/cogl/cogl-bitmap-conversion.c @@ -0,0 +1,468 @@ +/* + * Cogl + * + * An object oriented GL/GLES Abstraction/Utility Layer + * + * Copyright (C) 2007,2008,2009 Intel Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "cogl-private.h" +#include "cogl-bitmap-private.h" + +#include <string.h> + +#define component_type guint8 +/* We want to specially optimise the packing when we are converting + to/from an 8-bit type so that it won't do anything. That way for + example if we are just doing a swizzle conversion then the inner + loop for the conversion will be really simple */ +#define UNPACK_BYTE(b) (b) +#define PACK_BYTE(b) (b) +#include "cogl-bitmap-packing.h" +#undef PACK_BYTE +#undef UNPACK_BYTE +#undef component_type + +#define component_type guint16 +#define UNPACK_BYTE(b) (((b) * 65535 + 127) / 255) +#define PACK_BYTE(b) (((b) * 255 + 32767) / 65535) +#include "cogl-bitmap-packing.h" +#undef PACK_BYTE +#undef UNPACK_BYTE +#undef component_type + +/* (Un)Premultiplication */ + +inline static void +_cogl_unpremult_alpha_0 (guint8 *dst) +{ + dst[0] = 0; + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; +} + +inline static void +_cogl_unpremult_alpha_last (guint8 *dst) +{ + guint8 alpha = dst[3]; + + dst[0] = (dst[0] * 255) / alpha; + dst[1] = (dst[1] * 255) / alpha; + dst[2] = (dst[2] * 255) / alpha; +} + +inline static void +_cogl_unpremult_alpha_first (guint8 *dst) +{ + guint8 alpha = dst[0]; + + dst[1] = (dst[1] * 255) / alpha; + dst[2] = (dst[2] * 255) / alpha; + dst[3] = (dst[3] * 255) / alpha; +} + +/* No division form of floor((c*a + 128)/255) (I first encountered + * this in the RENDER implementation in the X server.) Being exact + * is important for a == 255 - we want to get exactly c. + */ +#define MULT(d,a,t) \ + G_STMT_START { \ + t = d * a + 128; \ + d = ((t >> 8) + t) >> 8; \ + } G_STMT_END + +inline static void +_cogl_premult_alpha_last (guint8 *dst) +{ + guint8 alpha = dst[3]; + /* Using a separate temporary per component has given slightly better + * code generation with GCC in the past; it shouldn't do any worse in + * any case. + */ + unsigned int t1, t2, t3; + MULT(dst[0], alpha, t1); + MULT(dst[1], alpha, t2); + MULT(dst[2], alpha, t3); +} + +inline static void +_cogl_premult_alpha_first (guint8 *dst) +{ + guint8 alpha = dst[0]; + unsigned int t1, t2, t3; + + MULT(dst[1], alpha, t1); + MULT(dst[2], alpha, t2); + MULT(dst[3], alpha, t3); +} + +#undef MULT + +/* Use the SSE optimized version to premult four pixels at once when + it is available. The same assembler code works for x86 and x86-64 + because it doesn't refer to any non-SSE registers directly */ +#if defined(__SSE2__) && defined(__GNUC__) \ + && (defined(__x86_64) || defined(__i386)) +#define COGL_USE_PREMULT_SSE2 +#endif + +#ifdef COGL_USE_PREMULT_SSE2 + +inline static void +_cogl_premult_alpha_last_four_pixels_sse2 (guint8 *p) +{ + /* 8 copies of 128 used below */ + static const gint16 eight_halves[8] __attribute__ ((aligned (16))) = + { 128, 128, 128, 128, 128, 128, 128, 128 }; + /* Mask of the rgb components of the four pixels */ + static const gint8 just_rgb[16] __attribute__ ((aligned (16))) = + { 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, + 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00 }; + /* Each SSE register only holds two pixels because we need to work + with 16-bit intermediate values. We still do four pixels by + interleaving two registers in the hope that it will pipeline + better */ + asm (/* Load eight_halves into xmm5 for later */ + "movdqa (%1), %%xmm5\n" + /* Clear xmm3 */ + "pxor %%xmm3, %%xmm3\n" + /* Load two pixels from p into the low half of xmm0 */ + "movlps (%0), %%xmm0\n" + /* Load the next set of two pixels from p into the low half of xmm1 */ + "movlps 8(%0), %%xmm1\n" + /* Unpack 8 bytes from the low quad-words in each register to 8 + 16-bit values */ + "punpcklbw %%xmm3, %%xmm0\n" + "punpcklbw %%xmm3, %%xmm1\n" + /* Copy alpha values of the first pixel in xmm0 to all + components of the first pixel in xmm2 */ + "pshuflw $255, %%xmm0, %%xmm2\n" + /* same for xmm1 and xmm3 */ + "pshuflw $255, %%xmm1, %%xmm3\n" + /* The above also copies the second pixel directly so we now + want to replace the RGB components with copies of the alpha + components */ + "pshufhw $255, %%xmm2, %%xmm2\n" + "pshufhw $255, %%xmm3, %%xmm3\n" + /* Multiply the rgb components by the alpha */ + "pmullw %%xmm2, %%xmm0\n" + "pmullw %%xmm3, %%xmm1\n" + /* Add 128 to each component */ + "paddw %%xmm5, %%xmm0\n" + "paddw %%xmm5, %%xmm1\n" + /* Copy the results to temporary registers xmm4 and xmm5 */ + "movdqa %%xmm0, %%xmm4\n" + "movdqa %%xmm1, %%xmm5\n" + /* Divide the results by 256 */ + "psrlw $8, %%xmm0\n" + "psrlw $8, %%xmm1\n" + /* Add the temporaries back in */ + "paddw %%xmm4, %%xmm0\n" + "paddw %%xmm5, %%xmm1\n" + /* Divide again */ + "psrlw $8, %%xmm0\n" + "psrlw $8, %%xmm1\n" + /* Pack the results back as bytes */ + "packuswb %%xmm1, %%xmm0\n" + /* Load just_rgb into xmm3 for later */ + "movdqa (%2), %%xmm3\n" + /* Reload all four pixels into xmm2 */ + "movups (%0), %%xmm2\n" + /* Mask out the alpha from the results */ + "andps %%xmm3, %%xmm0\n" + /* Mask out the RGB from the original four pixels */ + "andnps %%xmm2, %%xmm3\n" + /* Combine the two to get the right alpha values */ + "orps %%xmm3, %%xmm0\n" + /* Write to memory */ + "movdqu %%xmm0, (%0)\n" + : /* no outputs */ + : "r" (p), "r" (eight_halves), "r" (just_rgb) + : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif /* COGL_USE_PREMULT_SSE2 */ + +static gboolean +_cogl_bitmap_can_premult (CoglPixelFormat format) +{ + switch (format & ~COGL_PREMULT_BIT) + { + case COGL_PIXEL_FORMAT_RGBA_8888: + case COGL_PIXEL_FORMAT_BGRA_8888: + case COGL_PIXEL_FORMAT_ARGB_8888: + case COGL_PIXEL_FORMAT_ABGR_8888: + return TRUE; + + default: + return FALSE; + } +} + +static gboolean +_cogl_bitmap_needs_short_temp_buffer (CoglPixelFormat format) +{ + /* If the format is using more than 8 bits per component then we'll + unpack into a 16-bit per component buffer instead of 8-bit so we + won't lose as much precision. If we ever add support for formats + with more than 16 bits for at least one of the components then we + should probably do something else here, maybe convert to + floats */ + switch (format) + { + case COGL_PIXEL_FORMAT_ANY: + case COGL_PIXEL_FORMAT_YUV: + g_assert_not_reached (); + + case COGL_PIXEL_FORMAT_A_8: + case COGL_PIXEL_FORMAT_RGB_565: + case COGL_PIXEL_FORMAT_RGBA_4444: + case COGL_PIXEL_FORMAT_RGBA_5551: + case COGL_PIXEL_FORMAT_G_8: + case COGL_PIXEL_FORMAT_RGB_888: + case COGL_PIXEL_FORMAT_BGR_888: + case COGL_PIXEL_FORMAT_RGBA_8888: + case COGL_PIXEL_FORMAT_BGRA_8888: + case COGL_PIXEL_FORMAT_ARGB_8888: + case COGL_PIXEL_FORMAT_ABGR_8888: + case COGL_PIXEL_FORMAT_RGBA_8888_PRE: + case COGL_PIXEL_FORMAT_BGRA_8888_PRE: + case COGL_PIXEL_FORMAT_ARGB_8888_PRE: + case COGL_PIXEL_FORMAT_ABGR_8888_PRE: + case COGL_PIXEL_FORMAT_RGBA_4444_PRE: + case COGL_PIXEL_FORMAT_RGBA_5551_PRE: + return FALSE; + + case COGL_PIXEL_FORMAT_RGBA_1010102: + case COGL_PIXEL_FORMAT_BGRA_1010102: + case COGL_PIXEL_FORMAT_ARGB_2101010: + case COGL_PIXEL_FORMAT_ABGR_2101010: + case COGL_PIXEL_FORMAT_RGBA_1010102_PRE: + case COGL_PIXEL_FORMAT_BGRA_1010102_PRE: + case COGL_PIXEL_FORMAT_ARGB_2101010_PRE: + case COGL_PIXEL_FORMAT_ABGR_2101010_PRE: + return TRUE; + } + + g_assert_not_reached (); +} + +CoglBitmap * +_cogl_bitmap_convert (CoglBitmap *src_bmp, + CoglPixelFormat dst_format) +{ + guint8 *src_data; + guint8 *dst_data; + guint8 *src; + guint8 *dst; + void *tmp_row; + int dst_bpp; + int src_rowstride; + int dst_rowstride; + int y; + int width, height; + CoglPixelFormat src_format; + gboolean use_16; + + src_format = _cogl_bitmap_get_format (src_bmp); + src_rowstride = _cogl_bitmap_get_rowstride (src_bmp); + width = _cogl_bitmap_get_width (src_bmp); + height = _cogl_bitmap_get_height (src_bmp); + + src_data = _cogl_bitmap_map (src_bmp, COGL_BUFFER_ACCESS_READ, 0); + if (src_data == NULL) + return NULL; + + use_16 = _cogl_bitmap_needs_short_temp_buffer (dst_format); + + dst_bpp = _cogl_pixel_format_get_bytes_per_pixel (dst_format); + + /* Initialize destination bitmap */ + dst_rowstride = (sizeof(guint8) * dst_bpp * width + 3) & ~3; + /* Copy the premult bit if the new format has an alpha channel */ + if (COGL_PIXEL_FORMAT_CAN_HAVE_PREMULT (dst_format)) + dst_format = ((src_format & COGL_PREMULT_BIT) | + (dst_format & ~COGL_PREMULT_BIT)); + + /* Allocate a new buffer to hold converted data */ + dst_data = g_malloc (height * dst_rowstride); + /* and a buffer to hold a temporary RGBA row */ + tmp_row = g_malloc (width * + (use_16 ? sizeof (guint16) : sizeof (guint8)) * 4); + + /* FIXME: Optimize */ + for (y = 0; y < height; y++) + { + src = src_data + y * src_rowstride; + dst = dst_data + y * dst_rowstride; + + if (use_16) + _cogl_unpack_guint16 (src_format, src, tmp_row, width); + else + _cogl_unpack_guint8 (src_format, src, tmp_row, width); + + if (use_16) + _cogl_pack_guint16 (dst_format, tmp_row, dst, width); + else + _cogl_pack_guint8 (dst_format, tmp_row, dst, width); + } + + _cogl_bitmap_unmap (src_bmp); + + g_free (tmp_row); + + return _cogl_bitmap_new_from_data (dst_data, + dst_format, + width, height, dst_rowstride, + (CoglBitmapDestroyNotify) g_free, + NULL); +} + +gboolean +_cogl_bitmap_unpremult (CoglBitmap *bmp) +{ + guint8 *p, *data; + int x,y; + CoglPixelFormat format; + int width, height; + int rowstride; + + format = _cogl_bitmap_get_format (bmp); + width = _cogl_bitmap_get_width (bmp); + height = _cogl_bitmap_get_height (bmp); + rowstride = _cogl_bitmap_get_rowstride (bmp); + + /* If we can premult that implies we can un-premult too... */ + if (!_cogl_bitmap_can_premult (format)) + return FALSE; + + if ((data = _cogl_bitmap_map (bmp, + COGL_BUFFER_ACCESS_READ | + COGL_BUFFER_ACCESS_WRITE, + 0)) == NULL) + return FALSE; + + for (y = 0; y < height; y++) + { + p = (guint8*) data + y * rowstride; + + if (format & COGL_AFIRST_BIT) + { + for (x = 0; x < width; x++) + { + if (p[0] == 0) + _cogl_unpremult_alpha_0 (p); + else + _cogl_unpremult_alpha_first (p); + p += 4; + } + } + else + { + for (x = 0; x < width; x++) + { + if (p[3] == 0) + _cogl_unpremult_alpha_0 (p); + else + _cogl_unpremult_alpha_last (p); + p += 4; + } + } + } + + _cogl_bitmap_unmap (bmp); + + _cogl_bitmap_set_format (bmp, format & ~COGL_PREMULT_BIT); + + return TRUE; +} + +gboolean +_cogl_bitmap_premult (CoglBitmap *bmp) +{ + guint8 *p, *data; + int x,y; + CoglPixelFormat format; + int width, height; + int rowstride; + + format = _cogl_bitmap_get_format (bmp); + width = _cogl_bitmap_get_width (bmp); + height = _cogl_bitmap_get_height (bmp); + rowstride = _cogl_bitmap_get_rowstride (bmp); + + /* Make sure format supported for un-premultiplication */ + if (!_cogl_bitmap_can_premult (format)) + return FALSE; + + if ((data = _cogl_bitmap_map (bmp, + COGL_BUFFER_ACCESS_READ | + COGL_BUFFER_ACCESS_WRITE, + 0)) == NULL) + return FALSE; + + for (y = 0; y < height; y++) + { + p = (guint8*) data + y * rowstride; + + if (format & COGL_AFIRST_BIT) + { + for (x = 0; x < width; x++) + { + _cogl_premult_alpha_first (p); + p += 4; + } + } + else + { + x = width; + +#ifdef COGL_USE_PREMULT_SSE2 + + /* Process 4 pixels at a time */ + while (x >= 4) + { + _cogl_premult_alpha_last_four_pixels_sse2 (p); + p += 4 * 4; + x -= 4; + } + + /* If there are any pixels left we will fall through and + handle them below */ + +#endif /* COGL_USE_PREMULT_SSE2 */ + + while (x-- > 0) + { + _cogl_premult_alpha_last (p); + p += 4; + } + } + } + + _cogl_bitmap_unmap (bmp); + + _cogl_bitmap_set_format (bmp, format | COGL_PREMULT_BIT); + + return TRUE; +} |