diff options
author | Matthias Clasen <mclasen@redhat.com> | 2022-03-10 19:26:25 +0000 |
---|---|---|
committer | Matthias Clasen <mclasen@redhat.com> | 2022-03-10 19:26:25 +0000 |
commit | 9f7c9ce6ad9d0ad8743fe9c5d5a66b7ae477a5c8 (patch) | |
tree | 3f55b203fb125ab8ee87c2c6d664dd8451c9cf04 | |
parent | 1f155bf39af56cb5a7d6ede72983502a9286cc9c (diff) | |
parent | 588076d16676b7231d6c61ab89a8fbc7be9a782d (diff) | |
download | gtk+-9f7c9ce6ad9d0ad8743fe9c5d5a66b7ae477a5c8.tar.gz |
Merge branch 'better-format-conversions' into 'main'
gdk: Clean up the optimised premultiply conversion function
See merge request GNOME/gtk!4550
-rw-r--r-- | gdk/gdkmemoryformat.c | 155 |
1 files changed, 84 insertions, 71 deletions
diff --git a/gdk/gdkmemoryformat.c b/gdk/gdkmemoryformat.c index 64c027eac5..5bd7b228d4 100644 --- a/gdk/gdkmemoryformat.c +++ b/gdk/gdkmemoryformat.c @@ -25,10 +25,6 @@ #include <epoxy/gl.h> -#ifdef __ARM_NEON -#include <arm_neon.h> -#endif - typedef struct _GdkMemoryFormatDescription GdkMemoryFormatDescription; #define TYPED_FUNCS(name, T, R, G, B, A, bpp, scale) \ @@ -170,75 +166,54 @@ r32g32b32a32_float_from_float (guchar *dest, memcpy (dest, src, sizeof (float) * n * 4); } -// This one conversion is quite important, it converts from RGBA with straight -// alpha (as found in PNG for instance) to BGRA with premultiplied alpha (the -// sole cairo format available). -static void -r8g8b8a8_to_b8g8r8a8_premultiplied (guchar *dest, - const guchar *src, - gsize n) -{ -#ifdef __ARM_NEON - uint16x8_t one = vdupq_n_u16 (1); - uint16x8_t half = vdupq_n_u16 (127); - - for (gsize i = n / 8; i > 0; i--) - { - // Work on “just” 8 pixels at once, since we need the full 16-bytes of - // the q registers for the multiplication. - uint8x8x4_t rgba = vld4_u8 (src); - uint8x8_t r8 = rgba.val[0]; - uint8x8_t g8 = rgba.val[1]; - uint8x8_t b8 = rgba.val[2]; - uint8x8_t a8 = rgba.val[3]; - - // This is the same algorithm as premultiply(), but on packed 16-bit - // instead of float. - uint16x8_t r16 = vmull_u8 (r8, a8); - uint16x8_t g16 = vmull_u8 (g8, a8); - uint16x8_t b16 = vmull_u8 (b8, a8); - - r16 = vaddq_u16 (r16, half); - g16 = vaddq_u16 (g16, half); - b16 = vaddq_u16 (b16, half); - - r16 = vsraq_n_u16 (r16, r16, 8); - g16 = vsraq_n_u16 (g16, g16, 8); - b16 = vsraq_n_u16 (b16, b16, 8); - - r16 = vaddq_u16 (r16, one); - g16 = vaddq_u16 (g16, one); - b16 = vaddq_u16 (b16, one); - - // Just like the other one, here we use BGRA instead of RGBA! - rgba.val[0] = vshrn_n_u16 (b16, 8); - rgba.val[1] = vshrn_n_u16 (g16, 8); - rgba.val[2] = vshrn_n_u16 (r16, 8); - - vst4_u8 (dest, rgba); - src += 32; - dest += 32; - } +#define PREMULTIPLY_FUNC(name, R1, G1, B1, A1, R2, G2, B2, A2) \ +static void \ +name (guchar *dest, \ + const guchar *src, \ + gsize n) \ +{ \ + for (; n > 0; n--) \ + { \ + guchar a = src[A1]; \ + guint16 r = (guint16)src[R1] * a + 127; \ + guint16 g = (guint16)src[G1] * a + 127; \ + guint16 b = (guint16)src[B1] * a + 127; \ + dest[R2] = (r + (r >> 8) + 1) >> 8; \ + dest[G2] = (g + (g >> 8) + 1) >> 8; \ + dest[B2] = (b + (b >> 8) + 1) >> 8; \ + dest[A2] = a; \ + dest += 4; \ + src += 4; \ + } \ +} - // We want the fallthrough here for the last (up to) seven bytes of the row. - n = n % 8; -#endif // __ARM_NEON +PREMULTIPLY_FUNC(r8g8b8a8_to_r8g8b8a8_premultiplied, 0, 1, 2, 3, 0, 1, 2, 3) +PREMULTIPLY_FUNC(r8g8b8a8_to_b8g8r8a8_premultiplied, 0, 1, 2, 3, 2, 1, 0, 3) +PREMULTIPLY_FUNC(r8g8b8a8_to_a8r8g8b8_premultiplied, 0, 1, 2, 3, 1, 2, 3, 0) +PREMULTIPLY_FUNC(r8g8b8a8_to_a8b8g8r8_premultiplied, 0, 1, 2, 3, 3, 2, 1, 0) - for (; n > 0; n--) - { - guchar a = src[3]; - guint16 r = (guint16)src[0] * a + 127; - guint16 g = (guint16)src[1] * a + 127; - guint16 b = (guint16)src[2] * a + 127; - dest[0] = (b + (b >> 8) + 1) >> 8; - dest[1] = (g + (g >> 8) + 1) >> 8; - dest[2] = (r + (r >> 8) + 1) >> 8; - dest[3] = a; - dest += 4; - src += 4; - } +#define ADD_ALPHA_FUNC(name, R1, G1, B1, R2, G2, B2, A2) \ +static void \ +name (guchar *dest, \ + const guchar *src, \ + gsize n) \ +{ \ + for (; n > 0; n--) \ + { \ + dest[R2] = src[R1]; \ + dest[G2] = src[G1]; \ + dest[B2] = src[B1]; \ + dest[A2] = 255; \ + dest += 4; \ + src += 3; \ + } \ } +ADD_ALPHA_FUNC(r8g8b8_to_r8g8b8a8, 0, 1, 2, 0, 1, 2, 3) +ADD_ALPHA_FUNC(r8g8b8_to_b8g8r8a8, 0, 1, 2, 2, 1, 0, 3) +ADD_ALPHA_FUNC(r8g8b8_to_a8r8g8b8, 0, 1, 2, 1, 2, 3, 0) +ADD_ALPHA_FUNC(r8g8b8_to_a8b8g8r8, 0, 1, 2, 3, 2, 1, 0) + struct _GdkMemoryFormatDescription { GdkMemoryAlpha alpha; @@ -548,15 +523,53 @@ gdk_memory_convert (guchar *dest_data, const GdkMemoryFormatDescription *src_desc = &memory_formats[src_format]; float *tmp; gsize y; + void (*func) (guchar *, const guchar *, gsize) = NULL; g_assert (dest_format < GDK_MEMORY_N_FORMATS); g_assert (src_format < GDK_MEMORY_N_FORMATS); - if (src_format == GDK_MEMORY_R8G8B8A8 && dest_format == GDK_MEMORY_B8G8R8A8_PREMULTIPLIED) + if (src_format == GDK_MEMORY_R8G8B8A8 && dest_format == GDK_MEMORY_R8G8B8A8_PREMULTIPLIED) + func = r8g8b8a8_to_r8g8b8a8_premultiplied; + else if (src_format == GDK_MEMORY_B8G8R8A8 && dest_format == GDK_MEMORY_R8G8B8A8_PREMULTIPLIED) + func = r8g8b8a8_to_b8g8r8a8_premultiplied; + else if (src_format == GDK_MEMORY_R8G8B8A8 && dest_format == GDK_MEMORY_B8G8R8A8_PREMULTIPLIED) + func = r8g8b8a8_to_b8g8r8a8_premultiplied; + else if (src_format == GDK_MEMORY_B8G8R8A8 && dest_format == GDK_MEMORY_B8G8R8A8_PREMULTIPLIED) + func = r8g8b8a8_to_r8g8b8a8_premultiplied; + else if (src_format == GDK_MEMORY_R8G8B8A8 && dest_format == GDK_MEMORY_A8R8G8B8_PREMULTIPLIED) + func = r8g8b8a8_to_a8r8g8b8_premultiplied; + else if (src_format == GDK_MEMORY_B8G8R8A8 && dest_format == GDK_MEMORY_A8R8G8B8_PREMULTIPLIED) + func = r8g8b8a8_to_a8b8g8r8_premultiplied; + else if (src_format == GDK_MEMORY_R8G8B8 && dest_format == GDK_MEMORY_R8G8B8A8_PREMULTIPLIED) + func = r8g8b8_to_r8g8b8a8; + else if (src_format == GDK_MEMORY_B8G8R8 && dest_format == GDK_MEMORY_R8G8B8A8_PREMULTIPLIED) + func = r8g8b8_to_b8g8r8a8; + else if (src_format == GDK_MEMORY_R8G8B8 && dest_format == GDK_MEMORY_B8G8R8A8_PREMULTIPLIED) + func = r8g8b8_to_b8g8r8a8; + else if (src_format == GDK_MEMORY_B8G8R8 && dest_format == GDK_MEMORY_B8G8R8A8_PREMULTIPLIED) + func = r8g8b8_to_r8g8b8a8; + else if (src_format == GDK_MEMORY_R8G8B8 && dest_format == GDK_MEMORY_A8R8G8B8_PREMULTIPLIED) + func = r8g8b8_to_a8r8g8b8; + else if (src_format == GDK_MEMORY_B8G8R8 && dest_format == GDK_MEMORY_A8R8G8B8_PREMULTIPLIED) + func = r8g8b8_to_a8b8g8r8; + else if (src_format == GDK_MEMORY_R8G8B8 && dest_format == GDK_MEMORY_R8G8B8A8) + func = r8g8b8_to_r8g8b8a8; + else if (src_format == GDK_MEMORY_B8G8R8 && dest_format == GDK_MEMORY_R8G8B8A8) + func = r8g8b8_to_b8g8r8a8; + else if (src_format == GDK_MEMORY_R8G8B8 && dest_format == GDK_MEMORY_B8G8R8A8) + func = r8g8b8_to_b8g8r8a8; + else if (src_format == GDK_MEMORY_B8G8R8 && dest_format == GDK_MEMORY_B8G8R8A8) + func = r8g8b8_to_r8g8b8a8; + else if (src_format == GDK_MEMORY_R8G8B8 && dest_format == GDK_MEMORY_A8R8G8B8) + func = r8g8b8_to_a8r8g8b8; + else if (src_format == GDK_MEMORY_B8G8R8 && dest_format == GDK_MEMORY_A8R8G8B8) + func = r8g8b8_to_a8b8g8r8; + + if (func != NULL) { for (y = 0; y < height; y++) { - r8g8b8a8_to_b8g8r8a8_premultiplied (dest_data, src_data, width); + func (dest_data, src_data, width); src_data += src_stride; dest_data += dest_stride; } |