From 0e3ed7a7380a377c650dfe665db741498e183933 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Tue, 15 Feb 2022 20:54:18 +0100 Subject: [PATCH] =?utf8?q?gdk:=20Specialise=20RGBA8=20=E2=86=92=20premulti?= =?utf8?q?plied=20BGRA8=20conversion?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit On x86 on a Kaby Lake CPU, this makes it go from 6.63% of the total execution time (loading some PNGs using the cairo backend) down to 3.20%. On ARM on a Cortex-A7, on the same workload, this makes it go from 57% to 8.36%. --- gdk/gdkmemoryformat.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/gdk/gdkmemoryformat.c b/gdk/gdkmemoryformat.c index 60c3317bc8..21667c1dcb 100644 --- a/gdk/gdkmemoryformat.c +++ b/gdk/gdkmemoryformat.c @@ -166,6 +166,29 @@ r32g32b32a32_float_from_float (guchar *dest, memcpy (dest, src, sizeof (float) * n * 4); } +// This one conversion is quite important, it converts from RGBA with straight +// alpha (as found in PNG for instance) to BGRA with premultiplied alpha (the +// sole cairo format available). +static void +r8g8b8a8_to_b8g8r8a8_premultiplied (guchar *dest, + const guchar *src, + gsize n) +{ + for (; n > 0; n--) + { + guchar a = src[3]; + guint16 r = (guint16)src[0] * a + 127; + guint16 g = (guint16)src[1] * a + 127; + guint16 b = (guint16)src[2] * a + 127; + dest[0] = (b + (b >> 8) + 1) >> 8; + dest[1] = (g + (g >> 8) + 1) >> 8; + dest[2] = (r + (r >> 8) + 1) >> 8; + dest[3] = a; + dest += 4; + src += 4; + } +} + struct _GdkMemoryFormatDescription { GdkMemoryAlpha alpha; @@ -479,6 +502,17 @@ gdk_memory_convert (guchar *dest_data, g_assert (dest_format < GDK_MEMORY_N_FORMATS); g_assert (src_format < GDK_MEMORY_N_FORMATS); + if (src_format == GDK_MEMORY_R8G8B8A8 && dest_format == GDK_MEMORY_B8G8R8A8_PREMULTIPLIED) + { + for (y = 0; y < height; y++) + { + r8g8b8a8_to_b8g8r8a8_premultiplied (dest_data, src_data, width); + src_data += src_stride; + dest_data += dest_stride; + } + return; + } + tmp = g_new (float, width * 4); for (y = 0; y < height; y++) -- 2.30.2