From 08099ed17df19dd1ef8859ffa74d6ddc7fb84b40 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Thu, 3 Mar 2022 14:12:06 +0000 Subject: [PATCH] gdk: Remove pixel format conversion ARM intrinsics This was only useful when building for AArch32 without -mfpu=neon, on AArch64 or with -mfpu=neon gcc is smart enough to do the auto- vectorisation, leading to code almost as good as what I wrote in 1fdf5b7cf8fec2d94389ecad5ec28865f8dcfd99. --- gdk/gdkmemoryformat.c | 50 ------------------------------------------- 1 file changed, 50 deletions(-) diff --git a/gdk/gdkmemoryformat.c b/gdk/gdkmemoryformat.c index 64c027eac5..21667c1dcb 100644 --- a/gdk/gdkmemoryformat.c +++ b/gdk/gdkmemoryformat.c @@ -25,10 +25,6 @@ #include -#ifdef __ARM_NEON -#include -#endif - typedef struct _GdkMemoryFormatDescription GdkMemoryFormatDescription; #define TYPED_FUNCS(name, T, R, G, B, A, bpp, scale) \ @@ -178,52 +174,6 @@ r8g8b8a8_to_b8g8r8a8_premultiplied (guchar *dest, const guchar *src, gsize n) { -#ifdef __ARM_NEON - uint16x8_t one = vdupq_n_u16 (1); - uint16x8_t half = vdupq_n_u16 (127); - - for (gsize i = n / 8; i > 0; i--) - { - // Work on “just” 8 pixels at once, since we need the full 16-bytes of - // the q registers for the multiplication. - uint8x8x4_t rgba = vld4_u8 (src); - uint8x8_t r8 = rgba.val[0]; - uint8x8_t g8 = rgba.val[1]; - uint8x8_t b8 = rgba.val[2]; - uint8x8_t a8 = rgba.val[3]; - - // This is the same algorithm as premultiply(), but on packed 16-bit - // instead of float. - uint16x8_t r16 = vmull_u8 (r8, a8); - uint16x8_t g16 = vmull_u8 (g8, a8); - uint16x8_t b16 = vmull_u8 (b8, a8); - - r16 = vaddq_u16 (r16, half); - g16 = vaddq_u16 (g16, half); - b16 = vaddq_u16 (b16, half); - - r16 = vsraq_n_u16 (r16, r16, 8); - g16 = vsraq_n_u16 (g16, g16, 8); - b16 = vsraq_n_u16 (b16, b16, 8); - - r16 = vaddq_u16 (r16, one); - g16 = vaddq_u16 (g16, one); - b16 = vaddq_u16 (b16, one); - - // Just like the other one, here we use BGRA instead of RGBA! - rgba.val[0] = vshrn_n_u16 (b16, 8); - rgba.val[1] = vshrn_n_u16 (g16, 8); - rgba.val[2] = vshrn_n_u16 (r16, 8); - - vst4_u8 (dest, rgba); - src += 32; - dest += 32; - } - - // We want the fallthrough here for the last (up to) seven bytes of the row. - n = n % 8; -#endif // __ARM_NEON - for (; n > 0; n--) { guchar a = src[3]; -- 2.30.2