From b3e884edf3b5c58fb4c2cede1346bd8a9d9c4a1e Mon Sep 17 00:00:00 2001 From: =?utf8?q?=C3=98yvind=20Kol=C3=A5s?= Date: Mon, 24 Jan 2022 07:05:12 +0100 Subject: [PATCH] make trampoline for lut processing --- babl/babl-fish-path.c | 137 +++++---------------------------- babl/babl-internal.h | 13 ++++ babl/babl.c | 41 +++++++++- babl/base/babl-rgb-converter.c | 122 +++++++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 120 deletions(-) diff --git a/babl/babl-fish-path.c b/babl/babl-fish-path.c index f709c3f..7278ec2 100644 --- a/babl/babl-fish-path.c +++ b/babl/babl-fish-path.c @@ -76,7 +76,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi, static inline void -process_conversion_path (BablList *path, +_babl_process_conversion_path (BablList *path, const void *source_buffer, int source_bpp, void *destination_buffer, @@ -756,118 +756,6 @@ babl_gc_fishes (void) // is responsibility of higher layers } -static int babl_fish_lut_process_maybe (const Babl *babl, - const char *source, - const char *destination, - long n, - void *data) -{ - uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut; - ((Babl*)babl)->fish.pixels += n; - - - if (!lut && babl->fish.pixels > 256 * 128) - { -#if 0 - fprintf (stderr, "building LUT for %s to %s\n", - babl_get_name (babl->conversion.source), - babl_get_name (babl->conversion.destination)); -#endif - lut = malloc (256 * 256 * 256 * 4); - if (babl->fish_path.source_bpp == 8) - { - uint64_t *lut_in = malloc (256 * 256 * 256 * 8); - for (int o = 0; o < 256 * 256 * 256; o++) - { - uint64_t v = o; - uint64_t v0 = v & 0xff; - uint64_t v1 = (v & 0xff00) >> 8; - uint64_t v2 = (v & 0xff0000) >> 16; - -#if 1 - // gives same results... but purer white is better? - v0 = (v0 << 8) | (((v0&1)?0xff:0)<<0); - v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16); - v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32); -#else - v0 = (v0 << 8); - v1 = (v1 << 24); - v2 = (v2 << 40); -#endif - lut_in[o] = v; - } - - process_conversion_path (babl->fish_path.conversion_list, - lut_in, - babl->fish_path.source_bpp, - lut, - babl->fish_path.dest_bpp, - 256*256*256); - free (lut_in); - } - else - { - for (int o = 0; o < 256 * 256 * 256; o++) - lut[o] = o; - process_conversion_path (babl->fish_path.conversion_list, - lut, - babl->fish_path.source_bpp, - lut, - babl->fish_path.dest_bpp, - 256*256*256); - } - // XXX : there is still a micro race, if lost we should only - // leak a LUT not produce wrong results. - if (babl->fish_path.u8_lut == NULL) - { - (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut; - - } - else - { - free (lut); - lut = (uint32_t*)babl->fish_path.u8_lut; - } - } - if (lut) - { - if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet - { // half and u16 need their - // own separate handling - uint32_t *src = (uint32_t*)source; - uint32_t *dst = (uint32_t*)destination; - lut = (uint32_t*)babl->fish_path.u8_lut; - while (n--) - { - uint32_t col_a = *src++; - uint32_t col_b = *src++; - uint32_t col; - - uint32_t c_ar = ((col_a & 0xff000000)| - ((col_a & 0x0000ff00) << 8)); - uint32_t c_gb = ((col_b & 0xff000000)| - ((col_b & 0x0000ff00) << 8))>>16; - col = c_ar|c_gb; - - *dst++ = lut[col & 0xffffff] | (col & 0xff000000); - } - } - else - { - uint32_t *src = (uint32_t*)source; - uint32_t *dst = (uint32_t*)destination; - lut = (uint32_t*)babl->fish_path.u8_lut; - while (n--) - { - uint32_t col = *src++; - *dst++ = lut[col & 0xffffff] | (col & 0xff000000); - } - } - BABL(babl)->fish_path.last_lut_use = babl_ticks (); - return 1; - } - return 0; -} static void babl_fish_path_process (const Babl *babl, @@ -895,7 +783,7 @@ babl_fish_path_process (const Babl *babl, conv_counter = 0; } } - process_conversion_path (babl->fish_path.conversion_list, + _babl_process_conversion_path (babl->fish_path.conversion_list, source, babl->fish_path.source_bpp, destination, @@ -1037,7 +925,7 @@ static void inline *align_16 (unsigned char *ret) } static inline void -process_conversion_path (BablList *path, +_babl_process_conversion_path (BablList *path, const void *source_buffer, int source_bpp, void *destination_buffer, @@ -1109,6 +997,23 @@ process_conversion_path (BablList *path, } } +void +babl_process_conversion_path (BablList *path, + const void *source_buffer, + int source_bpp, + void *destination_buffer, + int dest_bpp, + long n) +{ + _babl_process_conversion_path (path, + source_buffer, + source_bpp, + destination_buffer, + dest_bpp, + n); +} + + static void init_path_instrumentation (FishPathInstrumentation *fpi, Babl *fmt_source, @@ -1244,7 +1149,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi, /* calculate this path's view of what the result should be */ ticks_start = babl_ticks (); for (int i = 0; i < BABL_TEST_ITER; i ++) - process_conversion_path (path, fpi->source, source_bpp, fpi->destination, + _babl_process_conversion_path (path, fpi->source, source_bpp, fpi->destination, dest_bpp, fpi->num_test_pixels); ticks_end = babl_ticks (); *path_cost = (ticks_end - ticks_start); diff --git a/babl/babl-internal.h b/babl/babl-internal.h index ec6008b..4377ec3 100644 --- a/babl/babl-internal.h +++ b/babl/babl-internal.h @@ -373,6 +373,12 @@ extern const Babl * extern const Babl * (*babl_trc_lookup_by_name) (const char *name); +extern int (*babl_fish_lut_process_maybe) (const Babl *babl, + const char *source, + const char *destination, + long n, + void *data); + void babl_space_to_xyz (const Babl *space, const double *rgb, double *xyz); void babl_space_from_xyz (const Babl *space, const double *xyz, double *rgb); @@ -473,5 +479,12 @@ _babl_space_for_lcms (const char *icc_data, int icc_length); // XXX pass profile void babl_trc_class_init (void); +void +babl_process_conversion_path (BablList *path, + const void *source_buffer, + int source_bpp, + void *destination_buffer, + int dest_bpp, + long n); #endif diff --git a/babl/babl.c b/babl/babl.c index 515fa09..7bfe60f 100644 --- a/babl/babl.c +++ b/babl/babl.c @@ -200,6 +200,19 @@ void (*babl_base_init) (void) = babl_base_init_generic; const Babl * babl_trc_lookup_by_name_generic (const char *name); +int babl_fish_lut_process_maybe_generic (const Babl *babl, + const char *source, + const char *destination, + long n, + void *data); + +int (*babl_fish_lut_process_maybe) (const Babl *babl, + const char *source, + const char *destination, + long n, + void *data) = + babl_fish_lut_process_maybe_generic; + const Babl * babl_trc_new_generic (const char *name, @@ -222,15 +235,25 @@ const Babl * float *lut) = babl_trc_new_generic; #ifdef ARCH_X86_64 + +int babl_fish_lut_process_maybe_x86_64_v2 (const Babl *babl, + const char *source, + const char *destination, + long n, + void *data); +int babl_fish_lut_process_maybe_x86_64_v3 (const Babl *babl, + const char *source, + const char *destination, + long n, + void *data); + void babl_base_init_x86_64_v2 (void); void babl_base_init_x86_64_v3 (void); void _babl_space_add_universal_rgb_x86_64_v2 (const Babl *space); void _babl_space_add_universal_rgb_x86_64_v3 (const Babl *space); -const Babl * -babl_trc_lookup_by_name_x86_64_v2 (const char *name); -const Babl * -babl_trc_lookup_by_name_x86_64_v3 (const char *name); +const Babl * babl_trc_lookup_by_name_x86_64_v2 (const char *name); +const Babl * babl_trc_lookup_by_name_x86_64_v3 (const char *name); const Babl * babl_trc_new_x86_64_v2 (const char *name, @@ -247,6 +270,13 @@ babl_trc_new_x86_64_v3 (const char *name, #endif #ifdef ARCH_ARM + +int babl_fish_lut_process_maybe_arm_neon (const Babl *babl, + const char *source, + const char *destination, + long n, + void *data); + void babl_base_init_arm_neon (void); void _babl_space_add_universal_rgb_arm_neon (const Babl *space); @@ -268,6 +298,7 @@ static void simd_init (void) BablCpuAccelFlags accel = babl_cpu_accel_get_support (); if ((accel & BABL_CPU_ACCEL_X86_64_V3) == BABL_CPU_ACCEL_X86_64_V3) { + babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v3; babl_base_init = babl_base_init_x86_64_v2; /// !! // this is correct, // it performs better @@ -278,6 +309,7 @@ static void simd_init (void) } else if ((accel & BABL_CPU_ACCEL_X86_64_V2) == BABL_CPU_ACCEL_X86_64_V2) { + babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v2; babl_base_init = babl_base_init_x86_64_v2; babl_trc_new = babl_trc_new_x86_64_v2; babl_trc_lookup_by_name = babl_trc_lookup_by_name_x86_64_v2; @@ -288,6 +320,7 @@ static void simd_init (void) BablCpuAccelFlags accel = babl_cpu_accel_get_support (); if ((accel & BABL_CPU_ACCEL_ARM_NEON) == BABL_CPU_ACCEL_ARM_NEON) { + babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_arm_neon; babl_base_init = babl_base_init_arm_neon; babl_trc_new = babl_trc_new_arm_neon; babl_trc_lookup_by_name = babl_trc_lookup_by_name_arm_neon; diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c index 3f4da04..5c3d2ca 100644 --- a/babl/base/babl-rgb-converter.c +++ b/babl/base/babl-rgb-converter.c @@ -533,3 +533,125 @@ BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space) { babl_space_class_for_each (add_rgb_adapter, (void*)space); } + +void +babl_process_conversion_path (BablList *path, + const void *source_buffer, + int source_bpp, + void *destination_buffer, + int dest_bpp, + long n); + +int BABL_SIMD_SUFFIX(babl_fish_lut_process_maybe) (const Babl *babl, + const char *source, + const char *destination, + long n, + void *data) +{ + uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut; + ((Babl*)babl)->fish.pixels += n; + + + if (!lut && babl->fish.pixels > 256 * 128) + { +#if 0 + fprintf (stderr, "building LUT for %s to %s\n", + babl_get_name (babl->conversion.source), + babl_get_name (babl->conversion.destination)); +#endif + lut = malloc (256 * 256 * 256 * 4); + if (babl->fish_path.source_bpp == 8) + { + uint64_t *lut_in = malloc (256 * 256 * 256 * 8); + for (int o = 0; o < 256 * 256 * 256; o++) + { + uint64_t v = o; + uint64_t v0 = v & 0xff; + uint64_t v1 = (v & 0xff00) >> 8; + uint64_t v2 = (v & 0xff0000) >> 16; + +#if 1 + // gives same results... but purer white is better? + v0 = (v0 << 8) | (((v0&1)?0xff:0)<<0); + v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16); + v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32); +#else + v0 = (v0 << 8); + v1 = (v1 << 24); + v2 = (v2 << 40); +#endif + lut_in[o] = v; + } + + babl_process_conversion_path (babl->fish_path.conversion_list, + lut_in, + babl->fish_path.source_bpp, + lut, + babl->fish_path.dest_bpp, + 256*256*256); + free (lut_in); + } + else + { + for (int o = 0; o < 256 * 256 * 256; o++) + lut[o] = o; + babl_process_conversion_path (babl->fish_path.conversion_list, + lut, + babl->fish_path.source_bpp, + lut, + babl->fish_path.dest_bpp, + 256*256*256); + } + // XXX : there is still a micro race, if lost we should only + // leak a LUT not produce wrong results. + if (babl->fish_path.u8_lut == NULL) + { + (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut; + + } + else + { + free (lut); + lut = (uint32_t*)babl->fish_path.u8_lut; + } + } + if (lut) + { + if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet + { // half and u16 need their + // own separate handling + uint32_t *src = (uint32_t*)source; + uint32_t *dst = (uint32_t*)destination; + lut = (uint32_t*)babl->fish_path.u8_lut; + while (n--) + { + uint32_t col_a = *src++; + uint32_t col_b = *src++; + uint32_t col; + + uint32_t c_ar = ((col_a & 0xff000000)| + ((col_a & 0x0000ff00) << 8)); + uint32_t c_gb = ((col_b & 0xff000000)| + ((col_b & 0x0000ff00) << 8))>>16; + col = c_ar|c_gb; + + *dst++ = lut[col & 0xffffff] | (col & 0xff000000); + } + } + else + { + uint32_t *src = (uint32_t*)source; + uint32_t *dst = (uint32_t*)destination; + lut = (uint32_t*)babl->fish_path.u8_lut; + while (n--) + { + uint32_t col = *src++; + *dst++ = lut[col & 0xffffff] | (col & 0xff000000); + } + } + BABL(babl)->fish_path.last_lut_use = babl_ticks (); + return 1; + } + return 0; +} + -- 2.30.2