From: Øyvind Kolås Date: Mon, 24 Jan 2022 06:39:06 +0000 (+0100) Subject: Revert "make trampoline for lut processing" X-Git-Tag: archive/raspbian/1%0.1.106-3+rpi1^2~15^2~4^2~34 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=22bd31c3666bfc9a07033a62ff7e94fae9b7af6d;p=babl.git Revert "make trampoline for lut processing" This reverts commit b3e884edf3b5c58fb4c2cede1346bd8a9d9c4a1e. benchmarking on x86_64 saw no effect, and as a surprise on arm it pessimises performance. --- diff --git a/babl/babl-fish-path.c b/babl/babl-fish-path.c index 7278ec2..f709c3f 100644 --- a/babl/babl-fish-path.c +++ b/babl/babl-fish-path.c @@ -76,7 +76,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi, static inline void -_babl_process_conversion_path (BablList *path, +process_conversion_path (BablList *path, const void *source_buffer, int source_bpp, void *destination_buffer, @@ -756,6 +756,118 @@ babl_gc_fishes (void) // is responsibility of higher layers } +static int babl_fish_lut_process_maybe (const Babl *babl, + const char *source, + const char *destination, + long n, + void *data) +{ + uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut; + ((Babl*)babl)->fish.pixels += n; + + + if (!lut && babl->fish.pixels > 256 * 128) + { +#if 0 + fprintf (stderr, "building LUT for %s to %s\n", + babl_get_name (babl->conversion.source), + babl_get_name (babl->conversion.destination)); +#endif + lut = malloc (256 * 256 * 256 * 4); + if (babl->fish_path.source_bpp == 8) + { + uint64_t *lut_in = malloc (256 * 256 * 256 * 8); + for (int o = 0; o < 256 * 256 * 256; o++) + { + uint64_t v = o; + uint64_t v0 = v & 0xff; + uint64_t v1 = (v & 0xff00) >> 8; + uint64_t v2 = (v & 0xff0000) >> 16; + +#if 1 + // gives same results... but purer white is better? + v0 = (v0 << 8) | (((v0&1)?0xff:0)<<0); + v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16); + v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32); +#else + v0 = (v0 << 8); + v1 = (v1 << 24); + v2 = (v2 << 40); +#endif + lut_in[o] = v; + } + + process_conversion_path (babl->fish_path.conversion_list, + lut_in, + babl->fish_path.source_bpp, + lut, + babl->fish_path.dest_bpp, + 256*256*256); + free (lut_in); + } + else + { + for (int o = 0; o < 256 * 256 * 256; o++) + lut[o] = o; + process_conversion_path (babl->fish_path.conversion_list, + lut, + babl->fish_path.source_bpp, + lut, + babl->fish_path.dest_bpp, + 256*256*256); + } + // XXX : there is still a micro race, if lost we should only + // leak a LUT not produce wrong results. + if (babl->fish_path.u8_lut == NULL) + { + (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut; + + } + else + { + free (lut); + lut = (uint32_t*)babl->fish_path.u8_lut; + } + } + if (lut) + { + if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet + { // half and u16 need their + // own separate handling + uint32_t *src = (uint32_t*)source; + uint32_t *dst = (uint32_t*)destination; + lut = (uint32_t*)babl->fish_path.u8_lut; + while (n--) + { + uint32_t col_a = *src++; + uint32_t col_b = *src++; + uint32_t col; + + uint32_t c_ar = ((col_a & 0xff000000)| + ((col_a & 0x0000ff00) << 8)); + uint32_t c_gb = ((col_b & 0xff000000)| + ((col_b & 0x0000ff00) << 8))>>16; + col = c_ar|c_gb; + + *dst++ = lut[col & 0xffffff] | (col & 0xff000000); + } + } + else + { + uint32_t *src = (uint32_t*)source; + uint32_t *dst = (uint32_t*)destination; + lut = (uint32_t*)babl->fish_path.u8_lut; + while (n--) + { + uint32_t col = *src++; + *dst++ = lut[col & 0xffffff] | (col & 0xff000000); + } + } + BABL(babl)->fish_path.last_lut_use = babl_ticks (); + return 1; + } + return 0; +} static void babl_fish_path_process (const Babl *babl, @@ -783,7 +895,7 @@ babl_fish_path_process (const Babl *babl, conv_counter = 0; } } - _babl_process_conversion_path (babl->fish_path.conversion_list, + process_conversion_path (babl->fish_path.conversion_list, source, babl->fish_path.source_bpp, destination, @@ -925,7 +1037,7 @@ static void inline *align_16 (unsigned char *ret) } static inline void -_babl_process_conversion_path (BablList *path, +process_conversion_path (BablList *path, const void *source_buffer, int source_bpp, void *destination_buffer, @@ -997,23 +1109,6 @@ _babl_process_conversion_path (BablList *path, } } -void -babl_process_conversion_path (BablList *path, - const void *source_buffer, - int source_bpp, - void *destination_buffer, - int dest_bpp, - long n) -{ - _babl_process_conversion_path (path, - source_buffer, - source_bpp, - destination_buffer, - dest_bpp, - n); -} - - static void init_path_instrumentation (FishPathInstrumentation *fpi, Babl *fmt_source, @@ -1149,7 +1244,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi, /* calculate this path's view of what the result should be */ ticks_start = babl_ticks (); for (int i = 0; i < BABL_TEST_ITER; i ++) - _babl_process_conversion_path (path, fpi->source, source_bpp, fpi->destination, + process_conversion_path (path, fpi->source, source_bpp, fpi->destination, dest_bpp, fpi->num_test_pixels); ticks_end = babl_ticks (); *path_cost = (ticks_end - ticks_start); diff --git a/babl/babl-internal.h b/babl/babl-internal.h index 4377ec3..ec6008b 100644 --- a/babl/babl-internal.h +++ b/babl/babl-internal.h @@ -373,12 +373,6 @@ extern const Babl * extern const Babl * (*babl_trc_lookup_by_name) (const char *name); -extern int (*babl_fish_lut_process_maybe) (const Babl *babl, - const char *source, - const char *destination, - long n, - void *data); - void babl_space_to_xyz (const Babl *space, const double *rgb, double *xyz); void babl_space_from_xyz (const Babl *space, const double *xyz, double *rgb); @@ -479,12 +473,5 @@ _babl_space_for_lcms (const char *icc_data, int icc_length); // XXX pass profile void babl_trc_class_init (void); -void -babl_process_conversion_path (BablList *path, - const void *source_buffer, - int source_bpp, - void *destination_buffer, - int dest_bpp, - long n); #endif diff --git a/babl/babl.c b/babl/babl.c index 7bfe60f..515fa09 100644 --- a/babl/babl.c +++ b/babl/babl.c @@ -200,19 +200,6 @@ void (*babl_base_init) (void) = babl_base_init_generic; const Babl * babl_trc_lookup_by_name_generic (const char *name); -int babl_fish_lut_process_maybe_generic (const Babl *babl, - const char *source, - const char *destination, - long n, - void *data); - -int (*babl_fish_lut_process_maybe) (const Babl *babl, - const char *source, - const char *destination, - long n, - void *data) = - babl_fish_lut_process_maybe_generic; - const Babl * babl_trc_new_generic (const char *name, @@ -235,25 +222,15 @@ const Babl * float *lut) = babl_trc_new_generic; #ifdef ARCH_X86_64 - -int babl_fish_lut_process_maybe_x86_64_v2 (const Babl *babl, - const char *source, - const char *destination, - long n, - void *data); -int babl_fish_lut_process_maybe_x86_64_v3 (const Babl *babl, - const char *source, - const char *destination, - long n, - void *data); - void babl_base_init_x86_64_v2 (void); void babl_base_init_x86_64_v3 (void); void _babl_space_add_universal_rgb_x86_64_v2 (const Babl *space); void _babl_space_add_universal_rgb_x86_64_v3 (const Babl *space); -const Babl * babl_trc_lookup_by_name_x86_64_v2 (const char *name); -const Babl * babl_trc_lookup_by_name_x86_64_v3 (const char *name); +const Babl * +babl_trc_lookup_by_name_x86_64_v2 (const char *name); +const Babl * +babl_trc_lookup_by_name_x86_64_v3 (const char *name); const Babl * babl_trc_new_x86_64_v2 (const char *name, @@ -270,13 +247,6 @@ babl_trc_new_x86_64_v3 (const char *name, #endif #ifdef ARCH_ARM - -int babl_fish_lut_process_maybe_arm_neon (const Babl *babl, - const char *source, - const char *destination, - long n, - void *data); - void babl_base_init_arm_neon (void); void _babl_space_add_universal_rgb_arm_neon (const Babl *space); @@ -298,7 +268,6 @@ static void simd_init (void) BablCpuAccelFlags accel = babl_cpu_accel_get_support (); if ((accel & BABL_CPU_ACCEL_X86_64_V3) == BABL_CPU_ACCEL_X86_64_V3) { - babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v3; babl_base_init = babl_base_init_x86_64_v2; /// !! // this is correct, // it performs better @@ -309,7 +278,6 @@ static void simd_init (void) } else if ((accel & BABL_CPU_ACCEL_X86_64_V2) == BABL_CPU_ACCEL_X86_64_V2) { - babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v2; babl_base_init = babl_base_init_x86_64_v2; babl_trc_new = babl_trc_new_x86_64_v2; babl_trc_lookup_by_name = babl_trc_lookup_by_name_x86_64_v2; @@ -320,7 +288,6 @@ static void simd_init (void) BablCpuAccelFlags accel = babl_cpu_accel_get_support (); if ((accel & BABL_CPU_ACCEL_ARM_NEON) == BABL_CPU_ACCEL_ARM_NEON) { - babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_arm_neon; babl_base_init = babl_base_init_arm_neon; babl_trc_new = babl_trc_new_arm_neon; babl_trc_lookup_by_name = babl_trc_lookup_by_name_arm_neon; diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c index 5c3d2ca..3f4da04 100644 --- a/babl/base/babl-rgb-converter.c +++ b/babl/base/babl-rgb-converter.c @@ -533,125 +533,3 @@ BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space) { babl_space_class_for_each (add_rgb_adapter, (void*)space); } - -void -babl_process_conversion_path (BablList *path, - const void *source_buffer, - int source_bpp, - void *destination_buffer, - int dest_bpp, - long n); - -int BABL_SIMD_SUFFIX(babl_fish_lut_process_maybe) (const Babl *babl, - const char *source, - const char *destination, - long n, - void *data) -{ - uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut; - ((Babl*)babl)->fish.pixels += n; - - - if (!lut && babl->fish.pixels > 256 * 128) - { -#if 0 - fprintf (stderr, "building LUT for %s to %s\n", - babl_get_name (babl->conversion.source), - babl_get_name (babl->conversion.destination)); -#endif - lut = malloc (256 * 256 * 256 * 4); - if (babl->fish_path.source_bpp == 8) - { - uint64_t *lut_in = malloc (256 * 256 * 256 * 8); - for (int o = 0; o < 256 * 256 * 256; o++) - { - uint64_t v = o; - uint64_t v0 = v & 0xff; - uint64_t v1 = (v & 0xff00) >> 8; - uint64_t v2 = (v & 0xff0000) >> 16; - -#if 1 - // gives same results... but purer white is better? - v0 = (v0 << 8) | (((v0&1)?0xff:0)<<0); - v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16); - v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32); -#else - v0 = (v0 << 8); - v1 = (v1 << 24); - v2 = (v2 << 40); -#endif - lut_in[o] = v; - } - - babl_process_conversion_path (babl->fish_path.conversion_list, - lut_in, - babl->fish_path.source_bpp, - lut, - babl->fish_path.dest_bpp, - 256*256*256); - free (lut_in); - } - else - { - for (int o = 0; o < 256 * 256 * 256; o++) - lut[o] = o; - babl_process_conversion_path (babl->fish_path.conversion_list, - lut, - babl->fish_path.source_bpp, - lut, - babl->fish_path.dest_bpp, - 256*256*256); - } - // XXX : there is still a micro race, if lost we should only - // leak a LUT not produce wrong results. - if (babl->fish_path.u8_lut == NULL) - { - (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut; - - } - else - { - free (lut); - lut = (uint32_t*)babl->fish_path.u8_lut; - } - } - if (lut) - { - if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet - { // half and u16 need their - // own separate handling - uint32_t *src = (uint32_t*)source; - uint32_t *dst = (uint32_t*)destination; - lut = (uint32_t*)babl->fish_path.u8_lut; - while (n--) - { - uint32_t col_a = *src++; - uint32_t col_b = *src++; - uint32_t col; - - uint32_t c_ar = ((col_a & 0xff000000)| - ((col_a & 0x0000ff00) << 8)); - uint32_t c_gb = ((col_b & 0xff000000)| - ((col_b & 0x0000ff00) << 8))>>16; - col = c_ar|c_gb; - - *dst++ = lut[col & 0xffffff] | (col & 0xff000000); - } - } - else - { - uint32_t *src = (uint32_t*)source; - uint32_t *dst = (uint32_t*)destination; - lut = (uint32_t*)babl->fish_path.u8_lut; - while (n--) - { - uint32_t col = *src++; - *dst++ = lut[col & 0xffffff] | (col & 0xff000000); - } - } - BABL(babl)->fish_path.last_lut_use = babl_ticks (); - return 1; - } - return 0; -} -