From 57bd10c3f114a2f785ca6325204f035665d2892e Mon Sep 17 00:00:00 2001 From: Ell Date: Sun, 12 Apr 2020 17:42:16 +0300 Subject: [PATCH] avx2-int8: add gamma u8 -> linear float conversions Add AVX2 conversions from u8 Y', Y'A, R'G'B, and R'G'B'A to float Y, YA, RGB, and RGBA, respectively. The conversions use an LUT together with the AVX2 gather instructions to process 8 values a once. Depending on the formats and cache utilization, the new conversions are between 1.25x to 2.2x faster than the existing conversions. --- extensions/avx2-int8-tables.h | 517 ++++++++++++++++++++++++++++++++++ extensions/avx2-int8.c | 184 ++++++++++++ 2 files changed, 701 insertions(+) diff --git a/extensions/avx2-int8-tables.h b/extensions/avx2-int8-tables.h index 8e3c6fb..cfc5208 100644 --- a/extensions/avx2-int8-tables.h +++ b/extensions/avx2-int8-tables.h @@ -4098,3 +4098,520 @@ static const int32_t linear_to_gamma[65536] = 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }; + +static const float gamma_to_linear[512] = +{ +0x0p+0, +0x1.3e45677c176f7p-12, +0x1.3e45677c176f7p-11, +0x1.dd681b3a23272p-11, +0x1.3e45677c176f7p-10, +0x1.8dd6c15b1d4b4p-10, +0x1.dd681b3a23272p-10, +0x1.167cba8c94818p-9, +0x1.3e45677c176f7p-9, +0x1.660e146b9a5d5p-9, +0x1.8dd6c15b1d4b4p-9, +0x1.b6a31b5259c99p-9, +0x1.e1e31d70c99ddp-9, +0x1.07c38bf8583a9p-8, +0x1.1fcc2beed6421p-8, +0x1.390ffaf95e279p-8, +0x1.53936cc7bc928p-8, +0x1.6f5addb50c915p-8, +0x1.8c6a94031b561p-8, +0x1.aac6c0fb97351p-8, +0x1.ca7381f9f602bp-8, +0x1.eb74e160978dp-8, +0x1.06e76bbda92b8p-7, +0x1.18c2a5a8a8044p-7, +0x1.2b4e09b3f0ae3p-7, +0x1.3e8b7b3bde965p-7, +0x1.527cd60af8b85p-7, +0x1.6723eea8d3709p-7, +0x1.7c8292a3db6b3p-7, +0x1.929a88d67b521p-7, +0x1.a96d91a8016bdp-7, +0x1.c0fd67499fab6p-7, +0x1.d94bbdefd740ep-7, +0x1.f25a44089883fp-7, +0x1.061551372c694p-6, +0x1.135f3e4c2cce2p-6, +0x1.210bb8642b172p-6, +0x1.2f1b8c1ae46bdp-6, +0x1.3d8f839b79c0bp-6, +0x1.4c6866b3e9fa4p-6, +0x1.5ba6fae794313p-6, +0x1.6b4c0380d2deep-6, +0x1.7b5841a1bf3acp-6, +0x1.8bcc74542addbp-6, +0x1.9ca95898dc8b5p-6, +0x1.adefa9761c02p-6, +0x1.bfa0200597bd9p-6, +0x1.d1bb7381aec1fp-6, +0x1.e442595227bcap-6, +0x1.f73585185e1b5p-6, +0x1.054ad45d76878p-5, +0x1.0f31ba386ff26p-5, +0x1.194fcb663747bp-5, +0x1.23a55e62a662ap-5, +0x1.2e32c8e148d11p-5, +0x1.38f85fd21eacfp-5, +0x1.43f67766310ffp-5, +0x1.4f2d6313fa8dp-5, +0x1.5a9d759ba5edp-5, +0x1.6647010b254eep-5, +0x1.722a56c2239eep-5, +0x1.7e47c775d2427p-5, +0x1.8a9fa33494b07p-5, +0x1.973239698b9ccp-5, +0x1.a3ffd8e001389p-5, +0x1.b108cfc6b7fbcp-5, +0x1.be4d6bb31d522p-5, +0x1.cbcdf9a4616f2p-5, +0x1.d98ac60675833p-5, +0x1.e7841cb4f16dfp-5, +0x1.f5ba48fde2048p-5, +0x1.0216cad240765p-4, +0x1.096f2671eb815p-4, +0x1.10e65c38a5192p-4, +0x1.187c90bf8bce2p-4, +0x1.2031e85f5d6dap-4, +0x1.28068731a1952p-4, +0x1.2ffa9111cb94bp-4, +0x1.380e299e53f92p-4, +0x1.40417439ca10fp-4, +0x1.4894940bddbfbp-4, +0x1.5107ac0261e59p-4, +0x1.599aded247aacp-4, +0x1.624e4ef892ed4p-4, +0x1.6b221ebb4817ep-4, +0x1.7416702a539d1p-4, +0x1.7d2b65206b527p-4, +0x1.86611f43e9e6ap-4, +0x1.8fb7c007a4a7p-4, +0x1.992f68abbbc89p-4, +0x1.a2c83a3e6566dp-4, +0x1.ac82559cb3644p-4, +0x1.b65ddb7354604p-4, +0x1.c05aec3f4fe5ep-4, +0x1.ca79a84ebe03p-4, +0x1.d4ba2fc17a6a5p-4, +0x1.df1ca289d34b8p-4, +0x1.e9a1206d34003p-4, +0x1.f447c904cbb4ep-4, +0x1.ff10bbbe302c2p-4, +0x1.04fe0bedfe5f1p-3, +0x1.0a84fe3b36d8fp-3, +0x1.101d443dfc06fp-3, +0x1.15c6ed58eefdfp-3, +0x1.1b8208da5fefp-3, +0x1.214ea5fc9514ap-3, +0x1.272cd3e610123p-3, +0x1.2d1ca1a9d1cfbp-3, +0x1.331e1e479cdf5p-3, +0x1.393158ac3674ep-3, +0x1.3f565fb1a5fd5p-3, +0x1.458d421f735dfp-3, +0x1.4bd60eaae3e73p-3, +0x1.5230d3f736034p-3, +0x1.589da095dbaa1p-3, +0x1.5f1c8306b3a3cp-3, +0x1.65ad89b841a2bp-3, +0x1.6c50c307e53bfp-3, +0x1.73063d420fc8p-3, +0x1.79ce06a279303p-3, +0x1.80a82d5453b5dp-3, +0x1.8794bf727eb3fp-3, +0x1.8e93cb07b8679p-3, +0x1.95a55e0ecec0bp-3, +0x1.9cc98672cf47ep-3, +0x1.a400520f3619cp-3, +0x1.ab49ceb01c003p-3, +0x1.b2a60a1263b0ap-3, +0x1.ba1511e3e632dp-3, +0x1.c196f3c39e76fp-3, +0x1.c92bbd41d41fep-3, +0x1.d0d37be045851p-3, +0x1.d88e3d1250f68p-3, +0x1.e05c0e3d1d3ep-3, +0x1.e83cfcb7c16fp-3, +0x1.f03115cb6bfd3p-3, +0x1.f83866b38924dp-3, +0x1.00297e4ef4553p-2, +0x1.044072557177ap-2, +0x1.086115f6beb3ap-2, +0x1.0c8b6fb5c735ep-2, +0x1.10bf860ef039ap-2, +0x1.14fd5f782a5a6p-2, +0x1.1945026102997p-2, +0x1.1d967532b31b1p-2, +0x1.21f1be50339e7p-2, +0x1.2656e41649ae3p-2, +0x1.2ac5ecdb988f8p-2, +0x1.2f3edef0b0ed8p-2, +0x1.33c1c0a020438p-2, +0x1.384e982e800b1p-2, +0x1.3ce56bda84a81p-2, +0x1.418641dd0c1bcp-2, +0x1.463120692c7afp-2, +0x1.4ae60dac4229dp-2, +0x1.4fa50fcdfde15p-2, +0x1.546e2cf0727a9p-2, +0x1.59416b3022858p-2, +0x1.5e1ed0a40daabp-2, +0x1.6306635dbdd7bp-2, +0x1.67f82969543a2p-2, +0x1.6cf428cd96079p-2, +0x1.71fa678bf915dp-2, +0x1.770aeba0b042ap-2, +0x1.7c25bb02b7ac5p-2, +0x1.814adba3e0bd9p-2, +0x1.867a5370de0b1p-2, +0x1.8bb428514f067p-2, +0x1.90f86027cb84ep-2, +0x1.964700d1ef1b1p-2, +0x1.9ba0102864521p-2, +0x1.a10393feefafdp-2, +0x1.a67192247a9bep-2, +0x1.abea10631e195p-2, +0x1.b16d14802d5cap-2, +0x1.b6faa43c403bbp-2, +0x1.bc92c5533d785p-2, +0x1.c2357d7c64e5dp-2, +0x1.c7e2d26a596dep-2, +0x1.cd9ac9cb2aef2p-2, +0x1.d35d69485ffc5p-2, +0x1.d92ab686ff782p-2, +0x1.df02b7279a10dp-2, +0x1.e4e570c6539c5p-2, +0x1.ead2e8faec526p-2, +0x1.f0cb2558c9ea4p-2, +0x1.f6ce2b6f00983p-2, +0x1.fcdc00c85bec2p-2, +0x1.017a5575b3cb2p-1, +0x1.048c17ad3c04bp-1, +0x1.07a349c9d9837p-1, +0x1.0abfee888c05p-1, +0x1.0de208a4444c8p-1, +0x1.11099ad5e83ebp-1, +0x1.1436a7d456eefp-1, +0x1.176932546ca12p-1, +0x1.1aa13d0906bdap-1, +0x1.1ddecaa307b85p-1, +0x1.2121ddd15aecep-1, +0x1.246a7940f86d1p-1, +0x1.27b89f9ce8c4bp-1, +0x1.2b0c538e48b07p-1, +0x1.2e6597bc4ccap-1, +0x1.31c46ecc4528dp-1, +0x1.3528db61a0f73p-1, +0x1.3892e01df1fccp-1, +0x1.3c027fa0f01ebp-1, +0x1.3f77bc887cd3bp-1, +0x1.42f29970a68f8p-1, +0x1.467318f3ac22dp-1, +0x1.49f93daa00113p-1, +0x1.4d850a2a4bde1p-1, +0x1.51168109734e5p-1, +0x1.54ada4da97a1bp-1, +0x1.584a782f1ac23p-1, +0x1.5becfd96a2698p-1, +0x1.5f95379f1b3edp-1, +0x1.634328d4bbe97p-1, +0x1.66f6d3c2081cfp-1, +0x1.6ab03aefd39aap-1, +0x1.6e6f60e5452b1p-1, +0x1.72344827d98f6p-1, +0x1.75fef33b6669bp-1, +0x1.79cf64a21d1e2p-1, +0x1.7da59edc8dabp-1, +0x1.8181a469a9787p-1, +0x1.856377c6c6224p-1, +0x1.894b1b6fa0377p-1, +0x1.8d3891de5df49p-1, +0x1.912bdd8b91f45p-1, +0x1.952500ee3dda5p-1, +0x1.9923fe7bd4f67p-1, +0x1.9d28d8a83edfcp-1, +0x1.a13391e5da09fp-1, +0x1.a5442ca57e52ep-1, +0x1.a95aab567f88fp-1, +0x1.ad771066afec2p-1, +0x1.b1995e4262a69p-1, +0x1.b5c197546e3f8p-1, +0x1.b9efbe062f086p-1, +0x1.be23d4bf8981bp-1, +0x1.c25ddde6ecbbbp-1, +0x1.c69ddbe154af1p-1, +0x1.cae3d1124c90bp-1, +0x1.cf2fbfdbf11f1p-1, +0x1.d381aa9ef2e82p-1, +0x1.d7d993ba988d4p-1, +0x1.dc377d8cc0fd5p-1, +0x1.e09b6a71e5aa6p-1, +0x1.e5055cc51cbb4p-1, +0x1.e97556e01b351p-1, +0x1.edeb5b1b37216p-1, +0x1.f2676bcd69adep-1, +0x1.f6e98b4c51466p-1, +0x1.fb71bbec33ab2p-1, +0x1p+0, + +0x0p+0, +0x1.010101010101p-8, +0x1.010101010101p-7, +0x1.8181818181818p-7, +0x1.010101010101p-6, +0x1.4141414141414p-6, +0x1.8181818181818p-6, +0x1.c1c1c1c1c1c1cp-6, +0x1.010101010101p-5, +0x1.2121212121212p-5, +0x1.4141414141414p-5, +0x1.6161616161616p-5, +0x1.8181818181818p-5, +0x1.a1a1a1a1a1a1ap-5, +0x1.c1c1c1c1c1c1cp-5, +0x1.e1e1e1e1e1e1ep-5, +0x1.010101010101p-4, +0x1.1111111111111p-4, +0x1.2121212121212p-4, +0x1.3131313131313p-4, +0x1.4141414141414p-4, +0x1.5151515151515p-4, +0x1.6161616161616p-4, +0x1.7171717171717p-4, +0x1.8181818181818p-4, +0x1.9191919191919p-4, +0x1.a1a1a1a1a1a1ap-4, +0x1.b1b1b1b1b1b1bp-4, +0x1.c1c1c1c1c1c1cp-4, +0x1.d1d1d1d1d1d1dp-4, +0x1.e1e1e1e1e1e1ep-4, +0x1.f1f1f1f1f1f1fp-4, +0x1.010101010101p-3, +0x1.0909090909091p-3, +0x1.1111111111111p-3, +0x1.1919191919192p-3, +0x1.2121212121212p-3, +0x1.2929292929293p-3, +0x1.3131313131313p-3, +0x1.3939393939394p-3, +0x1.4141414141414p-3, +0x1.4949494949495p-3, +0x1.5151515151515p-3, +0x1.5959595959596p-3, +0x1.6161616161616p-3, +0x1.6969696969697p-3, +0x1.7171717171717p-3, +0x1.7979797979798p-3, +0x1.8181818181818p-3, +0x1.8989898989899p-3, +0x1.9191919191919p-3, +0x1.999999999999ap-3, +0x1.a1a1a1a1a1a1ap-3, +0x1.a9a9a9a9a9a9bp-3, +0x1.b1b1b1b1b1b1bp-3, +0x1.b9b9b9b9b9b9cp-3, +0x1.c1c1c1c1c1c1cp-3, +0x1.c9c9c9c9c9c9dp-3, +0x1.d1d1d1d1d1d1dp-3, +0x1.d9d9d9d9d9d9ep-3, +0x1.e1e1e1e1e1e1ep-3, +0x1.e9e9e9e9e9e9fp-3, +0x1.f1f1f1f1f1f1fp-3, +0x1.f9f9f9f9f9fap-3, +0x1.010101010101p-2, +0x1.050505050505p-2, +0x1.0909090909091p-2, +0x1.0d0d0d0d0d0d1p-2, +0x1.1111111111111p-2, +0x1.1515151515151p-2, +0x1.1919191919192p-2, +0x1.1d1d1d1d1d1d2p-2, +0x1.2121212121212p-2, +0x1.2525252525252p-2, +0x1.2929292929293p-2, +0x1.2d2d2d2d2d2d3p-2, +0x1.3131313131313p-2, +0x1.3535353535353p-2, +0x1.3939393939394p-2, +0x1.3d3d3d3d3d3d4p-2, +0x1.4141414141414p-2, +0x1.4545454545454p-2, +0x1.4949494949495p-2, +0x1.4d4d4d4d4d4d5p-2, +0x1.5151515151515p-2, +0x1.5555555555555p-2, +0x1.5959595959596p-2, +0x1.5d5d5d5d5d5d6p-2, +0x1.6161616161616p-2, +0x1.6565656565656p-2, +0x1.6969696969697p-2, +0x1.6d6d6d6d6d6d7p-2, +0x1.7171717171717p-2, +0x1.7575757575757p-2, +0x1.7979797979798p-2, +0x1.7d7d7d7d7d7d8p-2, +0x1.8181818181818p-2, +0x1.8585858585858p-2, +0x1.8989898989899p-2, +0x1.8d8d8d8d8d8d9p-2, +0x1.9191919191919p-2, +0x1.9595959595959p-2, +0x1.999999999999ap-2, +0x1.9d9d9d9d9d9dap-2, +0x1.a1a1a1a1a1a1ap-2, +0x1.a5a5a5a5a5a5ap-2, +0x1.a9a9a9a9a9a9bp-2, +0x1.adadadadadadbp-2, +0x1.b1b1b1b1b1b1bp-2, +0x1.b5b5b5b5b5b5bp-2, +0x1.b9b9b9b9b9b9cp-2, +0x1.bdbdbdbdbdbdcp-2, +0x1.c1c1c1c1c1c1cp-2, +0x1.c5c5c5c5c5c5cp-2, +0x1.c9c9c9c9c9c9dp-2, +0x1.cdcdcdcdcdcddp-2, +0x1.d1d1d1d1d1d1dp-2, +0x1.d5d5d5d5d5d5dp-2, +0x1.d9d9d9d9d9d9ep-2, +0x1.ddddddddddddep-2, +0x1.e1e1e1e1e1e1ep-2, +0x1.e5e5e5e5e5e5ep-2, +0x1.e9e9e9e9e9e9fp-2, +0x1.ededededededfp-2, +0x1.f1f1f1f1f1f1fp-2, +0x1.f5f5f5f5f5f5fp-2, +0x1.f9f9f9f9f9fap-2, +0x1.fdfdfdfdfdfep-2, +0x1.010101010101p-1, +0x1.030303030303p-1, +0x1.050505050505p-1, +0x1.070707070707p-1, +0x1.0909090909091p-1, +0x1.0b0b0b0b0b0b1p-1, +0x1.0d0d0d0d0d0d1p-1, +0x1.0f0f0f0f0f0f1p-1, +0x1.1111111111111p-1, +0x1.1313131313131p-1, +0x1.1515151515151p-1, +0x1.1717171717171p-1, +0x1.1919191919192p-1, +0x1.1b1b1b1b1b1b2p-1, +0x1.1d1d1d1d1d1d2p-1, +0x1.1f1f1f1f1f1f2p-1, +0x1.2121212121212p-1, +0x1.2323232323232p-1, +0x1.2525252525252p-1, +0x1.2727272727272p-1, +0x1.2929292929293p-1, +0x1.2b2b2b2b2b2b3p-1, +0x1.2d2d2d2d2d2d3p-1, +0x1.2f2f2f2f2f2f3p-1, +0x1.3131313131313p-1, +0x1.3333333333333p-1, +0x1.3535353535353p-1, +0x1.3737373737373p-1, +0x1.3939393939394p-1, +0x1.3b3b3b3b3b3b4p-1, +0x1.3d3d3d3d3d3d4p-1, +0x1.3f3f3f3f3f3f4p-1, +0x1.4141414141414p-1, +0x1.4343434343434p-1, +0x1.4545454545454p-1, +0x1.4747474747474p-1, +0x1.4949494949495p-1, +0x1.4b4b4b4b4b4b5p-1, +0x1.4d4d4d4d4d4d5p-1, +0x1.4f4f4f4f4f4f5p-1, +0x1.5151515151515p-1, +0x1.5353535353535p-1, +0x1.5555555555555p-1, +0x1.5757575757575p-1, +0x1.5959595959596p-1, +0x1.5b5b5b5b5b5b6p-1, +0x1.5d5d5d5d5d5d6p-1, +0x1.5f5f5f5f5f5f6p-1, +0x1.6161616161616p-1, +0x1.6363636363636p-1, +0x1.6565656565656p-1, +0x1.6767676767676p-1, +0x1.6969696969697p-1, +0x1.6b6b6b6b6b6b7p-1, +0x1.6d6d6d6d6d6d7p-1, +0x1.6f6f6f6f6f6f7p-1, +0x1.7171717171717p-1, +0x1.7373737373737p-1, +0x1.7575757575757p-1, +0x1.7777777777777p-1, +0x1.7979797979798p-1, +0x1.7b7b7b7b7b7b8p-1, +0x1.7d7d7d7d7d7d8p-1, +0x1.7f7f7f7f7f7f8p-1, +0x1.8181818181818p-1, +0x1.8383838383838p-1, +0x1.8585858585858p-1, +0x1.8787878787878p-1, +0x1.8989898989899p-1, +0x1.8b8b8b8b8b8b9p-1, +0x1.8d8d8d8d8d8d9p-1, +0x1.8f8f8f8f8f8f9p-1, +0x1.9191919191919p-1, +0x1.9393939393939p-1, +0x1.9595959595959p-1, +0x1.9797979797979p-1, +0x1.999999999999ap-1, +0x1.9b9b9b9b9b9bap-1, +0x1.9d9d9d9d9d9dap-1, +0x1.9f9f9f9f9f9fap-1, +0x1.a1a1a1a1a1a1ap-1, +0x1.a3a3a3a3a3a3ap-1, +0x1.a5a5a5a5a5a5ap-1, +0x1.a7a7a7a7a7a7ap-1, +0x1.a9a9a9a9a9a9bp-1, +0x1.ababababababbp-1, +0x1.adadadadadadbp-1, +0x1.afafafafafafbp-1, +0x1.b1b1b1b1b1b1bp-1, +0x1.b3b3b3b3b3b3bp-1, +0x1.b5b5b5b5b5b5bp-1, +0x1.b7b7b7b7b7b7bp-1, +0x1.b9b9b9b9b9b9cp-1, +0x1.bbbbbbbbbbbbcp-1, +0x1.bdbdbdbdbdbdcp-1, +0x1.bfbfbfbfbfbfcp-1, +0x1.c1c1c1c1c1c1cp-1, +0x1.c3c3c3c3c3c3cp-1, +0x1.c5c5c5c5c5c5cp-1, +0x1.c7c7c7c7c7c7cp-1, +0x1.c9c9c9c9c9c9dp-1, +0x1.cbcbcbcbcbcbdp-1, +0x1.cdcdcdcdcdcddp-1, +0x1.cfcfcfcfcfcfdp-1, +0x1.d1d1d1d1d1d1dp-1, +0x1.d3d3d3d3d3d3dp-1, +0x1.d5d5d5d5d5d5dp-1, +0x1.d7d7d7d7d7d7dp-1, +0x1.d9d9d9d9d9d9ep-1, +0x1.dbdbdbdbdbdbep-1, +0x1.ddddddddddddep-1, +0x1.dfdfdfdfdfdfep-1, +0x1.e1e1e1e1e1e1ep-1, +0x1.e3e3e3e3e3e3ep-1, +0x1.e5e5e5e5e5e5ep-1, +0x1.e7e7e7e7e7e7ep-1, +0x1.e9e9e9e9e9e9fp-1, +0x1.ebebebebebebfp-1, +0x1.ededededededfp-1, +0x1.efefefefefeffp-1, +0x1.f1f1f1f1f1f1fp-1, +0x1.f3f3f3f3f3f3fp-1, +0x1.f5f5f5f5f5f5fp-1, +0x1.f7f7f7f7f7f7fp-1, +0x1.f9f9f9f9f9fap-1, +0x1.fbfbfbfbfbfcp-1, +0x1.fdfdfdfdfdfep-1, +0x1p+0 +}; diff --git a/extensions/avx2-int8.c b/extensions/avx2-int8.c index b6d5165..a3ded4d 100644 --- a/extensions/avx2-int8.c +++ b/extensions/avx2-int8.c @@ -338,6 +338,184 @@ conv_rgbaF_linear_rgba8_gamma (const Babl *conversion, #undef CVT1 #undef CVTA1 +#define CVT1(src, dst) \ + (*dst++ = gamma_to_linear[*src++]) + +#define CVTA1(src, dst) \ + (*dst++ = gamma_to_linear[*src++ + 256]) + +static inline void +conv_y8_gamma_yF_linear (const Babl *conversion, + const uint8_t *src, + float *dst, + long samples) +{ + const __m128i *src_vec; + __v8sf *dst_vec; + + while ((uintptr_t) dst % 32 && samples > 0) + { + CVT1 (src, dst); + + samples--; + } + + src_vec = (const __m128i *) src; + dst_vec = (__v8sf *) dst; + + while (samples >= 16) + { + __m128i i8_01; + __m256i i32_0; + + i8_01 = _mm_loadu_si128 (src_vec++); + + i32_0 = _mm256_cvtepu8_epi32 (i8_01); + *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); + + i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2)); + + i32_0 = _mm256_cvtepu8_epi32 (i8_01); + *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); + + samples -= 16; + } + + src = (const uint8_t *) src_vec; + dst = (float *) dst_vec; + + while (samples > 0) + { + CVT1 (src, dst); + + samples--; + } +} + +static inline void +conv_ya8_gamma_yaF_linear (const Babl *conversion, + const uint8_t *src, + float *dst, + long samples) +{ + const __m128i *src_vec; + __v8sf *dst_vec; + const __m256i offset = _mm256_setr_epi32 (0, 256, 0, 256, + 0, 256, 0, 256); + + while ((uintptr_t) dst % 32 && samples > 0) + { + CVT1 (src, dst); + CVTA1 (src, dst); + + samples--; + } + + src_vec = (const __m128i *) src; + dst_vec = (__v8sf *) dst; + + while (samples >= 8) + { + __m128i i8_01; + __m256i i32_0; + + i8_01 = _mm_loadu_si128 (src_vec++); + + i32_0 = _mm256_cvtepu8_epi32 (i8_01); + i32_0 += offset; + *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); + + i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2)); + + i32_0 = _mm256_cvtepu8_epi32 (i8_01); + i32_0 += offset; + *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); + + samples -= 8; + } + + src = (const uint8_t *) src_vec; + dst = (float *) dst_vec; + + while (samples > 0) + { + CVT1 (src, dst); + CVTA1 (src, dst); + + samples--; + } +} + +static inline void +conv_rgb8_gamma_rgbF_linear (const Babl *conversion, + const uint8_t *src, + float *dst, + long samples) +{ + conv_y8_gamma_yF_linear (conversion, src, dst, 3 * samples); +} + +static inline void +conv_rgba8_gamma_rgbaF_linear (const Babl *conversion, + const uint8_t *src, + float *dst, + long samples) +{ + const __m128i *src_vec; + __v8sf *dst_vec; + const __m256i offset = _mm256_setr_epi32 (0, 0, 0, 256, + 0, 0, 0, 256); + + while ((uintptr_t) dst % 32 && samples > 0) + { + CVT1 (src, dst); + CVT1 (src, dst); + CVT1 (src, dst); + CVTA1 (src, dst); + + samples--; + } + + src_vec = (const __m128i *) src; + dst_vec = (__v8sf *) dst; + + while (samples >= 4) + { + __m128i i8_01; + __m256i i32_0; + + i8_01 = _mm_loadu_si128 (src_vec++); + + i32_0 = _mm256_cvtepu8_epi32 (i8_01); + i32_0 += offset; + *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); + + i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2)); + + i32_0 = _mm256_cvtepu8_epi32 (i8_01); + i32_0 += offset; + *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4); + + samples -= 4; + } + + src = (const uint8_t *) src_vec; + dst = (float *) dst_vec; + + while (samples > 0) + { + CVT1 (src, dst); + CVT1 (src, dst); + CVT1 (src, dst); + CVTA1 (src, dst); + + samples--; + } +} + +#undef CVT1 +#undef CVTA1 + #endif /* defined(USE_AVX2) */ int init (void); @@ -407,6 +585,12 @@ init (void) dst ## _gamma, \ "linear", \ conv_ ## src ## _linear_ ## dst ## _gamma, \ + NULL); \ + \ + babl_conversion_new (dst ## _gamma, \ + src ## _linear, \ + "linear", \ + conv_ ## dst ## _gamma_ ## src ## _linear, \ NULL); \ } \ while (0) -- 2.30.2