From: Graham Inggs Date: Fri, 7 Jul 2017 15:18:14 +0000 (+0200) Subject: Imported Upstream version 2.1.0 X-Git-Tag: archive/raspbian/2.11.3-2+rpi1^2~10^2~12 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=6f1c1b60552164b40a7b7079f523846a244597f6;p=utf8proc.git Imported Upstream version 2.1.0 --- diff --git a/.gitignore b/.gitignore index 532fa9a..41a6cff 100644 --- a/.gitignore +++ b/.gitignore @@ -17,11 +17,12 @@ data/*.sfd bench/bench bench/icu bench/unistring -normtest -graphemetest -printproperty -charwidth -valid -iterate -case +test/normtest +test/graphemetest +test/printproperty +test/charwidth +test/valid +test/iterate +test/case +test/custom /tmp/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e9b8a1..be676ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,8 +10,8 @@ project (utf8proc C) # API version number (defined in utf8proc.h). # Be sure to also update these in Makefile and MANIFEST! set(SO_MAJOR 2) -set(SO_MINOR 0) -set(SO_PATCH 2) +set(SO_MINOR 1) +set(SO_PATCH 0) add_definitions ( -DUTF8PROC_EXPORTS diff --git a/MANIFEST b/MANIFEST index 106a4f0..b39f8a8 100644 --- a/MANIFEST +++ b/MANIFEST @@ -2,6 +2,6 @@ include/ include/utf8proc.h lib/ lib/libutf8proc.a -lib/libutf8proc.so -> libutf8proc.so.2.0.2 -lib/libutf8proc.so.2 -> libutf8proc.so.2.0.2 -lib/libutf8proc.so.2.0.2 +lib/libutf8proc.so -> libutf8proc.so.2.1.0 +lib/libutf8proc.so.2 -> libutf8proc.so.2.1.0 +lib/libutf8proc.so.2.1.0 diff --git a/Makefile b/Makefile index 2bde4b1..51995c3 100644 --- a/Makefile +++ b/Makefile @@ -21,8 +21,8 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS # The API version number is defined in utf8proc.h. # Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt! MAJOR=2 -MINOR=0 -PATCH=2 +MINOR=1 +PATCH=0 OS := $(shell uname) ifeq ($(OS),Darwin) # MacOS X @@ -49,7 +49,7 @@ clean: ifneq ($(OS),Darwin) rm -f libutf8proc.so.$(MAJOR) endif - rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case + rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom rm -rf MANIFEST.new tmp $(MAKE) -C bench clean $(MAKE) -C data clean @@ -136,7 +136,10 @@ test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h $(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@ -check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o +test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@ + +check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o $(MAKE) -C bench test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt @@ -144,3 +147,4 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB test/valid test/iterate test/case + test/custom diff --git a/NEWS.md b/NEWS.md index 6f4c131..22eff0e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,23 @@ # utf8proc release history # +## Version 2.1 ## + +2016-12-16: + +- New functions `utf8proc_map_custom` and `utf8proc_decompose_custom` + to allow user-supplied transformations of codepoints, in conjunction + with other transformations ([#89]). + +- New function `utf8proc_normalize_utf32` to apply normalizations + directly to UTF-32 data (not just UTF-8) ([#88]). + +- Fixed stack overflow that could occur due to incorrect definition + of `UINT16_MAX` with some compilers ([#84]). + +- Fixed conflict with `stdbool.h` in Visual Studio ([#90]). + +- Updated font metrics to use Unifont 9.0.04. + ## Version 2.0.2 ## 2016-07-27: @@ -277,3 +295,9 @@ Release of version 1.0.1 [#70]: https://github.com/JuliaLang/utf8proc/issues/70 [#77]: https://github.com/JuliaLang/utf8proc/issues/77 [#78]: https://github.com/JuliaLang/utf8proc/issues/78 +[#79]: https://github.com/JuliaLang/utf8proc/issues/79 +[#80]: https://github.com/JuliaLang/utf8proc/issues/80 +[#84]: https://github.com/JuliaLang/utf8proc/pull/84 +[#88]: https://github.com/JuliaLang/utf8proc/pull/88 +[#89]: https://github.com/JuliaLang/utf8proc/pull/89 +[#90]: https://github.com/JuliaLang/utf8proc/issues/90 diff --git a/data/Makefile b/data/Makefile index c41f601..19d375f 100644 --- a/data/Makefile +++ b/data/Makefile @@ -20,7 +20,7 @@ utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt $(RUBY) data_generator.rb < UnicodeData.txt > $@ # GNU Unifont version for font metric calculations: -UNIFONT_VERSION=9.0.01 +UNIFONT_VERSION=9.0.04 unifont.ttf: $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf diff --git a/test/custom.c b/test/custom.c new file mode 100644 index 0000000..f85b3cc --- /dev/null +++ b/test/custom.c @@ -0,0 +1,27 @@ +#include "tests.h" + +static int thunk_test = 1; + +static utf8proc_int32_t custom(utf8proc_int32_t codepoint, void *thunk) +{ + check(((int *) thunk) == &thunk_test, "unexpected thunk passed"); + if (codepoint == 'a') + return 'b'; + if (codepoint == 'S') + return 0x00df; /* ß */ + return codepoint; +} + +int main(void) +{ + utf8proc_uint8_t input[] = {0x41,0x61,0x53,0x62,0xef,0xbd,0x81,0x00}; /* "AaSb\uff41" */ + utf8proc_uint8_t correct[] = {0x61,0x62,0x73,0x73,0x62,0x61,0x00}; /* "abssba" */ + utf8proc_uint8_t *output; + utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM, + custom, &thunk_test); + printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output); + check(strlen((char*) output) == 6, "incorrect output length"); + check(!memcmp(correct, output, 7), "incorrect output data"); + free(output); + return 0; +} diff --git a/utf8proc.c b/utf8proc.c index ba5143a..c14bbe1 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -166,24 +166,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut if (uc < 0x00) { return 0; } else if (uc < 0x80) { - dst[0] = uc; + dst[0] = (utf8proc_uint8_t) uc; return 1; } else if (uc < 0x800) { - dst[0] = 0xC0 + (uc >> 6); - dst[1] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); + dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 2; // Note: we allow encoding 0xd800-0xdfff here, so as not to change // the API, however, these are actually invalid in UTF-8 } else if (uc < 0x10000) { - dst[0] = 0xE0 + (uc >> 12); - dst[1] = 0x80 + ((uc >> 6) & 0x3F); - dst[2] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); + dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); + dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 3; } else if (uc < 0x110000) { - dst[0] = 0xF0 + (uc >> 18); - dst[1] = 0x80 + ((uc >> 12) & 0x3F); - dst[2] = 0x80 + ((uc >> 6) & 0x3F); - dst[3] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); + dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); + dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); + dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 4; } else return 0; } @@ -193,28 +193,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t if (uc < 0x00) { return 0; } else if (uc < 0x80) { - dst[0] = uc; + dst[0] = (utf8proc_uint8_t)uc; return 1; } else if (uc < 0x800) { - dst[0] = 0xC0 + (uc >> 6); - dst[1] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); + dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 2; } else if (uc == 0xFFFF) { - dst[0] = 0xFF; + dst[0] = (utf8proc_uint8_t)0xFF; return 1; } else if (uc == 0xFFFE) { - dst[0] = 0xFE; + dst[0] = (utf8proc_uint8_t)0xFE; return 1; } else if (uc < 0x10000) { - dst[0] = 0xE0 + (uc >> 12); - dst[1] = 0x80 + ((uc >> 6) & 0x3F); - dst[2] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); + dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); + dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 3; } else if (uc < 0x110000) { - dst[0] = 0xF0 + (uc >> 18); - dst[1] = 0x80 + ((uc >> 12) & 0x3F); - dst[2] = 0x80 + ((uc >> 6) & 0x3F); - dst[3] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); + dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); + dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); + dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 4; } else return 0; } @@ -391,8 +391,6 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { return s[utf8proc_category(c)]; } - - #define utf8proc_decompose_lump(replacement_uc) \ return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ options & ~UTF8PROC_LUMP, last_boundclass) @@ -485,6 +483,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options +) { + return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, + utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data ) { /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ utf8proc_ssize_t wpos = 0; @@ -511,6 +517,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; } + if (custom_func != NULL) { + uc = custom_func(uc, custom_data); /* user-specified custom mapping */ + } decomp_result = utf8proc_decompose_char( uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, &boundclass @@ -545,9 +554,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( return wpos; } -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { - /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored - ASSERT: 'buffer' has one spare byte of free space at the end! */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { + /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { utf8proc_ssize_t rpos; utf8proc_ssize_t wpos = 0; @@ -655,6 +663,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, } length = wpos; } + return length; +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { + /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored + ASSERT: 'buffer' has one spare byte of free space at the end! */ + length = utf8proc_normalize_utf32(buffer, length, options); + if (length < 0) return length; { utf8proc_ssize_t rpos, wpos = 0; utf8proc_int32_t uc; @@ -676,15 +692,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options +) { + return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data ) { utf8proc_int32_t *buffer; utf8proc_ssize_t result; *dstptr = NULL; - result = utf8proc_decompose(str, strlen, NULL, 0, options); + result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); if (result < 0) return result; buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); if (!buffer) return UTF8PROC_ERROR_NOMEM; - result = utf8proc_decompose(str, strlen, buffer, result, options); + result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); if (result < 0) { free(buffer); return result; @@ -730,4 +753,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) UTF8PROC_COMPOSE | UTF8PROC_COMPAT); return retval; } - diff --git a/utf8proc.h b/utf8proc.h index 82c0902..edf46d4 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -71,14 +71,15 @@ /** The MAJOR version number (increased when backwards API compatibility is broken). */ #define UTF8PROC_VERSION_MAJOR 2 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ -#define UTF8PROC_VERSION_MINOR 0 +#define UTF8PROC_VERSION_MINOR 1 /** The PATCH version (increased for fixes that do not change the API). */ -#define UTF8PROC_VERSION_PATCH 2 +#define UTF8PROC_VERSION_PATCH 0 /** @} */ #include #include -#ifdef _MSC_VER +#if defined(_MSC_VER) && _MSC_VER < 1800 +// MSVC prior to 2013 lacked stdbool.h and inttypes.h typedef signed char utf8proc_int8_t; typedef unsigned char utf8proc_uint8_t; typedef short utf8proc_int16_t; @@ -93,12 +94,18 @@ typedef int utf8proc_ssize_t; typedef unsigned int utf8proc_size_t; # endif # ifndef __cplusplus +// emulate C99 bool typedef unsigned char utf8proc_bool; -enum {false, true}; +# ifndef __bool_true_false_are_defined +# define false 0 +# define true 1 +# define __bool_true_false_are_defined 1 +# endif # else typedef bool utf8proc_bool; # endif #else +# include # include # include typedef int8_t utf8proc_int8_t; @@ -108,7 +115,7 @@ typedef uint16_t utf8proc_uint16_t; typedef int32_t utf8proc_int32_t; typedef uint32_t utf8proc_uint32_t; typedef size_t utf8proc_size_t; -typedef ssize_t utf8proc_ssize_t; +typedef ptrdiff_t utf8proc_ssize_t; typedef bool utf8proc_bool; #endif #include @@ -134,7 +141,7 @@ extern "C" { #endif #ifndef UINT16_MAX -# define UINT16_MAX ~(utf8proc_uint16_t)0 +# define UINT16_MAX 65535U #endif /** @@ -373,6 +380,13 @@ typedef enum { UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */ } utf8proc_boundclass_t; +/** + * Function pointer type passed to @ref utf8proc_map_custom and + * @ref utf8proc_decompose_custom, which is used to specify a user-defined + * mapping of codepoints to be applied in conjunction with other mappings. + */ +typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data); + /** * Array containing the byte lengths of a UTF-8 encoded codepoint based * on the first byte. @@ -480,6 +494,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( * `buffer` (which must contain at least `bufsize` entries). In case of * success, the number of codepoints written is returned; in case of an * error, a negative error code is returned (@ref utf8proc_errmsg). + * See @ref utf8proc_decompose_custom to supply additional transformations. * * If the number of written codepoints would be bigger than `bufsize`, the * required buffer size is returned, while the buffer will be overwritten with @@ -490,9 +505,47 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options ); +/** + * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function + * that is called on each codepoint in `str` before any other transformations + * (along with a `custom_data` pointer that is passed through to `custom_func`). + * The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom. + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, + utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +); + +/** + * Normalizes the sequence of `length` codepoints pointed to by `buffer` + * in-place (i.e., the result is also stored in `buffer`). + * + * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. + * @param length the length (in codepoints) of the buffer. + * @param options a bitwise or (`|`) of one or more of the following flags: + * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS + * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS + * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF + * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters + * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite + * codepoints + * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate + * the unicode versioning stability + * + * @return + * In case of success, the length (in codepoints) of the normalized UTF-32 string is + * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). + * + * @warning The entries of the array pointed to by `str` have to be in the + * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); + /** * Reencodes the sequence of `length` codepoints pointed to by `buffer` * UTF-8 data in-place (i.e., the result is also stored in `buffer`). + * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion. * * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. * @param length the length (in codepoints) of the buffer. @@ -505,10 +558,12 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( * codepoints * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate * the unicode versioning stability + * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster * * @return - * In case of success, the length (in bytes) of the resulting UTF-8 string is - * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). + * In case of success, the length (in bytes) of the resulting nul-terminated + * UTF-8 string is returned; otherwise, a negative error code is returned + * (@ref utf8proc_errmsg). * * @warning The amount of free space pointed to by `buffer` must * exceed the amount of the input data by one byte, and the @@ -567,7 +622,7 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c); * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, * except that a width of 0 is returned for non-printable codepoints * instead of -1 as in `wcwidth`. - * + * * @note * If you want to check for particular types of non-printable characters, * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */ @@ -595,7 +650,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi * in any case the result will be NULL terminated (though it might * contain NULL characters with the string if `str` contained NULL * characters). Other flags in the `options` field are passed to the - * functions defined above, and regarded as described. + * functions defined above, and regarded as described. See also + * @ref utfproc_map_custom to supply a custom codepoint transformation. * * In case of success the length of the new string is returned, * otherwise a negative error code is returned. @@ -607,6 +663,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ); +/** + * Like @ref utf8proc_map, but also takes a `custom_func` mapping function + * that is called on each codepoint in `str` before any other transformations + * (along with a `custom_data` pointer that is passed through to `custom_func`). + * The `custom_func` argument is ignored if it is `NULL`. + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +); + /** @name Unicode normalization * * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC @@ -619,9 +686,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str); /** NFC normalization (@ref UTF8PROC_COMPOSE). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); -/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ +/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); -/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ +/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); /** @} */ @@ -630,4 +697,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); #endif #endif - diff --git a/utf8proc_data.c b/utf8proc_data.c index 02541e0..d8a56bb 100644 --- a/utf8proc_data.c +++ b/utf8proc_data.c @@ -5896,7 +5896,7 @@ const utf8proc_uint16_t utf8proc_stage2table[] = { 540, 540, 540, 1180, 0, 0, 0, 0, 0, 1154, 1154, 1154, 1154, 1154, 1154, 1154, 1154, 1154, 1154, 0, 0, 0, 0, 1103, - 1158, 0, 0, 0, 0, 0, 0, 0, + 1103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -7847,7 +7847,7 @@ const utf8proc_property_t utf8proc_properties[] = { {UTF8PROC_CATEGORY_MN, 122, UTF8PROC_BIDI_CLASS_NSM, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 0, 0, UTF8PROC_BOUNDCLASS_EXTEND}, {UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_COMPAT, 9523, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_COMPAT, 9525, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, - {UTF8PROC_CATEGORY_PO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_NOBREAK, 1335, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, + {UTF8PROC_CATEGORY_PO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_NOBREAK, 1335, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_MN, 216, UTF8PROC_BIDI_CLASS_NSM, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 0, 0, UTF8PROC_BOUNDCLASS_EXTEND}, {UTF8PROC_CATEGORY_PS, 0, UTF8PROC_BIDI_CLASS_ON, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, true, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_PE, 0, UTF8PROC_BIDI_CLASS_ON, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, true, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER}, @@ -10475,7 +10475,7 @@ const utf8proc_property_t utf8proc_properties[] = { {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1470, UINT16_MAX, 1470, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1478, UINT16_MAX, 1478, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5132, UINT16_MAX, 5132, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, - {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1480, UINT16_MAX, 1480, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER}, + {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1480, UINT16_MAX, 1480, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5133, UINT16_MAX, 5133, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5134, UINT16_MAX, 5134, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1482, UINT16_MAX, 1482, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, @@ -12165,7 +12165,7 @@ const utf8proc_property_t utf8proc_properties[] = { {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6787, UINT16_MAX, 6787, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6789, UINT16_MAX, 6789, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6791, UINT16_MAX, 6791, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, - {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6793, UINT16_MAX, 6793, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER}, + {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6793, UINT16_MAX, 6793, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6795, UINT16_MAX, 6795, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6797, UINT16_MAX, 6797, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6799, UINT16_MAX, 6799, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, @@ -12201,7 +12201,7 @@ const utf8proc_property_t utf8proc_properties[] = { {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9104, UINT16_MAX, 9104, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9106, UINT16_MAX, 9106, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9108, UINT16_MAX, 9108, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, - {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9110, UINT16_MAX, 9110, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER}, + {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9110, UINT16_MAX, 9110, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9112, UINT16_MAX, 9112, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9114, UINT16_MAX, 9114, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER}, {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9116, UINT16_MAX, 9116, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},