From: Diego Frias Date: Sat, 22 Nov 2025 18:42:18 +0000 (-0600) Subject: Fix attempting to combine Hangul Jamo 0x11a7 (#317) X-Git-Tag: archive/raspbian/2.11.3-2+rpi1^2~10^2^2~4 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=0260ba56c81e5ef6f06c0804034a36284bcb8710;p=utf8proc.git Fix attempting to combine Hangul Jamo 0x11a7 (#317) * Fix attempting to combine Hangul Jamo 0x11a7 0x11a7 is not a valid Hangul T syllable despite being equal to T_BASE. This is because, per the Unicode spec: TCount is set to one more than the number of trailing consonants relevant to the decomposition algorithm: (0x11C2 - 0x11A8 + 1) + 1 So the first valid Hangul T syllable is 0x11a8. Also see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G59434 for where the spec describes the usage of 0x11a8, not 0x11a7, during composition. * document that utf8proc_map simply wraps utf8proc_decompose and utf8proc_reencode (#312) * test code refactoring (#318) * Write regression test for #317 --------- Co-authored-by: Steven G. Johnson --- diff --git a/test/misc.c b/test/misc.c index bff793d..7ea2ebc 100644 --- a/test/misc.c +++ b/test/misc.c @@ -25,10 +25,30 @@ static void issue102(void) /* #102 */ check_compare("NFKC_Casefold", input, correct, utf8proc_NFKC_Casefold(input), 1); } +static void issue317(void) /* #317 */ +{ + utf8proc_uint8_t input[] = {0xec, 0xa3, 0xa0, 0xe1, 0x86, 0xa7, 0x00}; /* "\uc8e0\u11a7" */ + utf8proc_uint8_t combined[] = {0xec, 0xa3, 0xa, 0x00}; /* "\uc8e1" */ + utf8proc_int32_t codepoint; + + /* inputs that should *not* be combined* */ + check_compare("NFC", input, input, utf8proc_NFC(input), 1); + utf8proc_encode_char(0x11c3, input+3); + check_compare("NFC", input, input, utf8proc_NFC(input), 1); + + /* inputs that *should* be combined (TCOUNT-1 chars starting at TBASE+1) */ + for (codepoint = 0x11a8; codepoint < 0x11c3; ++codepoint) { + utf8proc_encode_char(codepoint, input+3); + utf8proc_encode_char(0xc8e0 + (codepoint - 0x11a7), combined); + check_compare("NFC", input, combined, utf8proc_NFC(input), 1); + } +} + int main(void) { issue128(); issue102(); + issue317(); #ifdef UNICODE_VERSION printf("Unicode version: Makefile has %s, has API %s\n", UNICODE_VERSION, utf8proc_unicode_version()); check(!strcmp(UNICODE_VERSION, utf8proc_unicode_version()), "utf8proc_unicode_version mismatch"); diff --git a/utf8proc.c b/utf8proc.c index c59bad2..b9877c0 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -684,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { utf8proc_int32_t hangul_tindex; hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; - if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { + if (hangul_tindex > 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { *starter += hangul_tindex; starter_property = NULL; continue;