From: Erik Schnetter Date: Sat, 28 Dec 2024 17:30:53 +0000 (-0500) Subject: Describe character combining table layout X-Git-Tag: archive/raspbian/2.11.3-2+rpi1^2~10^2~1^2~3^2~2 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=23ccf2bbf400fbf2ca5a1a26b72238d109e95d67;p=utf8proc.git Describe character combining table layout --- diff --git a/utf8proc.h b/utf8proc.h index 29798ca..a16f41b 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -255,6 +255,38 @@ typedef struct utf8proc_property_struct { utf8proc_uint16_t uppercase_seqindex; utf8proc_uint16_t lowercase_seqindex; utf8proc_uint16_t titlecase_seqindex; + /** + * Character combining table. + * + * The character combining table is formally indexed by two + * characters, the first and second character that might form a + * combining pair. The table entry then contains the combined + * character. Most character pairs cannot be combined. There are + * about 1,000 characters that can be the first character in a + * combining pair, and for most, there are only a handful for + * possible second characters. + * + * The combining table is stored as `utf8proc_uint32_t + * utf8proc_combinations[][2]`. That is, it contains a pair `(second + * combining character, combined character)` for every character + * that can be a first combining character. + * + * - `comb_index`: Index into the combining table if this character + * is the first character in a combining pair, else 0x3ff + * + * - `comb_length`: Number of table entries for this first character + * + * - `comb_is_second`: As optimization we also record whether this + * characther is the second combining character in any pair. If + * not, we can skip the table lookup. + * + * A table lookup starts from a given character pair. It first + * checks whether the first character is stored in the table + * (checking whether the index is 0x3ff) and whether the second + * index is stored in the table (looking at `comb_is_second`). If + * so, the `comb_length` table entries will be checked sequentially + * for a match. + */ utf8proc_uint16_t comb_index:10; utf8proc_uint16_t comb_length:5; utf8proc_uint16_t comb_issecond:1;