diff options
-rw-r--r-- | ChangeLog | 11 | ||||
-rw-r--r-- | docs/tmpl/main.sgml | 13 | ||||
-rw-r--r-- | pango/break.c | 242 | ||||
-rw-r--r-- | pango/pango-break.h | 3 | ||||
-rw-r--r-- | tests/testboundaries_ucd.c | 3 |
5 files changed, 247 insertions, 25 deletions
@@ -1,6 +1,17 @@ 2008-04-24 Behdad Esfahbod <behdad@gnome.org> Part of Bug 97545 – Make pango_default_break follow Unicode TR #29 + + * docs/tmpl/main.sgml: + * pango/break.c (pango_default_break): + * pango/pango-break.h: + * tests/testboundaries_ucd.c (main): + Add new PangoLogAttr member is_word_boundary, that implements UAX#29's + Word Boundaries semantics. Test fully passes for it. + +2008-04-24 Behdad Esfahbod <behdad@gnome.org> + + Part of Bug 97545 – Make pango_default_break follow Unicode TR #29 Patch from Noah Levitt * tests/Makefile.am: diff --git a/docs/tmpl/main.sgml b/docs/tmpl/main.sgml index fe37ff91..f5cff8e0 100644 --- a/docs/tmpl/main.sgml +++ b/docs/tmpl/main.sgml @@ -393,6 +393,10 @@ about the attributes of a single character. @is_cursor_position: if set, cursor can appear in front of character. i.e. this is a grapheme boundary, or the first character in the text. + This flag implements Unicode's + <ulink + url="http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries">Grapheme + Cluster Boundaries</ulink> semantics. @is_word_start: is first character in a word @is_word_end: is first non-word char after a word Note that in degenerate cases, you could have both @is_word_start @@ -424,6 +428,15 @@ about the attributes of a single character. characters. @is_expandable_space: is a whitespace character that can possibly be expanded for justification purposes. (Since: 1.18) +@is_word_boundary: is a word boundary. + More specifically, means that this is not a position in the middle + of a word. For example, both sides of a punctuation mark are + considered word boundaries. This flag is particularly useful when + selecting text word-by-word. + This flag implements Unicode's + <ulink url="http://www.unicode.org/reports/tr29/#Word_Boundaries">Word + Boundaries</ulink> semantics. + (Since: 1.22) <!-- ##### FUNCTION pango_shape ##### --> <para> diff --git a/pango/break.c b/pango/break.c index 9e63f063..72d5d541 100644 --- a/pango/break.c +++ b/pango/break.c @@ -522,6 +522,23 @@ pango_default_break (const gchar *text, } GraphemeBreakType; GraphemeBreakType prev_GB_type = GB_Other; + /* See Word_Break Property Values table of UAX#29 */ + typedef enum + { + WB_Other, + WB_NewlineCRLF, + WB_ExtendFormat, + WB_Katakana, + WB_ALetter, + WB_MidNumLet, + WB_MidLetter, + WB_MidNum, + WB_Numeric, + WB_ExtendNumLet, + } WordBreakType; + WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other; + gint prev_WB_i = -1; + WordType current_word_type = WordNone; gunichar last_word_letter = 0; gunichar base_character = 0; @@ -567,6 +584,11 @@ pango_default_break (const gchar *text, JamoType jamo; gboolean makes_hangul_syllable; + /* UAX#29 boundaries */ + gboolean is_grapheme_boundary; + gboolean is_word_boundary; + + wc = next_wc; break_type = next_break_type; @@ -624,23 +646,24 @@ pango_default_break (const gchar *text, */ attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc); - /* ---- Cursor position breaks (Grapheme breaks) ---- */ - + /* ---- UAX#29 Grapheme Boundaries ---- */ { GraphemeBreakType GB_type; - gboolean is_grapheme_boundary = FALSE; /* Find the GraphemeBreakType of wc */ GB_type = GB_Other; switch (type) { - case G_UNICODE_CONTROL: case G_UNICODE_FORMAT: + if (wc == 0x200C && wc == 0x200D) + { + GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */ + break; + } + /* fall through */ + case G_UNICODE_CONTROL: case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: - if (wc != 0x200C && wc != 0x200D) - GB_type = GB_ControlCRLF; - else - GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */ + GB_type = GB_ControlCRLF; break; case G_UNICODE_OTHER_LETTER: @@ -679,16 +702,13 @@ pango_default_break (const gchar *text, case G_UNICODE_NON_SPACING_MARK: GB_type = GB_Extend; /* Grapheme_Extend */ break; - - default: - break; } /* Grapheme Cluster Boundary Rules */ /* We apply Rules GB1 and GB2 at the end of the function */ if (wc == '\n' && prev_wc == '\r') is_grapheme_boundary = FALSE; /* Rule GB3 */ - else if (GB_type == GB_ControlCRLF || prev_GB_type == GB_ControlCRLF) + else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF) is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */ else if (GB_type == GB_InHangulSyllable) is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */ @@ -704,14 +724,189 @@ pango_default_break (const gchar *text, prev_GB_type = GB_type; attrs[i].is_cursor_position = is_grapheme_boundary; + /* If this is a grapheme boundary, we have to decide if backspace + * deletes a character or the whole grapheme cluster */ + if (is_grapheme_boundary) + attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character); + else + attrs[i].backspace_deletes_character = FALSE; + } + + /* ---- UAX#29 Word Boundaries ---- */ + { + is_word_boundary = FALSE; + if (is_grapheme_boundary) /* Rules WB3 and WB4 */ + { + PangoScript script; + WordBreakType WB_type; + + script = pango_script_for_unichar (wc); + + /* Find the WordBreakType of wc */ + WB_type = WB_Other; + + if (script == PANGO_SCRIPT_KATAKANA) + WB_type = WB_Katakana; + + if (WB_type == WB_Other) + switch (wc >> 8) + { + case 0x30: + if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 || + wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc) + WB_type = WB_Katakana; /* Katakana exceptions */ + break; + case 0xFF: + if (wc == 0xFF70) + WB_type = WB_Katakana; /* Katakana exceptions */ + else if (wc >= 0xFF9E || wc <= 0xFF9F) + WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */ + break; + case 0x05: + if (wc == 0x05F3) + WB_type = WB_ALetter; /* ALetter exceptions */ + break; + } + + if (WB_type == WB_Other) + switch (break_type) + { + case G_UNICODE_BREAK_NUMERIC: + if (wc != 0x066C) + WB_type = WB_Numeric; /* Numeric */ + break; + case G_UNICODE_BREAK_INFIX_SEPARATOR: + if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E) + WB_type = WB_MidNum; /* MidNum */ + break; + } + + if (WB_type == WB_Other) + switch (type) + { + case G_UNICODE_CONTROL: + if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085) + break; + /* fall through */ + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + WB_type = WB_NewlineCRLF; /* CR, LF, Newline */ + break; + + case G_UNICODE_FORMAT: + case G_UNICODE_COMBINING_MARK: + case G_UNICODE_ENCLOSING_MARK: + case G_UNICODE_NON_SPACING_MARK: + WB_type = WB_ExtendFormat; /* Extend, Format */ + break; + + case G_UNICODE_CONNECT_PUNCTUATION: + WB_type = WB_ExtendNumLet; /* ExtendNumLet */ + break; + + case G_UNICODE_INITIAL_PUNCTUATION: + case G_UNICODE_FINAL_PUNCTUATION: + if (wc == 0x2018 || wc == 0x2019) + WB_type = WB_MidNumLet; /* MidNumLet */ + break; + case G_UNICODE_OTHER_PUNCTUATION: + if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 || + wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e) + WB_type = WB_MidNumLet; /* MidNumLet */ + else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 || wc == 0x003a || wc == 0x0387 || + wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a) + WB_type = WB_MidLetter; /* WB_MidLetter */ + else if (wc == 0x066c || + wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b) + WB_type = WB_MidNum; /* MidNum */ + break; + + case G_UNICODE_OTHER_SYMBOL: + if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */ + goto Alphabetic; + break; + + case G_UNICODE_OTHER_LETTER: + case G_UNICODE_LETTER_NUMBER: + if (wc == 0x3006 || wc == 0x3007 || + (wc >= 0x3021 && wc <= 0x3029) || + (wc >= 0x3038 && wc <= 0x303A) || + (wc >= 0x3400 && wc <= 0x4DB5) || + (wc >= 0x4E00 && wc <= 0x9FC3) || + (wc >= 0xF900 && wc <= 0xFA2D) || + (wc >= 0xFA30 && wc <= 0xFA6A) || + (wc >= 0xFA70 && wc <= 0xFAD9) || + (wc >= 0x20000 && wc <= 0x2A6D6) || + (wc >= 0x2F800 && wc <= 0x2FA1D)) + break; /* ALetter exceptions: Ideographic */ + goto Alphabetic; + + case G_UNICODE_LOWERCASE_LETTER: + case G_UNICODE_MODIFIER_LETTER: + case G_UNICODE_TITLECASE_LETTER: + case G_UNICODE_UPPERCASE_LETTER: + Alphabetic: + if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA) + WB_type = WB_ALetter; /* ALetter */ + break; + } + + /* Grapheme Cluster Boundary Rules */ + + /* We apply Rules WB1 and WB2 at the end of the function */ + + if (prev_wc == 0x3031 && wc == 0x41) + g_message ("Y %d %d", prev_WB_type, WB_type); + if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i) + { + /* The extra check for prev_WB_i is to correctly handle sequences like + * Newline ÷ Extend × Extend + * since we have not skipped ExtendFormat yet. + */ + is_word_boundary = TRUE; /* Rule WB3a */ + } + else if (WB_type == WB_NewlineCRLF) + is_word_boundary = TRUE; /* Rule WB3b */ + else if (WB_type == WB_ExtendFormat) + is_word_boundary = FALSE; /* Rules WB4? */ + else if ((prev_WB_type == WB_ALetter || + prev_WB_type == WB_Numeric || + prev_WB_type == WB_ExtendNumLet) && + ( WB_type == WB_ALetter || + WB_type == WB_Numeric || + WB_type == WB_ExtendNumLet)) + is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10, WB13a, WB13b */ + else if ((prev_WB_type == WB_Katakana || + prev_WB_type == WB_ExtendNumLet) && + ( WB_type == WB_Katakana || + WB_type == WB_ExtendNumLet)) + is_word_boundary = FALSE; /* Rules WB13, WB13a, WB13b */ + else if ((prev_prev_WB_type == WB_ALetter && WB_type == WB_ALetter) && + (prev_WB_type == WB_MidLetter || prev_WB_type == WB_MidNumLet)) + { + attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */ + is_word_boundary = FALSE; /* Rule WB7 */ + } + else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) && + (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet)) + { + is_word_boundary = FALSE; /* Rule WB11 */ + attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */ + } + else + is_word_boundary = TRUE; /* Rule WB14 */ + + if (WB_type != WB_ExtendFormat) + { + prev_prev_WB_type = prev_WB_type; + prev_WB_type = WB_type; + prev_WB_i = i; + } + } + + attrs[i].is_word_boundary = is_word_boundary; } - /* If this is a grapheme boundary, we have to decide if backspace - * deletes a character or the whole grapheme cluster */ - if (attrs[i].is_cursor_position) - attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character); - else - attrs[i].backspace_deletes_character = FALSE; /* ---- Line breaking ---- */ @@ -1446,14 +1641,15 @@ pango_default_break (const gchar *text, } i--; + attrs[i].is_cursor_position = TRUE; /* Rule GB2 */ + attrs[0].is_cursor_position = TRUE; /* Rule GB1 */ + + attrs[i].is_word_boundary = TRUE; /* Rule WB2 */ + attrs[0].is_word_boundary = TRUE; /* Rule WB1 */ + attrs[i].is_line_break = TRUE; /* Rule LB3 */ attrs[0].is_line_break = FALSE; /* Rule LB2 */ - attrs[i].is_word_end = TRUE; /* Rule WB2 */ - attrs[0].is_word_start = TRUE; /* Rule WB1 */ - - attrs[i].is_cursor_position = TRUE; /* Rule GB2 */ - attrs[0].is_cursor_position = TRUE; /* Rule GB1 */ } static gboolean diff --git a/pango/pango-break.h b/pango/pango-break.h index fe22acec..5c326dd4 100644 --- a/pango/pango-break.h +++ b/pango/pango-break.h @@ -74,6 +74,9 @@ struct _PangoLogAttr * width during justification. */ guint is_expandable_space : 1; + + /* Word boundary as defined by UAX#29 */ + guint is_word_boundary : 1; /* is NOT in the middle of a word */ }; /* Determine information about cluster/word/line breaks in a string diff --git a/tests/testboundaries_ucd.c b/tests/testboundaries_ucd.c index 7e266f88..1e6d7d01 100644 --- a/tests/testboundaries_ucd.c +++ b/tests/testboundaries_ucd.c @@ -345,8 +345,7 @@ main (gint argc, filename = g_strdup_printf ("%s/WordBreakTest.txt", srcdir); bits.bits = 0; - bits.attr.is_word_start = 1; /* either word start or end */ - bits.attr.is_word_end = 1; /* (is this right?) */ + bits.attr.is_word_boundary = 1; do_test (filename, bits, FALSE); filename = g_strdup_printf ("%s/SentenceBreakTest.txt", srcdir); |