summaryrefslogtreecommitdiff
path: root/pango/break.c
diff options
context:
space:
mode:
authorBehdad Esfahbod <behdad@gnome.org>2008-04-24 17:21:03 +0000
committerBehdad Esfahbod <behdad@src.gnome.org>2008-04-24 17:21:03 +0000
commitc09d3c29d67a00d206809988458574031b8fda8f (patch)
treef99351d50e73370b964ee0046b48605ee7043d97 /pango/break.c
parent35cfe752f8e36e6c7642e4dd0e25261f9aca8842 (diff)
downloadpango-c09d3c29d67a00d206809988458574031b8fda8f.tar.gz
Part of Bug 97545 – Make pango_default_break follow Unicode TR #29
2008-04-24 Behdad Esfahbod <behdad@gnome.org> Part of Bug 97545 – Make pango_default_break follow Unicode TR #29 * pango/break.c (pango_default_break): Make Grapheme Boundary code exactly follow UAX#29 of Unicode 5.1.0 svn path=/trunk/; revision=2615
Diffstat (limited to 'pango/break.c')
-rw-r--r--pango/break.c210
1 files changed, 87 insertions, 123 deletions
diff --git a/pango/break.c b/pango/break.c
index e6b10fcd..69c86501 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -417,26 +417,6 @@ static const CharJamoProps HangulJamoProps[] = {
#define JAMO_TYPE(btype) \
(IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
-
-
-
-/* "virama script" is just an optimization; it includes a bunch of
- * scripts without viramas in them
- */
-#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF)
-#define VIRAMA(wc) ((wc) == 0x094D || \
- (wc) == 0x09CD || \
- (wc) == 0x0A4D || \
- (wc) == 0x0ACD || \
- (wc) == 0x0B4D || \
- (wc) == 0x0BCD || \
- (wc) == 0x0C4D || \
- (wc) == 0x0CCD || \
- (wc) == 0x0D4D || \
- (wc) == 0x0DCA || \
- (wc) == 0x0F84 || \
- (wc) == 0x1039 || \
- (wc) == 0x17D2)
/* Types of Japanese characters */
#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
@@ -450,10 +430,6 @@ static const CharJamoProps HangulJamoProps[] = {
#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))
-#define LOGICAL_ORDER_EXCEPTION(wc) (((wc) >= 0x0E40 && (wc) <= 0x0E44) || \
- ((wc) >= 0x0EC0 && (wc) <= 0x0EC4))
-
-
/* p. 132-133 of Unicode spec table 5-6 will help understand this */
typedef enum
{
@@ -534,6 +510,18 @@ pango_default_break (const gchar *text,
GUnicodeBreakType prev_break_type; /* skips spaces */
gboolean prev_was_break_space;
+ /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
+ typedef enum
+ {
+ GB_Other,
+ GB_ControlCRLF,
+ GB_Extend,
+ GB_Prepend,
+ GB_SpacingMark,
+ GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
+ } GraphemeBreakType;
+ GraphemeBreakType prev_GB_type = GB_Other;
+
WordType current_word_type = WordNone;
gunichar last_word_letter = 0;
gunichar base_character = 0;
@@ -638,105 +626,81 @@ pango_default_break (const gchar *text,
/* ---- Cursor position breaks (Grapheme breaks) ---- */
- /* TODO: This is quite close to TR29 of Unicode 5.1 now.
- * We need to implement Extend and SpacingMark support, as well
- * as checking the category exceptions.
- */
-
- if (wc == '\n')
- {
- /* Break before line feed unless prev char is a CR */
-
- if (prev_wc != '\r')
- attrs[i].is_cursor_position = TRUE;
- else
- attrs[i].is_cursor_position = FALSE;
- }
- else if (prev_type == G_UNICODE_CONTROL ||
- prev_type == G_UNICODE_FORMAT)
- {
- /* break after all format or control characters */
- attrs[i].is_cursor_position = TRUE;
- }
- else
- {
- switch (type)
- {
- case G_UNICODE_CONTROL:
- case G_UNICODE_FORMAT:
- /* Break before all format or control characters */
- attrs[i].is_cursor_position = TRUE;
- break;
-
- case G_UNICODE_COMBINING_MARK:
- case G_UNICODE_ENCLOSING_MARK:
- case G_UNICODE_NON_SPACING_MARK:
- /* Unicode spec includes "Combining marks plus Tibetan
- * subjoined characters" as joining chars, but lists the
- * Tibetan subjoined characters as combining marks, and
- * g_unichar_type() returns NON_SPACING_MARK for the Tibetan
- * subjoined characters. So who knows, beats me.
- */
-
- /* It's a joining character, break only if preceded by
- * control or format; we already handled the case where
- * it was preceded earlier, so here we know it wasn't,
- * don't break
- */
- attrs[i].is_cursor_position = FALSE;
- break;
-
- case G_UNICODE_LOWERCASE_LETTER:
- case G_UNICODE_MODIFIER_LETTER:
- case G_UNICODE_OTHER_LETTER:
- case G_UNICODE_TITLECASE_LETTER:
- case G_UNICODE_UPPERCASE_LETTER:
-
- /* Handle non-Hangul-syllable non-combining chars */
-
- /* Break before Jamo if they are in a broken sequence or
- * next to non-Jamo; break if preceded by Jamo; don't
- * break if a letter is preceded by a virama; break in
- * all other cases. No need to check whether we are or are
- * preceded by Jamo explicitly, since a Jamo is not
- * a virama, we just break in all cases where we
- * aren't a or preceded by a virama. Don't fool with
- * viramas if we aren't part of a script that uses them.
- */
-
- if (makes_hangul_syllable)
- {
- attrs[i].is_cursor_position = FALSE; /* Rules GB6, GB7, GB8 */
- break;
- }
-
- if (VIRAMA_SCRIPT (wc))
- {
- /* Check whether we're preceded by a virama; this
- * could use some optimization.
- */
- if (VIRAMA (prev_wc))
- {
- attrs[i].is_cursor_position = FALSE;
- break;
- }
- }
-
- /* fall through */
-
- default:
-
- if (LOGICAL_ORDER_EXCEPTION (prev_wc))
- {
- attrs[i].is_cursor_position = FALSE; /* Rule GB9b */
- break;
- }
-
- /* Some weirdo char, just break here, why not */
- attrs[i].is_cursor_position = TRUE; /* Rule GB10 */
- break;
- }
- }
+ {
+ /* Find the GraphemeBreakType of wc */
+ GraphemeBreakType GB_type = GB_Other;
+ switch (type)
+ {
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ if (wc != 0x200C && wc != 0x200D)
+ GB_type = GB_ControlCRLF;
+ else
+ GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */
+ break;
+
+ case G_UNICODE_OTHER_LETTER:
+ if (makes_hangul_syllable)
+ GB_type = GB_InHangulSyllable;
+ else if ((wc & 0x0E00) == 0x0E00)
+ {
+ /* Thai and Lao stuff hardcoded in UAX#29 */
+ if ((wc >= 0x0E40 && wc <= 0x0E44) || (wc >= 0x0EC0 && wc <= 0x0EC4))
+ GB_type = GB_Prepend; /* Prepend */
+ else if (wc == 0x0E30 || wc == 0x0E32 || wc == 0x0E33 || wc == 0x0E45 ||
+ wc == 0x0EB0 || wc == 0x0EB2 || wc == 0x0EB3)
+ GB_type = GB_Extend; /* Exceptions in the Extend definition */
+ }
+ break;
+
+ case G_UNICODE_MODIFIER_LETTER:
+ if (wc >= 0xFF9E && wc <= 0xFF9F)
+ GB_type = GB_Extend; /* Other_Grapheme_Extend */
+ break;
+
+ case G_UNICODE_COMBINING_MARK:
+ GB_type = GB_SpacingMark; /* SpacingMark */
+ if (wc >= 0x0900)
+ {
+ if (wc == 0x09BE || wc == 0x09D7 ||
+ wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
+ wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
+ wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
+ wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
+ GB_type = GB_Extend; /* Other_Grapheme_Extend */
+ }
+ break;
+
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ GB_type = GB_Extend; /* Grapheme_Extend */
+ break;
+
+ default:
+ break;
+ }
+
+ /* Grapheme Cluster Boundary Rules */
+ /* We apply Rules GB1 and GB2 at the end of the function */
+ if (wc == '\n' && prev_wc == '\r')
+ attrs[i].is_cursor_position = FALSE; /* Rule GB3 */
+ else if (GB_type == GB_ControlCRLF || prev_GB_type == GB_ControlCRLF)
+ attrs[i].is_cursor_position = TRUE; /* Rules GB4 and GB5 */
+ else if (GB_type == GB_InHangulSyllable)
+ attrs[i].is_cursor_position = FALSE; /* Rules GB6, GB7, GB8 */
+ else if (GB_type == GB_Extend)
+ attrs[i].is_cursor_position = FALSE; /* Rule GB9 */
+ else if (GB_type == GB_SpacingMark)
+ attrs[i].is_cursor_position = FALSE; /* Rule GB9a */
+ else if (prev_GB_type == GB_Prepend)
+ attrs[i].is_cursor_position = FALSE; /* Rule GB9b */
+ else
+ attrs[i].is_cursor_position = TRUE; /* Rule GB10 */
+
+ prev_GB_type = GB_type;
+ }
/* If this is a grapheme boundary, we have to decide if backspace
* deletes a character or the whole grapheme cluster */