diff options
author | Peng Wu <alexepico@gmail.com> | 2017-07-05 15:05:16 +0800 |
---|---|---|
committer | Matthias Clasen <mclasen@redhat.com> | 2017-07-31 18:07:22 +0100 |
commit | 284d357e3d6e29c1437ca18bab347c1af8330908 (patch) | |
tree | 40e61928646c99a7d41b13b1d196bc2f30a57d74 | |
parent | 238ac31bd8cc1ef32f18317328381155093ded07 (diff) | |
download | pango-284d357e3d6e29c1437ca18bab347c1af8330908.tar.gz |
Update pango_default_break function for Sentence Boundary
Re-write the code for Sentence Boundary,
and use the code style like Grapheme Boundary and Word Boundary.
https://bugzilla.gnome.org/show_bug.cgi?id=782813
-rw-r--r-- | pango/break.c | 748 |
1 files changed, 305 insertions, 443 deletions
diff --git a/pango/break.c b/pango/break.c index 5b2128d2..1c36d494 100644 --- a/pango/break.c +++ b/pango/break.c @@ -432,27 +432,6 @@ static const CharJamoProps HangulJamoProps[] = { #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3) #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc)) -/* p. 132-133 of Unicode spec table 5-6 will help understand this */ -typedef enum -{ - STATE_SENTENCE_OUTSIDE, - STATE_SENTENCE_BODY, - STATE_SENTENCE_TERM, - STATE_SENTENCE_POST_TERM_CLOSE, - STATE_SENTENCE_POST_TERM_SPACE, - STATE_SENTENCE_POST_TERM_SEP, - STATE_SENTENCE_DOT, - STATE_SENTENCE_POST_DOT_CLOSE, - STATE_SENTENCE_POST_DOT_SPACE, - STATE_SENTENCE_POST_DOT_OPEN, - /* never include line/para separators in a sentence for now */ - /* This isn't in the spec, but I can't figure out why they'd include - * one line/para separator in lines ending with Term but not with - * period-terminated lines, so I'm doing it for the dot lines also - */ - STATE_SENTENCE_POST_DOT_SEP -} SentenceState; - /* Previously "123foo" was two words. But in UAX 29 of Unicode, * we know don't break words between consecutive letters and numbers */ @@ -508,7 +487,6 @@ pango_default_break (const gchar *text, JamoType prev_jamo; GUnicodeBreakType next_break_type; - GUnicodeType prev_type; GUnicodeBreakType prev_break_type; /* skips spaces */ gboolean prev_was_break_space; @@ -553,17 +531,34 @@ pango_default_break (const gchar *text, WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other; gint prev_WB_i = -1; + /* See Sentence_Break Property Values table of UAX#29 */ + typedef enum + { + SB_Other, + SB_ExtendFormat, + SB_ParaSep, + SB_Sp, + SB_Lower, + SB_Upper, + SB_OLetter, + SB_Numeric, + SB_ATerm, + SB_SContinue, + SB_STerm, + SB_Close, + /* Rules SB8 and SB8a */ + SB_ATerm_Close_Sp, + SB_STerm_Close_Sp, + } SentenceBreakType; + SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other; + gint prev_SB_i = -1; + WordType current_word_type = WordNone; gunichar last_word_letter = 0; gunichar base_character = 0; - SentenceState sentence_state = STATE_SENTENCE_OUTSIDE; - /* Tracks what will be the end of the sentence if a period is - * determined to actually be a sentence-ending period. - */ - gint possible_sentence_end = -1; - /* possible sentence break before Open* after a period-ended sentence */ - gint possible_sentence_boundary = -1; + gint last_sentence_start = -1; + gboolean almost_done = FALSE; gboolean done = FALSE; @@ -572,7 +567,6 @@ pango_default_break (const gchar *text, next = text; - prev_type = G_UNICODE_PARAGRAPH_SEPARATOR; prev_break_type = G_UNICODE_BREAK_UNKNOWN; prev_was_break_space = FALSE; prev_wc = 0; @@ -601,6 +595,7 @@ pango_default_break (const gchar *text, /* UAX#29 boundaries */ gboolean is_grapheme_boundary; gboolean is_word_boundary; + gboolean is_sentence_boundary; wc = next_wc; @@ -1078,6 +1073,276 @@ pango_default_break (const gchar *text, attrs[i].is_word_boundary = is_word_boundary; } + /* ---- UAX#29 Sentence Boundaries ---- */ + { + is_sentence_boundary = FALSE; + if (is_word_boundary || + wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */ + { + SentenceBreakType SB_type; + + /* Find the SentenceBreakType of wc */ + SB_type = SB_Other; + + if (break_type == G_UNICODE_BREAK_NUMERIC) + SB_type = SB_Numeric; /* Numeric */ + + if (SB_type == SB_Other) + switch ((int) type) + { + case G_UNICODE_CONTROL: + if (wc == '\r' || wc == '\n') + SB_type = SB_ParaSep; + else if (wc == 0x0009 || wc == 0x000B || wc == 0x000C) + SB_type = SB_Sp; + else if (wc == 0x0085) + SB_type = SB_ParaSep; + break; + + case G_UNICODE_SPACE_SEPARATOR: + if (wc == 0x0020 || wc == 0x00A0 || wc == 0x1680 || + (wc >= 0x2000 && wc <= 0x200A) || + wc == 0x202F || wc == 0x205F || wc == 0x3000) + SB_type = SB_Sp; + break; + + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + SB_type = SB_ParaSep; + break; + + case G_UNICODE_FORMAT: + case G_UNICODE_SPACING_MARK: + case G_UNICODE_ENCLOSING_MARK: + case G_UNICODE_NON_SPACING_MARK: + SB_type = SB_ExtendFormat; /* Extend, Format */ + break; + + case G_UNICODE_MODIFIER_LETTER: + if (wc >= 0xFF9E && wc <= 0xFF9F) + SB_type = SB_ExtendFormat; /* Other_Grapheme_Extend */ + break; + + case G_UNICODE_TITLECASE_LETTER: + SB_type = SB_Upper; + break; + + case G_UNICODE_DASH_PUNCTUATION: + if (wc == 0x002D || + (wc >= 0x2013 && wc <= 0x2014) || + (wc >= 0xFE31 && wc <= 0xFE32) || + wc == 0xFE58 || + wc == 0xFE63 || + wc == 0xFF0D) + SB_type = SB_SContinue; + break; + + case G_UNICODE_OTHER_PUNCTUATION: + if (wc == 0x05F3) + SB_type = SB_OLetter; + else if (wc == 0x002E || wc == 0x2024 || + wc == 0xFE52 || wc == 0xFF0E) + SB_type = SB_ATerm; + + if (wc == 0x002C || + wc == 0x003A || + wc == 0x055D || + (wc >= 0x060C && wc <= 0x060D) || + wc == 0x07F8 || + wc == 0x1802 || + wc == 0x1808 || + wc == 0x3001 || + (wc >= 0xFE10 && wc <= 0xFE11) || + wc == 0xFE13 || + (wc >= 0xFE50 && wc <= 0xFE51) || + wc == 0xFE55 || + wc == 0xFF0C || + wc == 0xFF1A || + wc == 0xFF64) + SB_type = SB_SContinue; + + if (wc == 0x0021 || + wc == 0x003F || + wc == 0x0589 || + wc == 0x061F || + wc == 0x06D4 || + (wc >= 0x0700 && wc <= 0x0702) || + wc == 0x07F9 || + (wc >= 0x0964 && wc <= 0x0965) || + (wc >= 0x104A && wc <= 0x104B) || + wc == 0x1362 || + (wc >= 0x1367 && wc <= 0x1368) || + wc == 0x166E || + (wc >= 0x1735 && wc <= 0x1736) || + wc == 0x1803 || + wc == 0x1809 || + (wc >= 0x1944 && wc <= 0x1945) || + (wc >= 0x1AA8 && wc <= 0x1AAB) || + (wc >= 0x1B5A && wc <= 0x1B5B) || + (wc >= 0x1B5E && wc <= 0x1B5F) || + (wc >= 0x1C3B && wc <= 0x1C3C) || + (wc >= 0x1C7E && wc <= 0x1C7F) || + (wc >= 0x203C && wc <= 0x203D) || + (wc >= 0x2047 && wc <= 0x2049) || + wc == 0x2E2E || + wc == 0x2E3C || + wc == 0x3002 || + wc == 0xA4FF || + (wc >= 0xA60E && wc <= 0xA60F) || + wc == 0xA6F3 || + wc == 0xA6F7 || + (wc >= 0xA876 && wc <= 0xA877) || + (wc >= 0xA8CE && wc <= 0xA8CF) || + wc == 0xA92F || + (wc >= 0xA9C8 && wc <= 0xA9C9) || + (wc >= 0xAA5D && wc <= 0xAA5F) || + (wc >= 0xAAF0 && wc <= 0xAAF1) || + wc == 0xABEB || + (wc >= 0xFE56 && wc <= 0xFE57) || + wc == 0xFF01 || + wc == 0xFF1F || + wc == 0xFF61 || + (wc >= 0x10A56 && wc <= 0x10A57) || + (wc >= 0x11047 && wc <= 0x11048) || + (wc >= 0x110BE && wc <= 0x110C1) || + (wc >= 0x11141 && wc <= 0x11143) || + (wc >= 0x111C5 && wc <= 0x111C6) || + wc == 0x111CD || + (wc >= 0x111DE && wc <= 0x111DF) || + (wc >= 0x11238 && wc <= 0x11239) || + (wc >= 0x1123B && wc <= 0x1123C) || + wc == 0x112A9 || + (wc >= 0x1144B && wc <= 0x1144C) || + (wc >= 0x115C2 && wc <= 0x115C3) || + (wc >= 0x115C9 && wc <= 0x115D7) || + (wc >= 0x11641 && wc <= 0x11642) || + (wc >= 0x1173C && wc <= 0x1173E) || + (wc >= 0x11C41 && wc <= 0x11C42) || + (wc >= 0x16A6E && wc <= 0x16A6F) || + wc == 0x16AF5 || + (wc >= 0x16B37 && wc <= 0x16B38) || + wc == 0x16B44 || + wc == 0x1BC9F || + wc == 0x1DA88) + SB_type = SB_STerm; + + break; + } + + if (SB_type == SB_Other) + { + if (g_unichar_islower(wc)) + SB_type = SB_Lower; + else if (g_unichar_isupper(wc)) + SB_type = SB_Upper; + else if (g_unichar_isalpha(wc)) + SB_type = SB_OLetter; + + if (type == G_UNICODE_OPEN_PUNCTUATION || + type == G_UNICODE_CLOSE_PUNCTUATION || + break_type == G_UNICODE_BREAK_QUOTATION) + SB_type = SB_Close; + } + + /* Sentence Boundary Rules */ + + /* We apply Rules SB1 and SB2 at the end of the function */ + +#define IS_OTHER_TERM(SB_type) \ + /* not in (OLetter | Upper | Lower | ParaSep | SATerm) */ \ + !(SB_type == SB_OLetter || \ + SB_type == SB_Upper || SB_type == SB_Lower || \ + SB_type == SB_ParaSep || \ + SB_type == SB_ATerm || SB_type == SB_STerm || \ + SB_type == SB_ATerm_Close_Sp || \ + SB_type == SB_STerm_Close_Sp) + + + if (wc == '\n' && prev_wc == '\r') + is_sentence_boundary = FALSE; /* Rule SB3 */ + else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i) + { + /* The extra check for prev_SB_i is to correctly handle sequences like + * ParaSep ÷ Extend × Extend + * since we have not skipped ExtendFormat yet. + */ + + is_sentence_boundary = TRUE; /* Rule SB4 */ + } + else if (SB_type == SB_ExtendFormat) + is_sentence_boundary = FALSE; /* Rule SB5? */ + else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric) + is_sentence_boundary = FALSE; /* Rule SB6 */ + else if ((prev_prev_SB_type == SB_Upper || + prev_prev_SB_type == SB_Lower) && + prev_SB_type == SB_ATerm && + SB_type == SB_Upper) + is_sentence_boundary = FALSE; /* Rule SB7 */ + else if (prev_SB_type == SB_ATerm && SB_type == SB_Close) + SB_type = SB_ATerm; + else if (prev_SB_type == SB_STerm && SB_type == SB_Close) + SB_type = SB_STerm; + else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp) + SB_type = SB_ATerm_Close_Sp; + else if (prev_SB_type == SB_STerm && SB_type == SB_Sp) + SB_type = SB_STerm_Close_Sp; + /* Rule SB8 */ + else if ((prev_SB_type == SB_ATerm || + prev_SB_type == SB_ATerm_Close_Sp) && + SB_type == SB_Lower) + is_sentence_boundary = FALSE; + else if ((prev_prev_SB_type == SB_ATerm || + prev_prev_SB_type == SB_ATerm_Close_Sp) && + IS_OTHER_TERM(prev_SB_type) && + SB_type == SB_Lower) + attrs[prev_SB_i].is_sentence_boundary = FALSE; + else if ((prev_SB_type == SB_ATerm || + prev_SB_type == SB_ATerm_Close_Sp || + prev_SB_type == SB_STerm || + prev_SB_type == SB_STerm_Close_Sp) && + (SB_type == SB_SContinue || + SB_type == SB_ATerm || SB_type == SB_STerm)) + is_sentence_boundary = FALSE; /* Rule SB8a */ + else if ((prev_SB_type == SB_ATerm || + prev_SB_type == SB_STerm) && + (SB_type == SB_Close || SB_type == SB_Sp || + SB_type == SB_ParaSep)) + is_sentence_boundary = FALSE; /* Rule SB9 */ + else if ((prev_SB_type == SB_ATerm || + prev_SB_type == SB_ATerm_Close_Sp || + prev_SB_type == SB_STerm || + prev_SB_type == SB_STerm_Close_Sp) && + (SB_type == SB_Sp || SB_type == SB_ParaSep)) + is_sentence_boundary = FALSE; /* Rule SB10 */ + else if ((prev_SB_type == SB_ATerm || + prev_SB_type == SB_ATerm_Close_Sp || + prev_SB_type == SB_STerm || + prev_SB_type == SB_STerm_Close_Sp) && + SB_type != SB_ParaSep) + is_sentence_boundary = TRUE; /* Rule SB11 */ + else + is_sentence_boundary = FALSE; /* Rule SB998 */ + + if (SB_type != SB_ExtendFormat && + !((prev_prev_SB_type == SB_ATerm || + prev_prev_SB_type == SB_ATerm_Close_Sp) && + IS_OTHER_TERM(prev_SB_type) && + IS_OTHER_TERM(SB_type))) + { + prev_prev_SB_type = prev_SB_type; + prev_SB_type = SB_type; + prev_SB_i = i; + } + +#undef IS_OTHER_TERM + + } + + if (i == 0 || done) + is_sentence_boundary = TRUE; /* Rules SB1 and SB2 */ + + attrs[i].is_sentence_boundary = is_sentence_boundary; + } /* ---- Line breaking ---- */ @@ -1371,424 +1636,20 @@ pango_default_break (const gchar *text, /* ---- Sentence breaks ---- */ - /* The Unicode spec specifies sentence breakpoints, so that a piece of - * text would be partitioned into sentences, and all characters would - * be inside some sentence. This code implements that for is_sentence_boundary, - * but tries to keep leading/trailing whitespace out of sentences for - * the start/end flags - */ - - /* The Unicode spec seems to say that one trailing line/para - * separator can be tacked on to a sentence ending in ! or ?, - * but not a sentence ending in period; I think they're on crack - * so am allowing one to be tacked onto a sentence ending in period. - */ - -#define MAYBE_START_NEW_SENTENCE \ - switch ((int) type) \ - { \ - case G_UNICODE_LINE_SEPARATOR: \ - case G_UNICODE_PARAGRAPH_SEPARATOR: \ - case G_UNICODE_CONTROL: \ - case G_UNICODE_FORMAT: \ - case G_UNICODE_SPACE_SEPARATOR: \ - sentence_state = STATE_SENTENCE_OUTSIDE; \ - break; \ - \ - default: \ - sentence_state = STATE_SENTENCE_BODY; \ - attrs[i].is_sentence_start = TRUE; \ - break; \ - } - - /* No sentence break at the start of the text */ - - /* default to not a sentence breakpoint */ - attrs[i].is_sentence_boundary = FALSE; + /* default to not a sentence start/end */ attrs[i].is_sentence_start = FALSE; attrs[i].is_sentence_end = FALSE; - /* FIXME the Unicode spec lumps control/format chars with - * line/para separators in descriptive text, but not in the - * character class specs, in table 5-6, so who knows whether you - * are actually supposed to break on control/format - * characters. Seems semi-broken to break on tabs... - */ - - /* Break after line/para separators except carriage return - * followed by newline - */ - switch ((int) prev_type) - { - case G_UNICODE_LINE_SEPARATOR: - case G_UNICODE_PARAGRAPH_SEPARATOR: - case G_UNICODE_CONTROL: - case G_UNICODE_FORMAT: - if (wc == '\r') - { - if (next_wc != '\n') - attrs[i].is_sentence_boundary = TRUE; - } - else - attrs[i].is_sentence_boundary = TRUE; - break; - - default: - break; - } - - /* break before para/line separators except newline following - * carriage return - */ - switch ((int) type) - { - case G_UNICODE_LINE_SEPARATOR: - case G_UNICODE_PARAGRAPH_SEPARATOR: - case G_UNICODE_CONTROL: - case G_UNICODE_FORMAT: - if (wc == '\n') - { - if (prev_wc != '\r') - attrs[i].is_sentence_boundary = TRUE; - } - else - attrs[i].is_sentence_boundary = TRUE; - break; - - default: - break; - } - - switch (sentence_state) - { - case STATE_SENTENCE_OUTSIDE: - /* Start sentence if we have non-whitespace/format/control */ - switch ((int) type) - { - case G_UNICODE_LINE_SEPARATOR: - case G_UNICODE_PARAGRAPH_SEPARATOR: - case G_UNICODE_CONTROL: - case G_UNICODE_FORMAT: - case G_UNICODE_SPACE_SEPARATOR: - break; - - default: - attrs[i].is_sentence_start = TRUE; - sentence_state = STATE_SENTENCE_BODY; - break; - } - break; - - case STATE_SENTENCE_BODY: - /* If we already broke here due to separators, end the sentence. */ - if (attrs[i].is_sentence_boundary) - { - attrs[i].is_sentence_end = TRUE; - - MAYBE_START_NEW_SENTENCE; - } - else - { - if (wc == '.') - sentence_state = STATE_SENTENCE_DOT; - else if (wc == '?' || wc == '!') - sentence_state = STATE_SENTENCE_TERM; - } - break; - - case STATE_SENTENCE_TERM: - /* End sentence on anything but close punctuation and some - * loosely-specified OTHER_PUNCTUATION such as period, - * comma, etc.; follow Unicode rules for breaks - */ - switch ((int) type) - { - case G_UNICODE_OTHER_PUNCTUATION: - case G_UNICODE_CLOSE_PUNCTUATION: - if (type == G_UNICODE_CLOSE_PUNCTUATION || - wc == '.' || - wc == ',' || - wc == '?' || - wc == '!') - sentence_state = STATE_SENTENCE_POST_TERM_CLOSE; - else - { - attrs[i].is_sentence_end = TRUE; - attrs[i].is_sentence_boundary = TRUE; - - MAYBE_START_NEW_SENTENCE; - } - break; - - case G_UNICODE_SPACE_SEPARATOR: - attrs[i].is_sentence_end = TRUE; - sentence_state = STATE_SENTENCE_POST_TERM_SPACE; - break; - - case G_UNICODE_LINE_SEPARATOR: - case G_UNICODE_PARAGRAPH_SEPARATOR: - attrs[i].is_sentence_end = TRUE; - sentence_state = STATE_SENTENCE_POST_TERM_SEP; - break; - - default: - attrs[i].is_sentence_end = TRUE; - attrs[i].is_sentence_boundary = TRUE; - - MAYBE_START_NEW_SENTENCE; - - break; - } - break; - - case STATE_SENTENCE_POST_TERM_CLOSE: - /* End sentence on anything besides more punctuation; follow - * rules for breaks - */ - switch ((int) type) - { - case G_UNICODE_OTHER_PUNCTUATION: - case G_UNICODE_CLOSE_PUNCTUATION: - if (type == G_UNICODE_CLOSE_PUNCTUATION || - wc == '.' || - wc == ',' || - wc == '?' || - wc == '!') - /* continue in this state */ - ; - else - { - attrs[i].is_sentence_end = TRUE; - attrs[i].is_sentence_boundary = TRUE; - - MAYBE_START_NEW_SENTENCE; - } - break; - - case G_UNICODE_SPACE_SEPARATOR: - attrs[i].is_sentence_end = TRUE; - sentence_state = STATE_SENTENCE_POST_TERM_SPACE; - break; - - case G_UNICODE_LINE_SEPARATOR: - case G_UNICODE_PARAGRAPH_SEPARATOR: - attrs[i].is_sentence_end = TRUE; - /* undo the unconditional break-at-all-line/para-separators - * from above; I'm not sure this is what the Unicode spec - * intends, but it seems right - we get to include - * a single line/para separator in the sentence according - * to their rules - */ - attrs[i].is_sentence_boundary = FALSE; - sentence_state = STATE_SENTENCE_POST_TERM_SEP; - break; - - default: - attrs[i].is_sentence_end = TRUE; - attrs[i].is_sentence_boundary = TRUE; - - MAYBE_START_NEW_SENTENCE; - - break; - } - break; - - case STATE_SENTENCE_POST_TERM_SPACE: - - /* Sentence is definitely already ended; to enter this state - * we had to see a space, which ends the sentence. - */ - - switch ((int) type) - { - case G_UNICODE_SPACE_SEPARATOR: - /* continue in this state */ - break; - - case G_UNICODE_LINE_SEPARATOR: - case G_UNICODE_PARAGRAPH_SEPARATOR: - /* undo the unconditional break-at-all-line/para-separators - * from above; I'm not sure this is what the Unicode spec - * intends, but it seems right - */ - attrs[i].is_sentence_boundary = FALSE; - sentence_state = STATE_SENTENCE_POST_TERM_SEP; - break; - - default: - attrs[i].is_sentence_boundary = TRUE; - - MAYBE_START_NEW_SENTENCE; - - break; - } - break; - - case STATE_SENTENCE_POST_TERM_SEP: - /* Break is forced at this point, unless we're a newline - * after a CR, then we will break after the newline on the - * next iteration. Only a single Sep can be in the - * sentence. - */ - if (!(prev_wc == '\r' && wc == '\n')) - attrs[i].is_sentence_boundary = TRUE; - - MAYBE_START_NEW_SENTENCE; - - break; - - case STATE_SENTENCE_DOT: - switch ((int) type) - { - case G_UNICODE_CLOSE_PUNCTUATION: - sentence_state = STATE_SENTENCE_POST_DOT_CLOSE; - break; - - case G_UNICODE_SPACE_SEPARATOR: - possible_sentence_end = i; - sentence_state = STATE_SENTENCE_POST_DOT_SPACE; - break; - - default: - /* If we broke on a control/format char, end the - * sentence; else this was not a sentence end, since - * we didn't enter the POST_DOT_SPACE state. - */ - if (attrs[i].is_sentence_boundary) - { - attrs[i].is_sentence_end = TRUE; - - MAYBE_START_NEW_SENTENCE; - } - else - sentence_state = STATE_SENTENCE_BODY; - break; - } - break; - - case STATE_SENTENCE_POST_DOT_CLOSE: - switch ((int) type) - { - case G_UNICODE_SPACE_SEPARATOR: - possible_sentence_end = i; - sentence_state = STATE_SENTENCE_POST_DOT_SPACE; - break; - - default: - /* If we broke on a control/format char, end the - * sentence; else this was not a sentence end, since - * we didn't enter the POST_DOT_SPACE state. - */ - if (attrs[i].is_sentence_boundary) - { - attrs[i].is_sentence_end = TRUE; - - MAYBE_START_NEW_SENTENCE; - } - else - sentence_state = STATE_SENTENCE_BODY; - break; - } - break; - - case STATE_SENTENCE_POST_DOT_SPACE: - - possible_sentence_boundary = i; - - switch ((int) type) - { - case G_UNICODE_SPACE_SEPARATOR: - /* remain in current state */ - break; - - case G_UNICODE_OPEN_PUNCTUATION: - sentence_state = STATE_SENTENCE_POST_DOT_OPEN; - break; - - case G_UNICODE_LOWERCASE_LETTER: - /* wasn't a sentence-ending period; so re-enter the sentence - * body - */ - sentence_state = STATE_SENTENCE_BODY; - break; - - default: - /* End the sentence, break, maybe start a new one */ - - g_assert (possible_sentence_end >= 0); - g_assert (possible_sentence_boundary >= 0); - - attrs[possible_sentence_boundary].is_sentence_boundary = TRUE; - attrs[possible_sentence_end].is_sentence_end = TRUE; - - possible_sentence_end = -1; - possible_sentence_boundary = -1; - - MAYBE_START_NEW_SENTENCE; - - break; - } - break; - - case STATE_SENTENCE_POST_DOT_OPEN: - switch ((int) type) - { - case G_UNICODE_OPEN_PUNCTUATION: - /* continue in current state */ - break; - - case G_UNICODE_LOWERCASE_LETTER: - /* wasn't a sentence-ending period; so re-enter the sentence - * body - */ - sentence_state = STATE_SENTENCE_BODY; - break; - - default: - /* End the sentence, break, maybe start a new one */ - - g_assert (possible_sentence_end >= 0); - g_assert (possible_sentence_boundary >= 0); - - attrs[possible_sentence_boundary].is_sentence_boundary = TRUE; - attrs[possible_sentence_end].is_sentence_end = TRUE; - - possible_sentence_end = -1; - possible_sentence_boundary = -1; - - MAYBE_START_NEW_SENTENCE; - - break; - } - break; - - case STATE_SENTENCE_POST_DOT_SEP: - /* Break is forced at this point, unless we're a newline - * after a CR, then we will break after the newline on the - * next iteration. Only a single Sep can be in the - * sentence. - */ - if (!(prev_wc == '\r' && wc == '\n')) - attrs[i].is_sentence_boundary = TRUE; - - g_assert (possible_sentence_end >= 0); - g_assert (possible_sentence_boundary >= 0); - - attrs[possible_sentence_end].is_sentence_end = TRUE; - - possible_sentence_end = -1; - possible_sentence_boundary = -1; - - MAYBE_START_NEW_SENTENCE; - - break; + if (last_sentence_start == -1 && !is_sentence_boundary) { + last_sentence_start = i - 1; + attrs[i - 1].is_sentence_start = TRUE; + } - default: - g_assert_not_reached (); - break; - } + if (last_sentence_start != -1 && is_sentence_boundary) { + last_sentence_start = -1; + attrs[i].is_sentence_end = TRUE; + } - prev_type = type; prev_wc = wc; /* wc might not be a valid Unicode base character, but really all we @@ -1798,6 +1659,7 @@ pango_default_break (const gchar *text, type != G_UNICODE_NON_SPACING_MARK) base_character = wc; } + i--; attrs[i].is_cursor_position = TRUE; /* Rule GB2 */ |