diff options
Diffstat (limited to 'pango')
-rw-r--r-- | pango/break.c | 1373 | ||||
-rw-r--r-- | pango/pango-context.c | 13 | ||||
-rw-r--r-- | pango/pango-item.c | 42 | ||||
-rw-r--r-- | pango/pango-item.h | 9 | ||||
-rw-r--r-- | pango/pango-layout.c | 295 | ||||
-rw-r--r-- | pango/pango-layout.h | 3 | ||||
-rw-r--r-- | pango/pango.h | 51 |
7 files changed, 1622 insertions, 164 deletions
diff --git a/pango/break.c b/pango/break.c index 3dc0465b..8e63415b 100644 --- a/pango/break.c +++ b/pango/break.c @@ -22,6 +22,1221 @@ #include "pango.h" #include "pango-modules.h" +/* See http://www.unicode.org/unicode/reports/tr14/ if you hope + * to understand the line breaking code. + */ + +typedef enum +{ + BREAK_ALREADY_HANDLED, /* didn't use the table */ + BREAK_PROHIBITED, /* no break, even if spaces intervene */ + BREAK_IF_SPACES, /* "indirect break" (only if there are spaces) */ + BREAK_ALLOWED /* "direct break" (can always break here) */ +} BreakOpportunity; + +enum +{ + INDEX_OPEN_PUNCTUATION, + INDEX_CLOSE_PUNCTUATION, + INDEX_QUOTATION, + INDEX_NON_BREAKING_GLUE, + INDEX_NON_STARTER, + INDEX_EXCLAMATION, + INDEX_SYMBOL, + INDEX_INFIX_SEPARATOR, + INDEX_PREFIX, + INDEX_POSTFIX, + INDEX_NUMERIC, + INDEX_ALPHABETIC, + INDEX_IDEOGRAPHIC, + INDEX_INSEPARABLE, + INDEX_HYPHEN, + INDEX_AFTER, + INDEX_BEFORE, + INDEX_BEFORE_AND_AFTER, + INDEX_ZERO_WIDTH_SPACE, + INDEX_COMBINING_MARK, + + /* End of the table */ + INDEX_END_OF_TABLE, + + /* The following are not in the tables */ + INDEX_MANDATORY, + INDEX_CARRIAGE_RETURN, + INDEX_LINE_FEED, + INDEX_SURROGATE, + INDEX_CONTINGENT, + INDEX_SPACE, + INDEX_COMPLEX_CONTEXT, + INDEX_AMBIGUOUS, + INDEX_UNKNOWN +}; + +static BreakOpportunity row_OPEN_PUNCTUATION[INDEX_END_OF_TABLE] = { + BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_CLOSE_PUNCTUATION[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_QUOTATION[INDEX_END_OF_TABLE] = { + BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_NON_BREAKING_GLUE[INDEX_END_OF_TABLE] = { + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_NON_STARTER[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_EXCLAMATION[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_SYMBOL[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_INFIX_SEPARATOR[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_PREFIX[INDEX_END_OF_TABLE] = { + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_POSTFIX[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_NUMERIC[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_ALPHABETIC[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_IDEOGRAPHIC[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_INSEPARABLE[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_HYPHEN[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_AFTER[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_BEFORE[INDEX_END_OF_TABLE] = { + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_BEFORE_AND_AFTER[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_ZERO_WIDTH_SPACE[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity row_COMBINING_MARK[INDEX_END_OF_TABLE] = { + BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, + BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES +}; + +static BreakOpportunity *line_break_rows[INDEX_END_OF_TABLE] = { + row_OPEN_PUNCTUATION, /* INDEX_OPEN_PUNCTUATION */ + row_CLOSE_PUNCTUATION, /* INDEX_CLOSE_PUNCTUATION */ + row_QUOTATION, /* INDEX_QUOTATION */ + row_NON_BREAKING_GLUE, /* INDEX_NON_BREAKING_GLUE */ + row_NON_STARTER, /* INDEX_NON_STARTER */ + row_EXCLAMATION, /* INDEX_EXCLAMATION */ + row_SYMBOL, /* INDEX_SYMBOL */ + row_INFIX_SEPARATOR, /* INDEX_INFIX_SEPARATOR */ + row_PREFIX, /* INDEX_PREFIX */ + row_POSTFIX, /* INDEX_POSTFIX */ + row_NUMERIC, /* INDEX_NUMERIC */ + row_ALPHABETIC, /* INDEX_ALPHABETIC */ + row_IDEOGRAPHIC, /* INDEX_IDEOGRAPHIC */ + row_INSEPARABLE, /* INDEX_INSEPARABLE */ + row_HYPHEN, /* INDEX_HYPHEN */ + row_AFTER, /* INDEX_AFTER */ + row_BEFORE, /* INDEX_BEFORE */ + row_BEFORE_AND_AFTER, /* INDEX_BEFORE_AND_AFTER */ + row_ZERO_WIDTH_SPACE, /* INDEX_ZERO_WIDTH_SPACE */ + row_COMBINING_MARK /* INDEX_COMBINING_MARK */ +}; + +/* Map GUnicodeBreakType to table indexes */ +static int line_break_indexes[] = { + INDEX_MANDATORY, + INDEX_CARRIAGE_RETURN, + INDEX_LINE_FEED, + INDEX_COMBINING_MARK, + INDEX_SURROGATE, + INDEX_ZERO_WIDTH_SPACE, + INDEX_INSEPARABLE, + INDEX_NON_BREAKING_GLUE, + INDEX_CONTINGENT, + INDEX_SPACE, + INDEX_AFTER, + INDEX_BEFORE, + INDEX_BEFORE_AND_AFTER, + INDEX_HYPHEN, + INDEX_NON_STARTER, + INDEX_OPEN_PUNCTUATION, + INDEX_CLOSE_PUNCTUATION, + INDEX_QUOTATION, + INDEX_EXCLAMATION, + INDEX_IDEOGRAPHIC, + INDEX_NUMERIC, + INDEX_INFIX_SEPARATOR, + INDEX_SYMBOL, + INDEX_ALPHABETIC, + INDEX_PREFIX, + INDEX_POSTFIX, + INDEX_COMPLEX_CONTEXT, + INDEX_AMBIGUOUS, + INDEX_UNKNOWN +}; + +#define BREAK_INDEX(btype) \ + (line_break_indexes[(btype)]) +#define BREAK_ROW(before_type) \ + (line_break_rows[BREAK_INDEX (before_type)]) +#define BREAK_OP(before_type, after_type) \ + (BREAK_ROW (before_type)[BREAK_INDEX (after_type)]) +#define IN_BREAK_TABLE(btype) \ + (BREAK_INDEX(btype) < INDEX_END_OF_TABLE) + +/* Keep these in sync with the same macros in the test program */ + +#define LEADING_JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x115F) +#define VOWEL_JAMO(wc) ((wc) >= 0x1160 && (wc) <= 0x11A2) +#define TRAILING_JAMO(wc) ((wc) >= 0x11A8 && (wc) <= 0x11F9) +#define JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x11FF) +/* "virama script" is just an optimization; it includes a bunch of + * scripts without viramas in them + */ +#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF) +#define VIRAMA(wc) ((wc) == 0x094D || \ + (wc) == 0x09CD || \ + (wc) == 0x0A4D || \ + (wc) == 0x0ACD || \ + (wc) == 0x0B4D || \ + (wc) == 0x0BCD || \ + (wc) == 0x0C4D || \ + (wc) == 0x0CCD || \ + (wc) == 0x0D4D || \ + (wc) == 0x0DCA || \ + (wc) == 0x0E3A || \ + (wc) == 0x0F84 || \ + (wc) == 0x1039 || \ + (wc) == 0x17D2) +/* Types of Japanese characters */ +#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF) +#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF) +#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F) +#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF) + + +/* p. 132-133 of Unicode spec table 5-6 will help understand this */ +typedef enum +{ + STATE_SENTENCE_OUTSIDE, + STATE_SENTENCE_BODY, + STATE_SENTENCE_TERM, + STATE_SENTENCE_POST_TERM_CLOSE, + STATE_SENTENCE_POST_TERM_SPACE, + STATE_SENTENCE_POST_TERM_SEP, + STATE_SENTENCE_DOT, + STATE_SENTENCE_POST_DOT_CLOSE, + STATE_SENTENCE_POST_DOT_SPACE, + STATE_SENTENCE_POST_DOT_OPEN, + /* never include line/para separators in a sentence for now */ + /* This isn't in the spec, but I can't figure out why they'd include + * one line/para separator in lines ending with Term but not with + * period-terminated lines, so I'm doing it for the dot lines also + */ + STATE_SENTENCE_POST_DOT_SEP +} SentenceState; + +/* We call "123" and "foobar" words, but "123foo" is two words; + * the Unicode spec just calls "123" a non-word + */ +typedef enum +{ + WordNone, + WordLetters, + WordNumbers +} WordType; + + +/** + * pango_default_break: + * @text: text to break + * @length: length of text in bytes + * @analysis: a #PangoAnalysis for the text + * @attrs: logical attributes to fill in + * + * This is the default break algorithm, used if no language + * engine overrides it. Normally you should use pango_break() + * instead; this function is mostly useful for chaining up + * from a language engine override. Unlike pango_break(), + * @analysis can be NULL, but only do that if you know what + * you're doing. (If you need an analysis to pass to pango_break(), + * you need to pango_itemize() or use pango_get_log_attrs().) + * + **/ +void +pango_default_break (const gchar *text, + gint length, + PangoAnalysis *analysis, + PangoLogAttr *attrs) +{ + /* The rationale for all this is in section 5.15 of the Unicode 3.0 book */ + + /* This is a default break implementation that should work for nearly all + * languages. Language engines can override it optionally. + */ + + /* FIXME one cheesy optimization here would be to memset attrs to 0 + * before we start, and then never assign FALSE to anything + */ + + const gchar *next = text; + const gchar *end = text + length; + gint i = 0; + gunichar prev_wc; + gunichar next_wc; + GUnicodeType prev_type; + GUnicodeBreakType prev_break_type; /* skips spaces */ + gboolean prev_was_break_space; + WordType current_word_type = WordNone; + gunichar last_word_letter = 0; + SentenceState sentence_state = STATE_SENTENCE_OUTSIDE; + /* Tracks what will be the end of the sentence if a period is + * determined to actually be a sentence-ending period. + */ + gint possible_sentence_end = -1; + /* possible sentence break before Open* after a period-ended sentence */ + gint possible_sentence_boundary = -1; + + g_return_if_fail (text != NULL); + g_return_if_fail (attrs != NULL); + + if (next == end) + return; + + prev_type = (GUnicodeType) -1; + prev_break_type = G_UNICODE_BREAK_UNKNOWN; + prev_was_break_space = FALSE; + prev_wc = 0; + + next_wc = g_utf8_get_char (next); + + g_assert (next_wc != 0); + + while (next_wc != 0) + { + GUnicodeType type; + gunichar wc; + GUnicodeBreakType break_type; + BreakOpportunity break_op; + + wc = next_wc; + + next = g_utf8_next_char (next); + + if (next >= end) + next_wc = 0; + else + { + next_wc = g_utf8_get_char (next); + g_assert (next_wc != 0); + } + + type = g_unichar_type (wc); + + /* Can't just use the type here since isspace() doesn't + * correspond to a Unicode character type + */ + attrs[i].is_white = g_unichar_isspace (wc); + + + /* ---- Cursor position breaks (Grapheme breaks) ---- */ + + if (wc == '\n') + { + /* Break before line feed unless prev char is a CR */ + + if (prev_wc != '\r') + attrs[i].is_cursor_position = TRUE; + else + attrs[i].is_cursor_position = FALSE; + } + else if (i == 0 || + prev_type == G_UNICODE_CONTROL || + prev_type == G_UNICODE_FORMAT) + { + /* Break at first position (must be special cased, or if the + * first char is say a combining mark there won't be a + * cursor position at the start, which seems wrong to me + * ???? - maybe it makes sense though, who knows) + */ + /* break after all format or control characters */ + attrs[i].is_cursor_position = TRUE; + } + else + { + switch (type) + { + case G_UNICODE_CONTROL: + case G_UNICODE_FORMAT: + /* Break before all format or control characters */ + attrs[i].is_cursor_position = TRUE; + break; + + case G_UNICODE_COMBINING_MARK: + case G_UNICODE_ENCLOSING_MARK: + case G_UNICODE_NON_SPACING_MARK: + /* Unicode spec includes "Combining marks plus Tibetan + * subjoined characters" as joining chars, but lists the + * Tibetan subjoined characters as combining marks, and + * g_unichar_type() returns NON_SPACING_MARK for the Tibetan + * subjoined characters. So who knows, beats me. + */ + + /* It's a joining character, break only if preceded by + * control or format; we already handled the case where + * it was preceded earlier, so here we know it wasn't, + * don't break + */ + attrs[i].is_cursor_position = FALSE; + break; + + case G_UNICODE_LOWERCASE_LETTER: + case G_UNICODE_MODIFIER_LETTER: + case G_UNICODE_OTHER_LETTER: + case G_UNICODE_TITLECASE_LETTER: + case G_UNICODE_UPPERCASE_LETTER: + if (JAMO (wc)) + { + /* Break before Jamo if they are in a broken sequence or + * next to non-Jamo, otherwise don't + */ + if (LEADING_JAMO (wc) && + !LEADING_JAMO (prev_wc)) + attrs[i].is_cursor_position = TRUE; + else if (VOWEL_JAMO (wc) && + !LEADING_JAMO (prev_wc) && + !VOWEL_JAMO (prev_wc)) + attrs[i].is_cursor_position = TRUE; + else if (TRAILING_JAMO (wc) && + !LEADING_JAMO (prev_wc) && + !VOWEL_JAMO (prev_wc) && + !TRAILING_JAMO (prev_wc)) + attrs[i].is_cursor_position = TRUE; + else + attrs[i].is_cursor_position = FALSE; + } + else + { + /* Handle non-Jamo non-combining chars */ + + /* Break if preceded by Jamo; don't break if a + * letter is preceded by a virama; break in all + * other cases. No need to check whether we're + * preceded by Jamo explicitly, since a Jamo is not + * a virama, we just break in all cases where we + * aren't preceded by a virama. Don't fool with viramas + * if we aren't part of a script that uses them. + */ + + if (VIRAMA_SCRIPT (wc)) + { + /* Check whether we're preceded by a virama; this + * could use some optimization. + */ + if (VIRAMA (prev_wc)) + attrs[i].is_cursor_position = FALSE; + else + attrs[i].is_cursor_position = TRUE; + } + else + { + attrs[i].is_cursor_position = TRUE; + } + } + break; + + default: + /* Some weirdo char, just break here, why not */ + attrs[i].is_cursor_position = TRUE; + break; + } + } + + /* ---- Line breaking ---- */ + + break_type = g_unichar_break_type (wc); + break_op = BREAK_ALREADY_HANDLED; + + g_assert (prev_break_type != G_UNICODE_BREAK_SPACE); + + attrs[i].is_break = FALSE; + attrs[i].is_mandatory_break = FALSE; + + if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary, + * it's not a line break either + */ + { + switch (prev_break_type) + { + case G_UNICODE_BREAK_MANDATORY: + case G_UNICODE_BREAK_LINE_FEED: + attrs[i].is_break = TRUE; + attrs[i].is_mandatory_break = TRUE; + break; + + case G_UNICODE_BREAK_CARRIAGE_RETURN: + if (wc != '\n') + { + attrs[i].is_break = TRUE; + attrs[i].is_mandatory_break = TRUE; + } + break; + + case G_UNICODE_BREAK_CONTINGENT: + /* can break after 0xFFFC by default, though we might want + * to eventually have a PangoLayout setting or + * PangoAttribute that disables this, if for some + * application breaking after objects is not desired. + */ + break_op = BREAK_ALLOWED; + break; + + case G_UNICODE_BREAK_SURROGATE: + /* FIXME I have no clue what to do with these, + * but we should do something with them + */ + break; + + case G_UNICODE_BREAK_AMBIGUOUS: + /* FIXME we need to resolve the East Asian width + * to decide what to do here + */ + case G_UNICODE_BREAK_COMPLEX_CONTEXT: + /* FIXME language engines should handle this case... */ + case G_UNICODE_BREAK_UNKNOWN: + /* treat unknown, complex, ambiguous as if they were + * alphabetic for now. + */ + prev_break_type = G_UNICODE_BREAK_ALPHABETIC; + /* FALL THRU to use the pair table if appropriate */ + + default: + + /* Note that our table assumes that combining marks + * are only applied to alphabetic characters; + * tech report 14 explains how to remove this assumption + * from the code, if anyone ever cares, but it shouldn't + * be a problem. Also this issue sort of goes + * away since we only look for breaks on grapheme + * boundaries. + */ + + g_assert (IN_BREAK_TABLE (prev_break_type)); + + switch (break_type) + { + case G_UNICODE_BREAK_MANDATORY: + case G_UNICODE_BREAK_LINE_FEED: + case G_UNICODE_BREAK_CARRIAGE_RETURN: + case G_UNICODE_BREAK_SPACE: + /* These types all "pile up" at the end of lines and + * get elided. + */ + break_op = BREAK_PROHIBITED; + break; + + case G_UNICODE_BREAK_CONTINGENT: + /* break before 0xFFFC by default, eventually + * make this configurable? + */ + break_op = BREAK_ALLOWED; + break; + + case G_UNICODE_BREAK_AMBIGUOUS: + /* FIXME resolve East Asian width to figure out what to do */ + case G_UNICODE_BREAK_COMPLEX_CONTEXT: + /* FIXME language engine analysis */ + case G_UNICODE_BREAK_UNKNOWN: + case G_UNICODE_BREAK_ALPHABETIC: + /* treat all of the above as alphabetic for now */ + break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC); + break; + + case G_UNICODE_BREAK_SURROGATE: + /* FIXME this case needs to be handled + */ + break_op = BREAK_IF_SPACES; /* not right at all */ + break; + + default: + g_assert (IN_BREAK_TABLE (prev_break_type)); + g_assert (IN_BREAK_TABLE (break_type)); + break_op = BREAK_OP (prev_break_type, break_type); + break; + } + break; + } + + if (break_op != BREAK_ALREADY_HANDLED) + { + switch (break_op) + { + case BREAK_PROHIBITED: + /* nothing, can't break here */ + break; + + case BREAK_IF_SPACES: + /* break if prev char was space */ + if (prev_was_break_space) + attrs[i].is_break = TRUE; + break; + + case BREAK_ALLOWED: + attrs[i].is_break = TRUE; + break; + + default: + g_assert_not_reached (); + break; + } + } + } + + if (break_type != G_UNICODE_BREAK_SPACE) + { + prev_break_type = break_type; + prev_was_break_space = FALSE; + } + else + prev_was_break_space = TRUE; + + /* ---- Word breaks ---- */ + + /* default to not a word start/end */ + attrs[i].is_word_start = FALSE; + attrs[i].is_word_end = FALSE; + + if (current_word_type != WordNone) + { + /* Check for a word end */ + switch (type) + { + case G_UNICODE_COMBINING_MARK: + case G_UNICODE_ENCLOSING_MARK: + case G_UNICODE_NON_SPACING_MARK: + /* nothing, we just eat these up as part of the word */ + break; + + case G_UNICODE_LOWERCASE_LETTER: + case G_UNICODE_MODIFIER_LETTER: + case G_UNICODE_OTHER_LETTER: + case G_UNICODE_TITLECASE_LETTER: + case G_UNICODE_UPPERCASE_LETTER: + if (current_word_type == WordLetters) + { + /* Japanese special cases for ending the word */ + if (JAPANESE (last_word_letter) || + JAPANESE (wc)) + { + if ((HIRAGANA (last_word_letter) && + !HIRAGANA (wc)) || + (KATAKANA (last_word_letter) && + !(KATAKANA (wc) || HIRAGANA (wc))) || + (KANJI (last_word_letter) && + !(HIRAGANA (wc) || KANJI (wc))) || + (JAPANESE (last_word_letter) && + !JAPANESE (wc)) || + (!JAPANESE (last_word_letter) && + JAPANESE (wc))) + attrs[i].is_word_end = TRUE; + } + } + else + { + /* end the number word, start the letter word */ + attrs[i].is_word_end = TRUE; + attrs[i].is_word_start = TRUE; + current_word_type = WordLetters; + } + + last_word_letter = wc; + break; + + case G_UNICODE_DECIMAL_NUMBER: + case G_UNICODE_LETTER_NUMBER: + case G_UNICODE_OTHER_NUMBER: + if (current_word_type != WordNumbers) + { + attrs[i].is_word_end = TRUE; + attrs[i].is_word_start = TRUE; + current_word_type = WordNumbers; + } + + last_word_letter = wc; + break; + + default: + /* Punctuation, control/format chars, etc. all end a word. */ + attrs[i].is_word_end = TRUE; + break; + } + + if (attrs[i].is_word_end) + current_word_type = WordNone; + } + else + { + /* Check for a word start */ + switch (type) + { + case G_UNICODE_LOWERCASE_LETTER: + case G_UNICODE_MODIFIER_LETTER: + case G_UNICODE_OTHER_LETTER: + case G_UNICODE_TITLECASE_LETTER: + case G_UNICODE_UPPERCASE_LETTER: + current_word_type = WordLetters; + last_word_letter = wc; + attrs[i].is_word_start = TRUE; + break; + + case G_UNICODE_DECIMAL_NUMBER: + case G_UNICODE_LETTER_NUMBER: + case G_UNICODE_OTHER_NUMBER: + current_word_type = WordNumbers; + last_word_letter = wc; + attrs[i].is_word_start = TRUE; + break; + + default: + /* No word here */ + break; + } + } + + /* ---- Sentence breaks ---- */ + + /* The Unicode spec specifies sentence breakpoints, so that a piece of + * text would be partitioned into sentences, and all characters would + * be inside some sentence. This code implements that for is_sentence_boundary, + * but tries to keep leading/trailing whitespace out of sentences for + * the start/end flags + */ + + /* The Unicode spec seems to say that one trailing line/para + * separator can be tacked on to a sentence ending in ! or ?, + * but not a sentence ending in period; I think they're on crack + * so am allowing one to be tacked onto a sentence ending in period. + */ + + /* No sentence break at the start of the text */ + + /* default to not a sentence breakpoint */ + attrs[i].is_sentence_boundary = FALSE; + attrs[i].is_sentence_start = FALSE; + attrs[i].is_sentence_end = FALSE; + + /* FIXME the Unicode spec lumps control/format chars with + * line/para separators in descriptive text, but not in the + * character class specs, in table 5-6, so who knows whether you + * are actually supposed to break on control/format + * characters. Seems semi-broken to break on tabs... + */ + + /* Break after line/para separators except carriage return + * followed by newline + */ + switch (prev_type) + { + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + case G_UNICODE_CONTROL: + case G_UNICODE_FORMAT: + if (wc == '\r') + { + if (next_wc != '\n') + attrs[i].is_sentence_boundary = TRUE; + } + else + attrs[i].is_sentence_boundary = TRUE; + break; + + default: + break; + } + + /* break before para/line separators except newline following + * carriage return + */ + switch (type) + { + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + case G_UNICODE_CONTROL: + case G_UNICODE_FORMAT: + if (wc == '\n') + { + if (prev_wc != '\r') + attrs[i].is_sentence_boundary = TRUE; + } + else + attrs[i].is_sentence_boundary = TRUE; + break; + + default: + break; + } + + switch (sentence_state) + { + case STATE_SENTENCE_OUTSIDE: + /* Start sentence if we have non-whitespace/format/control */ + switch (type) + { + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + case G_UNICODE_CONTROL: + case G_UNICODE_FORMAT: + case G_UNICODE_SPACE_SEPARATOR: + break; + + default: + attrs[i].is_sentence_start = TRUE; + sentence_state = STATE_SENTENCE_BODY; + break; + } + break; + + case STATE_SENTENCE_BODY: + /* If we already broke here due to separators, end the sentence. */ + if (attrs[i].is_sentence_boundary) + { + attrs[i].is_sentence_end = TRUE; + sentence_state = STATE_SENTENCE_OUTSIDE; + } + else + { + if (wc == '.') + sentence_state = STATE_SENTENCE_DOT; + else if (wc == '?' || wc == '!') + sentence_state = STATE_SENTENCE_TERM; + } + break; + + case STATE_SENTENCE_TERM: + /* End sentence on anything but close punctuation and some + * loosely-specified OTHER_PUNCTUATION such as period, + * comma, etc.; follow Unicode rules for breaks + */ + switch (type) + { + case G_UNICODE_OTHER_PUNCTUATION: + case G_UNICODE_CLOSE_PUNCTUATION: + if (type == G_UNICODE_CLOSE_PUNCTUATION || + wc == '.' || + wc == ',' || + wc == '?' || + wc == '!') + sentence_state = STATE_SENTENCE_POST_TERM_CLOSE; + else + { + attrs[i].is_sentence_end = TRUE; + attrs[i].is_sentence_boundary = TRUE; + sentence_state = STATE_SENTENCE_OUTSIDE; + } + break; + + case G_UNICODE_SPACE_SEPARATOR: + attrs[i].is_sentence_end = TRUE; + sentence_state = STATE_SENTENCE_POST_TERM_SPACE; + break; + + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + attrs[i].is_sentence_end = TRUE; + sentence_state = STATE_SENTENCE_POST_TERM_SEP; + break; + + default: + attrs[i].is_sentence_end = TRUE; + attrs[i].is_sentence_boundary = TRUE; + sentence_state = STATE_SENTENCE_OUTSIDE; + break; + } + break; + + case STATE_SENTENCE_POST_TERM_CLOSE: + /* End sentence on anything besides more punctuation; follow + * rules for breaks + */ + switch (type) + { + case G_UNICODE_OTHER_PUNCTUATION: + case G_UNICODE_CLOSE_PUNCTUATION: + if (type == G_UNICODE_CLOSE_PUNCTUATION || + wc == '.' || + wc == ',' || + wc == '?' || + wc == '!') + /* continue in this state */ + ; + else + { + attrs[i].is_sentence_end = TRUE; + attrs[i].is_sentence_boundary = TRUE; + sentence_state = STATE_SENTENCE_OUTSIDE; + } + break; + + case G_UNICODE_SPACE_SEPARATOR: + attrs[i].is_sentence_end = TRUE; + sentence_state = STATE_SENTENCE_POST_TERM_SPACE; + break; + + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + attrs[i].is_sentence_end = TRUE; + /* undo the unconditional break-at-all-line/para-separators + * from above; I'm not sure this is what the Unicode spec + * intends, but it seems right - we get to include + * a single line/para separator in the sentence according + * to their rules + */ + attrs[i].is_sentence_boundary = FALSE; + sentence_state = STATE_SENTENCE_POST_TERM_SEP; + break; + + default: + attrs[i].is_sentence_end = TRUE; + attrs[i].is_sentence_boundary = TRUE; + sentence_state = STATE_SENTENCE_OUTSIDE; + break; + } + break; + + case STATE_SENTENCE_POST_TERM_SPACE: + + /* Sentence is definitely already ended; to enter this state + * we had to see a space, which ends the sentence. + */ + + switch (type) + { + case G_UNICODE_SPACE_SEPARATOR: + /* continue in this state */ + break; + + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + /* undo the unconditional break-at-all-line/para-separators + * from above; I'm not sure this is what the Unicode spec + * intends, but it seems right + */ + attrs[i].is_sentence_boundary = FALSE; + sentence_state = STATE_SENTENCE_POST_TERM_SEP; + break; + + default: + attrs[i].is_sentence_boundary = TRUE; + sentence_state = STATE_SENTENCE_OUTSIDE; + break; + } + break; + + case STATE_SENTENCE_POST_TERM_SEP: + /* Break is forced at this point, unless we're a newline + * after a CR, then we will break after the newline on the + * next iteration. Only a single Sep can be in the + * sentence. + */ + if (!(prev_wc == '\r' && wc == '\n')) + attrs[i].is_sentence_boundary = TRUE; + sentence_state = STATE_SENTENCE_OUTSIDE; + break; + + case STATE_SENTENCE_DOT: + switch (type) + { + case G_UNICODE_CLOSE_PUNCTUATION: + sentence_state = STATE_SENTENCE_POST_DOT_CLOSE; + break; + + case G_UNICODE_SPACE_SEPARATOR: + possible_sentence_end = i; + sentence_state = STATE_SENTENCE_POST_DOT_SPACE; + break; + + default: + /* If we broke on a control/format char, end the + * sentence; else this was not a sentence end, since + * we didn't enter the POST_DOT_SPACE state. + */ + if (attrs[i].is_sentence_boundary) + { + attrs[i].is_sentence_end = TRUE; + + sentence_state = STATE_SENTENCE_OUTSIDE; + } + else + sentence_state = STATE_SENTENCE_BODY; + break; + } + break; + + case STATE_SENTENCE_POST_DOT_CLOSE: + switch (type) + { + case G_UNICODE_SPACE_SEPARATOR: + possible_sentence_end = i; + sentence_state = STATE_SENTENCE_POST_DOT_SPACE; + break; + + default: + /* If we broke on a control/format char, end the + * sentence; else this was not a sentence end, since + * we didn't enter the POST_DOT_SPACE state. + */ + if (attrs[i].is_sentence_boundary) + { + attrs[i].is_sentence_end = TRUE; + + sentence_state = STATE_SENTENCE_OUTSIDE; + } + else + sentence_state = STATE_SENTENCE_BODY; + break; + } + break; + + case STATE_SENTENCE_POST_DOT_SPACE: + + possible_sentence_boundary = i; + + switch (type) + { + case G_UNICODE_SPACE_SEPARATOR: + /* remain in current state */ + break; + + case G_UNICODE_OPEN_PUNCTUATION: + sentence_state = STATE_SENTENCE_POST_DOT_OPEN; + break; + + case G_UNICODE_LOWERCASE_LETTER: + /* wasn't a sentence-ending period; so re-enter the sentence + * body + */ + sentence_state = STATE_SENTENCE_BODY; + break; + + default: + /* End the sentence, break, maybe start a new one */ + + g_assert (possible_sentence_end >= 0); + g_assert (possible_sentence_boundary >= 0); + + attrs[possible_sentence_boundary].is_sentence_boundary = TRUE; + attrs[possible_sentence_end].is_sentence_end = TRUE; + + possible_sentence_end = -1; + possible_sentence_boundary = -1; + + switch (type) + { + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + case G_UNICODE_CONTROL: + case G_UNICODE_FORMAT: + sentence_state = STATE_SENTENCE_OUTSIDE; + break; + + default: + g_assert (type != G_UNICODE_SPACE_SEPARATOR); + sentence_state = STATE_SENTENCE_BODY; + attrs[i].is_sentence_start = TRUE; + break; + } + break; + } + break; + + case STATE_SENTENCE_POST_DOT_OPEN: + switch (type) + { + case G_UNICODE_OPEN_PUNCTUATION: + /* continue in current state */ + break; + + case G_UNICODE_LOWERCASE_LETTER: + /* wasn't a sentence-ending period; so re-enter the sentence + * body + */ + sentence_state = STATE_SENTENCE_BODY; + break; + + default: + /* End the sentence, break, maybe start a new one */ + + g_assert (possible_sentence_end >= 0); + g_assert (possible_sentence_boundary >= 0); + + attrs[possible_sentence_boundary].is_sentence_boundary = TRUE; + attrs[possible_sentence_end].is_sentence_end = TRUE; + + possible_sentence_end = -1; + possible_sentence_boundary = -1; + + switch (type) + { + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + case G_UNICODE_CONTROL: + case G_UNICODE_FORMAT: + sentence_state = STATE_SENTENCE_OUTSIDE; + break; + + default: + g_assert (type != G_UNICODE_SPACE_SEPARATOR); + sentence_state = STATE_SENTENCE_BODY; + attrs[i].is_sentence_start = TRUE; + break; + } + break; + } + break; + + case STATE_SENTENCE_POST_DOT_SEP: + /* Break is forced at this point, unless we're a newline + * after a CR, then we will break after the newline on the + * next iteration. Only a single Sep can be in the + * sentence. + */ + if (!(prev_wc == '\r' && wc == '\n')) + attrs[i].is_sentence_boundary = TRUE; + sentence_state = STATE_SENTENCE_OUTSIDE; + + g_assert (possible_sentence_end >= 0); + g_assert (possible_sentence_boundary >= 0); + + attrs[possible_sentence_end].is_sentence_end = TRUE; + + possible_sentence_end = -1; + possible_sentence_boundary = -1; + break; + + default: + g_assert_not_reached (); + break; + } + + prev_type = type; + prev_wc = wc; + ++i; + } +} + /** * pango_break: * @text: the text to process @@ -32,31 +1247,120 @@ * Determines possible line, word, and character breaks * for a string of Unicode text. */ -void pango_break (const gchar *text, - gint length, - PangoAnalysis *analysis, - PangoLogAttr *attrs) +void +pango_break (const gchar *text, + gint length, + PangoAnalysis *analysis, + PangoLogAttr *attrs) { - /* Pseudo-implementation */ + g_return_if_fail (text != NULL); + g_return_if_fail (analysis != NULL); + g_return_if_fail (attrs != NULL); + + if (length < 0) + length = strlen (text); - const gchar *cur = text; - gint i = 0; - gunichar wc; + if (analysis->lang_engine && + analysis->lang_engine->script_break) + (* analysis->lang_engine->script_break) (text, length, analysis, attrs); + else + pango_default_break (text, length, analysis, attrs); +} + +/** + * pango_find_paragraph_boundary: + * @text: UTF-8 text + * @length: length of @text in bytes, or -1 if nul-terminated + * @paragraph_delimiter_index: return location for index of delimiter + * @next_paragraph_start: return location for start of next paragraph + * + * Locates a paragraph boundary in @text. A boundary is caused by + * delimiter characters, such as a newline, carriage return, carriage + * return-newline pair, or Unicode paragraph separator character. The + * index of the run of delimiters is returned in + * @paragraph_delimiter_index. The index of the start of the paragraph + * (index after all delimiters) is stored in @paragraph_start. + * + * If no delimiters are found, both @paragraph_delimiter_index and + * @next_paragraph_start are filled with the length of @text (an index one + * off the end). + **/ +void +pango_find_paragraph_boundary (const gchar *text, + gint length, + gint *paragraph_delimiter_index, + gint *next_paragraph_start) +{ + const gchar *p = text; + const gchar *end; + const gchar *start = NULL; + const gchar *delimiter = NULL; + gunichar prev_wc; + + /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in + * Unicode 3.0; update this if that changes. + */ +#define PARAGRAPH_SEPARATOR 0x2029 + + if (length < 0) + length = strlen (text); + + end = text + length; + + if (paragraph_delimiter_index) + *paragraph_delimiter_index = length; + + if (next_paragraph_start) + *next_paragraph_start = length; + + if (length == 0) + return; + + /* FIXME there's plenty of room to optimize this; e.g. there's + * no real need to g_utf8_get_char() on every char + */ - while (*cur && cur - text < length) + prev_wc = 0; + + while (p != end) { - wc = g_utf8_get_char (cur); - if (wc == (gunichar)-1) - break; /* FIXME: ERROR */ - - attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == '\n' || wc == 0x200b) ? 1 : 0; - attrs[i].is_break = i == 0 || attrs[i-1].is_white || attrs[i].is_white; - attrs[i].is_char_stop = 1; - attrs[i].is_word_stop = ((i == 0) || attrs[i-1].is_white) && !attrs[i].is_white; + gunichar wc; + + wc = g_utf8_get_char (p); + + if (prev_wc == '\n' || + prev_wc == PARAGRAPH_SEPARATOR) + { + g_assert (delimiter); + start = p; + break; + } + else if (prev_wc == '\r') + { + /* don't break between \r and \n */ + if (wc != '\n') + { + g_assert (delimiter); + start = p; + break; + } + } - i++; - cur = g_utf8_next_char (cur); + if ((wc == '\n' || + wc == '\r' || + wc == PARAGRAPH_SEPARATOR) && + delimiter == NULL) + delimiter = p; + + prev_wc = wc; + p = g_utf8_next_char (p); } + + if (delimiter && paragraph_delimiter_index) + *paragraph_delimiter_index = delimiter - text; + + if (start && next_paragraph_start) + *next_paragraph_start = start - text; } /** @@ -85,17 +1389,20 @@ pango_get_log_attrs (const char *text, const char *range_start; int chars_in_range; static guint engine_type_id = 0; - static guint render_type_id = 0; + static guint render_type_id = 0; PangoAnalysis analysis = { NULL, NULL, NULL, 0 }; analysis.level = level; - + g_return_if_fail (length == 0 || text != NULL); g_return_if_fail (log_attrs != NULL); - + + if (length < 0) + length = strlen (text); + if (length == 0) return; - + if (engine_type_id == 0) { engine_type_id = g_quark_from_static_string (PANGO_ENGINE_TYPE_LANG); @@ -105,23 +1412,27 @@ pango_get_log_attrs (const char *text, n_chars = g_utf8_strlen (text, length); lang_map = pango_find_map (language, engine_type_id, render_type_id); - + range_start = text; range_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, g_utf8_get_char (text)); analysis.lang_engine = range_engine; chars_broken = 0; chars_in_range = 1; - + end = text + length; pos = g_utf8_next_char (text); - + while (pos != end) { + g_assert (chars_in_range > 0); + g_assert (range_start <= end); + g_assert (end - pos < length); + analysis.lang_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, g_utf8_get_char (pos)); - + if (range_engine != analysis.lang_engine) { /* Engine has changed; do the breaking for the current range, @@ -133,7 +1444,7 @@ pango_get_log_attrs (const char *text, log_attrs + chars_broken); chars_broken += chars_in_range; - + range_start = pos; range_engine = analysis.lang_engine; chars_in_range = 1; @@ -142,15 +1453,15 @@ pango_get_log_attrs (const char *text, { chars_in_range += 1; } - + pos = g_utf8_next_char (pos); } - + g_assert (chars_in_range > 0); g_assert (range_start != end); g_assert (pos == end); g_assert (range_engine == analysis.lang_engine); - + pango_break (range_start, end - range_start, &analysis, diff --git a/pango/pango-context.c b/pango/pango-context.c index 0e9f7146..c8a7d5c1 100644 --- a/pango/pango-context.c +++ b/pango/pango-context.c @@ -510,7 +510,10 @@ pango_context_get_base_dir (PangoContext *context) * @cached_iter: Cached attribute iterator, or NULL * * Breaks a piece of text into segments with consistent - * directional level and shaping engine. + * directional level and shaping engine. Each byte of @text will + * be contained in exactly one of the items in the returned list; + * the generated list of items will be in logical order (the start + * offsets of the items are ascending). * * @cached_iter should be an iterator over @attrs currently positioned at a * range before or containing @start_index; @cached_iter will be advanced to @@ -565,7 +568,7 @@ pango_itemize (PangoContext *context, embedding_levels = g_new (guint8, n_chars); pango_log2vis_get_embedding_levels (text_ucs4, n_chars, &base_dir, - embedding_levels); + embedding_levels); /* Storing these as ranges would be a lot more efficient, * but also more complicated... we take the simple @@ -603,7 +606,11 @@ pango_itemize (PangoContext *context, fonts[i] != fonts[i-1] || extra_attr_lists[i] != extra_attr_lists[i-1]) { - item = g_new (PangoItem, 1); + /* assert that previous item got at least one char */ + g_assert (item == NULL || item->length > 0); + g_assert (item == NULL || item->num_chars > 0); + + item = pango_item_new (); item->offset = p - text; item->num_chars = 0; item->analysis.level = embedding_levels[i]; diff --git a/pango/pango-item.c b/pango/pango-item.c index d3e0dbd9..b13b60e9 100644 --- a/pango/pango-item.c +++ b/pango/pango-item.c @@ -88,3 +88,45 @@ pango_item_free (PangoItem *item) g_free (item); } +/** + * pango_item_split: + * @orig: a #PangoItem + * @split_index: byte index of position to split item, relative to the start of the item + * @split_offset: number of chars between start of @orig and @split_index + * + * Modifies @orig to cover only the text after @split_index, and + * returns a new item that covers the text before @split_index that + * used to be in @orig. You can think of @split_index as the length of + * the returned item. @split_index may not be 0, and it may not be + * greater than or equal to the length of @orig (that is, there must + * be at least one byte assigned to each item, you can't create a + * zero-length item). @split_offset is the length of the first item in + * chars, and must be provided because the text used to generate the + * item isn't available, so pango_item_split() can't count the char + * length of the split items itself. + * + * Return value: new item representing text before @split_index + **/ +PangoItem* +pango_item_split (PangoItem *orig, + int split_index, + int split_offset) +{ + PangoItem *new_item = pango_item_copy (orig); + + g_return_val_if_fail (orig != NULL, NULL); + g_return_val_if_fail (orig->length > 0, NULL); + g_return_val_if_fail (split_index > 0, NULL); + g_return_val_if_fail (split_index < orig->length, NULL); + g_return_val_if_fail (split_offset > 0, NULL); + g_return_val_if_fail (split_offset < orig->num_chars, NULL); + + new_item->length = split_index; + new_item->num_chars = split_offset; + + orig->offset += split_index; + orig->length -= split_index; + orig->num_chars -= split_offset; + + return new_item; +} diff --git a/pango/pango-item.h b/pango/pango-item.h index 407e5004..d4b067eb 100644 --- a/pango/pango-item.h +++ b/pango/pango-item.h @@ -49,9 +49,12 @@ struct _PangoItem PangoAnalysis analysis; }; -PangoItem *pango_item_new (void); -PangoItem *pango_item_copy (PangoItem *item); -void pango_item_free (PangoItem *item); +PangoItem *pango_item_new (void); +PangoItem *pango_item_copy (PangoItem *item); +void pango_item_free (PangoItem *item); +PangoItem *pango_item_split (PangoItem *orig, + int split_index, + int split_offset); #ifdef __cplusplus } diff --git a/pango/pango-layout.c b/pango/pango-layout.c index e0d921ac..3d4f6f8c 100644 --- a/pango/pango-layout.c +++ b/pango/pango-layout.c @@ -914,29 +914,49 @@ pango_layout_index_to_line_x (PangoLayout *layout, int *x_pos) { GSList *tmp_list; - int tmp_line = 0; - int bytes_seen = 0; - + int line_num = 0; + PangoLayoutLine *layout_line = NULL; + g_return_if_fail (layout != NULL); + g_return_if_fail (index >= 0); + g_return_if_fail (index <= layout->length); pango_layout_check_lines (layout); tmp_list = layout->lines; while (tmp_list) { - PangoLayoutLine *layout_line = tmp_list->data; + PangoLayoutLine *tmp_line = tmp_list->data; + + /* use end of previous layout_line if index was in the paragraph + * delimiters + */ + if (layout_line && layout_line->start_index > index) + { + if (line) + *line = line_num; + + pango_layout_line_index_to_x (layout_line, + layout_line->start_index + layout_line->length, + trailing, x_pos); + return; + + } - if (bytes_seen + layout_line->length > index) + layout_line = tmp_line; + ++line_num; + + if ((layout_line->start_index + layout_line->length) > index) { if (line) - *line = tmp_line; - - pango_layout_line_index_to_x (layout_line, index, trailing, x_pos); + *line = line_num; + + pango_layout_line_index_to_x (layout_line, index, + trailing, x_pos); return; } tmp_list = tmp_list->next; - bytes_seen += layout_line->length; } if (line) @@ -978,7 +998,6 @@ pango_layout_move_cursor_visually (PangoLayout *layout, int *new_index, int *new_trailing) { - int bytes_seen = 0; PangoDirection base_dir; PangoLayoutLine *line = NULL; PangoLayoutLine *prev_line = NULL; @@ -1005,14 +1024,18 @@ pango_layout_move_cursor_visually (PangoLayout *layout, tmp_list = layout->lines; while (tmp_list) { - line = tmp_list->data; + PangoLayoutLine *tmp_line = tmp_list->data; - if (bytes_seen + line->length > old_index || !tmp_list->next) - break; + if (line && line->start_index > old_index) + break; /* stick with the previous line */ - tmp_list = tmp_list->next; prev_line = line; - bytes_seen += line->length; + line = tmp_line; + + if (line->start_index + line->length > old_index || !tmp_list->next) + break; + + tmp_list = tmp_list->next; } if (tmp_list->next) @@ -1024,9 +1047,13 @@ pango_layout_move_cursor_visually (PangoLayout *layout, old_index = g_utf8_next_char (layout->text + old_index) - layout->text; log2vis_map = pango_layout_line_get_log2vis_map (line, TRUE); - n_vis = g_utf8_strlen (layout->text + bytes_seen, line->length); + n_vis = g_utf8_strlen (layout->text + line->start_index, line->length); - vis_pos = log2vis_map[old_index - bytes_seen]; + /* Clamp old_index to fit on the line */ + if (old_index > (line->start_index + line->length)) + old_index = line->start_index + line->length; + + vis_pos = log2vis_map[old_index - line->start_index]; g_free (log2vis_map); if (vis_pos == 0 && direction < 0) @@ -1040,7 +1067,6 @@ pango_layout_move_cursor_visually (PangoLayout *layout, return; } line = prev_line; - bytes_seen -= line->length; } else { @@ -1050,11 +1076,10 @@ pango_layout_move_cursor_visually (PangoLayout *layout, *new_trailing = 0; return; } - bytes_seen += line->length; line = next_line; } - vis_pos = g_utf8_strlen (layout->text + bytes_seen, line->length); + vis_pos = g_utf8_strlen (layout->text + line->start_index, line->length); } else if (vis_pos == n_vis && direction > 0) { @@ -1066,7 +1091,6 @@ pango_layout_move_cursor_visually (PangoLayout *layout, *new_trailing = 0; return; } - bytes_seen += line->length; line = next_line; } else @@ -1078,7 +1102,6 @@ pango_layout_move_cursor_visually (PangoLayout *layout, return; } line = prev_line; - bytes_seen -= line->length; } vis_pos = 0; @@ -1087,10 +1110,10 @@ pango_layout_move_cursor_visually (PangoLayout *layout, vis_pos += (direction > 0) ? 1 : -1; vis2log_map = pango_layout_line_get_vis2log_map (line, TRUE); - *new_index = bytes_seen + vis2log_map[vis_pos]; + *new_index = line->start_index + vis2log_map[vis_pos]; g_free (vis2log_map); - if (*new_index == bytes_seen + line->length && line->length > 0) + if (*new_index == line->start_index + line->length && line->length > 0) { *new_index = g_utf8_prev_char (layout->text + *new_index) - layout->text; *new_trailing = 1; @@ -1175,8 +1198,9 @@ pango_layout_index_to_pos (PangoLayout *layout, PangoRectangle *pos) { PangoRectangle logical_rect; - int bytes_seen = 0; PangoLayoutIter *iter; + PangoLayoutLine *layout_line = NULL; + gboolean notfound = FALSE; g_return_if_fail (layout != NULL); g_return_if_fail (index >= 0); @@ -1184,43 +1208,56 @@ pango_layout_index_to_pos (PangoLayout *layout, iter = pango_layout_get_iter (layout); - do + while (TRUE) { - PangoLayoutLine *layout_line = pango_layout_iter_get_line (iter); - - pango_layout_iter_get_line_extents (iter, NULL, &logical_rect); + PangoLayoutLine *tmp_line = pango_layout_iter_get_line (iter); - if (bytes_seen + layout_line->length > index) - { - int x_pos; - - pos->y = logical_rect.y; - pos->height = logical_rect.height; - - pango_layout_line_index_to_x (layout_line, index, FALSE, &x_pos); - pos->x = logical_rect.x + x_pos; - - pango_layout_line_index_to_x (layout_line, index, TRUE, &x_pos); - pos->width = (logical_rect.x + x_pos) - pos->x; + if (layout_line && tmp_line->start_index > index) + { + /* index is in the paragraph delimiters, move to + * end of previous line + */ + index = layout_line->start_index + layout_line->length; + break; + } - pango_layout_iter_free (iter); - - return; - } + layout_line = tmp_line; + + pango_layout_iter_get_line_extents (iter, NULL, &logical_rect); + + if (layout_line->start_index + layout_line->length > index) + break; - bytes_seen += layout_line->length; - if (bytes_seen < layout->length && layout->text[bytes_seen] == '\n') - bytes_seen++; + if (!pango_layout_iter_next_line (iter)) + { + notfound = TRUE; + break; + } } - while (pango_layout_iter_next_line (iter)); - /* Iterator should now be on the "NULL" run at the end of the last - * line, which is a zero-width rectangle. Return the extents of - * that run. - */ + if (notfound) + { + /* Iterator should now be on the "NULL" run at the end of the last + * line, which is a zero-width rectangle. Return the extents of + * that run. + */ + + pango_layout_iter_get_run_extents (iter, NULL, pos); + } + else + { + int x_pos; - pango_layout_iter_get_run_extents (iter, NULL, pos); + pos->y = logical_rect.y; + pos->height = logical_rect.height; + pango_layout_line_index_to_x (layout_line, index, FALSE, &x_pos); + pos->x = logical_rect.x + x_pos; + + pango_layout_line_index_to_x (layout_line, index, TRUE, &x_pos); + pos->width = (logical_rect.x + x_pos) - pos->x; + } + pango_layout_iter_free (iter); } @@ -1409,7 +1446,6 @@ pango_layout_get_cursor_pos (PangoLayout *layout, PangoLayoutLine *layout_line = NULL; /* Quiet GCC */ int x1_trailing; int x2; - int bytes_seen = 0; PangoLayoutIter *iter; g_return_if_fail (layout != NULL); @@ -1420,26 +1456,32 @@ pango_layout_get_cursor_pos (PangoLayout *layout, iter = pango_layout_get_iter (layout); /* Find the line */ - do + while (TRUE) { - layout_line = pango_layout_iter_get_line (iter); - - pango_layout_iter_get_line_extents (iter, NULL, &line_rect); + PangoLayoutLine *tmp_line; - if (bytes_seen + layout_line->length > index) - break; + tmp_line = pango_layout_iter_get_line (iter); - /* Want last line of layout for trailing position */ - if (!pango_layout_iter_at_last_line (iter)) - bytes_seen += layout_line->length; + if (layout_line && layout_line->start_index > index) + break; /* keep previous layout_line and line_rect */ + + layout_line = tmp_line; + pango_layout_iter_get_line_extents (iter, NULL, &line_rect); + + if ((layout_line->start_index + layout_line->length) > index) + break; + + if (!pango_layout_iter_next_line (iter)) + break; /* use end of the last line */ } - while (pango_layout_iter_next_line (iter)); pango_layout_iter_free (iter); iter = NULL; + + g_assert (index >= layout_line->start_index); /* Examine the trailing edge of the character before the cursor */ - if (index == bytes_seen) + if (index == layout_line->start_index) { dir1 = base_dir; if (base_dir == PANGO_DIRECTION_LTR) @@ -1453,9 +1495,9 @@ pango_layout_get_cursor_pos (PangoLayout *layout, dir1 = pango_layout_line_get_char_direction (layout_line, prev_index); pango_layout_line_index_to_x (layout_line, prev_index, TRUE, &x1_trailing); } - + /* Examine the leading edge of the character after the cursor */ - if (index == bytes_seen + layout_line->length) + if (index >= layout_line->start_index + layout_line->length) { dir2 = base_dir; if (base_dir == PANGO_DIRECTION_LTR) @@ -2137,20 +2179,14 @@ static inline gboolean can_break_at (PangoLayout *layout, gint offset) { - /* While a break between a letter and following whitespace is * - * legimate, we disallow it here to avoid lines starting with * - * whitespace. We probably should have a mode where we treat all - * white-space as of fungeable width - appropriate for typography - * but not for editing. + /* We probably should have a mode where we treat all white-space as + * of fungeable width - appropriate for typography but not for + * editing. */ - if (offset == 0) - return FALSE; - else if (offset == layout->n_chars) + if (offset == layout->n_chars) return TRUE; - else - return (layout->log_attrs[offset].is_break && - (layout->log_attrs[offset - 1].is_white || - !layout->log_attrs[offset].is_white)); + else + return layout->log_attrs[offset].is_break; } static inline gboolean @@ -2258,15 +2294,10 @@ process_item (PangoLayout *layout, else { PangoItem *new_item = pango_item_copy (item); - + length = g_utf8_offset_to_pointer (text + item->offset, break_num_chars) - (text + item->offset); - - new_item->length = length; - new_item->num_chars = break_num_chars; - - item->offset += length; - item->length -= length; - item->num_chars -= break_num_chars; + + new_item = pango_item_split (item, length, break_num_chars); if (shape_set) imposed_shape (item->num_chars, &shape_ink, &shape_logical, glyphs); @@ -2294,6 +2325,7 @@ struct _ParaBreakState gboolean first_line; const char *text; gint start_offset; + gint line_start_index; }; static void @@ -2310,7 +2342,8 @@ process_line (PangoLayout *layout, GSList *break_link = NULL; /* Link holding run before break */ line = pango_layout_line_new (layout); - + line->start_index = state->line_start_index; + if (state->first_line) remaining_width = (layout->indent >= 0) ? layout->width - layout->indent : layout->width; else @@ -2393,12 +2426,14 @@ process_line (PangoLayout *layout, pango_layout_line_postprocess (line); layout->lines = g_slist_prepend (layout->lines, line); state->first_line = FALSE; + state->line_start_index += line->length; } static void -get_para_log_attrs (const char *text, - GList *items, - PangoLogAttr *log_attrs) +get_items_log_attrs (const char *text, + GList *items, + PangoLogAttr *log_attrs, + int para_delimiter_len) { int offset = 0; int index = 0; @@ -2415,11 +2450,10 @@ get_para_log_attrs (const char *text, PangoItem *next_item = items->next->data; /* FIXME: Handle language tags */ - if (next_item->analysis.level != tmp_item.analysis.level || - (next_item->analysis.lang_engine != tmp_item.analysis.lang_engine && - (!next_item->analysis.lang_engine || !tmp_item.analysis.lang_engine || + if (next_item->analysis.lang_engine != tmp_item.analysis.lang_engine && + (!next_item->analysis.lang_engine || !tmp_item.analysis.lang_engine || strcmp (next_item->analysis.lang_engine->engine.id, - tmp_item.analysis.lang_engine->engine.id) != 0))) + tmp_item.analysis.lang_engine->engine.id) != 0)) break; else { @@ -2430,6 +2464,10 @@ get_para_log_attrs (const char *text, items = items->next; } + /* Break the paragraph delimiters with the last item */ + if (items->next == NULL) + tmp_item.length += para_delimiter_len; + pango_break (text + index, tmp_item.length, &tmp_item.analysis, log_attrs + offset); offset += tmp_item.num_chars; @@ -2488,21 +2526,33 @@ pango_layout_check_lines (PangoLayout *layout) start_offset = 0; start = layout->text; + do { - int para_chars = 0; - const char *end = start; + int delim_len; + const char *end; + int delimiter_index, next_para_index; ParaBreakState state; - - while (end != layout->text + layout->length && *end != '\n') - { - end = g_utf8_next_char (end); - para_chars++; - } - if (end == layout->text + layout->length) + pango_find_paragraph_boundary (start, + (layout->text + layout->length) - start, + &delimiter_index, + &next_para_index); + + g_assert (next_para_index >= delimiter_index); + + end = start + delimiter_index; + + delim_len = next_para_index - delimiter_index; + + if ((end + delim_len) == (layout->text + layout->length)) done = TRUE; + g_assert (end <= (layout->text + layout->length)); + g_assert (start <= (layout->text + layout->length)); + g_assert (delim_len < 3); + g_assert (delim_len >= 0); + state.items = pango_itemize (layout->context, layout->text, start - layout->text, @@ -2510,34 +2560,35 @@ pango_layout_check_lines (PangoLayout *layout) attrs, iter); - get_para_log_attrs (start, state.items, layout->log_attrs + start_offset); + get_items_log_attrs (start, state.items, + layout->log_attrs + start_offset, + delim_len); if (state.items) { state.first_line = TRUE; state.start_offset = start_offset; state.text = start; + state.line_start_index = state.text - layout->text; while (state.items) - process_line (layout, &state); + process_line (layout, &state); } else - layout->lines = g_slist_prepend (layout->lines, - pango_layout_line_new (layout)); - - start_offset += para_chars; + { + PangoLayoutLine *empty_line; + + empty_line = pango_layout_line_new (layout); + empty_line->start_index = start - layout->text; + + layout->lines = g_slist_prepend (layout->lines, + empty_line); + } if (!done) - { - /* Handle newline */ - layout->log_attrs[start_offset].is_break = TRUE; - layout->log_attrs[start_offset].is_white = TRUE; - layout->log_attrs[start_offset].is_char_stop = TRUE; - layout->log_attrs[start_offset].is_word_stop = TRUE; - start_offset += 1; - - start = end + 1; - } + start_offset += g_utf8_strlen (start, (end - start) + delim_len); + + start = end + delim_len; } while (!done); @@ -3140,6 +3191,8 @@ pango_layout_line_new (PangoLayout *layout) private->line.runs = 0; private->line.length = 0; + /* Note that we leave start_index uninitialized */ + return (PangoLayoutLine *) private; } diff --git a/pango/pango-layout.h b/pango/pango-layout.h index b19ea973..ef87869d 100644 --- a/pango/pango-layout.h +++ b/pango/pango-layout.h @@ -45,7 +45,8 @@ typedef enum { struct _PangoLayoutLine { PangoLayout *layout; - gint length; /* length of line in bytes*/ + gint start_index; /* start of line as byte index into layout->text */ + gint length; /* length of line in bytes */ GSList *runs; }; diff --git a/pango/pango.h b/pango/pango.h index e0652fd0..3cbc0f59 100644 --- a/pango/pango.h +++ b/pango/pango.h @@ -39,14 +39,40 @@ extern "C" { #include <pango/pango-layout.h> #include <pango/pango-types.h> -/* Logical attributes of a character +/* Logical attributes of a character. */ struct _PangoLogAttr { - guint is_break : 1; /* Break in front of character */ - guint is_white : 1; - guint is_char_stop : 1; - guint is_word_stop : 1; + guint is_break : 1; /* Can break line in front of character */ + + guint is_mandatory_break : 1; /* Must break line in front of character */ + + guint is_white : 1; /* Whitespace character */ + + /* cursor can appear in front of character (i.e. this is a grapheme + * boundary, or the first character in the text) + */ + guint is_cursor_position : 1; + + /* Note that in degenerate cases, you could have both start/end set on + * some text, most likely for sentences (e.g. no space after a period, so + * the next sentence starts right away) + */ + + guint is_word_start : 1; /* first character in a word */ + guint is_word_end : 1; /* is first non-word char after a word */ + + /* There are two ways to divide sentences. The first assigns all + * intersentence whitespace/control/format chars to some sentence, + * so all chars are in some sentence; is_sentence_boundary denotes + * the boundaries there. The second way doesn't assign + * between-sentence spaces, etc. to any sentence, so + * is_sentence_start/is_sentence_end mark the boundaries of those + * sentences. + */ + guint is_sentence_boundary : 1; + guint is_sentence_start : 1; /* first character in a sentence */ + guint is_sentence_end : 1; /* first non-sentence char after a sentence */ }; /* Determine information about cluster/word/line breaks in a string @@ -57,6 +83,11 @@ void pango_break (const gchar *text, PangoAnalysis *analysis, PangoLogAttr *attrs); +void pango_find_paragraph_boundary (const gchar *text, + gint length, + gint *paragraph_delimiter_index, + gint *next_paragraph_start); + void pango_get_log_attrs (const char *text, int length, int level, @@ -72,6 +103,16 @@ void pango_shape (const gchar *text, GList *pango_reorder_items (GList *logical_items); +/* This is the default break algorithm, used if no language + * engine overrides it. Normally you should use pango_break() + * instead; this function is mostly useful for chaining up + * from a language engine override. + */ +void pango_default_break (const gchar *text, + gint length, + PangoAnalysis *analysis, + PangoLogAttr *attrs); + #ifdef __cplusplus } #endif /* __cplusplus */ |