summaryrefslogtreecommitdiff
path: root/pango
diff options
context:
space:
mode:
Diffstat (limited to 'pango')
-rw-r--r--pango/break.c1373
-rw-r--r--pango/pango-context.c13
-rw-r--r--pango/pango-item.c42
-rw-r--r--pango/pango-item.h9
-rw-r--r--pango/pango-layout.c295
-rw-r--r--pango/pango-layout.h3
-rw-r--r--pango/pango.h51
7 files changed, 1622 insertions, 164 deletions
diff --git a/pango/break.c b/pango/break.c
index 3dc0465b..8e63415b 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -22,6 +22,1221 @@
#include "pango.h"
#include "pango-modules.h"
+/* See http://www.unicode.org/unicode/reports/tr14/ if you hope
+ * to understand the line breaking code.
+ */
+
+typedef enum
+{
+ BREAK_ALREADY_HANDLED, /* didn't use the table */
+ BREAK_PROHIBITED, /* no break, even if spaces intervene */
+ BREAK_IF_SPACES, /* "indirect break" (only if there are spaces) */
+ BREAK_ALLOWED /* "direct break" (can always break here) */
+} BreakOpportunity;
+
+enum
+{
+ INDEX_OPEN_PUNCTUATION,
+ INDEX_CLOSE_PUNCTUATION,
+ INDEX_QUOTATION,
+ INDEX_NON_BREAKING_GLUE,
+ INDEX_NON_STARTER,
+ INDEX_EXCLAMATION,
+ INDEX_SYMBOL,
+ INDEX_INFIX_SEPARATOR,
+ INDEX_PREFIX,
+ INDEX_POSTFIX,
+ INDEX_NUMERIC,
+ INDEX_ALPHABETIC,
+ INDEX_IDEOGRAPHIC,
+ INDEX_INSEPARABLE,
+ INDEX_HYPHEN,
+ INDEX_AFTER,
+ INDEX_BEFORE,
+ INDEX_BEFORE_AND_AFTER,
+ INDEX_ZERO_WIDTH_SPACE,
+ INDEX_COMBINING_MARK,
+
+ /* End of the table */
+ INDEX_END_OF_TABLE,
+
+ /* The following are not in the tables */
+ INDEX_MANDATORY,
+ INDEX_CARRIAGE_RETURN,
+ INDEX_LINE_FEED,
+ INDEX_SURROGATE,
+ INDEX_CONTINGENT,
+ INDEX_SPACE,
+ INDEX_COMPLEX_CONTEXT,
+ INDEX_AMBIGUOUS,
+ INDEX_UNKNOWN
+};
+
+static BreakOpportunity row_OPEN_PUNCTUATION[INDEX_END_OF_TABLE] = {
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_CLOSE_PUNCTUATION[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_QUOTATION[INDEX_END_OF_TABLE] = {
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_NON_BREAKING_GLUE[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_NON_STARTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_EXCLAMATION[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_SYMBOL[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_INFIX_SEPARATOR[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_PREFIX[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_POSTFIX[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_NUMERIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_ALPHABETIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_IDEOGRAPHIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_INSEPARABLE[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_HYPHEN[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_AFTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_BEFORE[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_BEFORE_AND_AFTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_ZERO_WIDTH_SPACE[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_COMBINING_MARK[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity *line_break_rows[INDEX_END_OF_TABLE] = {
+ row_OPEN_PUNCTUATION, /* INDEX_OPEN_PUNCTUATION */
+ row_CLOSE_PUNCTUATION, /* INDEX_CLOSE_PUNCTUATION */
+ row_QUOTATION, /* INDEX_QUOTATION */
+ row_NON_BREAKING_GLUE, /* INDEX_NON_BREAKING_GLUE */
+ row_NON_STARTER, /* INDEX_NON_STARTER */
+ row_EXCLAMATION, /* INDEX_EXCLAMATION */
+ row_SYMBOL, /* INDEX_SYMBOL */
+ row_INFIX_SEPARATOR, /* INDEX_INFIX_SEPARATOR */
+ row_PREFIX, /* INDEX_PREFIX */
+ row_POSTFIX, /* INDEX_POSTFIX */
+ row_NUMERIC, /* INDEX_NUMERIC */
+ row_ALPHABETIC, /* INDEX_ALPHABETIC */
+ row_IDEOGRAPHIC, /* INDEX_IDEOGRAPHIC */
+ row_INSEPARABLE, /* INDEX_INSEPARABLE */
+ row_HYPHEN, /* INDEX_HYPHEN */
+ row_AFTER, /* INDEX_AFTER */
+ row_BEFORE, /* INDEX_BEFORE */
+ row_BEFORE_AND_AFTER, /* INDEX_BEFORE_AND_AFTER */
+ row_ZERO_WIDTH_SPACE, /* INDEX_ZERO_WIDTH_SPACE */
+ row_COMBINING_MARK /* INDEX_COMBINING_MARK */
+};
+
+/* Map GUnicodeBreakType to table indexes */
+static int line_break_indexes[] = {
+ INDEX_MANDATORY,
+ INDEX_CARRIAGE_RETURN,
+ INDEX_LINE_FEED,
+ INDEX_COMBINING_MARK,
+ INDEX_SURROGATE,
+ INDEX_ZERO_WIDTH_SPACE,
+ INDEX_INSEPARABLE,
+ INDEX_NON_BREAKING_GLUE,
+ INDEX_CONTINGENT,
+ INDEX_SPACE,
+ INDEX_AFTER,
+ INDEX_BEFORE,
+ INDEX_BEFORE_AND_AFTER,
+ INDEX_HYPHEN,
+ INDEX_NON_STARTER,
+ INDEX_OPEN_PUNCTUATION,
+ INDEX_CLOSE_PUNCTUATION,
+ INDEX_QUOTATION,
+ INDEX_EXCLAMATION,
+ INDEX_IDEOGRAPHIC,
+ INDEX_NUMERIC,
+ INDEX_INFIX_SEPARATOR,
+ INDEX_SYMBOL,
+ INDEX_ALPHABETIC,
+ INDEX_PREFIX,
+ INDEX_POSTFIX,
+ INDEX_COMPLEX_CONTEXT,
+ INDEX_AMBIGUOUS,
+ INDEX_UNKNOWN
+};
+
+#define BREAK_INDEX(btype) \
+ (line_break_indexes[(btype)])
+#define BREAK_ROW(before_type) \
+ (line_break_rows[BREAK_INDEX (before_type)])
+#define BREAK_OP(before_type, after_type) \
+ (BREAK_ROW (before_type)[BREAK_INDEX (after_type)])
+#define IN_BREAK_TABLE(btype) \
+ (BREAK_INDEX(btype) < INDEX_END_OF_TABLE)
+
+/* Keep these in sync with the same macros in the test program */
+
+#define LEADING_JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x115F)
+#define VOWEL_JAMO(wc) ((wc) >= 0x1160 && (wc) <= 0x11A2)
+#define TRAILING_JAMO(wc) ((wc) >= 0x11A8 && (wc) <= 0x11F9)
+#define JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x11FF)
+/* "virama script" is just an optimization; it includes a bunch of
+ * scripts without viramas in them
+ */
+#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF)
+#define VIRAMA(wc) ((wc) == 0x094D || \
+ (wc) == 0x09CD || \
+ (wc) == 0x0A4D || \
+ (wc) == 0x0ACD || \
+ (wc) == 0x0B4D || \
+ (wc) == 0x0BCD || \
+ (wc) == 0x0C4D || \
+ (wc) == 0x0CCD || \
+ (wc) == 0x0D4D || \
+ (wc) == 0x0DCA || \
+ (wc) == 0x0E3A || \
+ (wc) == 0x0F84 || \
+ (wc) == 0x1039 || \
+ (wc) == 0x17D2)
+/* Types of Japanese characters */
+#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
+#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
+#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
+#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
+
+
+/* p. 132-133 of Unicode spec table 5-6 will help understand this */
+typedef enum
+{
+ STATE_SENTENCE_OUTSIDE,
+ STATE_SENTENCE_BODY,
+ STATE_SENTENCE_TERM,
+ STATE_SENTENCE_POST_TERM_CLOSE,
+ STATE_SENTENCE_POST_TERM_SPACE,
+ STATE_SENTENCE_POST_TERM_SEP,
+ STATE_SENTENCE_DOT,
+ STATE_SENTENCE_POST_DOT_CLOSE,
+ STATE_SENTENCE_POST_DOT_SPACE,
+ STATE_SENTENCE_POST_DOT_OPEN,
+ /* never include line/para separators in a sentence for now */
+ /* This isn't in the spec, but I can't figure out why they'd include
+ * one line/para separator in lines ending with Term but not with
+ * period-terminated lines, so I'm doing it for the dot lines also
+ */
+ STATE_SENTENCE_POST_DOT_SEP
+} SentenceState;
+
+/* We call "123" and "foobar" words, but "123foo" is two words;
+ * the Unicode spec just calls "123" a non-word
+ */
+typedef enum
+{
+ WordNone,
+ WordLetters,
+ WordNumbers
+} WordType;
+
+
+/**
+ * pango_default_break:
+ * @text: text to break
+ * @length: length of text in bytes
+ * @analysis: a #PangoAnalysis for the text
+ * @attrs: logical attributes to fill in
+ *
+ * This is the default break algorithm, used if no language
+ * engine overrides it. Normally you should use pango_break()
+ * instead; this function is mostly useful for chaining up
+ * from a language engine override. Unlike pango_break(),
+ * @analysis can be NULL, but only do that if you know what
+ * you're doing. (If you need an analysis to pass to pango_break(),
+ * you need to pango_itemize() or use pango_get_log_attrs().)
+ *
+ **/
+void
+pango_default_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis,
+ PangoLogAttr *attrs)
+{
+ /* The rationale for all this is in section 5.15 of the Unicode 3.0 book */
+
+ /* This is a default break implementation that should work for nearly all
+ * languages. Language engines can override it optionally.
+ */
+
+ /* FIXME one cheesy optimization here would be to memset attrs to 0
+ * before we start, and then never assign FALSE to anything
+ */
+
+ const gchar *next = text;
+ const gchar *end = text + length;
+ gint i = 0;
+ gunichar prev_wc;
+ gunichar next_wc;
+ GUnicodeType prev_type;
+ GUnicodeBreakType prev_break_type; /* skips spaces */
+ gboolean prev_was_break_space;
+ WordType current_word_type = WordNone;
+ gunichar last_word_letter = 0;
+ SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
+ /* Tracks what will be the end of the sentence if a period is
+ * determined to actually be a sentence-ending period.
+ */
+ gint possible_sentence_end = -1;
+ /* possible sentence break before Open* after a period-ended sentence */
+ gint possible_sentence_boundary = -1;
+
+ g_return_if_fail (text != NULL);
+ g_return_if_fail (attrs != NULL);
+
+ if (next == end)
+ return;
+
+ prev_type = (GUnicodeType) -1;
+ prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+ prev_was_break_space = FALSE;
+ prev_wc = 0;
+
+ next_wc = g_utf8_get_char (next);
+
+ g_assert (next_wc != 0);
+
+ while (next_wc != 0)
+ {
+ GUnicodeType type;
+ gunichar wc;
+ GUnicodeBreakType break_type;
+ BreakOpportunity break_op;
+
+ wc = next_wc;
+
+ next = g_utf8_next_char (next);
+
+ if (next >= end)
+ next_wc = 0;
+ else
+ {
+ next_wc = g_utf8_get_char (next);
+ g_assert (next_wc != 0);
+ }
+
+ type = g_unichar_type (wc);
+
+ /* Can't just use the type here since isspace() doesn't
+ * correspond to a Unicode character type
+ */
+ attrs[i].is_white = g_unichar_isspace (wc);
+
+
+ /* ---- Cursor position breaks (Grapheme breaks) ---- */
+
+ if (wc == '\n')
+ {
+ /* Break before line feed unless prev char is a CR */
+
+ if (prev_wc != '\r')
+ attrs[i].is_cursor_position = TRUE;
+ else
+ attrs[i].is_cursor_position = FALSE;
+ }
+ else if (i == 0 ||
+ prev_type == G_UNICODE_CONTROL ||
+ prev_type == G_UNICODE_FORMAT)
+ {
+ /* Break at first position (must be special cased, or if the
+ * first char is say a combining mark there won't be a
+ * cursor position at the start, which seems wrong to me
+ * ???? - maybe it makes sense though, who knows)
+ */
+ /* break after all format or control characters */
+ attrs[i].is_cursor_position = TRUE;
+ }
+ else
+ {
+ switch (type)
+ {
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ /* Break before all format or control characters */
+ attrs[i].is_cursor_position = TRUE;
+ break;
+
+ case G_UNICODE_COMBINING_MARK:
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ /* Unicode spec includes "Combining marks plus Tibetan
+ * subjoined characters" as joining chars, but lists the
+ * Tibetan subjoined characters as combining marks, and
+ * g_unichar_type() returns NON_SPACING_MARK for the Tibetan
+ * subjoined characters. So who knows, beats me.
+ */
+
+ /* It's a joining character, break only if preceded by
+ * control or format; we already handled the case where
+ * it was preceded earlier, so here we know it wasn't,
+ * don't break
+ */
+ attrs[i].is_cursor_position = FALSE;
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ if (JAMO (wc))
+ {
+ /* Break before Jamo if they are in a broken sequence or
+ * next to non-Jamo, otherwise don't
+ */
+ if (LEADING_JAMO (wc) &&
+ !LEADING_JAMO (prev_wc))
+ attrs[i].is_cursor_position = TRUE;
+ else if (VOWEL_JAMO (wc) &&
+ !LEADING_JAMO (prev_wc) &&
+ !VOWEL_JAMO (prev_wc))
+ attrs[i].is_cursor_position = TRUE;
+ else if (TRAILING_JAMO (wc) &&
+ !LEADING_JAMO (prev_wc) &&
+ !VOWEL_JAMO (prev_wc) &&
+ !TRAILING_JAMO (prev_wc))
+ attrs[i].is_cursor_position = TRUE;
+ else
+ attrs[i].is_cursor_position = FALSE;
+ }
+ else
+ {
+ /* Handle non-Jamo non-combining chars */
+
+ /* Break if preceded by Jamo; don't break if a
+ * letter is preceded by a virama; break in all
+ * other cases. No need to check whether we're
+ * preceded by Jamo explicitly, since a Jamo is not
+ * a virama, we just break in all cases where we
+ * aren't preceded by a virama. Don't fool with viramas
+ * if we aren't part of a script that uses them.
+ */
+
+ if (VIRAMA_SCRIPT (wc))
+ {
+ /* Check whether we're preceded by a virama; this
+ * could use some optimization.
+ */
+ if (VIRAMA (prev_wc))
+ attrs[i].is_cursor_position = FALSE;
+ else
+ attrs[i].is_cursor_position = TRUE;
+ }
+ else
+ {
+ attrs[i].is_cursor_position = TRUE;
+ }
+ }
+ break;
+
+ default:
+ /* Some weirdo char, just break here, why not */
+ attrs[i].is_cursor_position = TRUE;
+ break;
+ }
+ }
+
+ /* ---- Line breaking ---- */
+
+ break_type = g_unichar_break_type (wc);
+ break_op = BREAK_ALREADY_HANDLED;
+
+ g_assert (prev_break_type != G_UNICODE_BREAK_SPACE);
+
+ attrs[i].is_break = FALSE;
+ attrs[i].is_mandatory_break = FALSE;
+
+ if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary,
+ * it's not a line break either
+ */
+ {
+ switch (prev_break_type)
+ {
+ case G_UNICODE_BREAK_MANDATORY:
+ case G_UNICODE_BREAK_LINE_FEED:
+ attrs[i].is_break = TRUE;
+ attrs[i].is_mandatory_break = TRUE;
+ break;
+
+ case G_UNICODE_BREAK_CARRIAGE_RETURN:
+ if (wc != '\n')
+ {
+ attrs[i].is_break = TRUE;
+ attrs[i].is_mandatory_break = TRUE;
+ }
+ break;
+
+ case G_UNICODE_BREAK_CONTINGENT:
+ /* can break after 0xFFFC by default, though we might want
+ * to eventually have a PangoLayout setting or
+ * PangoAttribute that disables this, if for some
+ * application breaking after objects is not desired.
+ */
+ break_op = BREAK_ALLOWED;
+ break;
+
+ case G_UNICODE_BREAK_SURROGATE:
+ /* FIXME I have no clue what to do with these,
+ * but we should do something with them
+ */
+ break;
+
+ case G_UNICODE_BREAK_AMBIGUOUS:
+ /* FIXME we need to resolve the East Asian width
+ * to decide what to do here
+ */
+ case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+ /* FIXME language engines should handle this case... */
+ case G_UNICODE_BREAK_UNKNOWN:
+ /* treat unknown, complex, ambiguous as if they were
+ * alphabetic for now.
+ */
+ prev_break_type = G_UNICODE_BREAK_ALPHABETIC;
+ /* FALL THRU to use the pair table if appropriate */
+
+ default:
+
+ /* Note that our table assumes that combining marks
+ * are only applied to alphabetic characters;
+ * tech report 14 explains how to remove this assumption
+ * from the code, if anyone ever cares, but it shouldn't
+ * be a problem. Also this issue sort of goes
+ * away since we only look for breaks on grapheme
+ * boundaries.
+ */
+
+ g_assert (IN_BREAK_TABLE (prev_break_type));
+
+ switch (break_type)
+ {
+ case G_UNICODE_BREAK_MANDATORY:
+ case G_UNICODE_BREAK_LINE_FEED:
+ case G_UNICODE_BREAK_CARRIAGE_RETURN:
+ case G_UNICODE_BREAK_SPACE:
+ /* These types all "pile up" at the end of lines and
+ * get elided.
+ */
+ break_op = BREAK_PROHIBITED;
+ break;
+
+ case G_UNICODE_BREAK_CONTINGENT:
+ /* break before 0xFFFC by default, eventually
+ * make this configurable?
+ */
+ break_op = BREAK_ALLOWED;
+ break;
+
+ case G_UNICODE_BREAK_AMBIGUOUS:
+ /* FIXME resolve East Asian width to figure out what to do */
+ case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+ /* FIXME language engine analysis */
+ case G_UNICODE_BREAK_UNKNOWN:
+ case G_UNICODE_BREAK_ALPHABETIC:
+ /* treat all of the above as alphabetic for now */
+ break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC);
+ break;
+
+ case G_UNICODE_BREAK_SURROGATE:
+ /* FIXME this case needs to be handled
+ */
+ break_op = BREAK_IF_SPACES; /* not right at all */
+ break;
+
+ default:
+ g_assert (IN_BREAK_TABLE (prev_break_type));
+ g_assert (IN_BREAK_TABLE (break_type));
+ break_op = BREAK_OP (prev_break_type, break_type);
+ break;
+ }
+ break;
+ }
+
+ if (break_op != BREAK_ALREADY_HANDLED)
+ {
+ switch (break_op)
+ {
+ case BREAK_PROHIBITED:
+ /* nothing, can't break here */
+ break;
+
+ case BREAK_IF_SPACES:
+ /* break if prev char was space */
+ if (prev_was_break_space)
+ attrs[i].is_break = TRUE;
+ break;
+
+ case BREAK_ALLOWED:
+ attrs[i].is_break = TRUE;
+ break;
+
+ default:
+ g_assert_not_reached ();
+ break;
+ }
+ }
+ }
+
+ if (break_type != G_UNICODE_BREAK_SPACE)
+ {
+ prev_break_type = break_type;
+ prev_was_break_space = FALSE;
+ }
+ else
+ prev_was_break_space = TRUE;
+
+ /* ---- Word breaks ---- */
+
+ /* default to not a word start/end */
+ attrs[i].is_word_start = FALSE;
+ attrs[i].is_word_end = FALSE;
+
+ if (current_word_type != WordNone)
+ {
+ /* Check for a word end */
+ switch (type)
+ {
+ case G_UNICODE_COMBINING_MARK:
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ /* nothing, we just eat these up as part of the word */
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ if (current_word_type == WordLetters)
+ {
+ /* Japanese special cases for ending the word */
+ if (JAPANESE (last_word_letter) ||
+ JAPANESE (wc))
+ {
+ if ((HIRAGANA (last_word_letter) &&
+ !HIRAGANA (wc)) ||
+ (KATAKANA (last_word_letter) &&
+ !(KATAKANA (wc) || HIRAGANA (wc))) ||
+ (KANJI (last_word_letter) &&
+ !(HIRAGANA (wc) || KANJI (wc))) ||
+ (JAPANESE (last_word_letter) &&
+ !JAPANESE (wc)) ||
+ (!JAPANESE (last_word_letter) &&
+ JAPANESE (wc)))
+ attrs[i].is_word_end = TRUE;
+ }
+ }
+ else
+ {
+ /* end the number word, start the letter word */
+ attrs[i].is_word_end = TRUE;
+ attrs[i].is_word_start = TRUE;
+ current_word_type = WordLetters;
+ }
+
+ last_word_letter = wc;
+ break;
+
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ if (current_word_type != WordNumbers)
+ {
+ attrs[i].is_word_end = TRUE;
+ attrs[i].is_word_start = TRUE;
+ current_word_type = WordNumbers;
+ }
+
+ last_word_letter = wc;
+ break;
+
+ default:
+ /* Punctuation, control/format chars, etc. all end a word. */
+ attrs[i].is_word_end = TRUE;
+ break;
+ }
+
+ if (attrs[i].is_word_end)
+ current_word_type = WordNone;
+ }
+ else
+ {
+ /* Check for a word start */
+ switch (type)
+ {
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ current_word_type = WordLetters;
+ last_word_letter = wc;
+ attrs[i].is_word_start = TRUE;
+ break;
+
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ current_word_type = WordNumbers;
+ last_word_letter = wc;
+ attrs[i].is_word_start = TRUE;
+ break;
+
+ default:
+ /* No word here */
+ break;
+ }
+ }
+
+ /* ---- Sentence breaks ---- */
+
+ /* The Unicode spec specifies sentence breakpoints, so that a piece of
+ * text would be partitioned into sentences, and all characters would
+ * be inside some sentence. This code implements that for is_sentence_boundary,
+ * but tries to keep leading/trailing whitespace out of sentences for
+ * the start/end flags
+ */
+
+ /* The Unicode spec seems to say that one trailing line/para
+ * separator can be tacked on to a sentence ending in ! or ?,
+ * but not a sentence ending in period; I think they're on crack
+ * so am allowing one to be tacked onto a sentence ending in period.
+ */
+
+ /* No sentence break at the start of the text */
+
+ /* default to not a sentence breakpoint */
+ attrs[i].is_sentence_boundary = FALSE;
+ attrs[i].is_sentence_start = FALSE;
+ attrs[i].is_sentence_end = FALSE;
+
+ /* FIXME the Unicode spec lumps control/format chars with
+ * line/para separators in descriptive text, but not in the
+ * character class specs, in table 5-6, so who knows whether you
+ * are actually supposed to break on control/format
+ * characters. Seems semi-broken to break on tabs...
+ */
+
+ /* Break after line/para separators except carriage return
+ * followed by newline
+ */
+ switch (prev_type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ if (wc == '\r')
+ {
+ if (next_wc != '\n')
+ attrs[i].is_sentence_boundary = TRUE;
+ }
+ else
+ attrs[i].is_sentence_boundary = TRUE;
+ break;
+
+ default:
+ break;
+ }
+
+ /* break before para/line separators except newline following
+ * carriage return
+ */
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ if (wc == '\n')
+ {
+ if (prev_wc != '\r')
+ attrs[i].is_sentence_boundary = TRUE;
+ }
+ else
+ attrs[i].is_sentence_boundary = TRUE;
+ break;
+
+ default:
+ break;
+ }
+
+ switch (sentence_state)
+ {
+ case STATE_SENTENCE_OUTSIDE:
+ /* Start sentence if we have non-whitespace/format/control */
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ case G_UNICODE_SPACE_SEPARATOR:
+ break;
+
+ default:
+ attrs[i].is_sentence_start = TRUE;
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_BODY:
+ /* If we already broke here due to separators, end the sentence. */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ else
+ {
+ if (wc == '.')
+ sentence_state = STATE_SENTENCE_DOT;
+ else if (wc == '?' || wc == '!')
+ sentence_state = STATE_SENTENCE_TERM;
+ }
+ break;
+
+ case STATE_SENTENCE_TERM:
+ /* End sentence on anything but close punctuation and some
+ * loosely-specified OTHER_PUNCTUATION such as period,
+ * comma, etc.; follow Unicode rules for breaks
+ */
+ switch (type)
+ {
+ case G_UNICODE_OTHER_PUNCTUATION:
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ if (type == G_UNICODE_CLOSE_PUNCTUATION ||
+ wc == '.' ||
+ wc == ',' ||
+ wc == '?' ||
+ wc == '!')
+ sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
+ else
+ {
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_CLOSE:
+ /* End sentence on anything besides more punctuation; follow
+ * rules for breaks
+ */
+ switch (type)
+ {
+ case G_UNICODE_OTHER_PUNCTUATION:
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ if (type == G_UNICODE_CLOSE_PUNCTUATION ||
+ wc == '.' ||
+ wc == ',' ||
+ wc == '?' ||
+ wc == '!')
+ /* continue in this state */
+ ;
+ else
+ {
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ /* undo the unconditional break-at-all-line/para-separators
+ * from above; I'm not sure this is what the Unicode spec
+ * intends, but it seems right - we get to include
+ * a single line/para separator in the sentence according
+ * to their rules
+ */
+ attrs[i].is_sentence_boundary = FALSE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_SPACE:
+
+ /* Sentence is definitely already ended; to enter this state
+ * we had to see a space, which ends the sentence.
+ */
+
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ /* continue in this state */
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ /* undo the unconditional break-at-all-line/para-separators
+ * from above; I'm not sure this is what the Unicode spec
+ * intends, but it seems right
+ */
+ attrs[i].is_sentence_boundary = FALSE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_SEP:
+ /* Break is forced at this point, unless we're a newline
+ * after a CR, then we will break after the newline on the
+ * next iteration. Only a single Sep can be in the
+ * sentence.
+ */
+ if (!(prev_wc == '\r' && wc == '\n'))
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+
+ case STATE_SENTENCE_DOT:
+ switch (type)
+ {
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ possible_sentence_end = i;
+ sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
+ break;
+
+ default:
+ /* If we broke on a control/format char, end the
+ * sentence; else this was not a sentence end, since
+ * we didn't enter the POST_DOT_SPACE state.
+ */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ else
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_CLOSE:
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ possible_sentence_end = i;
+ sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
+ break;
+
+ default:
+ /* If we broke on a control/format char, end the
+ * sentence; else this was not a sentence end, since
+ * we didn't enter the POST_DOT_SPACE state.
+ */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ else
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_SPACE:
+
+ possible_sentence_boundary = i;
+
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ /* remain in current state */
+ break;
+
+ case G_UNICODE_OPEN_PUNCTUATION:
+ sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ /* wasn't a sentence-ending period; so re-enter the sentence
+ * body
+ */
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+
+ default:
+ /* End the sentence, break, maybe start a new one */
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+
+ default:
+ g_assert (type != G_UNICODE_SPACE_SEPARATOR);
+ sentence_state = STATE_SENTENCE_BODY;
+ attrs[i].is_sentence_start = TRUE;
+ break;
+ }
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_OPEN:
+ switch (type)
+ {
+ case G_UNICODE_OPEN_PUNCTUATION:
+ /* continue in current state */
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ /* wasn't a sentence-ending period; so re-enter the sentence
+ * body
+ */
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+
+ default:
+ /* End the sentence, break, maybe start a new one */
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+
+ default:
+ g_assert (type != G_UNICODE_SPACE_SEPARATOR);
+ sentence_state = STATE_SENTENCE_BODY;
+ attrs[i].is_sentence_start = TRUE;
+ break;
+ }
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_SEP:
+ /* Break is forced at this point, unless we're a newline
+ * after a CR, then we will break after the newline on the
+ * next iteration. Only a single Sep can be in the
+ * sentence.
+ */
+ if (!(prev_wc == '\r' && wc == '\n'))
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+ break;
+
+ default:
+ g_assert_not_reached ();
+ break;
+ }
+
+ prev_type = type;
+ prev_wc = wc;
+ ++i;
+ }
+}
+
/**
* pango_break:
* @text: the text to process
@@ -32,31 +1247,120 @@
* Determines possible line, word, and character breaks
* for a string of Unicode text.
*/
-void pango_break (const gchar *text,
- gint length,
- PangoAnalysis *analysis,
- PangoLogAttr *attrs)
+void
+pango_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis,
+ PangoLogAttr *attrs)
{
- /* Pseudo-implementation */
+ g_return_if_fail (text != NULL);
+ g_return_if_fail (analysis != NULL);
+ g_return_if_fail (attrs != NULL);
+
+ if (length < 0)
+ length = strlen (text);
- const gchar *cur = text;
- gint i = 0;
- gunichar wc;
+ if (analysis->lang_engine &&
+ analysis->lang_engine->script_break)
+ (* analysis->lang_engine->script_break) (text, length, analysis, attrs);
+ else
+ pango_default_break (text, length, analysis, attrs);
+}
+
+/**
+ * pango_find_paragraph_boundary:
+ * @text: UTF-8 text
+ * @length: length of @text in bytes, or -1 if nul-terminated
+ * @paragraph_delimiter_index: return location for index of delimiter
+ * @next_paragraph_start: return location for start of next paragraph
+ *
+ * Locates a paragraph boundary in @text. A boundary is caused by
+ * delimiter characters, such as a newline, carriage return, carriage
+ * return-newline pair, or Unicode paragraph separator character. The
+ * index of the run of delimiters is returned in
+ * @paragraph_delimiter_index. The index of the start of the paragraph
+ * (index after all delimiters) is stored in @paragraph_start.
+ *
+ * If no delimiters are found, both @paragraph_delimiter_index and
+ * @next_paragraph_start are filled with the length of @text (an index one
+ * off the end).
+ **/
+void
+pango_find_paragraph_boundary (const gchar *text,
+ gint length,
+ gint *paragraph_delimiter_index,
+ gint *next_paragraph_start)
+{
+ const gchar *p = text;
+ const gchar *end;
+ const gchar *start = NULL;
+ const gchar *delimiter = NULL;
+ gunichar prev_wc;
+
+ /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in
+ * Unicode 3.0; update this if that changes.
+ */
+#define PARAGRAPH_SEPARATOR 0x2029
+
+ if (length < 0)
+ length = strlen (text);
+
+ end = text + length;
+
+ if (paragraph_delimiter_index)
+ *paragraph_delimiter_index = length;
+
+ if (next_paragraph_start)
+ *next_paragraph_start = length;
+
+ if (length == 0)
+ return;
+
+ /* FIXME there's plenty of room to optimize this; e.g. there's
+ * no real need to g_utf8_get_char() on every char
+ */
- while (*cur && cur - text < length)
+ prev_wc = 0;
+
+ while (p != end)
{
- wc = g_utf8_get_char (cur);
- if (wc == (gunichar)-1)
- break; /* FIXME: ERROR */
-
- attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == '\n' || wc == 0x200b) ? 1 : 0;
- attrs[i].is_break = i == 0 || attrs[i-1].is_white || attrs[i].is_white;
- attrs[i].is_char_stop = 1;
- attrs[i].is_word_stop = ((i == 0) || attrs[i-1].is_white) && !attrs[i].is_white;
+ gunichar wc;
+
+ wc = g_utf8_get_char (p);
+
+ if (prev_wc == '\n' ||
+ prev_wc == PARAGRAPH_SEPARATOR)
+ {
+ g_assert (delimiter);
+ start = p;
+ break;
+ }
+ else if (prev_wc == '\r')
+ {
+ /* don't break between \r and \n */
+ if (wc != '\n')
+ {
+ g_assert (delimiter);
+ start = p;
+ break;
+ }
+ }
- i++;
- cur = g_utf8_next_char (cur);
+ if ((wc == '\n' ||
+ wc == '\r' ||
+ wc == PARAGRAPH_SEPARATOR) &&
+ delimiter == NULL)
+ delimiter = p;
+
+ prev_wc = wc;
+ p = g_utf8_next_char (p);
}
+
+ if (delimiter && paragraph_delimiter_index)
+ *paragraph_delimiter_index = delimiter - text;
+
+ if (start && next_paragraph_start)
+ *next_paragraph_start = start - text;
}
/**
@@ -85,17 +1389,20 @@ pango_get_log_attrs (const char *text,
const char *range_start;
int chars_in_range;
static guint engine_type_id = 0;
- static guint render_type_id = 0;
+ static guint render_type_id = 0;
PangoAnalysis analysis = { NULL, NULL, NULL, 0 };
analysis.level = level;
-
+
g_return_if_fail (length == 0 || text != NULL);
g_return_if_fail (log_attrs != NULL);
-
+
+ if (length < 0)
+ length = strlen (text);
+
if (length == 0)
return;
-
+
if (engine_type_id == 0)
{
engine_type_id = g_quark_from_static_string (PANGO_ENGINE_TYPE_LANG);
@@ -105,23 +1412,27 @@ pango_get_log_attrs (const char *text,
n_chars = g_utf8_strlen (text, length);
lang_map = pango_find_map (language, engine_type_id, render_type_id);
-
+
range_start = text;
range_engine = (PangoEngineLang*) pango_map_get_engine (lang_map,
g_utf8_get_char (text));
analysis.lang_engine = range_engine;
chars_broken = 0;
chars_in_range = 1;
-
+
end = text + length;
pos = g_utf8_next_char (text);
-
+
while (pos != end)
{
+ g_assert (chars_in_range > 0);
+ g_assert (range_start <= end);
+ g_assert (end - pos < length);
+
analysis.lang_engine =
(PangoEngineLang*) pango_map_get_engine (lang_map,
g_utf8_get_char (pos));
-
+
if (range_engine != analysis.lang_engine)
{
/* Engine has changed; do the breaking for the current range,
@@ -133,7 +1444,7 @@ pango_get_log_attrs (const char *text,
log_attrs + chars_broken);
chars_broken += chars_in_range;
-
+
range_start = pos;
range_engine = analysis.lang_engine;
chars_in_range = 1;
@@ -142,15 +1453,15 @@ pango_get_log_attrs (const char *text,
{
chars_in_range += 1;
}
-
+
pos = g_utf8_next_char (pos);
}
-
+
g_assert (chars_in_range > 0);
g_assert (range_start != end);
g_assert (pos == end);
g_assert (range_engine == analysis.lang_engine);
-
+
pango_break (range_start,
end - range_start,
&analysis,
diff --git a/pango/pango-context.c b/pango/pango-context.c
index 0e9f7146..c8a7d5c1 100644
--- a/pango/pango-context.c
+++ b/pango/pango-context.c
@@ -510,7 +510,10 @@ pango_context_get_base_dir (PangoContext *context)
* @cached_iter: Cached attribute iterator, or NULL
*
* Breaks a piece of text into segments with consistent
- * directional level and shaping engine.
+ * directional level and shaping engine. Each byte of @text will
+ * be contained in exactly one of the items in the returned list;
+ * the generated list of items will be in logical order (the start
+ * offsets of the items are ascending).
*
* @cached_iter should be an iterator over @attrs currently positioned at a
* range before or containing @start_index; @cached_iter will be advanced to
@@ -565,7 +568,7 @@ pango_itemize (PangoContext *context,
embedding_levels = g_new (guint8, n_chars);
pango_log2vis_get_embedding_levels (text_ucs4, n_chars, &base_dir,
- embedding_levels);
+ embedding_levels);
/* Storing these as ranges would be a lot more efficient,
* but also more complicated... we take the simple
@@ -603,7 +606,11 @@ pango_itemize (PangoContext *context,
fonts[i] != fonts[i-1] ||
extra_attr_lists[i] != extra_attr_lists[i-1])
{
- item = g_new (PangoItem, 1);
+ /* assert that previous item got at least one char */
+ g_assert (item == NULL || item->length > 0);
+ g_assert (item == NULL || item->num_chars > 0);
+
+ item = pango_item_new ();
item->offset = p - text;
item->num_chars = 0;
item->analysis.level = embedding_levels[i];
diff --git a/pango/pango-item.c b/pango/pango-item.c
index d3e0dbd9..b13b60e9 100644
--- a/pango/pango-item.c
+++ b/pango/pango-item.c
@@ -88,3 +88,45 @@ pango_item_free (PangoItem *item)
g_free (item);
}
+/**
+ * pango_item_split:
+ * @orig: a #PangoItem
+ * @split_index: byte index of position to split item, relative to the start of the item
+ * @split_offset: number of chars between start of @orig and @split_index
+ *
+ * Modifies @orig to cover only the text after @split_index, and
+ * returns a new item that covers the text before @split_index that
+ * used to be in @orig. You can think of @split_index as the length of
+ * the returned item. @split_index may not be 0, and it may not be
+ * greater than or equal to the length of @orig (that is, there must
+ * be at least one byte assigned to each item, you can't create a
+ * zero-length item). @split_offset is the length of the first item in
+ * chars, and must be provided because the text used to generate the
+ * item isn't available, so pango_item_split() can't count the char
+ * length of the split items itself.
+ *
+ * Return value: new item representing text before @split_index
+ **/
+PangoItem*
+pango_item_split (PangoItem *orig,
+ int split_index,
+ int split_offset)
+{
+ PangoItem *new_item = pango_item_copy (orig);
+
+ g_return_val_if_fail (orig != NULL, NULL);
+ g_return_val_if_fail (orig->length > 0, NULL);
+ g_return_val_if_fail (split_index > 0, NULL);
+ g_return_val_if_fail (split_index < orig->length, NULL);
+ g_return_val_if_fail (split_offset > 0, NULL);
+ g_return_val_if_fail (split_offset < orig->num_chars, NULL);
+
+ new_item->length = split_index;
+ new_item->num_chars = split_offset;
+
+ orig->offset += split_index;
+ orig->length -= split_index;
+ orig->num_chars -= split_offset;
+
+ return new_item;
+}
diff --git a/pango/pango-item.h b/pango/pango-item.h
index 407e5004..d4b067eb 100644
--- a/pango/pango-item.h
+++ b/pango/pango-item.h
@@ -49,9 +49,12 @@ struct _PangoItem
PangoAnalysis analysis;
};
-PangoItem *pango_item_new (void);
-PangoItem *pango_item_copy (PangoItem *item);
-void pango_item_free (PangoItem *item);
+PangoItem *pango_item_new (void);
+PangoItem *pango_item_copy (PangoItem *item);
+void pango_item_free (PangoItem *item);
+PangoItem *pango_item_split (PangoItem *orig,
+ int split_index,
+ int split_offset);
#ifdef __cplusplus
}
diff --git a/pango/pango-layout.c b/pango/pango-layout.c
index e0d921ac..3d4f6f8c 100644
--- a/pango/pango-layout.c
+++ b/pango/pango-layout.c
@@ -914,29 +914,49 @@ pango_layout_index_to_line_x (PangoLayout *layout,
int *x_pos)
{
GSList *tmp_list;
- int tmp_line = 0;
- int bytes_seen = 0;
-
+ int line_num = 0;
+ PangoLayoutLine *layout_line = NULL;
+
g_return_if_fail (layout != NULL);
+ g_return_if_fail (index >= 0);
+ g_return_if_fail (index <= layout->length);
pango_layout_check_lines (layout);
tmp_list = layout->lines;
while (tmp_list)
{
- PangoLayoutLine *layout_line = tmp_list->data;
+ PangoLayoutLine *tmp_line = tmp_list->data;
+
+ /* use end of previous layout_line if index was in the paragraph
+ * delimiters
+ */
+ if (layout_line && layout_line->start_index > index)
+ {
+ if (line)
+ *line = line_num;
+
+ pango_layout_line_index_to_x (layout_line,
+ layout_line->start_index + layout_line->length,
+ trailing, x_pos);
+ return;
+
+ }
- if (bytes_seen + layout_line->length > index)
+ layout_line = tmp_line;
+ ++line_num;
+
+ if ((layout_line->start_index + layout_line->length) > index)
{
if (line)
- *line = tmp_line;
-
- pango_layout_line_index_to_x (layout_line, index, trailing, x_pos);
+ *line = line_num;
+
+ pango_layout_line_index_to_x (layout_line, index,
+ trailing, x_pos);
return;
}
tmp_list = tmp_list->next;
- bytes_seen += layout_line->length;
}
if (line)
@@ -978,7 +998,6 @@ pango_layout_move_cursor_visually (PangoLayout *layout,
int *new_index,
int *new_trailing)
{
- int bytes_seen = 0;
PangoDirection base_dir;
PangoLayoutLine *line = NULL;
PangoLayoutLine *prev_line = NULL;
@@ -1005,14 +1024,18 @@ pango_layout_move_cursor_visually (PangoLayout *layout,
tmp_list = layout->lines;
while (tmp_list)
{
- line = tmp_list->data;
+ PangoLayoutLine *tmp_line = tmp_list->data;
- if (bytes_seen + line->length > old_index || !tmp_list->next)
- break;
+ if (line && line->start_index > old_index)
+ break; /* stick with the previous line */
- tmp_list = tmp_list->next;
prev_line = line;
- bytes_seen += line->length;
+ line = tmp_line;
+
+ if (line->start_index + line->length > old_index || !tmp_list->next)
+ break;
+
+ tmp_list = tmp_list->next;
}
if (tmp_list->next)
@@ -1024,9 +1047,13 @@ pango_layout_move_cursor_visually (PangoLayout *layout,
old_index = g_utf8_next_char (layout->text + old_index) - layout->text;
log2vis_map = pango_layout_line_get_log2vis_map (line, TRUE);
- n_vis = g_utf8_strlen (layout->text + bytes_seen, line->length);
+ n_vis = g_utf8_strlen (layout->text + line->start_index, line->length);
- vis_pos = log2vis_map[old_index - bytes_seen];
+ /* Clamp old_index to fit on the line */
+ if (old_index > (line->start_index + line->length))
+ old_index = line->start_index + line->length;
+
+ vis_pos = log2vis_map[old_index - line->start_index];
g_free (log2vis_map);
if (vis_pos == 0 && direction < 0)
@@ -1040,7 +1067,6 @@ pango_layout_move_cursor_visually (PangoLayout *layout,
return;
}
line = prev_line;
- bytes_seen -= line->length;
}
else
{
@@ -1050,11 +1076,10 @@ pango_layout_move_cursor_visually (PangoLayout *layout,
*new_trailing = 0;
return;
}
- bytes_seen += line->length;
line = next_line;
}
- vis_pos = g_utf8_strlen (layout->text + bytes_seen, line->length);
+ vis_pos = g_utf8_strlen (layout->text + line->start_index, line->length);
}
else if (vis_pos == n_vis && direction > 0)
{
@@ -1066,7 +1091,6 @@ pango_layout_move_cursor_visually (PangoLayout *layout,
*new_trailing = 0;
return;
}
- bytes_seen += line->length;
line = next_line;
}
else
@@ -1078,7 +1102,6 @@ pango_layout_move_cursor_visually (PangoLayout *layout,
return;
}
line = prev_line;
- bytes_seen -= line->length;
}
vis_pos = 0;
@@ -1087,10 +1110,10 @@ pango_layout_move_cursor_visually (PangoLayout *layout,
vis_pos += (direction > 0) ? 1 : -1;
vis2log_map = pango_layout_line_get_vis2log_map (line, TRUE);
- *new_index = bytes_seen + vis2log_map[vis_pos];
+ *new_index = line->start_index + vis2log_map[vis_pos];
g_free (vis2log_map);
- if (*new_index == bytes_seen + line->length && line->length > 0)
+ if (*new_index == line->start_index + line->length && line->length > 0)
{
*new_index = g_utf8_prev_char (layout->text + *new_index) - layout->text;
*new_trailing = 1;
@@ -1175,8 +1198,9 @@ pango_layout_index_to_pos (PangoLayout *layout,
PangoRectangle *pos)
{
PangoRectangle logical_rect;
- int bytes_seen = 0;
PangoLayoutIter *iter;
+ PangoLayoutLine *layout_line = NULL;
+ gboolean notfound = FALSE;
g_return_if_fail (layout != NULL);
g_return_if_fail (index >= 0);
@@ -1184,43 +1208,56 @@ pango_layout_index_to_pos (PangoLayout *layout,
iter = pango_layout_get_iter (layout);
- do
+ while (TRUE)
{
- PangoLayoutLine *layout_line = pango_layout_iter_get_line (iter);
-
- pango_layout_iter_get_line_extents (iter, NULL, &logical_rect);
+ PangoLayoutLine *tmp_line = pango_layout_iter_get_line (iter);
- if (bytes_seen + layout_line->length > index)
- {
- int x_pos;
-
- pos->y = logical_rect.y;
- pos->height = logical_rect.height;
-
- pango_layout_line_index_to_x (layout_line, index, FALSE, &x_pos);
- pos->x = logical_rect.x + x_pos;
-
- pango_layout_line_index_to_x (layout_line, index, TRUE, &x_pos);
- pos->width = (logical_rect.x + x_pos) - pos->x;
+ if (layout_line && tmp_line->start_index > index)
+ {
+ /* index is in the paragraph delimiters, move to
+ * end of previous line
+ */
+ index = layout_line->start_index + layout_line->length;
+ break;
+ }
- pango_layout_iter_free (iter);
-
- return;
- }
+ layout_line = tmp_line;
+
+ pango_layout_iter_get_line_extents (iter, NULL, &logical_rect);
+
+ if (layout_line->start_index + layout_line->length > index)
+ break;
- bytes_seen += layout_line->length;
- if (bytes_seen < layout->length && layout->text[bytes_seen] == '\n')
- bytes_seen++;
+ if (!pango_layout_iter_next_line (iter))
+ {
+ notfound = TRUE;
+ break;
+ }
}
- while (pango_layout_iter_next_line (iter));
- /* Iterator should now be on the "NULL" run at the end of the last
- * line, which is a zero-width rectangle. Return the extents of
- * that run.
- */
+ if (notfound)
+ {
+ /* Iterator should now be on the "NULL" run at the end of the last
+ * line, which is a zero-width rectangle. Return the extents of
+ * that run.
+ */
+
+ pango_layout_iter_get_run_extents (iter, NULL, pos);
+ }
+ else
+ {
+ int x_pos;
- pango_layout_iter_get_run_extents (iter, NULL, pos);
+ pos->y = logical_rect.y;
+ pos->height = logical_rect.height;
+ pango_layout_line_index_to_x (layout_line, index, FALSE, &x_pos);
+ pos->x = logical_rect.x + x_pos;
+
+ pango_layout_line_index_to_x (layout_line, index, TRUE, &x_pos);
+ pos->width = (logical_rect.x + x_pos) - pos->x;
+ }
+
pango_layout_iter_free (iter);
}
@@ -1409,7 +1446,6 @@ pango_layout_get_cursor_pos (PangoLayout *layout,
PangoLayoutLine *layout_line = NULL; /* Quiet GCC */
int x1_trailing;
int x2;
- int bytes_seen = 0;
PangoLayoutIter *iter;
g_return_if_fail (layout != NULL);
@@ -1420,26 +1456,32 @@ pango_layout_get_cursor_pos (PangoLayout *layout,
iter = pango_layout_get_iter (layout);
/* Find the line */
- do
+ while (TRUE)
{
- layout_line = pango_layout_iter_get_line (iter);
-
- pango_layout_iter_get_line_extents (iter, NULL, &line_rect);
+ PangoLayoutLine *tmp_line;
- if (bytes_seen + layout_line->length > index)
- break;
+ tmp_line = pango_layout_iter_get_line (iter);
- /* Want last line of layout for trailing position */
- if (!pango_layout_iter_at_last_line (iter))
- bytes_seen += layout_line->length;
+ if (layout_line && layout_line->start_index > index)
+ break; /* keep previous layout_line and line_rect */
+
+ layout_line = tmp_line;
+ pango_layout_iter_get_line_extents (iter, NULL, &line_rect);
+
+ if ((layout_line->start_index + layout_line->length) > index)
+ break;
+
+ if (!pango_layout_iter_next_line (iter))
+ break; /* use end of the last line */
}
- while (pango_layout_iter_next_line (iter));
pango_layout_iter_free (iter);
iter = NULL;
+
+ g_assert (index >= layout_line->start_index);
/* Examine the trailing edge of the character before the cursor */
- if (index == bytes_seen)
+ if (index == layout_line->start_index)
{
dir1 = base_dir;
if (base_dir == PANGO_DIRECTION_LTR)
@@ -1453,9 +1495,9 @@ pango_layout_get_cursor_pos (PangoLayout *layout,
dir1 = pango_layout_line_get_char_direction (layout_line, prev_index);
pango_layout_line_index_to_x (layout_line, prev_index, TRUE, &x1_trailing);
}
-
+
/* Examine the leading edge of the character after the cursor */
- if (index == bytes_seen + layout_line->length)
+ if (index >= layout_line->start_index + layout_line->length)
{
dir2 = base_dir;
if (base_dir == PANGO_DIRECTION_LTR)
@@ -2137,20 +2179,14 @@ static inline gboolean
can_break_at (PangoLayout *layout,
gint offset)
{
- /* While a break between a letter and following whitespace is *
- * legimate, we disallow it here to avoid lines starting with *
- * whitespace. We probably should have a mode where we treat all
- * white-space as of fungeable width - appropriate for typography
- * but not for editing.
+ /* We probably should have a mode where we treat all white-space as
+ * of fungeable width - appropriate for typography but not for
+ * editing.
*/
- if (offset == 0)
- return FALSE;
- else if (offset == layout->n_chars)
+ if (offset == layout->n_chars)
return TRUE;
- else
- return (layout->log_attrs[offset].is_break &&
- (layout->log_attrs[offset - 1].is_white ||
- !layout->log_attrs[offset].is_white));
+ else
+ return layout->log_attrs[offset].is_break;
}
static inline gboolean
@@ -2258,15 +2294,10 @@ process_item (PangoLayout *layout,
else
{
PangoItem *new_item = pango_item_copy (item);
-
+
length = g_utf8_offset_to_pointer (text + item->offset, break_num_chars) - (text + item->offset);
-
- new_item->length = length;
- new_item->num_chars = break_num_chars;
-
- item->offset += length;
- item->length -= length;
- item->num_chars -= break_num_chars;
+
+ new_item = pango_item_split (item, length, break_num_chars);
if (shape_set)
imposed_shape (item->num_chars, &shape_ink, &shape_logical, glyphs);
@@ -2294,6 +2325,7 @@ struct _ParaBreakState
gboolean first_line;
const char *text;
gint start_offset;
+ gint line_start_index;
};
static void
@@ -2310,7 +2342,8 @@ process_line (PangoLayout *layout,
GSList *break_link = NULL; /* Link holding run before break */
line = pango_layout_line_new (layout);
-
+ line->start_index = state->line_start_index;
+
if (state->first_line)
remaining_width = (layout->indent >= 0) ? layout->width - layout->indent : layout->width;
else
@@ -2393,12 +2426,14 @@ process_line (PangoLayout *layout,
pango_layout_line_postprocess (line);
layout->lines = g_slist_prepend (layout->lines, line);
state->first_line = FALSE;
+ state->line_start_index += line->length;
}
static void
-get_para_log_attrs (const char *text,
- GList *items,
- PangoLogAttr *log_attrs)
+get_items_log_attrs (const char *text,
+ GList *items,
+ PangoLogAttr *log_attrs,
+ int para_delimiter_len)
{
int offset = 0;
int index = 0;
@@ -2415,11 +2450,10 @@ get_para_log_attrs (const char *text,
PangoItem *next_item = items->next->data;
/* FIXME: Handle language tags */
- if (next_item->analysis.level != tmp_item.analysis.level ||
- (next_item->analysis.lang_engine != tmp_item.analysis.lang_engine &&
- (!next_item->analysis.lang_engine || !tmp_item.analysis.lang_engine ||
+ if (next_item->analysis.lang_engine != tmp_item.analysis.lang_engine &&
+ (!next_item->analysis.lang_engine || !tmp_item.analysis.lang_engine ||
strcmp (next_item->analysis.lang_engine->engine.id,
- tmp_item.analysis.lang_engine->engine.id) != 0)))
+ tmp_item.analysis.lang_engine->engine.id) != 0))
break;
else
{
@@ -2430,6 +2464,10 @@ get_para_log_attrs (const char *text,
items = items->next;
}
+ /* Break the paragraph delimiters with the last item */
+ if (items->next == NULL)
+ tmp_item.length += para_delimiter_len;
+
pango_break (text + index, tmp_item.length, &tmp_item.analysis, log_attrs + offset);
offset += tmp_item.num_chars;
@@ -2488,21 +2526,33 @@ pango_layout_check_lines (PangoLayout *layout)
start_offset = 0;
start = layout->text;
+
do
{
- int para_chars = 0;
- const char *end = start;
+ int delim_len;
+ const char *end;
+ int delimiter_index, next_para_index;
ParaBreakState state;
-
- while (end != layout->text + layout->length && *end != '\n')
- {
- end = g_utf8_next_char (end);
- para_chars++;
- }
- if (end == layout->text + layout->length)
+ pango_find_paragraph_boundary (start,
+ (layout->text + layout->length) - start,
+ &delimiter_index,
+ &next_para_index);
+
+ g_assert (next_para_index >= delimiter_index);
+
+ end = start + delimiter_index;
+
+ delim_len = next_para_index - delimiter_index;
+
+ if ((end + delim_len) == (layout->text + layout->length))
done = TRUE;
+ g_assert (end <= (layout->text + layout->length));
+ g_assert (start <= (layout->text + layout->length));
+ g_assert (delim_len < 3);
+ g_assert (delim_len >= 0);
+
state.items = pango_itemize (layout->context,
layout->text,
start - layout->text,
@@ -2510,34 +2560,35 @@ pango_layout_check_lines (PangoLayout *layout)
attrs,
iter);
- get_para_log_attrs (start, state.items, layout->log_attrs + start_offset);
+ get_items_log_attrs (start, state.items,
+ layout->log_attrs + start_offset,
+ delim_len);
if (state.items)
{
state.first_line = TRUE;
state.start_offset = start_offset;
state.text = start;
+ state.line_start_index = state.text - layout->text;
while (state.items)
- process_line (layout, &state);
+ process_line (layout, &state);
}
else
- layout->lines = g_slist_prepend (layout->lines,
- pango_layout_line_new (layout));
-
- start_offset += para_chars;
+ {
+ PangoLayoutLine *empty_line;
+
+ empty_line = pango_layout_line_new (layout);
+ empty_line->start_index = start - layout->text;
+
+ layout->lines = g_slist_prepend (layout->lines,
+ empty_line);
+ }
if (!done)
- {
- /* Handle newline */
- layout->log_attrs[start_offset].is_break = TRUE;
- layout->log_attrs[start_offset].is_white = TRUE;
- layout->log_attrs[start_offset].is_char_stop = TRUE;
- layout->log_attrs[start_offset].is_word_stop = TRUE;
- start_offset += 1;
-
- start = end + 1;
- }
+ start_offset += g_utf8_strlen (start, (end - start) + delim_len);
+
+ start = end + delim_len;
}
while (!done);
@@ -3140,6 +3191,8 @@ pango_layout_line_new (PangoLayout *layout)
private->line.runs = 0;
private->line.length = 0;
+ /* Note that we leave start_index uninitialized */
+
return (PangoLayoutLine *) private;
}
diff --git a/pango/pango-layout.h b/pango/pango-layout.h
index b19ea973..ef87869d 100644
--- a/pango/pango-layout.h
+++ b/pango/pango-layout.h
@@ -45,7 +45,8 @@ typedef enum {
struct _PangoLayoutLine
{
PangoLayout *layout;
- gint length; /* length of line in bytes*/
+ gint start_index; /* start of line as byte index into layout->text */
+ gint length; /* length of line in bytes */
GSList *runs;
};
diff --git a/pango/pango.h b/pango/pango.h
index e0652fd0..3cbc0f59 100644
--- a/pango/pango.h
+++ b/pango/pango.h
@@ -39,14 +39,40 @@ extern "C" {
#include <pango/pango-layout.h>
#include <pango/pango-types.h>
-/* Logical attributes of a character
+/* Logical attributes of a character.
*/
struct _PangoLogAttr
{
- guint is_break : 1; /* Break in front of character */
- guint is_white : 1;
- guint is_char_stop : 1;
- guint is_word_stop : 1;
+ guint is_break : 1; /* Can break line in front of character */
+
+ guint is_mandatory_break : 1; /* Must break line in front of character */
+
+ guint is_white : 1; /* Whitespace character */
+
+ /* cursor can appear in front of character (i.e. this is a grapheme
+ * boundary, or the first character in the text)
+ */
+ guint is_cursor_position : 1;
+
+ /* Note that in degenerate cases, you could have both start/end set on
+ * some text, most likely for sentences (e.g. no space after a period, so
+ * the next sentence starts right away)
+ */
+
+ guint is_word_start : 1; /* first character in a word */
+ guint is_word_end : 1; /* is first non-word char after a word */
+
+ /* There are two ways to divide sentences. The first assigns all
+ * intersentence whitespace/control/format chars to some sentence,
+ * so all chars are in some sentence; is_sentence_boundary denotes
+ * the boundaries there. The second way doesn't assign
+ * between-sentence spaces, etc. to any sentence, so
+ * is_sentence_start/is_sentence_end mark the boundaries of those
+ * sentences.
+ */
+ guint is_sentence_boundary : 1;
+ guint is_sentence_start : 1; /* first character in a sentence */
+ guint is_sentence_end : 1; /* first non-sentence char after a sentence */
};
/* Determine information about cluster/word/line breaks in a string
@@ -57,6 +83,11 @@ void pango_break (const gchar *text,
PangoAnalysis *analysis,
PangoLogAttr *attrs);
+void pango_find_paragraph_boundary (const gchar *text,
+ gint length,
+ gint *paragraph_delimiter_index,
+ gint *next_paragraph_start);
+
void pango_get_log_attrs (const char *text,
int length,
int level,
@@ -72,6 +103,16 @@ void pango_shape (const gchar *text,
GList *pango_reorder_items (GList *logical_items);
+/* This is the default break algorithm, used if no language
+ * engine overrides it. Normally you should use pango_break()
+ * instead; this function is mostly useful for chaining up
+ * from a language engine override.
+ */
+void pango_default_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis,
+ PangoLogAttr *attrs);
+
#ifdef __cplusplus
}
#endif /* __cplusplus */