summaryrefslogtreecommitdiff
path: root/pango/break.c
diff options
context:
space:
mode:
Diffstat (limited to 'pango/break.c')
-rw-r--r--pango/break.c1373
1 files changed, 1342 insertions, 31 deletions
diff --git a/pango/break.c b/pango/break.c
index 3dc0465b..8e63415b 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -22,6 +22,1221 @@
#include "pango.h"
#include "pango-modules.h"
+/* See http://www.unicode.org/unicode/reports/tr14/ if you hope
+ * to understand the line breaking code.
+ */
+
+typedef enum
+{
+ BREAK_ALREADY_HANDLED, /* didn't use the table */
+ BREAK_PROHIBITED, /* no break, even if spaces intervene */
+ BREAK_IF_SPACES, /* "indirect break" (only if there are spaces) */
+ BREAK_ALLOWED /* "direct break" (can always break here) */
+} BreakOpportunity;
+
+enum
+{
+ INDEX_OPEN_PUNCTUATION,
+ INDEX_CLOSE_PUNCTUATION,
+ INDEX_QUOTATION,
+ INDEX_NON_BREAKING_GLUE,
+ INDEX_NON_STARTER,
+ INDEX_EXCLAMATION,
+ INDEX_SYMBOL,
+ INDEX_INFIX_SEPARATOR,
+ INDEX_PREFIX,
+ INDEX_POSTFIX,
+ INDEX_NUMERIC,
+ INDEX_ALPHABETIC,
+ INDEX_IDEOGRAPHIC,
+ INDEX_INSEPARABLE,
+ INDEX_HYPHEN,
+ INDEX_AFTER,
+ INDEX_BEFORE,
+ INDEX_BEFORE_AND_AFTER,
+ INDEX_ZERO_WIDTH_SPACE,
+ INDEX_COMBINING_MARK,
+
+ /* End of the table */
+ INDEX_END_OF_TABLE,
+
+ /* The following are not in the tables */
+ INDEX_MANDATORY,
+ INDEX_CARRIAGE_RETURN,
+ INDEX_LINE_FEED,
+ INDEX_SURROGATE,
+ INDEX_CONTINGENT,
+ INDEX_SPACE,
+ INDEX_COMPLEX_CONTEXT,
+ INDEX_AMBIGUOUS,
+ INDEX_UNKNOWN
+};
+
+static BreakOpportunity row_OPEN_PUNCTUATION[INDEX_END_OF_TABLE] = {
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_CLOSE_PUNCTUATION[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_QUOTATION[INDEX_END_OF_TABLE] = {
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_NON_BREAKING_GLUE[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_NON_STARTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_EXCLAMATION[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_SYMBOL[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_INFIX_SEPARATOR[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_PREFIX[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_POSTFIX[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_NUMERIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_ALPHABETIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_IDEOGRAPHIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_INSEPARABLE[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_HYPHEN[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_AFTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_BEFORE[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_BEFORE_AND_AFTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_ZERO_WIDTH_SPACE[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity row_COMBINING_MARK[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES
+};
+
+static BreakOpportunity *line_break_rows[INDEX_END_OF_TABLE] = {
+ row_OPEN_PUNCTUATION, /* INDEX_OPEN_PUNCTUATION */
+ row_CLOSE_PUNCTUATION, /* INDEX_CLOSE_PUNCTUATION */
+ row_QUOTATION, /* INDEX_QUOTATION */
+ row_NON_BREAKING_GLUE, /* INDEX_NON_BREAKING_GLUE */
+ row_NON_STARTER, /* INDEX_NON_STARTER */
+ row_EXCLAMATION, /* INDEX_EXCLAMATION */
+ row_SYMBOL, /* INDEX_SYMBOL */
+ row_INFIX_SEPARATOR, /* INDEX_INFIX_SEPARATOR */
+ row_PREFIX, /* INDEX_PREFIX */
+ row_POSTFIX, /* INDEX_POSTFIX */
+ row_NUMERIC, /* INDEX_NUMERIC */
+ row_ALPHABETIC, /* INDEX_ALPHABETIC */
+ row_IDEOGRAPHIC, /* INDEX_IDEOGRAPHIC */
+ row_INSEPARABLE, /* INDEX_INSEPARABLE */
+ row_HYPHEN, /* INDEX_HYPHEN */
+ row_AFTER, /* INDEX_AFTER */
+ row_BEFORE, /* INDEX_BEFORE */
+ row_BEFORE_AND_AFTER, /* INDEX_BEFORE_AND_AFTER */
+ row_ZERO_WIDTH_SPACE, /* INDEX_ZERO_WIDTH_SPACE */
+ row_COMBINING_MARK /* INDEX_COMBINING_MARK */
+};
+
+/* Map GUnicodeBreakType to table indexes */
+static int line_break_indexes[] = {
+ INDEX_MANDATORY,
+ INDEX_CARRIAGE_RETURN,
+ INDEX_LINE_FEED,
+ INDEX_COMBINING_MARK,
+ INDEX_SURROGATE,
+ INDEX_ZERO_WIDTH_SPACE,
+ INDEX_INSEPARABLE,
+ INDEX_NON_BREAKING_GLUE,
+ INDEX_CONTINGENT,
+ INDEX_SPACE,
+ INDEX_AFTER,
+ INDEX_BEFORE,
+ INDEX_BEFORE_AND_AFTER,
+ INDEX_HYPHEN,
+ INDEX_NON_STARTER,
+ INDEX_OPEN_PUNCTUATION,
+ INDEX_CLOSE_PUNCTUATION,
+ INDEX_QUOTATION,
+ INDEX_EXCLAMATION,
+ INDEX_IDEOGRAPHIC,
+ INDEX_NUMERIC,
+ INDEX_INFIX_SEPARATOR,
+ INDEX_SYMBOL,
+ INDEX_ALPHABETIC,
+ INDEX_PREFIX,
+ INDEX_POSTFIX,
+ INDEX_COMPLEX_CONTEXT,
+ INDEX_AMBIGUOUS,
+ INDEX_UNKNOWN
+};
+
+#define BREAK_INDEX(btype) \
+ (line_break_indexes[(btype)])
+#define BREAK_ROW(before_type) \
+ (line_break_rows[BREAK_INDEX (before_type)])
+#define BREAK_OP(before_type, after_type) \
+ (BREAK_ROW (before_type)[BREAK_INDEX (after_type)])
+#define IN_BREAK_TABLE(btype) \
+ (BREAK_INDEX(btype) < INDEX_END_OF_TABLE)
+
+/* Keep these in sync with the same macros in the test program */
+
+#define LEADING_JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x115F)
+#define VOWEL_JAMO(wc) ((wc) >= 0x1160 && (wc) <= 0x11A2)
+#define TRAILING_JAMO(wc) ((wc) >= 0x11A8 && (wc) <= 0x11F9)
+#define JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x11FF)
+/* "virama script" is just an optimization; it includes a bunch of
+ * scripts without viramas in them
+ */
+#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF)
+#define VIRAMA(wc) ((wc) == 0x094D || \
+ (wc) == 0x09CD || \
+ (wc) == 0x0A4D || \
+ (wc) == 0x0ACD || \
+ (wc) == 0x0B4D || \
+ (wc) == 0x0BCD || \
+ (wc) == 0x0C4D || \
+ (wc) == 0x0CCD || \
+ (wc) == 0x0D4D || \
+ (wc) == 0x0DCA || \
+ (wc) == 0x0E3A || \
+ (wc) == 0x0F84 || \
+ (wc) == 0x1039 || \
+ (wc) == 0x17D2)
+/* Types of Japanese characters */
+#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
+#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
+#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
+#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
+
+
+/* p. 132-133 of Unicode spec table 5-6 will help understand this */
+typedef enum
+{
+ STATE_SENTENCE_OUTSIDE,
+ STATE_SENTENCE_BODY,
+ STATE_SENTENCE_TERM,
+ STATE_SENTENCE_POST_TERM_CLOSE,
+ STATE_SENTENCE_POST_TERM_SPACE,
+ STATE_SENTENCE_POST_TERM_SEP,
+ STATE_SENTENCE_DOT,
+ STATE_SENTENCE_POST_DOT_CLOSE,
+ STATE_SENTENCE_POST_DOT_SPACE,
+ STATE_SENTENCE_POST_DOT_OPEN,
+ /* never include line/para separators in a sentence for now */
+ /* This isn't in the spec, but I can't figure out why they'd include
+ * one line/para separator in lines ending with Term but not with
+ * period-terminated lines, so I'm doing it for the dot lines also
+ */
+ STATE_SENTENCE_POST_DOT_SEP
+} SentenceState;
+
+/* We call "123" and "foobar" words, but "123foo" is two words;
+ * the Unicode spec just calls "123" a non-word
+ */
+typedef enum
+{
+ WordNone,
+ WordLetters,
+ WordNumbers
+} WordType;
+
+
+/**
+ * pango_default_break:
+ * @text: text to break
+ * @length: length of text in bytes
+ * @analysis: a #PangoAnalysis for the text
+ * @attrs: logical attributes to fill in
+ *
+ * This is the default break algorithm, used if no language
+ * engine overrides it. Normally you should use pango_break()
+ * instead; this function is mostly useful for chaining up
+ * from a language engine override. Unlike pango_break(),
+ * @analysis can be NULL, but only do that if you know what
+ * you're doing. (If you need an analysis to pass to pango_break(),
+ * you need to pango_itemize() or use pango_get_log_attrs().)
+ *
+ **/
+void
+pango_default_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis,
+ PangoLogAttr *attrs)
+{
+ /* The rationale for all this is in section 5.15 of the Unicode 3.0 book */
+
+ /* This is a default break implementation that should work for nearly all
+ * languages. Language engines can override it optionally.
+ */
+
+ /* FIXME one cheesy optimization here would be to memset attrs to 0
+ * before we start, and then never assign FALSE to anything
+ */
+
+ const gchar *next = text;
+ const gchar *end = text + length;
+ gint i = 0;
+ gunichar prev_wc;
+ gunichar next_wc;
+ GUnicodeType prev_type;
+ GUnicodeBreakType prev_break_type; /* skips spaces */
+ gboolean prev_was_break_space;
+ WordType current_word_type = WordNone;
+ gunichar last_word_letter = 0;
+ SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
+ /* Tracks what will be the end of the sentence if a period is
+ * determined to actually be a sentence-ending period.
+ */
+ gint possible_sentence_end = -1;
+ /* possible sentence break before Open* after a period-ended sentence */
+ gint possible_sentence_boundary = -1;
+
+ g_return_if_fail (text != NULL);
+ g_return_if_fail (attrs != NULL);
+
+ if (next == end)
+ return;
+
+ prev_type = (GUnicodeType) -1;
+ prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+ prev_was_break_space = FALSE;
+ prev_wc = 0;
+
+ next_wc = g_utf8_get_char (next);
+
+ g_assert (next_wc != 0);
+
+ while (next_wc != 0)
+ {
+ GUnicodeType type;
+ gunichar wc;
+ GUnicodeBreakType break_type;
+ BreakOpportunity break_op;
+
+ wc = next_wc;
+
+ next = g_utf8_next_char (next);
+
+ if (next >= end)
+ next_wc = 0;
+ else
+ {
+ next_wc = g_utf8_get_char (next);
+ g_assert (next_wc != 0);
+ }
+
+ type = g_unichar_type (wc);
+
+ /* Can't just use the type here since isspace() doesn't
+ * correspond to a Unicode character type
+ */
+ attrs[i].is_white = g_unichar_isspace (wc);
+
+
+ /* ---- Cursor position breaks (Grapheme breaks) ---- */
+
+ if (wc == '\n')
+ {
+ /* Break before line feed unless prev char is a CR */
+
+ if (prev_wc != '\r')
+ attrs[i].is_cursor_position = TRUE;
+ else
+ attrs[i].is_cursor_position = FALSE;
+ }
+ else if (i == 0 ||
+ prev_type == G_UNICODE_CONTROL ||
+ prev_type == G_UNICODE_FORMAT)
+ {
+ /* Break at first position (must be special cased, or if the
+ * first char is say a combining mark there won't be a
+ * cursor position at the start, which seems wrong to me
+ * ???? - maybe it makes sense though, who knows)
+ */
+ /* break after all format or control characters */
+ attrs[i].is_cursor_position = TRUE;
+ }
+ else
+ {
+ switch (type)
+ {
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ /* Break before all format or control characters */
+ attrs[i].is_cursor_position = TRUE;
+ break;
+
+ case G_UNICODE_COMBINING_MARK:
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ /* Unicode spec includes "Combining marks plus Tibetan
+ * subjoined characters" as joining chars, but lists the
+ * Tibetan subjoined characters as combining marks, and
+ * g_unichar_type() returns NON_SPACING_MARK for the Tibetan
+ * subjoined characters. So who knows, beats me.
+ */
+
+ /* It's a joining character, break only if preceded by
+ * control or format; we already handled the case where
+ * it was preceded earlier, so here we know it wasn't,
+ * don't break
+ */
+ attrs[i].is_cursor_position = FALSE;
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ if (JAMO (wc))
+ {
+ /* Break before Jamo if they are in a broken sequence or
+ * next to non-Jamo, otherwise don't
+ */
+ if (LEADING_JAMO (wc) &&
+ !LEADING_JAMO (prev_wc))
+ attrs[i].is_cursor_position = TRUE;
+ else if (VOWEL_JAMO (wc) &&
+ !LEADING_JAMO (prev_wc) &&
+ !VOWEL_JAMO (prev_wc))
+ attrs[i].is_cursor_position = TRUE;
+ else if (TRAILING_JAMO (wc) &&
+ !LEADING_JAMO (prev_wc) &&
+ !VOWEL_JAMO (prev_wc) &&
+ !TRAILING_JAMO (prev_wc))
+ attrs[i].is_cursor_position = TRUE;
+ else
+ attrs[i].is_cursor_position = FALSE;
+ }
+ else
+ {
+ /* Handle non-Jamo non-combining chars */
+
+ /* Break if preceded by Jamo; don't break if a
+ * letter is preceded by a virama; break in all
+ * other cases. No need to check whether we're
+ * preceded by Jamo explicitly, since a Jamo is not
+ * a virama, we just break in all cases where we
+ * aren't preceded by a virama. Don't fool with viramas
+ * if we aren't part of a script that uses them.
+ */
+
+ if (VIRAMA_SCRIPT (wc))
+ {
+ /* Check whether we're preceded by a virama; this
+ * could use some optimization.
+ */
+ if (VIRAMA (prev_wc))
+ attrs[i].is_cursor_position = FALSE;
+ else
+ attrs[i].is_cursor_position = TRUE;
+ }
+ else
+ {
+ attrs[i].is_cursor_position = TRUE;
+ }
+ }
+ break;
+
+ default:
+ /* Some weirdo char, just break here, why not */
+ attrs[i].is_cursor_position = TRUE;
+ break;
+ }
+ }
+
+ /* ---- Line breaking ---- */
+
+ break_type = g_unichar_break_type (wc);
+ break_op = BREAK_ALREADY_HANDLED;
+
+ g_assert (prev_break_type != G_UNICODE_BREAK_SPACE);
+
+ attrs[i].is_break = FALSE;
+ attrs[i].is_mandatory_break = FALSE;
+
+ if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary,
+ * it's not a line break either
+ */
+ {
+ switch (prev_break_type)
+ {
+ case G_UNICODE_BREAK_MANDATORY:
+ case G_UNICODE_BREAK_LINE_FEED:
+ attrs[i].is_break = TRUE;
+ attrs[i].is_mandatory_break = TRUE;
+ break;
+
+ case G_UNICODE_BREAK_CARRIAGE_RETURN:
+ if (wc != '\n')
+ {
+ attrs[i].is_break = TRUE;
+ attrs[i].is_mandatory_break = TRUE;
+ }
+ break;
+
+ case G_UNICODE_BREAK_CONTINGENT:
+ /* can break after 0xFFFC by default, though we might want
+ * to eventually have a PangoLayout setting or
+ * PangoAttribute that disables this, if for some
+ * application breaking after objects is not desired.
+ */
+ break_op = BREAK_ALLOWED;
+ break;
+
+ case G_UNICODE_BREAK_SURROGATE:
+ /* FIXME I have no clue what to do with these,
+ * but we should do something with them
+ */
+ break;
+
+ case G_UNICODE_BREAK_AMBIGUOUS:
+ /* FIXME we need to resolve the East Asian width
+ * to decide what to do here
+ */
+ case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+ /* FIXME language engines should handle this case... */
+ case G_UNICODE_BREAK_UNKNOWN:
+ /* treat unknown, complex, ambiguous as if they were
+ * alphabetic for now.
+ */
+ prev_break_type = G_UNICODE_BREAK_ALPHABETIC;
+ /* FALL THRU to use the pair table if appropriate */
+
+ default:
+
+ /* Note that our table assumes that combining marks
+ * are only applied to alphabetic characters;
+ * tech report 14 explains how to remove this assumption
+ * from the code, if anyone ever cares, but it shouldn't
+ * be a problem. Also this issue sort of goes
+ * away since we only look for breaks on grapheme
+ * boundaries.
+ */
+
+ g_assert (IN_BREAK_TABLE (prev_break_type));
+
+ switch (break_type)
+ {
+ case G_UNICODE_BREAK_MANDATORY:
+ case G_UNICODE_BREAK_LINE_FEED:
+ case G_UNICODE_BREAK_CARRIAGE_RETURN:
+ case G_UNICODE_BREAK_SPACE:
+ /* These types all "pile up" at the end of lines and
+ * get elided.
+ */
+ break_op = BREAK_PROHIBITED;
+ break;
+
+ case G_UNICODE_BREAK_CONTINGENT:
+ /* break before 0xFFFC by default, eventually
+ * make this configurable?
+ */
+ break_op = BREAK_ALLOWED;
+ break;
+
+ case G_UNICODE_BREAK_AMBIGUOUS:
+ /* FIXME resolve East Asian width to figure out what to do */
+ case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+ /* FIXME language engine analysis */
+ case G_UNICODE_BREAK_UNKNOWN:
+ case G_UNICODE_BREAK_ALPHABETIC:
+ /* treat all of the above as alphabetic for now */
+ break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC);
+ break;
+
+ case G_UNICODE_BREAK_SURROGATE:
+ /* FIXME this case needs to be handled
+ */
+ break_op = BREAK_IF_SPACES; /* not right at all */
+ break;
+
+ default:
+ g_assert (IN_BREAK_TABLE (prev_break_type));
+ g_assert (IN_BREAK_TABLE (break_type));
+ break_op = BREAK_OP (prev_break_type, break_type);
+ break;
+ }
+ break;
+ }
+
+ if (break_op != BREAK_ALREADY_HANDLED)
+ {
+ switch (break_op)
+ {
+ case BREAK_PROHIBITED:
+ /* nothing, can't break here */
+ break;
+
+ case BREAK_IF_SPACES:
+ /* break if prev char was space */
+ if (prev_was_break_space)
+ attrs[i].is_break = TRUE;
+ break;
+
+ case BREAK_ALLOWED:
+ attrs[i].is_break = TRUE;
+ break;
+
+ default:
+ g_assert_not_reached ();
+ break;
+ }
+ }
+ }
+
+ if (break_type != G_UNICODE_BREAK_SPACE)
+ {
+ prev_break_type = break_type;
+ prev_was_break_space = FALSE;
+ }
+ else
+ prev_was_break_space = TRUE;
+
+ /* ---- Word breaks ---- */
+
+ /* default to not a word start/end */
+ attrs[i].is_word_start = FALSE;
+ attrs[i].is_word_end = FALSE;
+
+ if (current_word_type != WordNone)
+ {
+ /* Check for a word end */
+ switch (type)
+ {
+ case G_UNICODE_COMBINING_MARK:
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ /* nothing, we just eat these up as part of the word */
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ if (current_word_type == WordLetters)
+ {
+ /* Japanese special cases for ending the word */
+ if (JAPANESE (last_word_letter) ||
+ JAPANESE (wc))
+ {
+ if ((HIRAGANA (last_word_letter) &&
+ !HIRAGANA (wc)) ||
+ (KATAKANA (last_word_letter) &&
+ !(KATAKANA (wc) || HIRAGANA (wc))) ||
+ (KANJI (last_word_letter) &&
+ !(HIRAGANA (wc) || KANJI (wc))) ||
+ (JAPANESE (last_word_letter) &&
+ !JAPANESE (wc)) ||
+ (!JAPANESE (last_word_letter) &&
+ JAPANESE (wc)))
+ attrs[i].is_word_end = TRUE;
+ }
+ }
+ else
+ {
+ /* end the number word, start the letter word */
+ attrs[i].is_word_end = TRUE;
+ attrs[i].is_word_start = TRUE;
+ current_word_type = WordLetters;
+ }
+
+ last_word_letter = wc;
+ break;
+
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ if (current_word_type != WordNumbers)
+ {
+ attrs[i].is_word_end = TRUE;
+ attrs[i].is_word_start = TRUE;
+ current_word_type = WordNumbers;
+ }
+
+ last_word_letter = wc;
+ break;
+
+ default:
+ /* Punctuation, control/format chars, etc. all end a word. */
+ attrs[i].is_word_end = TRUE;
+ break;
+ }
+
+ if (attrs[i].is_word_end)
+ current_word_type = WordNone;
+ }
+ else
+ {
+ /* Check for a word start */
+ switch (type)
+ {
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ current_word_type = WordLetters;
+ last_word_letter = wc;
+ attrs[i].is_word_start = TRUE;
+ break;
+
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ current_word_type = WordNumbers;
+ last_word_letter = wc;
+ attrs[i].is_word_start = TRUE;
+ break;
+
+ default:
+ /* No word here */
+ break;
+ }
+ }
+
+ /* ---- Sentence breaks ---- */
+
+ /* The Unicode spec specifies sentence breakpoints, so that a piece of
+ * text would be partitioned into sentences, and all characters would
+ * be inside some sentence. This code implements that for is_sentence_boundary,
+ * but tries to keep leading/trailing whitespace out of sentences for
+ * the start/end flags
+ */
+
+ /* The Unicode spec seems to say that one trailing line/para
+ * separator can be tacked on to a sentence ending in ! or ?,
+ * but not a sentence ending in period; I think they're on crack
+ * so am allowing one to be tacked onto a sentence ending in period.
+ */
+
+ /* No sentence break at the start of the text */
+
+ /* default to not a sentence breakpoint */
+ attrs[i].is_sentence_boundary = FALSE;
+ attrs[i].is_sentence_start = FALSE;
+ attrs[i].is_sentence_end = FALSE;
+
+ /* FIXME the Unicode spec lumps control/format chars with
+ * line/para separators in descriptive text, but not in the
+ * character class specs, in table 5-6, so who knows whether you
+ * are actually supposed to break on control/format
+ * characters. Seems semi-broken to break on tabs...
+ */
+
+ /* Break after line/para separators except carriage return
+ * followed by newline
+ */
+ switch (prev_type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ if (wc == '\r')
+ {
+ if (next_wc != '\n')
+ attrs[i].is_sentence_boundary = TRUE;
+ }
+ else
+ attrs[i].is_sentence_boundary = TRUE;
+ break;
+
+ default:
+ break;
+ }
+
+ /* break before para/line separators except newline following
+ * carriage return
+ */
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ if (wc == '\n')
+ {
+ if (prev_wc != '\r')
+ attrs[i].is_sentence_boundary = TRUE;
+ }
+ else
+ attrs[i].is_sentence_boundary = TRUE;
+ break;
+
+ default:
+ break;
+ }
+
+ switch (sentence_state)
+ {
+ case STATE_SENTENCE_OUTSIDE:
+ /* Start sentence if we have non-whitespace/format/control */
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ case G_UNICODE_SPACE_SEPARATOR:
+ break;
+
+ default:
+ attrs[i].is_sentence_start = TRUE;
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_BODY:
+ /* If we already broke here due to separators, end the sentence. */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ else
+ {
+ if (wc == '.')
+ sentence_state = STATE_SENTENCE_DOT;
+ else if (wc == '?' || wc == '!')
+ sentence_state = STATE_SENTENCE_TERM;
+ }
+ break;
+
+ case STATE_SENTENCE_TERM:
+ /* End sentence on anything but close punctuation and some
+ * loosely-specified OTHER_PUNCTUATION such as period,
+ * comma, etc.; follow Unicode rules for breaks
+ */
+ switch (type)
+ {
+ case G_UNICODE_OTHER_PUNCTUATION:
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ if (type == G_UNICODE_CLOSE_PUNCTUATION ||
+ wc == '.' ||
+ wc == ',' ||
+ wc == '?' ||
+ wc == '!')
+ sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
+ else
+ {
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_CLOSE:
+ /* End sentence on anything besides more punctuation; follow
+ * rules for breaks
+ */
+ switch (type)
+ {
+ case G_UNICODE_OTHER_PUNCTUATION:
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ if (type == G_UNICODE_CLOSE_PUNCTUATION ||
+ wc == '.' ||
+ wc == ',' ||
+ wc == '?' ||
+ wc == '!')
+ /* continue in this state */
+ ;
+ else
+ {
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ /* undo the unconditional break-at-all-line/para-separators
+ * from above; I'm not sure this is what the Unicode spec
+ * intends, but it seems right - we get to include
+ * a single line/para separator in the sentence according
+ * to their rules
+ */
+ attrs[i].is_sentence_boundary = FALSE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_SPACE:
+
+ /* Sentence is definitely already ended; to enter this state
+ * we had to see a space, which ends the sentence.
+ */
+
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ /* continue in this state */
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ /* undo the unconditional break-at-all-line/para-separators
+ * from above; I'm not sure this is what the Unicode spec
+ * intends, but it seems right
+ */
+ attrs[i].is_sentence_boundary = FALSE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_SEP:
+ /* Break is forced at this point, unless we're a newline
+ * after a CR, then we will break after the newline on the
+ * next iteration. Only a single Sep can be in the
+ * sentence.
+ */
+ if (!(prev_wc == '\r' && wc == '\n'))
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+
+ case STATE_SENTENCE_DOT:
+ switch (type)
+ {
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ possible_sentence_end = i;
+ sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
+ break;
+
+ default:
+ /* If we broke on a control/format char, end the
+ * sentence; else this was not a sentence end, since
+ * we didn't enter the POST_DOT_SPACE state.
+ */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ else
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_CLOSE:
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ possible_sentence_end = i;
+ sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
+ break;
+
+ default:
+ /* If we broke on a control/format char, end the
+ * sentence; else this was not a sentence end, since
+ * we didn't enter the POST_DOT_SPACE state.
+ */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ }
+ else
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_SPACE:
+
+ possible_sentence_boundary = i;
+
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ /* remain in current state */
+ break;
+
+ case G_UNICODE_OPEN_PUNCTUATION:
+ sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ /* wasn't a sentence-ending period; so re-enter the sentence
+ * body
+ */
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+
+ default:
+ /* End the sentence, break, maybe start a new one */
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+
+ default:
+ g_assert (type != G_UNICODE_SPACE_SEPARATOR);
+ sentence_state = STATE_SENTENCE_BODY;
+ attrs[i].is_sentence_start = TRUE;
+ break;
+ }
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_OPEN:
+ switch (type)
+ {
+ case G_UNICODE_OPEN_PUNCTUATION:
+ /* continue in current state */
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ /* wasn't a sentence-ending period; so re-enter the sentence
+ * body
+ */
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+
+ default:
+ /* End the sentence, break, maybe start a new one */
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+ break;
+
+ default:
+ g_assert (type != G_UNICODE_SPACE_SEPARATOR);
+ sentence_state = STATE_SENTENCE_BODY;
+ attrs[i].is_sentence_start = TRUE;
+ break;
+ }
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_SEP:
+ /* Break is forced at this point, unless we're a newline
+ * after a CR, then we will break after the newline on the
+ * next iteration. Only a single Sep can be in the
+ * sentence.
+ */
+ if (!(prev_wc == '\r' && wc == '\n'))
+ attrs[i].is_sentence_boundary = TRUE;
+ sentence_state = STATE_SENTENCE_OUTSIDE;
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+ break;
+
+ default:
+ g_assert_not_reached ();
+ break;
+ }
+
+ prev_type = type;
+ prev_wc = wc;
+ ++i;
+ }
+}
+
/**
* pango_break:
* @text: the text to process
@@ -32,31 +1247,120 @@
* Determines possible line, word, and character breaks
* for a string of Unicode text.
*/
-void pango_break (const gchar *text,
- gint length,
- PangoAnalysis *analysis,
- PangoLogAttr *attrs)
+void
+pango_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis,
+ PangoLogAttr *attrs)
{
- /* Pseudo-implementation */
+ g_return_if_fail (text != NULL);
+ g_return_if_fail (analysis != NULL);
+ g_return_if_fail (attrs != NULL);
+
+ if (length < 0)
+ length = strlen (text);
- const gchar *cur = text;
- gint i = 0;
- gunichar wc;
+ if (analysis->lang_engine &&
+ analysis->lang_engine->script_break)
+ (* analysis->lang_engine->script_break) (text, length, analysis, attrs);
+ else
+ pango_default_break (text, length, analysis, attrs);
+}
+
+/**
+ * pango_find_paragraph_boundary:
+ * @text: UTF-8 text
+ * @length: length of @text in bytes, or -1 if nul-terminated
+ * @paragraph_delimiter_index: return location for index of delimiter
+ * @next_paragraph_start: return location for start of next paragraph
+ *
+ * Locates a paragraph boundary in @text. A boundary is caused by
+ * delimiter characters, such as a newline, carriage return, carriage
+ * return-newline pair, or Unicode paragraph separator character. The
+ * index of the run of delimiters is returned in
+ * @paragraph_delimiter_index. The index of the start of the paragraph
+ * (index after all delimiters) is stored in @paragraph_start.
+ *
+ * If no delimiters are found, both @paragraph_delimiter_index and
+ * @next_paragraph_start are filled with the length of @text (an index one
+ * off the end).
+ **/
+void
+pango_find_paragraph_boundary (const gchar *text,
+ gint length,
+ gint *paragraph_delimiter_index,
+ gint *next_paragraph_start)
+{
+ const gchar *p = text;
+ const gchar *end;
+ const gchar *start = NULL;
+ const gchar *delimiter = NULL;
+ gunichar prev_wc;
+
+ /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in
+ * Unicode 3.0; update this if that changes.
+ */
+#define PARAGRAPH_SEPARATOR 0x2029
+
+ if (length < 0)
+ length = strlen (text);
+
+ end = text + length;
+
+ if (paragraph_delimiter_index)
+ *paragraph_delimiter_index = length;
+
+ if (next_paragraph_start)
+ *next_paragraph_start = length;
+
+ if (length == 0)
+ return;
+
+ /* FIXME there's plenty of room to optimize this; e.g. there's
+ * no real need to g_utf8_get_char() on every char
+ */
- while (*cur && cur - text < length)
+ prev_wc = 0;
+
+ while (p != end)
{
- wc = g_utf8_get_char (cur);
- if (wc == (gunichar)-1)
- break; /* FIXME: ERROR */
-
- attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == '\n' || wc == 0x200b) ? 1 : 0;
- attrs[i].is_break = i == 0 || attrs[i-1].is_white || attrs[i].is_white;
- attrs[i].is_char_stop = 1;
- attrs[i].is_word_stop = ((i == 0) || attrs[i-1].is_white) && !attrs[i].is_white;
+ gunichar wc;
+
+ wc = g_utf8_get_char (p);
+
+ if (prev_wc == '\n' ||
+ prev_wc == PARAGRAPH_SEPARATOR)
+ {
+ g_assert (delimiter);
+ start = p;
+ break;
+ }
+ else if (prev_wc == '\r')
+ {
+ /* don't break between \r and \n */
+ if (wc != '\n')
+ {
+ g_assert (delimiter);
+ start = p;
+ break;
+ }
+ }
- i++;
- cur = g_utf8_next_char (cur);
+ if ((wc == '\n' ||
+ wc == '\r' ||
+ wc == PARAGRAPH_SEPARATOR) &&
+ delimiter == NULL)
+ delimiter = p;
+
+ prev_wc = wc;
+ p = g_utf8_next_char (p);
}
+
+ if (delimiter && paragraph_delimiter_index)
+ *paragraph_delimiter_index = delimiter - text;
+
+ if (start && next_paragraph_start)
+ *next_paragraph_start = start - text;
}
/**
@@ -85,17 +1389,20 @@ pango_get_log_attrs (const char *text,
const char *range_start;
int chars_in_range;
static guint engine_type_id = 0;
- static guint render_type_id = 0;
+ static guint render_type_id = 0;
PangoAnalysis analysis = { NULL, NULL, NULL, 0 };
analysis.level = level;
-
+
g_return_if_fail (length == 0 || text != NULL);
g_return_if_fail (log_attrs != NULL);
-
+
+ if (length < 0)
+ length = strlen (text);
+
if (length == 0)
return;
-
+
if (engine_type_id == 0)
{
engine_type_id = g_quark_from_static_string (PANGO_ENGINE_TYPE_LANG);
@@ -105,23 +1412,27 @@ pango_get_log_attrs (const char *text,
n_chars = g_utf8_strlen (text, length);
lang_map = pango_find_map (language, engine_type_id, render_type_id);
-
+
range_start = text;
range_engine = (PangoEngineLang*) pango_map_get_engine (lang_map,
g_utf8_get_char (text));
analysis.lang_engine = range_engine;
chars_broken = 0;
chars_in_range = 1;
-
+
end = text + length;
pos = g_utf8_next_char (text);
-
+
while (pos != end)
{
+ g_assert (chars_in_range > 0);
+ g_assert (range_start <= end);
+ g_assert (end - pos < length);
+
analysis.lang_engine =
(PangoEngineLang*) pango_map_get_engine (lang_map,
g_utf8_get_char (pos));
-
+
if (range_engine != analysis.lang_engine)
{
/* Engine has changed; do the breaking for the current range,
@@ -133,7 +1444,7 @@ pango_get_log_attrs (const char *text,
log_attrs + chars_broken);
chars_broken += chars_in_range;
-
+
range_start = pos;
range_engine = analysis.lang_engine;
chars_in_range = 1;
@@ -142,15 +1453,15 @@ pango_get_log_attrs (const char *text,
{
chars_in_range += 1;
}
-
+
pos = g_utf8_next_char (pos);
}
-
+
g_assert (chars_in_range > 0);
g_assert (range_start != end);
g_assert (pos == end);
g_assert (range_engine == analysis.lang_engine);
-
+
pango_break (range_start,
end - range_start,
&analysis,