summaryrefslogtreecommitdiff
path: root/trunk/pango/break.c
diff options
context:
space:
mode:
Diffstat (limited to 'trunk/pango/break.c')
-rw-r--r--trunk/pango/break.c1738
1 files changed, 1738 insertions, 0 deletions
diff --git a/trunk/pango/break.c b/trunk/pango/break.c
new file mode 100644
index 00000000..091d2541
--- /dev/null
+++ b/trunk/pango/break.c
@@ -0,0 +1,1738 @@
+/* Pango
+ * break.c:
+ *
+ * Copyright (C) 1999 Red Hat Software
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include <config.h>
+
+#include "pango-break.h"
+#include "pango-modules.h"
+#include <string.h>
+
+#define PARAGRAPH_SEPARATOR 0x2029
+#define PARAGRAPH_SEPARATOR_STRING "\xE2\x80\xA9"
+
+/* See http://www.unicode.org/unicode/reports/tr14/ if you hope
+ * to understand the line breaking code.
+ */
+
+typedef enum
+{
+ BREAK_ALREADY_HANDLED, /* didn't use the table */
+ BREAK_PROHIBITED, /* no break, even if spaces intervene */
+ BREAK_IF_SPACES, /* "indirect break" (only if there are spaces) */
+ BREAK_ALLOWED /* "direct break" (can always break here) */
+ /* TR 14 has one more break-opportunity class,
+ * "indirect break opportunity for combining marks following a space"
+ * but we handle that inline in the code.
+ */
+} BreakOpportunity;
+
+
+enum
+{
+ INDEX_OPEN_PUNCTUATION,
+ INDEX_CLOSE_PUNCTUATION,
+ INDEX_QUOTATION,
+ INDEX_NON_BREAKING_GLUE,
+ INDEX_NON_STARTER,
+ INDEX_EXCLAMATION,
+ INDEX_SYMBOL,
+ INDEX_INFIX_SEPARATOR,
+ INDEX_PREFIX,
+ INDEX_POSTFIX,
+ INDEX_NUMERIC,
+ INDEX_ALPHABETIC,
+ INDEX_IDEOGRAPHIC,
+ INDEX_INSEPARABLE,
+ INDEX_HYPHEN,
+ INDEX_AFTER,
+ INDEX_BEFORE,
+ INDEX_BEFORE_AND_AFTER,
+ INDEX_ZERO_WIDTH_SPACE,
+ INDEX_COMBINING_MARK,
+ INDEX_WORD_JOINER,
+
+ /* End of the table */
+
+ INDEX_END_OF_TABLE,
+
+ /* The following are not in the tables */
+ INDEX_MANDATORY,
+ INDEX_CARRIAGE_RETURN,
+ INDEX_LINE_FEED,
+ INDEX_SURROGATE,
+ INDEX_CONTINGENT,
+ INDEX_SPACE,
+ INDEX_COMPLEX_CONTEXT,
+ INDEX_AMBIGUOUS,
+ INDEX_UNKNOWN,
+ INDEX_NEXT_LINE,
+ INDEX_HANGUL_L_JAMO,
+ INDEX_HANGUL_V_JAMO,
+ INDEX_HANGUL_T_JAMO,
+ INDEX_HANGUL_LV_SYLLABLE,
+ INDEX_HANGUL_LVT_SYLLABLE,
+};
+
+static const BreakOpportunity row_OPEN_PUNCTUATION[INDEX_END_OF_TABLE] = {
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_CLOSE_PUNCTUATION[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_QUOTATION[INDEX_END_OF_TABLE] = {
+ BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_NON_BREAKING_GLUE[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_NON_STARTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_EXCLAMATION[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_SYMBOL[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_INFIX_SEPARATOR[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_PREFIX[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_POSTFIX[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_NUMERIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_ALPHABETIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_IDEOGRAPHIC[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_INSEPARABLE[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_HYPHEN[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_AFTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_BEFORE[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_BEFORE_AND_AFTER[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_ZERO_WIDTH_SPACE[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED
+};
+
+static const BreakOpportunity row_COMBINING_MARK[INDEX_END_OF_TABLE] = {
+ BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity row_WORD_JOINER[INDEX_END_OF_TABLE] = {
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
+ BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
+ BREAK_PROHIBITED
+};
+
+static const BreakOpportunity *const line_break_rows[INDEX_END_OF_TABLE] = {
+ row_OPEN_PUNCTUATION, /* INDEX_OPEN_PUNCTUATION */
+ row_CLOSE_PUNCTUATION, /* INDEX_CLOSE_PUNCTUATION */
+ row_QUOTATION, /* INDEX_QUOTATION */
+ row_NON_BREAKING_GLUE, /* INDEX_NON_BREAKING_GLUE */
+ row_NON_STARTER, /* INDEX_NON_STARTER */
+ row_EXCLAMATION, /* INDEX_EXCLAMATION */
+ row_SYMBOL, /* INDEX_SYMBOL */
+ row_INFIX_SEPARATOR, /* INDEX_INFIX_SEPARATOR */
+ row_PREFIX, /* INDEX_PREFIX */
+ row_POSTFIX, /* INDEX_POSTFIX */
+ row_NUMERIC, /* INDEX_NUMERIC */
+ row_ALPHABETIC, /* INDEX_ALPHABETIC */
+ row_IDEOGRAPHIC, /* INDEX_IDEOGRAPHIC */
+ row_INSEPARABLE, /* INDEX_INSEPARABLE */
+ row_HYPHEN, /* INDEX_HYPHEN */
+ row_AFTER, /* INDEX_AFTER */
+ row_BEFORE, /* INDEX_BEFORE */
+ row_BEFORE_AND_AFTER, /* INDEX_BEFORE_AND_AFTER */
+ row_ZERO_WIDTH_SPACE, /* INDEX_ZERO_WIDTH_SPACE */
+ row_COMBINING_MARK, /* INDEX_COMBINING_MARK */
+ row_WORD_JOINER /* INDEX_WORD_JOINER */
+};
+
+/* Map GUnicodeBreakType to table indexes */
+static const int line_break_indexes[] = {
+ INDEX_MANDATORY,
+ INDEX_CARRIAGE_RETURN,
+ INDEX_LINE_FEED,
+ INDEX_COMBINING_MARK,
+ INDEX_SURROGATE,
+ INDEX_ZERO_WIDTH_SPACE,
+ INDEX_INSEPARABLE,
+ INDEX_NON_BREAKING_GLUE,
+ INDEX_CONTINGENT,
+ INDEX_SPACE,
+ INDEX_AFTER,
+ INDEX_BEFORE,
+ INDEX_BEFORE_AND_AFTER,
+ INDEX_HYPHEN,
+ INDEX_NON_STARTER,
+ INDEX_OPEN_PUNCTUATION,
+ INDEX_CLOSE_PUNCTUATION,
+ INDEX_QUOTATION,
+ INDEX_EXCLAMATION,
+ INDEX_IDEOGRAPHIC,
+ INDEX_NUMERIC,
+ INDEX_INFIX_SEPARATOR,
+ INDEX_SYMBOL,
+ INDEX_ALPHABETIC,
+ INDEX_PREFIX,
+ INDEX_POSTFIX,
+ INDEX_COMPLEX_CONTEXT,
+ INDEX_AMBIGUOUS,
+ INDEX_UNKNOWN,
+ INDEX_NEXT_LINE,
+ INDEX_WORD_JOINER,
+ INDEX_HANGUL_L_JAMO,
+ INDEX_HANGUL_V_JAMO,
+ INDEX_HANGUL_T_JAMO,
+ INDEX_HANGUL_LV_SYLLABLE,
+ INDEX_HANGUL_LVT_SYLLABLE
+};
+
+#define BREAK_TYPE_SAFE(btype) \
+ (btype < G_N_ELEMENTS(line_break_indexes) ? btype : G_UNICODE_BREAK_UNKNOWN)
+#define BREAK_INDEX(btype) \
+ (line_break_indexes[(btype)])
+#define BREAK_ROW(before_type) \
+ (line_break_rows[BREAK_INDEX (before_type)])
+#define BREAK_OP(before_type, after_type) \
+ (BREAK_ROW (before_type)[BREAK_INDEX (after_type)])
+#define IN_BREAK_TABLE(btype) \
+ (btype < G_N_ELEMENTS(line_break_indexes) && BREAK_INDEX(btype) < INDEX_END_OF_TABLE)
+
+
+
+/*
+ * Hangul Conjoining Jamo handling.
+ *
+ * The way we implement it is just a bit different from TR14,
+ * but produces the same results.
+ * The same algorithm is also used in TR29 for cluster boundaries.
+ *
+ */
+
+
+/* An enum that works as the states of the Hangul syllables system.
+ **/
+typedef enum
+{
+ JAMO_L, /* G_UNICODE_BREAK_HANGUL_L_JAMO */
+ JAMO_V, /* G_UNICODE_BREAK_HANGUL_V_JAMO */
+ JAMO_T, /* G_UNICODE_BREAK_HANGUL_T_JAMO */
+ JAMO_LV, /* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */
+ JAMO_LVT, /* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */
+ NO_JAMO /* Other */
+} JamoType;
+
+/* There are Hangul syllables encoded as characters, that act like a
+ * sequence of Jamos. For each character we define a JamoType
+ * that the character starts with, and one that it ends with. This
+ * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs. So for
+ * example, a character with LineBreak type
+ * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
+ */
+typedef struct _CharJamoProps
+{
+ JamoType start, end;
+} CharJamoProps;
+
+/* Map from JamoType to CharJamoProps that hold only simple
+ * JamoTypes (no LV or LVT) or none.
+ */
+static const CharJamoProps HangulJamoProps[] = {
+ {JAMO_L, JAMO_L}, /* JAMO_L */
+ {JAMO_V, JAMO_V}, /* JAMO_V */
+ {JAMO_T, JAMO_T}, /* JAMO_T */
+ {JAMO_L, JAMO_V}, /* JAMO_LV */
+ {JAMO_L, JAMO_T}, /* JAMO_LVT */
+ {NO_JAMO, NO_JAMO} /* NO_JAMO */
+};
+
+/* A character forms a syllable with the previous character if and only if:
+ * JamoType(this) is not NO_JAMO and:
+ *
+ * HangulJamoProps[JamoType(prev)].end and
+ * HangulJamoProps[JamoType(this)].start are equal,
+ * or the former is one less than the latter.
+ */
+
+#define IS_JAMO(btype) \
+ ((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
+ (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
+#define JAMO_TYPE(btype) \
+ (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
+
+
+
+
+/* "virama script" is just an optimization; it includes a bunch of
+ * scripts without viramas in them
+ */
+#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF)
+#define VIRAMA(wc) ((wc) == 0x094D || \
+ (wc) == 0x09CD || \
+ (wc) == 0x0A4D || \
+ (wc) == 0x0ACD || \
+ (wc) == 0x0B4D || \
+ (wc) == 0x0BCD || \
+ (wc) == 0x0C4D || \
+ (wc) == 0x0CCD || \
+ (wc) == 0x0D4D || \
+ (wc) == 0x0DCA || \
+ (wc) == 0x0E3A || \
+ (wc) == 0x0F84 || \
+ (wc) == 0x1039 || \
+ (wc) == 0x17D2)
+/* Types of Japanese characters */
+#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
+#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
+#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
+#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
+
+#define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
+#define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
+#define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
+#define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
+#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
+#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))
+
+
+/* p. 132-133 of Unicode spec table 5-6 will help understand this */
+typedef enum
+{
+ STATE_SENTENCE_OUTSIDE,
+ STATE_SENTENCE_BODY,
+ STATE_SENTENCE_TERM,
+ STATE_SENTENCE_POST_TERM_CLOSE,
+ STATE_SENTENCE_POST_TERM_SPACE,
+ STATE_SENTENCE_POST_TERM_SEP,
+ STATE_SENTENCE_DOT,
+ STATE_SENTENCE_POST_DOT_CLOSE,
+ STATE_SENTENCE_POST_DOT_SPACE,
+ STATE_SENTENCE_POST_DOT_OPEN,
+ /* never include line/para separators in a sentence for now */
+ /* This isn't in the spec, but I can't figure out why they'd include
+ * one line/para separator in lines ending with Term but not with
+ * period-terminated lines, so I'm doing it for the dot lines also
+ */
+ STATE_SENTENCE_POST_DOT_SEP
+} SentenceState;
+
+/* We call "123" and "foobar" words, but "123foo" is two words;
+ * the Unicode spec just calls "123" a non-word
+ */
+typedef enum
+{
+ WordNone,
+ WordLetters,
+ WordNumbers
+} WordType;
+
+
+/**
+ * pango_default_break:
+ * @text: text to break
+ * @length: length of text in bytes (may be -1 if @text is nul-terminated)
+ * @analysis: a #PangoAnalysis for the @text
+ * @attrs: logical attributes to fill in
+ * @attrs_len: size of the array passed as @attrs
+ *
+ * This is the default break algorithm, used if no language
+ * engine overrides it. Normally you should use pango_break()
+ * instead. Unlike pango_break(),
+ * @analysis can be %NULL, but only do that if you know what
+ * you're doing. If you need an analysis to pass to pango_break(),
+ * you need to pango_itemize(). In most cases however you should
+ * simply use pango_get_log_attrs().
+ **/
+void
+pango_default_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis,
+ PangoLogAttr *attrs,
+ int attrs_len)
+{
+ /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
+ * the line breaking stuff is also in TR14 on unicode.org
+ */
+
+ /* This is a default break implementation that should work for nearly all
+ * languages. Language engines can override it optionally.
+ */
+
+ /* FIXME one cheesy optimization here would be to memset attrs to 0
+ * before we start, and then never assign %FALSE to anything
+ */
+
+ const gchar *next;
+ gint i;
+
+ gunichar prev_wc;
+ gunichar next_wc;
+
+ JamoType prev_jamo;
+
+ GUnicodeBreakType next_break_type;
+ GUnicodeType prev_type;
+ GUnicodeBreakType prev_break_type; /* skips spaces */
+ gboolean prev_was_break_space;
+
+ WordType current_word_type = WordNone;
+ gunichar last_word_letter = 0;
+ gunichar base_character = 0;
+
+ SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
+ /* Tracks what will be the end of the sentence if a period is
+ * determined to actually be a sentence-ending period.
+ */
+ gint possible_sentence_end = -1;
+ /* possible sentence break before Open* after a period-ended sentence */
+ gint possible_sentence_boundary = -1;
+ gboolean almost_done = FALSE;
+ gboolean done = FALSE;
+
+ g_return_if_fail (length == 0 || text != NULL);
+ g_return_if_fail (attrs != NULL);
+
+ next = text;
+
+ prev_type = (GUnicodeType) -1;
+ prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+ prev_was_break_space = FALSE;
+ prev_wc = 0;
+ prev_jamo = NO_JAMO;
+
+ if (length == 0 || *text == '\0')
+ next_wc = PARAGRAPH_SEPARATOR;
+ else
+ next_wc = g_utf8_get_char (next);
+
+ next_break_type = g_unichar_break_type (next_wc);
+ next_break_type = BREAK_TYPE_SAFE (next_break_type);
+
+ for (i = 0; !done ; i++)
+ {
+ GUnicodeType type;
+ gunichar wc;
+ GUnicodeBreakType break_type;
+ BreakOpportunity break_op;
+ JamoType jamo;
+ gboolean makes_hangul_syllable;
+
+ wc = next_wc;
+ break_type = next_break_type;
+
+ if (almost_done)
+ {
+ /*
+ * If we have already reached the end of @text g_utf8_next_char()
+ * may not increment next
+ */
+ next_wc = 0;
+ next_break_type = G_UNICODE_BREAK_UNKNOWN;
+ done = TRUE;
+ }
+ else
+ {
+ next = g_utf8_next_char (next);
+
+ if ((length >= 0 && next >= text + length) || *next == '\0')
+ {
+ /* This is how we fill in the last element (end position) of the
+ * attr array - assume there's a paragraph separators off the end
+ * of @text.
+ */
+ next_wc = PARAGRAPH_SEPARATOR;
+ almost_done = TRUE;
+ }
+ else
+ next_wc = g_utf8_get_char (next);
+
+ next_break_type = g_unichar_break_type (next_wc);
+ next_break_type = BREAK_TYPE_SAFE (next_break_type);
+ }
+
+ type = g_unichar_type (wc);
+ jamo = JAMO_TYPE (break_type);
+
+ /* Determine wheter this forms a Hangul syllable with prev. */
+ if (jamo == NO_JAMO)
+ makes_hangul_syllable = FALSE;
+ else
+ {
+ JamoType prev_end = HangulJamoProps[prev_jamo].end ;
+ JamoType this_start = HangulJamoProps[ jamo].start;
+
+ /* See comments before IS_JAMO */
+ makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
+ }
+
+ /* Can't just use the type here since isspace() doesn't
+ * correspond to a Unicode character type
+ */
+ attrs[i].is_white = g_unichar_isspace (wc);
+
+ /* Just few spaces have variable width. So explicitly mark them.
+ */
+ attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);
+
+ /* ---- Cursor position breaks (Grapheme breaks) ---- */
+
+ if (wc == '\n')
+ {
+ /* Break before line feed unless prev char is a CR */
+
+ if (prev_wc != '\r')
+ attrs[i].is_cursor_position = TRUE;
+ else
+ attrs[i].is_cursor_position = FALSE;
+ }
+ else if (i == 0 ||
+ prev_type == G_UNICODE_CONTROL ||
+ prev_type == G_UNICODE_FORMAT)
+ {
+ /* Break at first position (must be special cased, or if the
+ * first char is say a combining mark there won't be a
+ * cursor position at the start, which seems wrong to me
+ * ???? - maybe it makes sense though, who knows)
+ */
+ /* break after all format or control characters */
+ attrs[i].is_cursor_position = TRUE;
+ }
+ else
+ {
+ switch (type)
+ {
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ /* Break before all format or control characters */
+ attrs[i].is_cursor_position = TRUE;
+ break;
+
+ case G_UNICODE_COMBINING_MARK:
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ /* Unicode spec includes "Combining marks plus Tibetan
+ * subjoined characters" as joining chars, but lists the
+ * Tibetan subjoined characters as combining marks, and
+ * g_unichar_type() returns NON_SPACING_MARK for the Tibetan
+ * subjoined characters. So who knows, beats me.
+ */
+
+ /* It's a joining character, break only if preceded by
+ * control or format; we already handled the case where
+ * it was preceded earlier, so here we know it wasn't,
+ * don't break
+ */
+ attrs[i].is_cursor_position = FALSE;
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+
+ if (makes_hangul_syllable)
+ attrs[i].is_cursor_position = FALSE;
+ else
+ {
+ /* Handle non-Hangul-syllable non-combining chars */
+
+ /* Break before Jamo if they are in a broken sequence or
+ * next to non-Jamo; break if preceded by Jamo; don't
+ * break if a letter is preceded by a virama; break in
+ * all other cases. No need to check whether we are or are
+ * preceded by Jamo explicitly, since a Jamo is not
+ * a virama, we just break in all cases where we
+ * aren't a or preceded by a virama. Don't fool with
+ * viramas if we aren't part of a script that uses them.
+ */
+
+ if (VIRAMA_SCRIPT (wc))
+ {
+ /* Check whether we're preceded by a virama; this
+ * could use some optimization.
+ */
+ if (VIRAMA (prev_wc))
+ attrs[i].is_cursor_position = FALSE;
+ else
+ attrs[i].is_cursor_position = TRUE;
+ }
+ else
+ {
+ attrs[i].is_cursor_position = TRUE;
+ }
+ }
+ break;
+
+ default:
+ /* Some weirdo char, just break here, why not */
+ attrs[i].is_cursor_position = TRUE;
+ break;
+ }
+ }
+
+ /* If this is a grapheme boundary, we have to decide if backspace
+ * deletes a character or the whole grapheme cluster */
+ if (attrs[i].is_cursor_position)
+ attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
+ else
+ attrs[i].backspace_deletes_character = FALSE;
+
+ /* ---- Line breaking ---- */
+
+ break_op = BREAK_ALREADY_HANDLED;
+
+ g_assert (prev_break_type != G_UNICODE_BREAK_SPACE);
+
+ attrs[i].is_line_break = FALSE;
+ attrs[i].is_mandatory_break = FALSE;
+
+ if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary,
+ * it's not a line break either
+ */
+ {
+ /* space followed by a combining mark is handled
+ * specially; (rule 7a from TR 14)
+ */
+ if (break_type == G_UNICODE_BREAK_SPACE &&
+ next_break_type == G_UNICODE_BREAK_COMBINING_MARK)
+ break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
+
+ /* Unicode doesn't specify char wrap; we wrap around all chars
+ * except where a line break is prohibited, which means we
+ * effectively break everywhere except inside runs of spaces.
+ */
+ attrs[i].is_char_break = TRUE;
+
+ /* Make any necessary replacements first */
+ switch (prev_break_type)
+ {
+ case G_UNICODE_BREAK_HANGUL_L_JAMO:
+ case G_UNICODE_BREAK_HANGUL_V_JAMO:
+ case G_UNICODE_BREAK_HANGUL_T_JAMO:
+ case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
+ case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
+ /* treat Jamo as IDEOGRAPHIC from now
+ */
+ prev_break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
+ break;
+
+ case G_UNICODE_BREAK_AMBIGUOUS:
+ /* FIXME
+ * we need to resolve the East Asian width
+ * to decide what to do here
+ */
+ case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+ /* FIXME
+ * language engines should handle this case...
+ */
+ case G_UNICODE_BREAK_UNKNOWN:
+ /* convert unknown, complex, ambiguous to ALPHABETIC
+ */
+ prev_break_type = G_UNICODE_BREAK_ALPHABETIC;
+ break;
+
+ default:
+ ;
+ }
+
+ switch (prev_break_type)
+ {
+ case G_UNICODE_BREAK_MANDATORY:
+ case G_UNICODE_BREAK_LINE_FEED:
+ case G_UNICODE_BREAK_NEXT_LINE:
+ attrs[i].is_line_break = TRUE;
+ attrs[i].is_mandatory_break = TRUE;
+ break;
+
+ case G_UNICODE_BREAK_CARRIAGE_RETURN:
+ if (wc != '\n')
+ {
+ attrs[i].is_line_break = TRUE;
+ attrs[i].is_mandatory_break = TRUE;
+ }
+ break;
+
+ case G_UNICODE_BREAK_CONTINGENT:
+ /* can break after 0xFFFC by default, though we might want
+ * to eventually have a PangoLayout setting or
+ * PangoAttribute that disables this, if for some
+ * application breaking after objects is not desired.
+ */
+ break_op = BREAK_ALLOWED;
+ break;
+
+ case G_UNICODE_BREAK_SURROGATE:
+ g_assert_not_reached ();
+ break;
+
+ default:
+ g_assert (IN_BREAK_TABLE (prev_break_type));
+
+ /* Note that our table assumes that combining marks
+ * are only applied to alphabetic characters;
+ * tech report 14 explains how to remove this assumption
+ * from the code, if anyone ever cares, but it shouldn't
+ * be a problem. Also this issue sort of goes
+ * away since we only look for breaks on grapheme
+ * boundaries.
+ */
+
+ switch (break_type)
+ {
+ case G_UNICODE_BREAK_MANDATORY:
+ case G_UNICODE_BREAK_LINE_FEED:
+ case G_UNICODE_BREAK_CARRIAGE_RETURN:
+ case G_UNICODE_BREAK_NEXT_LINE:
+ case G_UNICODE_BREAK_SPACE:
+ /* These types all "pile up" at the end of lines and
+ * get elided.
+ */
+ break_op = BREAK_PROHIBITED;
+ break;
+
+ case G_UNICODE_BREAK_CONTINGENT:
+ /* break before 0xFFFC by default, eventually
+ * make this configurable?
+ */
+ break_op = BREAK_ALLOWED;
+ break;
+
+ case G_UNICODE_BREAK_SURROGATE:
+ g_assert_not_reached ();
+ break;
+
+ /* Hangul additions are from Unicode 4.1 UAX#14 */
+ case G_UNICODE_BREAK_HANGUL_L_JAMO:
+ case G_UNICODE_BREAK_HANGUL_V_JAMO:
+ case G_UNICODE_BREAK_HANGUL_T_JAMO:
+ case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
+ case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
+ /* treat Jamo as IDEOGRAPHIC from now
+ */
+ break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
+
+ if (makes_hangul_syllable)
+ break_op = BREAK_IF_SPACES;
+ else
+ break_op = BREAK_ALLOWED;
+ break;
+
+ case G_UNICODE_BREAK_AMBIGUOUS:
+ /* FIXME:
+ * we need to resolve the East Asian width
+ * to decide what to do here
+ */
+ case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+ /* FIXME:
+ * language engines should handle this case...
+ */
+ case G_UNICODE_BREAK_UNKNOWN:
+ /* treat unknown, complex, and ambiguous like ALPHABETIC
+ * for now
+ */
+ break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC);
+ break;
+
+ default:
+
+ g_assert (IN_BREAK_TABLE (break_type));
+ break_op = BREAK_OP (prev_break_type, break_type);
+ break;
+ }
+ break;
+ }
+
+ if (break_op != BREAK_ALREADY_HANDLED)
+ {
+ switch (break_op)
+ {
+ case BREAK_PROHIBITED:
+ /* can't break here */
+ attrs[i].is_char_break = FALSE;
+ break;
+
+ case BREAK_IF_SPACES:
+ /* break if prev char was space */
+ if (prev_was_break_space)
+ attrs[i].is_line_break = TRUE;
+ break;
+
+ case BREAK_ALLOWED:
+ attrs[i].is_line_break = TRUE;
+ break;
+
+ default:
+ g_assert_not_reached ();
+ break;
+ }
+ }
+ }
+
+ if (break_type != G_UNICODE_BREAK_SPACE)
+ {
+ prev_break_type = break_type;
+ prev_was_break_space = FALSE;
+ prev_jamo = jamo;
+ }
+ else
+ prev_was_break_space = TRUE;
+
+ /* ---- Word breaks ---- */
+
+ /* default to not a word start/end */
+ attrs[i].is_word_start = FALSE;
+ attrs[i].is_word_end = FALSE;
+
+ if (current_word_type != WordNone)
+ {
+ /* Check for a word end */
+ switch (type)
+ {
+ case G_UNICODE_COMBINING_MARK:
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ case G_UNICODE_FORMAT:
+ /* nothing, we just eat these up as part of the word */
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ if (current_word_type == WordLetters)
+ {
+ /* Japanese special cases for ending the word */
+ if (JAPANESE (last_word_letter) ||
+ JAPANESE (wc))
+ {
+ if ((HIRAGANA (last_word_letter) &&
+ !HIRAGANA (wc)) ||
+ (KATAKANA (last_word_letter) &&
+ !(KATAKANA (wc) || HIRAGANA (wc))) ||
+ (KANJI (last_word_letter) &&
+ !(HIRAGANA (wc) || KANJI (wc))) ||
+ (JAPANESE (last_word_letter) &&
+ !JAPANESE (wc)) ||
+ (!JAPANESE (last_word_letter) &&
+ JAPANESE (wc)))
+ attrs[i].is_word_end = TRUE;
+ }
+ }
+ else
+ {
+ /* end the number word, start the letter word */
+ attrs[i].is_word_end = TRUE;
+ attrs[i].is_word_start = TRUE;
+ current_word_type = WordLetters;
+ }
+
+ last_word_letter = wc;
+ break;
+
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ if (current_word_type != WordNumbers)
+ {
+ attrs[i].is_word_end = TRUE;
+ attrs[i].is_word_start = TRUE;
+ current_word_type = WordNumbers;
+ }
+
+ last_word_letter = wc;
+ break;
+
+ default:
+ /* Punctuation, control/format chars, etc. all end a word. */
+ attrs[i].is_word_end = TRUE;
+ current_word_type = WordNone;
+ break;
+ }
+ }
+ else
+ {
+ /* Check for a word start */
+ switch (type)
+ {
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ current_word_type = WordLetters;
+ last_word_letter = wc;
+ attrs[i].is_word_start = TRUE;
+ break;
+
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ current_word_type = WordNumbers;
+ last_word_letter = wc;
+ attrs[i].is_word_start = TRUE;
+ break;
+
+ default:
+ /* No word here */
+ break;
+ }
+ }
+
+ /* ---- Sentence breaks ---- */
+
+ /* The Unicode spec specifies sentence breakpoints, so that a piece of
+ * text would be partitioned into sentences, and all characters would
+ * be inside some sentence. This code implements that for is_sentence_boundary,
+ * but tries to keep leading/trailing whitespace out of sentences for
+ * the start/end flags
+ */
+
+ /* The Unicode spec seems to say that one trailing line/para
+ * separator can be tacked on to a sentence ending in ! or ?,
+ * but not a sentence ending in period; I think they're on crack
+ * so am allowing one to be tacked onto a sentence ending in period.
+ */
+
+#define MAYBE_START_NEW_SENTENCE \
+ switch (type) \
+ { \
+ case G_UNICODE_LINE_SEPARATOR: \
+ case G_UNICODE_PARAGRAPH_SEPARATOR: \
+ case G_UNICODE_CONTROL: \
+ case G_UNICODE_FORMAT: \
+ case G_UNICODE_SPACE_SEPARATOR: \
+ sentence_state = STATE_SENTENCE_OUTSIDE; \
+ break; \
+ \
+ default: \
+ sentence_state = STATE_SENTENCE_BODY; \
+ attrs[i].is_sentence_start = TRUE; \
+ break; \
+ }
+
+ /* No sentence break at the start of the text */
+
+ /* default to not a sentence breakpoint */
+ attrs[i].is_sentence_boundary = FALSE;
+ attrs[i].is_sentence_start = FALSE;
+ attrs[i].is_sentence_end = FALSE;
+
+ /* FIXME the Unicode spec lumps control/format chars with
+ * line/para separators in descriptive text, but not in the
+ * character class specs, in table 5-6, so who knows whether you
+ * are actually supposed to break on control/format
+ * characters. Seems semi-broken to break on tabs...
+ */
+
+ /* Break after line/para separators except carriage return
+ * followed by newline
+ */
+ switch (prev_type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ if (wc == '\r')
+ {
+ if (next_wc != '\n')
+ attrs[i].is_sentence_boundary = TRUE;
+ }
+ else
+ attrs[i].is_sentence_boundary = TRUE;
+ break;
+
+ default:
+ break;
+ }
+
+ /* break before para/line separators except newline following
+ * carriage return
+ */
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ if (wc == '\n')
+ {
+ if (prev_wc != '\r')
+ attrs[i].is_sentence_boundary = TRUE;
+ }
+ else
+ attrs[i].is_sentence_boundary = TRUE;
+ break;
+
+ default:
+ break;
+ }
+
+ switch (sentence_state)
+ {
+ case STATE_SENTENCE_OUTSIDE:
+ /* Start sentence if we have non-whitespace/format/control */
+ switch (type)
+ {
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ case G_UNICODE_SPACE_SEPARATOR:
+ break;
+
+ default:
+ attrs[i].is_sentence_start = TRUE;
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_BODY:
+ /* If we already broke here due to separators, end the sentence. */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+ }
+ else
+ {
+ if (wc == '.')
+ sentence_state = STATE_SENTENCE_DOT;
+ else if (wc == '?' || wc == '!')
+ sentence_state = STATE_SENTENCE_TERM;
+ }
+ break;
+
+ case STATE_SENTENCE_TERM:
+ /* End sentence on anything but close punctuation and some
+ * loosely-specified OTHER_PUNCTUATION such as period,
+ * comma, etc.; follow Unicode rules for breaks
+ */
+ switch (type)
+ {
+ case G_UNICODE_OTHER_PUNCTUATION:
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ if (type == G_UNICODE_CLOSE_PUNCTUATION ||
+ wc == '.' ||
+ wc == ',' ||
+ wc == '?' ||
+ wc == '!')
+ sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
+ else
+ {
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+ }
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_CLOSE:
+ /* End sentence on anything besides more punctuation; follow
+ * rules for breaks
+ */
+ switch (type)
+ {
+ case G_UNICODE_OTHER_PUNCTUATION:
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ if (type == G_UNICODE_CLOSE_PUNCTUATION ||
+ wc == '.' ||
+ wc == ',' ||
+ wc == '?' ||
+ wc == '!')
+ /* continue in this state */
+ ;
+ else
+ {
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+ }
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ attrs[i].is_sentence_end = TRUE;
+ /* undo the unconditional break-at-all-line/para-separators
+ * from above; I'm not sure this is what the Unicode spec
+ * intends, but it seems right - we get to include
+ * a single line/para separator in the sentence according
+ * to their rules
+ */
+ attrs[i].is_sentence_boundary = FALSE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_end = TRUE;
+ attrs[i].is_sentence_boundary = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_SPACE:
+
+ /* Sentence is definitely already ended; to enter this state
+ * we had to see a space, which ends the sentence.
+ */
+
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ /* continue in this state */
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ /* undo the unconditional break-at-all-line/para-separators
+ * from above; I'm not sure this is what the Unicode spec
+ * intends, but it seems right
+ */
+ attrs[i].is_sentence_boundary = FALSE;
+ sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+ break;
+
+ default:
+ attrs[i].is_sentence_boundary = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_TERM_SEP:
+ /* Break is forced at this point, unless we're a newline
+ * after a CR, then we will break after the newline on the
+ * next iteration. Only a single Sep can be in the
+ * sentence.
+ */
+ if (!(prev_wc == '\r' && wc == '\n'))
+ attrs[i].is_sentence_boundary = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+
+ break;
+
+ case STATE_SENTENCE_DOT:
+ switch (type)
+ {
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ possible_sentence_end = i;
+ sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
+ break;
+
+ default:
+ /* If we broke on a control/format char, end the
+ * sentence; else this was not a sentence end, since
+ * we didn't enter the POST_DOT_SPACE state.
+ */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+ }
+ else
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_CLOSE:
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ possible_sentence_end = i;
+ sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
+ break;
+
+ default:
+ /* If we broke on a control/format char, end the
+ * sentence; else this was not a sentence end, since
+ * we didn't enter the POST_DOT_SPACE state.
+ */
+ if (attrs[i].is_sentence_boundary)
+ {
+ attrs[i].is_sentence_end = TRUE;
+
+ MAYBE_START_NEW_SENTENCE;
+ }
+ else
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_SPACE:
+
+ possible_sentence_boundary = i;
+
+ switch (type)
+ {
+ case G_UNICODE_SPACE_SEPARATOR:
+ /* remain in current state */
+ break;
+
+ case G_UNICODE_OPEN_PUNCTUATION:
+ sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ /* wasn't a sentence-ending period; so re-enter the sentence
+ * body
+ */
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+
+ default:
+ /* End the sentence, break, maybe start a new one */
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+
+ MAYBE_START_NEW_SENTENCE;
+
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_OPEN:
+ switch (type)
+ {
+ case G_UNICODE_OPEN_PUNCTUATION:
+ /* continue in current state */
+ break;
+
+ case G_UNICODE_LOWERCASE_LETTER:
+ /* wasn't a sentence-ending period; so re-enter the sentence
+ * body
+ */
+ sentence_state = STATE_SENTENCE_BODY;
+ break;
+
+ default:
+ /* End the sentence, break, maybe start a new one */
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+
+ MAYBE_START_NEW_SENTENCE;
+
+ break;
+ }
+ break;
+
+ case STATE_SENTENCE_POST_DOT_SEP:
+ /* Break is forced at this point, unless we're a newline
+ * after a CR, then we will break after the newline on the
+ * next iteration. Only a single Sep can be in the
+ * sentence.
+ */
+ if (!(prev_wc == '\r' && wc == '\n'))
+ attrs[i].is_sentence_boundary = TRUE;
+
+ g_assert (possible_sentence_end >= 0);
+ g_assert (possible_sentence_boundary >= 0);
+
+ attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+ possible_sentence_end = -1;
+ possible_sentence_boundary = -1;
+
+ MAYBE_START_NEW_SENTENCE;
+
+ break;
+
+ default:
+ g_assert_not_reached ();
+ break;
+ }
+
+ prev_type = type;
+ prev_wc = wc;
+
+ /* wc might not be a valid Unicode base character, but really all we
+ * need to know is the last non-combining character */
+ if (type != G_UNICODE_COMBINING_MARK &&
+ type != G_UNICODE_ENCLOSING_MARK &&
+ type != G_UNICODE_NON_SPACING_MARK)
+ base_character = wc;
+ }
+}
+
+static gboolean
+tailor_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis,
+ PangoLogAttr *attrs,
+ int attrs_len)
+{
+ if (analysis->lang_engine && PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break)
+ {
+ if (length < 0)
+ length = strlen (text);
+ else if (text == NULL)
+ text = "";
+
+ PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break (analysis->lang_engine, text, length, analysis, attrs, attrs_len);
+ return TRUE;
+ }
+ return FALSE;
+}
+
+/**
+ * pango_break:
+ * @text: the text to process
+ * @length: length of @text in bytes (may be -1 if @text is nul-terminated)
+ * @analysis: #PangoAnalysis structure from pango_itemize()
+ * @attrs: an array to store character information in
+ * @attrs_len: size of the array passed as @attrs
+ *
+ * Determines possible line, word, and character breaks
+ * for a string of Unicode text with a single analysis. For most
+ * purposes you may want to use pango_get_log_attrs().
+ */
+void
+pango_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis,
+ PangoLogAttr *attrs,
+ int attrs_len)
+{
+ g_return_if_fail (analysis != NULL);
+ g_return_if_fail (attrs != NULL);
+
+ pango_default_break (text, length, analysis, attrs, attrs_len);
+ tailor_break (text, length, analysis, attrs, attrs_len);
+}
+
+/**
+ * pango_find_paragraph_boundary:
+ * @text: UTF-8 text
+ * @length: length of @text in bytes, or -1 if nul-terminated
+ * @paragraph_delimiter_index: return location for index of delimiter
+ * @next_paragraph_start: return location for start of next paragraph
+ *
+ * Locates a paragraph boundary in @text. A boundary is caused by
+ * delimiter characters, such as a newline, carriage return, carriage
+ * return-newline pair, or Unicode paragraph separator character. The
+ * index of the run of delimiters is returned in
+ * @paragraph_delimiter_index. The index of the start of the paragraph
+ * (index after all delimiters) is stored in @next_paragraph_start.
+ *
+ * If no delimiters are found, both @paragraph_delimiter_index and
+ * @next_paragraph_start are filled with the length of @text (an index one
+ * off the end).
+ **/
+void
+pango_find_paragraph_boundary (const gchar *text,
+ gint length,
+ gint *paragraph_delimiter_index,
+ gint *next_paragraph_start)
+{
+ const gchar *p = text;
+ const gchar *end;
+ const gchar *start = NULL;
+ const gchar *delimiter = NULL;
+
+ /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in
+ * Unicode 5.0; update the following code if that changes.
+ */
+
+ /* prev_sep is the first byte of the previous separator. Since
+ * the valid separators are \r, \n, and PARAGRAPH_SEPARATOR, the
+ * first byte is enough to identify it.
+ */
+ gchar prev_sep;
+
+
+ if (length < 0)
+ length = strlen (text);
+
+ end = text + length;
+
+ if (paragraph_delimiter_index)
+ *paragraph_delimiter_index = length;
+
+ if (next_paragraph_start)
+ *next_paragraph_start = length;
+
+ if (length == 0)
+ return;
+
+ prev_sep = 0;
+
+ while (p != end)
+ {
+ if (prev_sep == '\n' ||
+ prev_sep == PARAGRAPH_SEPARATOR_STRING[0])
+ {
+ g_assert (delimiter);
+ start = p;
+ break;
+ }
+ else if (prev_sep == '\r')
+ {
+ /* don't break between \r and \n */
+ if (*p != '\n')
+ {
+ g_assert (delimiter);
+ start = p;
+ break;
+ }
+ }
+
+ if (*p == '\n' ||
+ *p == '\r' ||
+ !strncmp(p, PARAGRAPH_SEPARATOR_STRING,
+ strlen(PARAGRAPH_SEPARATOR_STRING)))
+ {
+ if (delimiter == NULL)
+ delimiter = p;
+ prev_sep = *p;
+ }
+ else
+ prev_sep = 0;
+
+ p = g_utf8_next_char (p);
+ }
+
+ if (delimiter && paragraph_delimiter_index)
+ *paragraph_delimiter_index = delimiter - text;
+
+ if (start && next_paragraph_start)
+ *next_paragraph_start = start - text;
+}
+
+static int
+tailor_segment (const char *range_start,
+ const char *range_end,
+ PangoEngineLang *range_engine,
+ int chars_broken,
+ PangoAnalysis *analysis,
+ PangoLogAttr *log_attrs)
+{
+ int chars_in_range;
+ PangoLogAttr attr_before = log_attrs[0];
+
+ analysis->lang_engine = range_engine;
+ chars_in_range = g_utf8_strlen (range_start, range_end - range_start);
+
+
+ if (tailor_break (range_start,
+ range_end - range_start,
+ analysis,
+ log_attrs + chars_broken,
+ chars_in_range + 1))
+ {
+ /* if tailored, we enforce some of the attrs from before tailoring at
+ * the boundary
+ */
+
+ log_attrs[0].backspace_deletes_character = attr_before.backspace_deletes_character;
+
+ log_attrs[0].is_line_break |= attr_before.is_line_break;
+ log_attrs[0].is_mandatory_break |= attr_before.is_mandatory_break;
+ log_attrs[0].is_cursor_position |= attr_before.is_cursor_position;
+ }
+
+ return chars_in_range;
+}
+
+/**
+ * pango_get_log_attrs:
+ * @text: text to process
+ * @length: length in bytes of @text
+ * @level: embedding level, or -1 if unknown
+ * @language: language tag
+ * @log_attrs: array with one #PangoLogAttr per character in @text, plus one extra, to be filled in
+ * @attrs_len: length of @log_attrs array
+ *
+ * Computes a #PangoLogAttr for each character in @text. The @log_attrs
+ * array must have one #PangoLogAttr for each position in @text; if
+ * @text contains N characters, it has N+1 positions, including the
+ * last position at the end of the text. @text should be an entire
+ * paragraph; logical attributes can't be computed without context
+ * (for example you need to see spaces on either side of a word to know
+ * the word is a word).
+ */
+void
+pango_get_log_attrs (const char *text,
+ int length,
+ int level,
+ PangoLanguage *language,
+ PangoLogAttr *log_attrs,
+ int attrs_len)
+{
+ PangoMap *lang_map;
+ int chars_broken;
+ const char *range_start, *range_end;
+ PangoScript script;
+ PangoEngineLang *range_engine;
+ static guint engine_type_id = 0;
+ static guint render_type_id = 0;
+ PangoAnalysis analysis = { 0 };
+ PangoScriptIter *iter;
+
+ g_return_if_fail (length == 0 || text != NULL);
+ g_return_if_fail (log_attrs != NULL);
+
+ analysis.level = level;
+
+ pango_default_break (text, length, &analysis, log_attrs, attrs_len);
+
+ if (engine_type_id == 0)
+ {
+ engine_type_id = g_quark_from_static_string (PANGO_ENGINE_TYPE_LANG);
+ render_type_id = g_quark_from_static_string (PANGO_RENDER_TYPE_NONE);
+ }
+
+ lang_map = pango_find_map (language, engine_type_id, render_type_id);
+
+ chars_broken = 0;
+
+ iter = pango_script_iter_new (text, length);
+ pango_script_iter_get_range (iter, &range_start, &range_end, &script);
+ range_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, script);
+ g_assert (range_start == text);
+
+ while (pango_script_iter_next (iter))
+ {
+ const char *run_start, *run_end;
+ PangoEngineLang* run_engine;
+
+ pango_script_iter_get_range (iter, &run_start, &run_end, &script);
+ run_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, script);
+ g_assert (range_end == run_start);
+
+ if (range_engine != run_engine)
+ {
+ /* Engine has changed; do the tailoring for the current range,
+ * then start a new range.
+ */
+ chars_broken += tailor_segment (range_start, range_end, range_engine, chars_broken, &analysis, log_attrs);
+
+ range_start = run_start;
+ range_engine = run_engine;
+ }
+ range_end = run_end;
+ }
+ pango_script_iter_free (iter);
+
+ g_assert (length < 0 || range_end == text + length);
+
+ chars_broken += tailor_segment (range_start, range_end, range_engine, chars_broken, &analysis, log_attrs);
+
+ if (chars_broken + 1 < attrs_len)
+ g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d. Expect corrupted memory.",
+ chars_broken + 1,
+ attrs_len);
+}