1 files changed, 2359 insertions, 0 deletions
diff --git a/pango2/break.c b/pango2/break.c
new file mode 100644
index 00000000..404f3058
--- /dev/null
+++ b/pango2/break.c
@@ -0,0 +1,2359 @@
+/* Pango2
+ * break.c:
+ *
+ * Copyright (C) 1999 Red Hat Software
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+
+#include "pango-break.h"
+#include "pango-script-private.h"
+#include "pango-emoji-private.h"
+#include "pango-attributes.h"
+#include "pango-attr-private.h"
+#include "pango-attr-list-private.h"
+#include "pango-attr-iterator-private.h"
+#include "pango-break-table.h"
+#include "pango-item-private.h"
+#include "pango-impl-utils.h"
+#include <string.h>
+
+/* {{{ Unicode line breaking and segmentation */
+
+#define PARAGRAPH_SEPARATOR 0x2029
+
+/* See http://www.unicode.org/unicode/reports/tr14/ if you hope
+ * to understand the line breaking code.
+ */
+
+typedef enum
+{
+  BREAK_ALREADY_HANDLED,   /* didn't use the table */
+  BREAK_PROHIBITED, /* no break, even if spaces intervene */
+  BREAK_IF_SPACES,  /* "indirect break" (only if there are spaces) */
+  BREAK_ALLOWED     /* "direct break" (can always break here) */
+  /* TR 14 has two more break-opportunity classes,
+   * "indirect break opportunity for combining marks following a space"
+   * and "prohibited break for combining marks"
+   * but we handle that inline in the code.
+   */
+} BreakOpportunity;
+
+/* need to sync the break range to glib/gunicode.h . */
+#define BREAK_TYPE_SAFE(btype)            \
+         ((btype) <= G_UNICODE_BREAK_ZERO_WIDTH_JOINER ? (btype) : G_UNICODE_BREAK_UNKNOWN)
+
+
+/*
+ * Hangul Conjoining Jamo handling.
+ *
+ * The way we implement it is just a bit different from TR14,
+ * but produces the same results.
+ * The same algorithm is also used in TR29 for cluster boundaries.
+ *
+ */
+
+
+/* An enum that works as the states of the Hangul syllables system.
+ **/
+typedef enum
+{
+  JAMO_L,       /* G_UNICODE_BREAK_HANGUL_L_JAMO */
+  JAMO_V,       /* G_UNICODE_BREAK_HANGUL_V_JAMO */
+  JAMO_T,       /* G_UNICODE_BREAK_HANGUL_T_JAMO */
+  JAMO_LV,      /* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */
+  JAMO_LVT,     /* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */
+  NO_JAMO       /* Other */
+} JamoType;
+
+/* There are Hangul syllables encoded as characters, that act like a
+ * sequence of Jamos. For each character we define a JamoType
+ * that the character starts with, and one that it ends with.  This
+ * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs.  So for
+ * example, a character with LineBreak type
+ * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
+ */
+typedef struct _CharJamoProps
+{
+  JamoType start, end;
+} CharJamoProps;
+
+/* Map from JamoType to CharJamoProps that hold only simple
+ * JamoTypes (no LV or LVT) or none.
+ */
+static const CharJamoProps HangulJamoProps[] = {
+  {JAMO_L, JAMO_L},     /* JAMO_L */
+  {JAMO_V, JAMO_V},     /* JAMO_V */
+  {JAMO_T, JAMO_T},     /* JAMO_T */
+  {JAMO_L, JAMO_V},     /* JAMO_LV */
+  {JAMO_L, JAMO_T},     /* JAMO_LVT */
+  {NO_JAMO, NO_JAMO}    /* NO_JAMO */
+};
+
+/* A character forms a syllable with the previous character if and only if:
+ * JamoType(this) is not NO_JAMO and:
+ *
+ * HangulJamoProps[JamoType(prev)].end and
+ * HangulJamoProps[JamoType(this)].start are equal,
+ * or the former is one less than the latter.
+ */
+
+#define IS_JAMO(btype)              \
+        ((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
+         (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
+#define JAMO_TYPE(btype)      \
+        (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
+
+/* Types of Japanese characters */
+#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
+#define KANJI(wc)    ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
+#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
+#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
+
+#define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
+#define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
+#define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
+#define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
+#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
+#define EMOJI(wc) (_pango2_Is_Emoji_Base_Character (wc))
+#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA (wc) && !HANGUL (wc) && !EMOJI (wc))
+
+/* Previously "123foo" was two words. But in UAX 29 of Unicode, 
+ * we know don't break words between consecutive letters and numbers
+ */
+typedef enum
+{
+  WordNone,
+  WordLetters,
+  WordNumbers
+} WordType;
+
+static void
+default_break (const char     *text,
+               int             length,
+               Pango2LogAttr  *attrs,
+               int             attrs_len G_GNUC_UNUSED)
+{
+  /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
+   * the line breaking stuff is also in TR14 on unicode.org
+   */
+
+  /* This is a default break implementation that should work for nearly all
+   * languages. Language engines can override it optionally.
+   */
+
+  /* FIXME one cheesy optimization here would be to memset attrs to 0
+   * before we start, and then never assign %FALSE to anything
+   */
+
+  const char *next;
+  int i;
+
+  gunichar prev_wc;
+  gunichar next_wc;
+
+  JamoType prev_jamo;
+
+  GUnicodeBreakType next_break_type;
+  GUnicodeBreakType prev_break_type;
+  GUnicodeBreakType prev_prev_break_type;
+
+  GUnicodeScript prev_script;
+
+  /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    GB_Other,
+    GB_ControlCRLF,
+    GB_Extend,
+    GB_ZWJ,
+    GB_Prepend,
+    GB_SpacingMark,
+    GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
+    /* Use state machine to handle emoji sequence */
+    /* Rule GB12 and GB13 */
+    GB_RI_Odd, /* Meets odd number of RI */
+    GB_RI_Even, /* Meets even number of RI */
+  } GraphemeBreakType;
+  GraphemeBreakType prev_GB_type = GB_Other;
+  gboolean met_Extended_Pictographic = FALSE;
+
+  /* See Word_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    WB_Other,
+    WB_NewlineCRLF,
+    WB_ExtendFormat,
+    WB_Katakana,
+    WB_Hebrew_Letter,
+    WB_ALetter,
+    WB_MidNumLet,
+    WB_MidLetter,
+    WB_MidNum,
+    WB_Numeric,
+    WB_ExtendNumLet,
+    WB_RI_Odd,
+    WB_RI_Even,
+    WB_WSegSpace,
+  } WordBreakType;
+  WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
+  int prev_WB_i = -1;
+
+  /* See Sentence_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    SB_Other,
+    SB_ExtendFormat,
+    SB_ParaSep,
+    SB_Sp,
+    SB_Lower,
+    SB_Upper,
+    SB_OLetter,
+    SB_Numeric,
+    SB_ATerm,
+    SB_SContinue,
+    SB_STerm,
+    SB_Close,
+    /* Rules SB8 and SB8a */
+    SB_ATerm_Close_Sp,
+    SB_STerm_Close_Sp,
+  } SentenceBreakType;
+  SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
+  int prev_SB_i = -1;
+
+  /* Rule LB25 with Example 7 of Customization */
+  typedef enum
+  {
+    LB_Other,
+    LB_Numeric,
+    LB_Numeric_Close,
+    LB_RI_Odd,
+    LB_RI_Even,
+  } LineBreakType;
+  LineBreakType prev_LB_type = LB_Other;
+
+  WordType current_word_type = WordNone;
+  gunichar last_word_letter = 0;
+  gunichar base_character = 0;
+
+  int last_sentence_start = -1;
+  int last_non_space = -1;
+
+  gboolean prev_space_or_hyphen;
+
+  gboolean almost_done = FALSE;
+  gboolean done = FALSE;
+
+  g_return_if_fail (length == 0 || text != NULL);
+  g_return_if_fail (attrs != NULL);
+
+  next = text;
+
+  prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+  prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+  prev_wc = 0;
+  prev_script = G_UNICODE_SCRIPT_COMMON;
+  prev_jamo = NO_JAMO;
+  prev_space_or_hyphen = FALSE;
+
+  if (length == 0 || *text == '\0')
+    {
+      next_wc = PARAGRAPH_SEPARATOR;
+      almost_done = TRUE;
+    }
+  else
+    next_wc = g_utf8_get_char (next);
+
+  next_break_type = g_unichar_break_type (next_wc);
+  next_break_type = BREAK_TYPE_SAFE (next_break_type);
+
+  for (i = 0; !done ; i++)
+    {
+      GUnicodeType type;
+      gunichar wc;
+      GUnicodeBreakType break_type;
+      GUnicodeBreakType row_break_type;
+      BreakOpportunity break_op;
+      JamoType jamo;
+      gboolean makes_hangul_syllable;
+
+      /* UAX#29 boundaries */
+      gboolean is_grapheme_boundary;
+      gboolean is_word_boundary;
+      gboolean is_sentence_boundary;
+
+      /* Emoji extended pictographics */
+      gboolean is_Extended_Pictographic;
+
+      GUnicodeScript script;
+
+      wc = next_wc;
+      break_type = next_break_type;
+
+      if (almost_done)
+        {
+          /*
+           * If we have already reached the end of @text g_utf8_next_char()
+           * may not increment next
+           */
+          next_wc = 0;
+          next_break_type = G_UNICODE_BREAK_UNKNOWN;
+          done = TRUE;
+        }
+      else
+        {
+          next = g_utf8_next_char (next);
+
+          if ((length >= 0 && next >= text + length) || *next == '\0')
+            {
+              /* This is how we fill in the last element (end position) of the
+               * attr array - assume there's a paragraph separators off the end
+               * of @text.
+               */
+              next_wc = PARAGRAPH_SEPARATOR;
+              almost_done = TRUE;
+            }
+          else
+            next_wc = g_utf8_get_char (next);
+
+          next_break_type = g_unichar_break_type (next_wc);
+          next_break_type = BREAK_TYPE_SAFE (next_break_type);
+        }
+
+      type = g_unichar_type (wc);
+      jamo = JAMO_TYPE (break_type);
+
+      /* Determine wheter this forms a Hangul syllable with prev. */
+      if (jamo == NO_JAMO)
+        makes_hangul_syllable = FALSE;
+      else
+        {
+          JamoType prev_end   = HangulJamoProps[prev_jamo].end  ;
+          JamoType this_start = HangulJamoProps[     jamo].start;
+
+          /* See comments before IS_JAMO */
+          makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
+        }
+
+      switch ((int)type)
+        {
+        case G_UNICODE_SPACE_SEPARATOR:
+        case G_UNICODE_LINE_SEPARATOR:
+        case G_UNICODE_PARAGRAPH_SEPARATOR:
+          attrs[i].is_white = TRUE;
+          break;
+        case G_UNICODE_CONTROL:
+          if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f')
+            attrs[i].is_white = TRUE;
+          else
+            attrs[i].is_white = FALSE;
+          break;
+        default:
+          attrs[i].is_white = FALSE;
+          break;
+        }
+
+      /* Just few spaces have variable width. So explicitly mark them.
+       */
+      attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);
+      is_Extended_Pictographic =
+        _pango2_Is_Emoji_Extended_Pictographic (wc);
+
+
+      /* ---- UAX#29 Grapheme Boundaries ---- */
+      {
+        GraphemeBreakType GB_type;
+
+        /* Find the GraphemeBreakType of wc */
+        GB_type = GB_Other;
+        switch ((int)type)
+          {
+          case G_UNICODE_FORMAT:
+            if (G_UNLIKELY (wc == 0x200C))
+              {
+                GB_type = GB_Extend;
+                break;
+              }
+            if (G_UNLIKELY (wc == 0x200D))
+              {
+                GB_type = GB_ZWJ;
+                break;
+              }
+            if (G_UNLIKELY((wc >= 0x600 && wc <= 0x605) ||
+                            wc == 0x6DD ||
+                            wc == 0x70F ||
+                            wc == 0x8E2 ||
+                            wc == 0x110BD ||
+                            wc == 0x110CD))
+              {
+                GB_type = GB_Prepend;
+                break;
+              }
+            /* Tag chars */
+            if (wc >= 0xE0020 && wc <= 0xE00FF)
+              {
+                GB_type = GB_Extend;
+                break;
+              }
+            G_GNUC_FALLTHROUGH;
+          case G_UNICODE_CONTROL:
+          case G_UNICODE_LINE_SEPARATOR:
+          case G_UNICODE_PARAGRAPH_SEPARATOR:
+          case G_UNICODE_SURROGATE:
+            GB_type = GB_ControlCRLF;
+            break;
+
+          case G_UNICODE_UNASSIGNED:
+            /* Unassigned default ignorables */
+            if ((wc >= 0xFFF0 && wc <= 0xFFF8) ||
+                (wc >= 0xE0000 && wc <= 0xE0FFF))
+              {
+                GB_type = GB_ControlCRLF;
+                break;
+              }
+            G_GNUC_FALLTHROUGH;
+
+          case G_UNICODE_OTHER_LETTER:
+            if (makes_hangul_syllable)
+              GB_type = GB_InHangulSyllable;
+
+            if (_pango2_is_Consonant_Preceding_Repha (wc) ||
+                _pango2_is_Consonant_Prefixed (wc))
+              GB_type = GB_Prepend;
+            break;
+
+          case G_UNICODE_MODIFIER_LETTER:
+            if (wc >= 0xFF9E && wc <= 0xFF9F)
+              GB_type = GB_Extend; /* Other_Grapheme_Extend */
+            break;
+
+          case G_UNICODE_SPACING_MARK:
+            GB_type = GB_SpacingMark; /* SpacingMark */
+            if (wc >= 0x0900)
+              {
+                if (wc == 0x09BE || wc == 0x09D7 ||
+                    wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
+                    wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
+                    wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
+                    wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
+                  GB_type = GB_Extend; /* Other_Grapheme_Extend */
+              }
+            break;
+
+          case G_UNICODE_ENCLOSING_MARK:
+          case G_UNICODE_NON_SPACING_MARK:
+            GB_type = GB_Extend; /* Grapheme_Extend */
+            break;
+
+          case G_UNICODE_OTHER_SYMBOL:
+            if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
+              {
+                if (prev_GB_type == GB_RI_Odd)
+                  GB_type = GB_RI_Even;
+                else
+                  GB_type = GB_RI_Odd;
+                break;
+              }
+            break;
+
+          case G_UNICODE_MODIFIER_SYMBOL:
+            /* Fitzpatrick modifiers */
+            if (wc >= 0x1F3FB && wc <= 0x1F3FF)
+              GB_type = GB_Extend;
+            break;
+
+          default:
+            break;
+          }
+
+        /* Rule GB11 */
+        if (met_Extended_Pictographic)
+          {
+            if (GB_type == GB_Extend)
+              met_Extended_Pictographic = TRUE;
+            else if (_pango2_Is_Emoji_Extended_Pictographic (prev_wc) &&
+                     GB_type == GB_ZWJ)
+              met_Extended_Pictographic = TRUE;
+            else if (prev_GB_type == GB_Extend && GB_type == GB_ZWJ)
+              met_Extended_Pictographic = TRUE;
+            else if (prev_GB_type == GB_ZWJ && is_Extended_Pictographic)
+              met_Extended_Pictographic = TRUE;
+            else
+              met_Extended_Pictographic = FALSE;
+          }
+
+        /* Grapheme Cluster Boundary Rules */
+        is_grapheme_boundary = TRUE; /* Rule GB999 */
+
+        /* We apply Rules GB1 and GB2 at the end of the function */
+        if (wc == '\n' && prev_wc == '\r')
+          is_grapheme_boundary = FALSE; /* Rule GB3 */
+        else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
+          is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
+        else if (GB_type == GB_InHangulSyllable)
+          is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
+        else if (GB_type == GB_Extend)
+          is_grapheme_boundary = FALSE; /* Rule GB9 */
+        else if (GB_type == GB_ZWJ)
+          is_grapheme_boundary = FALSE; /* Rule GB9 */
+        else if (GB_type == GB_SpacingMark)
+          is_grapheme_boundary = FALSE; /* Rule GB9a */
+        else if (prev_GB_type == GB_Prepend)
+          is_grapheme_boundary = FALSE; /* Rule GB9b */
+        else if (is_Extended_Pictographic)
+          { /* Rule GB11 */
+            if (prev_GB_type == GB_ZWJ && met_Extended_Pictographic)
+              is_grapheme_boundary = FALSE;
+          }
+        else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
+          is_grapheme_boundary = FALSE; /* Rule GB12 and GB13 */
+
+        if (is_Extended_Pictographic)
+          met_Extended_Pictographic = TRUE;
+
+        attrs[i].is_cursor_position = is_grapheme_boundary;
+        /* If this is a grapheme boundary, we have to decide if backspace
+         * deletes a character or the whole grapheme cluster */
+        if (is_grapheme_boundary)
+          {
+            attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
+
+            /* Dependent Vowels for Indic language */
+            if (_pango2_is_Virama (prev_wc) ||
+                _pango2_is_Vowel_Dependent (prev_wc))
+              attrs[i].backspace_deletes_character = TRUE;
+          }
+        else
+          attrs[i].backspace_deletes_character = FALSE;
+
+        prev_GB_type = GB_type;
+      }
+
+      script = g_unichar_get_script (wc);
+      /* ---- UAX#29 Word Boundaries ---- */
+      {
+        is_word_boundary = FALSE;
+        if (is_grapheme_boundary ||
+            G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */
+          {
+            WordBreakType WB_type;
+
+            /* Find the WordBreakType of wc */
+            WB_type = WB_Other;
+
+            if (script == G_UNICODE_SCRIPT_KATAKANA)
+              WB_type = WB_Katakana;
+
+            if (script == G_UNICODE_SCRIPT_HEBREW && type == G_UNICODE_OTHER_LETTER)
+              WB_type = WB_Hebrew_Letter;
+
+            if (WB_type == WB_Other)
+              switch (wc >> 8)
+                {
+                case 0x30:
+                  if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 ||
+                      wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc)
+                    WB_type = WB_Katakana; /* Katakana exceptions */
+                  break;
+                case 0xFF:
+                  if (wc == 0xFF70)
+                    WB_type = WB_Katakana; /* Katakana exceptions */
+                  else if (wc >= 0xFF9E && wc <= 0xFF9F)
+                    WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */
+                  break;
+                case 0x05:
+                  if (wc == 0x058A)
+                    WB_type = WB_ALetter; /* ALetter exceptions */
+                  break;
+                default:
+                  break;
+                }
+
+            if (WB_type == WB_Other)
+              switch ((int) break_type)
+                {
+                case G_UNICODE_BREAK_NUMERIC:
+                  if (wc != 0x066C)
+                    WB_type = WB_Numeric; /* Numeric */
+                  break;
+                case G_UNICODE_BREAK_INFIX_SEPARATOR:
+                  if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E)
+                    WB_type = WB_MidNum; /* MidNum */
+                  break;
+                default:
+                  break;
+                }
+
+            if (WB_type == WB_Other)
+              switch ((int) type)
+                {
+                case G_UNICODE_CONTROL:
+                  if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085)
+                    break;
+                  G_GNUC_FALLTHROUGH;
+                case G_UNICODE_LINE_SEPARATOR:
+                case G_UNICODE_PARAGRAPH_SEPARATOR:
+                  WB_type = WB_NewlineCRLF; /* CR, LF, Newline */
+                  break;
+
+                case G_UNICODE_FORMAT:
+                case G_UNICODE_SPACING_MARK:
+                case G_UNICODE_ENCLOSING_MARK:
+                case G_UNICODE_NON_SPACING_MARK:
+                  WB_type = WB_ExtendFormat; /* Extend, Format */
+                  break;
+
+                case G_UNICODE_CONNECT_PUNCTUATION:
+                  WB_type = WB_ExtendNumLet; /* ExtendNumLet */
+                  break;
+
+                case G_UNICODE_INITIAL_PUNCTUATION:
+                case G_UNICODE_FINAL_PUNCTUATION:
+                  if (wc == 0x2018 || wc == 0x2019)
+                    WB_type = WB_MidNumLet; /* MidNumLet */
+                  break;
+                case G_UNICODE_OTHER_PUNCTUATION:
+                  if ((wc >= 0x055a && wc <= 0x055c) ||
+                      wc == 0x055e || wc == 0x05f3)
+                    WB_type = WB_ALetter; /* ALetter */
+                  else if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 ||
+                      wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e)
+                    WB_type = WB_MidNumLet; /* MidNumLet */
+                  else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 ||
+                           wc == 0x003a || wc == 0x0387 || wc == 0x055f ||
+                           wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a)
+                    WB_type = WB_MidLetter; /* MidLetter */
+                  else if (wc == 0x066c ||
+                           wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b)
+                    WB_type = WB_MidNum; /* MidNum */
+                  break;
+
+                case G_UNICODE_OTHER_SYMBOL:
+                  if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */
+                    goto Alphabetic;
+
+                  if (G_UNLIKELY(wc >= 0x1F1E6 && wc <= 0x1F1FF))
+                    {
+                      if (prev_WB_type == WB_RI_Odd)
+                        WB_type = WB_RI_Even;
+                      else
+                        WB_type = WB_RI_Odd;
+                    }
+
+                  break;
+
+                case G_UNICODE_OTHER_LETTER:
+                case G_UNICODE_LETTER_NUMBER:
+                  if (wc == 0x3006 || wc == 0x3007 ||
+                      (wc >= 0x3021 && wc <= 0x3029) ||
+                      (wc >= 0x3038 && wc <= 0x303A) ||
+                      (wc >= 0x3400 && wc <= 0x4DB5) ||
+                      (wc >= 0x4E00 && wc <= 0x9FC3) ||
+                      (wc >= 0xF900 && wc <= 0xFA2D) ||
+                      (wc >= 0xFA30 && wc <= 0xFA6A) ||
+                      (wc >= 0xFA70 && wc <= 0xFAD9) ||
+                      (wc >= 0x20000 && wc <= 0x2A6D6) ||
+                      (wc >= 0x2F800 && wc <= 0x2FA1D))
+                    break; /* ALetter exceptions: Ideographic */
+                  goto Alphabetic;
+
+                case G_UNICODE_LOWERCASE_LETTER:
+                case G_UNICODE_MODIFIER_LETTER:
+                case G_UNICODE_TITLECASE_LETTER:
+                case G_UNICODE_UPPERCASE_LETTER:
+                Alphabetic:
+                  if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != G_UNICODE_SCRIPT_HIRAGANA)
+                    WB_type = WB_ALetter; /* ALetter */
+                  break;
+                default:
+                  break;
+                }
+
+            if (WB_type == WB_Other)
+              {
+                if (type == G_UNICODE_SPACE_SEPARATOR &&
+                    break_type != G_UNICODE_BREAK_NON_BREAKING_GLUE)
+                  WB_type = WB_WSegSpace;
+              }
+
+            /* Word Cluster Boundary Rules */
+
+            /* We apply Rules WB1 and WB2 at the end of the function */
+
+            if (prev_wc == 0x3031 && wc == 0x41)
+              g_debug ("Y %d %d", prev_WB_type, WB_type);
+            if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
+              {
+                /* The extra check for prev_WB_i is to correctly handle sequences like
+                 * Newline ÷ Extend × Extend
+                 * since we have not skipped ExtendFormat yet.
+                 */
+                is_word_boundary = TRUE; /* Rule WB3a */
+              }
+            else if (WB_type == WB_NewlineCRLF)
+              is_word_boundary = TRUE; /* Rule WB3b */
+            else if (prev_wc == 0x200D && is_Extended_Pictographic)
+              is_word_boundary = FALSE; /* Rule WB3c */
+            else if (prev_WB_type == WB_WSegSpace &&
+                     WB_type == WB_WSegSpace && prev_WB_i + 1 == i)
+              is_word_boundary = FALSE; /* Rule WB3d */
+            else if (WB_type == WB_ExtendFormat)
+              is_word_boundary = FALSE; /* Rules WB4? */
+            else if ((prev_WB_type == WB_ALetter  ||
+                  prev_WB_type == WB_Hebrew_Letter ||
+                  prev_WB_type == WB_Numeric) &&
+                 (WB_type == WB_ALetter  ||
+                  WB_type == WB_Hebrew_Letter ||
+                  WB_type == WB_Numeric))
+              is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10 */
+            else if (prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
+              is_word_boundary = FALSE; /* Rule WB13 */
+            else if ((prev_WB_type == WB_ALetter ||
+                  prev_WB_type == WB_Hebrew_Letter ||
+                  prev_WB_type == WB_Numeric ||
+                  prev_WB_type == WB_Katakana ||
+                  prev_WB_type == WB_ExtendNumLet) &&
+                 WB_type == WB_ExtendNumLet)
+              is_word_boundary = FALSE; /* Rule WB13a */
+            else if (prev_WB_type == WB_ExtendNumLet &&
+                 (WB_type == WB_ALetter ||
+                  WB_type == WB_Hebrew_Letter ||
+                  WB_type == WB_Numeric ||
+                  WB_type == WB_Katakana))
+              is_word_boundary = FALSE; /* Rule WB13b */
+            else if (((prev_prev_WB_type == WB_ALetter ||
+                   prev_prev_WB_type == WB_Hebrew_Letter) &&
+                  (WB_type == WB_ALetter ||
+                   WB_type == WB_Hebrew_Letter)) &&
+                     (prev_WB_type == WB_MidLetter ||
+              prev_WB_type == WB_MidNumLet ||
+              prev_wc == 0x0027))
+              {
+                attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
+                is_word_boundary = FALSE; /* Rule WB7 */
+              }
+            else if (prev_WB_type == WB_Hebrew_Letter && wc == 0x0027)
+          is_word_boundary = FALSE; /* Rule WB7a */
+            else if (prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 &&
+                 WB_type == WB_Hebrew_Letter) {
+          attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */
+          is_word_boundary = FALSE; /* Rule WB7c */
+        }
+            else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
+                 (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet ||
+                  prev_wc == 0x0027))
+              {
+                is_word_boundary = FALSE; /* Rule WB11 */
+                attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
+              }
+            else if (prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
+              is_word_boundary = FALSE; /* Rule WB15 and WB16 */
+            else
+              is_word_boundary = TRUE; /* Rule WB999 */
+
+            if (WB_type != WB_ExtendFormat)
+              {
+                prev_prev_WB_type = prev_WB_type;
+                prev_WB_type = WB_type;
+                prev_WB_i = i;
+              }
+          }
+
+        attrs[i].is_word_boundary = is_word_boundary;
+      }
+
+      /* ---- UAX#29 Sentence Boundaries ---- */
+      {
+        is_sentence_boundary = FALSE;
+        if (is_word_boundary ||
+            wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */
+          {
+            SentenceBreakType SB_type;
+
+            /* Find the SentenceBreakType of wc */
+            SB_type = SB_Other;
+
+            if (break_type == G_UNICODE_BREAK_NUMERIC)
+              SB_type = SB_Numeric; /* Numeric */
+
+            if (SB_type == SB_Other)
+              switch ((int) type)
+                {
+                case G_UNICODE_CONTROL:
+                  if (wc == '\r' || wc == '\n')
+                    SB_type = SB_ParaSep;
+                  else if (wc == 0x0009 || wc == 0x000B || wc == 0x000C)
+                    SB_type = SB_Sp;
+                  else if (wc == 0x0085)
+                    SB_type = SB_ParaSep;
+                  break;
+
+                case G_UNICODE_SPACE_SEPARATOR:
+                  if (wc == 0x0020 || wc == 0x00A0 || wc == 0x1680 ||
+                      (wc >= 0x2000 && wc <= 0x200A) ||
+                      wc == 0x202F || wc == 0x205F || wc == 0x3000)
+                    SB_type = SB_Sp;
+                  break;
+
+                case G_UNICODE_LINE_SEPARATOR:
+                case G_UNICODE_PARAGRAPH_SEPARATOR:
+                  SB_type = SB_ParaSep;
+                  break;
+
+                case G_UNICODE_FORMAT:
+                case G_UNICODE_SPACING_MARK:
+                case G_UNICODE_ENCLOSING_MARK:
+                case G_UNICODE_NON_SPACING_MARK:
+                  SB_type = SB_ExtendFormat; /* Extend, Format */
+                  break;
+
+                case G_UNICODE_MODIFIER_LETTER:
+                  if (wc >= 0xFF9E && wc <= 0xFF9F)
+                    SB_type = SB_ExtendFormat; /* Other_Grapheme_Extend */
+                  break;
+
+                case G_UNICODE_TITLECASE_LETTER:
+                  SB_type = SB_Upper;
+                  break;
+
+                case G_UNICODE_DASH_PUNCTUATION:
+                  if (wc == 0x002D ||
+                      (wc >= 0x2013 && wc <= 0x2014) ||
+                      (wc >= 0xFE31 && wc <= 0xFE32) ||
+                      wc == 0xFE58 ||
+                      wc == 0xFE63 ||
+                      wc == 0xFF0D)
+                    SB_type = SB_SContinue;
+                  break;
+
+                case G_UNICODE_OTHER_PUNCTUATION:
+                  if (wc == 0x05F3)
+                    SB_type = SB_OLetter;
+                  else if (wc == 0x002E || wc == 0x2024 ||
+                      wc == 0xFE52 || wc == 0xFF0E)
+                    SB_type = SB_ATerm;
+
+                  if (wc == 0x002C ||
+                      wc == 0x003A ||
+                      wc == 0x055D ||
+                      (wc >= 0x060C && wc <= 0x060D) ||
+                      wc == 0x07F8 ||
+                      wc == 0x1802 ||
+                      wc == 0x1808 ||
+                      wc == 0x3001 ||
+                      (wc >= 0xFE10 && wc <= 0xFE11) ||
+                      wc == 0xFE13 ||
+                      (wc >= 0xFE50 && wc <= 0xFE51) ||
+                      wc == 0xFE55 ||
+                      wc == 0xFF0C ||
+                      wc == 0xFF1A ||
+                      wc == 0xFF64)
+                    SB_type = SB_SContinue;
+
+                  if (_pango2_is_STerm(wc))
+                    SB_type = SB_STerm;
+
+                  break;
+
+                default:
+                  break;
+                }
+
+            if (SB_type == SB_Other)
+              {
+                if (type == G_UNICODE_LOWERCASE_LETTER)
+                  SB_type = SB_Lower;
+                else if (type == G_UNICODE_UPPERCASE_LETTER)
+                  SB_type = SB_Upper;
+                else if (type == G_UNICODE_TITLECASE_LETTER ||
+                         type == G_UNICODE_MODIFIER_LETTER ||
+                         type == G_UNICODE_OTHER_LETTER)
+                  SB_type = SB_OLetter;
+
+                if (type == G_UNICODE_OPEN_PUNCTUATION ||
+                    type == G_UNICODE_CLOSE_PUNCTUATION ||
+                    break_type == G_UNICODE_BREAK_QUOTATION)
+                  SB_type = SB_Close;
+              }
+
+            /* Sentence Boundary Rules */
+
+            /* We apply Rules SB1 and SB2 at the end of the function */
+
+#define IS_OTHER_TERM(SB_type)                                          \
+            /* not in (OLetter | Upper | Lower | ParaSep | SATerm) */   \
+              !(SB_type == SB_OLetter ||                                \
+                SB_type == SB_Upper || SB_type == SB_Lower ||           \
+                SB_type == SB_ParaSep ||                                \
+                SB_type == SB_ATerm || SB_type == SB_STerm ||           \
+                SB_type == SB_ATerm_Close_Sp ||                         \
+                SB_type == SB_STerm_Close_Sp)
+
+
+            if (wc == '\n' && prev_wc == '\r')
+              is_sentence_boundary = FALSE; /* Rule SB3 */
+            else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i)
+              {
+                /* The extra check for prev_SB_i is to correctly handle sequences like
+                 * ParaSep ÷ Extend × Extend
+                 * since we have not skipped ExtendFormat yet.
+                 */
+
+                is_sentence_boundary = TRUE; /* Rule SB4 */
+              }
+            else if (SB_type == SB_ExtendFormat)
+              is_sentence_boundary = FALSE; /* Rule SB5? */
+            else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
+              is_sentence_boundary = FALSE; /* Rule SB6 */
+            else if ((prev_prev_SB_type == SB_Upper ||
+                      prev_prev_SB_type == SB_Lower) &&
+                     prev_SB_type == SB_ATerm &&
+                     SB_type == SB_Upper)
+              is_sentence_boundary = FALSE; /* Rule SB7 */
+            else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
+                SB_type = SB_ATerm;
+            else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
+              SB_type = SB_STerm;
+            else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
+              SB_type = SB_ATerm_Close_Sp;
+            else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
+              SB_type = SB_STerm_Close_Sp;
+            /* Rule SB8 */
+            else if ((prev_SB_type == SB_ATerm ||
+                      prev_SB_type == SB_ATerm_Close_Sp) &&
+                     SB_type == SB_Lower)
+              is_sentence_boundary = FALSE;
+            else if ((prev_prev_SB_type == SB_ATerm ||
+                      prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+                     IS_OTHER_TERM(prev_SB_type) &&
+                     SB_type == SB_Lower)
+              {
+                attrs[prev_SB_i].is_sentence_boundary = FALSE;
+                attrs[prev_SB_i].is_sentence_end = FALSE;
+                last_sentence_start = -1;
+                for (int j = prev_SB_i - 1; j >= 0; j--)
+                  {
+                    attrs[j].is_sentence_end = FALSE;
+                    if (attrs[j].is_sentence_boundary)
+                      {
+                        last_sentence_start = j;
+                        break;
+                      }
+                  }
+              }
+            else if ((prev_SB_type == SB_ATerm ||
+                      prev_SB_type == SB_ATerm_Close_Sp ||
+                      prev_SB_type == SB_STerm ||
+                      prev_SB_type == SB_STerm_Close_Sp) &&
+                     (SB_type == SB_SContinue ||
+                      SB_type == SB_ATerm || SB_type == SB_STerm))
+              is_sentence_boundary = FALSE; /* Rule SB8a */
+            else if ((prev_SB_type == SB_ATerm ||
+                      prev_SB_type == SB_STerm) &&
+                     (SB_type == SB_Close || SB_type == SB_Sp ||
+                      SB_type == SB_ParaSep))
+              is_sentence_boundary = FALSE; /* Rule SB9 */
+            else if ((prev_SB_type == SB_ATerm ||
+                      prev_SB_type == SB_ATerm_Close_Sp ||
+                      prev_SB_type == SB_STerm ||
+                      prev_SB_type == SB_STerm_Close_Sp) &&
+                     (SB_type == SB_Sp || SB_type == SB_ParaSep))
+              is_sentence_boundary = FALSE; /* Rule SB10 */
+            else if ((prev_SB_type == SB_ATerm ||
+                      prev_SB_type == SB_ATerm_Close_Sp ||
+                      prev_SB_type == SB_STerm ||
+                      prev_SB_type == SB_STerm_Close_Sp) &&
+                     SB_type != SB_ParaSep)
+              is_sentence_boundary = TRUE; /* Rule SB11 */
+            else
+              is_sentence_boundary = FALSE; /* Rule SB998 */
+
+            if (SB_type != SB_ExtendFormat &&
+                !((prev_prev_SB_type == SB_ATerm ||
+                   prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+                  IS_OTHER_TERM(prev_SB_type) &&
+                  IS_OTHER_TERM(SB_type)))
+              {
+                prev_prev_SB_type = prev_SB_type;
+                prev_SB_type = SB_type;
+                prev_SB_i = i;
+              }
+
+#undef IS_OTHER_TERM
+
+          }
+
+        if (i == 0 || done)
+          is_sentence_boundary = TRUE; /* Rules SB1 and SB2 */
+
+        attrs[i].is_sentence_boundary = is_sentence_boundary;
+      }
+
+      /* ---- Line breaking ---- */
+
+      break_op = BREAK_ALREADY_HANDLED;
+
+      row_break_type = prev_break_type == G_UNICODE_BREAK_SPACE ?
+        prev_prev_break_type : prev_break_type;
+      g_assert (row_break_type != G_UNICODE_BREAK_SPACE);
+
+      attrs[i].is_char_break = FALSE;
+      attrs[i].is_line_break = FALSE;
+      attrs[i].is_mandatory_break = FALSE;
+
+      /* Rule LB1:
+         assign a line breaking class to each code point of the input. */
+      switch ((int)break_type)
+        {
+        case G_UNICODE_BREAK_AMBIGUOUS:
+        case G_UNICODE_BREAK_SURROGATE:
+        case G_UNICODE_BREAK_UNKNOWN:
+          break_type = G_UNICODE_BREAK_ALPHABETIC;
+          break;
+
+        case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+          if (type == G_UNICODE_NON_SPACING_MARK ||
+              type == G_UNICODE_SPACING_MARK)
+            break_type = G_UNICODE_BREAK_COMBINING_MARK;
+          else
+            break_type = G_UNICODE_BREAK_ALPHABETIC;
+          break;
+
+        case G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER:
+          break_type = G_UNICODE_BREAK_NON_STARTER;
+          break;
+
+        default:
+          break;
+        }
+
+      /* If it's not a grapheme boundary, it's not a line break either */
+      if (attrs[i].is_cursor_position ||
+          break_type == G_UNICODE_BREAK_COMBINING_MARK ||
+          break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER ||
+          break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
+          break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
+          break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
+          break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
+          break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE ||
+          break_type == G_UNICODE_BREAK_EMOJI_MODIFIER ||
+          break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
+        {
+          LineBreakType LB_type;
+
+          /* Find the LineBreakType of wc */
+          LB_type = LB_Other;
+
+          if (break_type == G_UNICODE_BREAK_NUMERIC)
+            LB_type = LB_Numeric;
+
+          if (break_type == G_UNICODE_BREAK_SYMBOL ||
+              break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
+            {
+              if (!(prev_LB_type == LB_Numeric))
+                LB_type = LB_Other;
+            }
+
+          if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
+              break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS)
+            {
+              if (prev_LB_type == LB_Numeric)
+                LB_type = LB_Numeric_Close;
+              else
+                LB_type = LB_Other;
+            }
+
+          if (break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
+            {
+              if (prev_LB_type == LB_RI_Odd)
+                LB_type = LB_RI_Even;
+              else
+                LB_type = LB_RI_Odd;
+            }
+
+          attrs[i].is_line_break = TRUE; /* Rule LB31 */
+          /* Unicode doesn't specify char wrap;
+             we wrap around all chars currently. */
+          if (attrs[i].is_cursor_position)
+            attrs[i].is_char_break = TRUE;
+
+          /* Make any necessary replacements first */
+          if (row_break_type == G_UNICODE_BREAK_UNKNOWN)
+            row_break_type = G_UNICODE_BREAK_ALPHABETIC;
+
+          /* add the line break rules in reverse order to override
+             the lower priority rules. */
+
+          /* Rule LB30 */
+          if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER ||
+               prev_break_type == G_UNICODE_BREAK_NUMERIC) &&
+              break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
+              !_pango2_is_EastAsianWide (wc))
+            break_op = BREAK_PROHIBITED;
+
+          if (prev_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS &&
+              !_pango2_is_EastAsianWide (prev_wc)&&
+              (break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               break_type == G_UNICODE_BREAK_HEBREW_LETTER ||
+               break_type == G_UNICODE_BREAK_NUMERIC))
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB30a */
+          if (prev_LB_type == LB_RI_Odd && LB_type == LB_RI_Even)
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB30b */
+          if (prev_break_type == G_UNICODE_BREAK_EMOJI_BASE &&
+              break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
+            break_op = BREAK_PROHIBITED;
+
+          if ((_pango2_Is_Emoji_Extended_Pictographic (prev_wc) &&
+               g_unichar_type (prev_wc) == G_UNICODE_UNASSIGNED) &&
+              break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB29 */
+          if (prev_break_type == G_UNICODE_BREAK_INFIX_SEPARATOR &&
+              (break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               break_type == G_UNICODE_BREAK_HEBREW_LETTER))
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB28 */
+          if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
+              (break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               break_type == G_UNICODE_BREAK_HEBREW_LETTER))
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB27 */
+          if ((prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
+               prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
+               prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
+               prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
+               prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
+              break_type == G_UNICODE_BREAK_POSTFIX)
+            break_op = BREAK_PROHIBITED;
+
+          if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
+              (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
+               break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
+               break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
+               break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
+               break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB26 */
+          if (prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO &&
+              (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
+               break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
+               break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
+               break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
+            break_op = BREAK_PROHIBITED;
+
+          if ((prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
+               prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE) &&
+              (break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
+               break_type == G_UNICODE_BREAK_HANGUL_T_JAMO))
+            break_op = BREAK_PROHIBITED;
+
+          if ((prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
+               prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
+              break_type == G_UNICODE_BREAK_HANGUL_T_JAMO)
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB25 with Example 7 of Customization */
+          if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
+               prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
+              break_type == G_UNICODE_BREAK_NUMERIC)
+            break_op = BREAK_PROHIBITED;
+
+          if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
+               prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
+              (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION ||
+               break_type == G_UNICODE_BREAK_HYPHEN) &&
+              next_break_type == G_UNICODE_BREAK_NUMERIC)
+            break_op = BREAK_PROHIBITED;
+
+          if ((prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION ||
+               prev_break_type == G_UNICODE_BREAK_HYPHEN) &&
+              break_type == G_UNICODE_BREAK_NUMERIC)
+            break_op = BREAK_PROHIBITED;
+
+          if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
+              (break_type == G_UNICODE_BREAK_NUMERIC ||
+               break_type == G_UNICODE_BREAK_SYMBOL ||
+               break_type == G_UNICODE_BREAK_INFIX_SEPARATOR))
+            break_op = BREAK_PROHIBITED;
+
+          if (prev_LB_type == LB_Numeric &&
+              (break_type == G_UNICODE_BREAK_NUMERIC ||
+               break_type == G_UNICODE_BREAK_SYMBOL ||
+               break_type == G_UNICODE_BREAK_INFIX_SEPARATOR ||
+               break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
+               break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS))
+            break_op = BREAK_PROHIBITED;
+
+          if ((prev_LB_type == LB_Numeric ||
+               prev_LB_type == LB_Numeric_Close) &&
+              (break_type == G_UNICODE_BREAK_POSTFIX ||
+               break_type == G_UNICODE_BREAK_PREFIX))
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB24 */
+          if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
+               prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
+              (break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               break_type == G_UNICODE_BREAK_HEBREW_LETTER))
+            break_op = BREAK_PROHIBITED;
+
+          if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
+              (break_type == G_UNICODE_BREAK_PREFIX ||
+               break_type == G_UNICODE_BREAK_POSTFIX))
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB23 */
+          if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
+              break_type == G_UNICODE_BREAK_NUMERIC)
+            break_op = BREAK_PROHIBITED;
+
+          if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
+              (break_type == G_UNICODE_BREAK_ALPHABETIC ||
+               break_type == G_UNICODE_BREAK_HEBREW_LETTER))
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB23a */
+          if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
+              (break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
+               break_type == G_UNICODE_BREAK_EMOJI_BASE ||
+               break_type == G_UNICODE_BREAK_EMOJI_MODIFIER))
+            break_op = BREAK_PROHIBITED;
+
+          if ((prev_break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
+               prev_break_type == G_UNICODE_BREAK_EMOJI_BASE ||
+               prev_break_type == G_UNICODE_BREAK_EMOJI_MODIFIER) &&
+              break_type == G_UNICODE_BREAK_POSTFIX)
+            break_op = BREAK_PROHIBITED;
+
+          /* Rule LB22 */
+          if (break_type == G_UNICODE_BREAK_INSEPARABLE)
+            break_op = BREAK_PROHIBITED;
+
+          if (break_type == G_UNICODE_BREAK_AFTER ||
+              break_type == G_UNICODE_BREAK_HYPHEN ||
+              break_type == G_UNICODE_BREAK_NON_STARTER ||
+              prev_break_type == G_UNICODE_BREAK_BEFORE)
+            break_op = BREAK_PROHIBITED; /* Rule LB21 */
+
+          if (prev_prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER &&
+              (prev_break_type == G_UNICODE_BREAK_HYPHEN ||
+               prev_break_type == G_UNICODE_BREAK_AFTER))
+            break_op = BREAK_PROHIBITED; /* Rule LB21a */
+
+          if (prev_break_type == G_UNICODE_BREAK_SYMBOL &&
+              break_type == G_UNICODE_BREAK_HEBREW_LETTER)
+            break_op = BREAK_PROHIBITED; /* Rule LB21b */
+
+          if (prev_break_type == G_UNICODE_BREAK_CONTINGENT ||
+              break_type == G_UNICODE_BREAK_CONTINGENT)
+            break_op = BREAK_ALLOWED; /* Rule LB20 */
+
+          if (prev_break_type == G_UNICODE_BREAK_QUOTATION ||
+              break_type == G_UNICODE_BREAK_QUOTATION)
+            break_op = BREAK_PROHIBITED; /* Rule LB19 */
+
+          /* handle related rules for Space as state machine here,
+             and override the pair table result. */
+          if (prev_break_type == G_UNICODE_BREAK_SPACE) /* Rule LB18 */
+            break_op = BREAK_ALLOWED;
+
+          if (row_break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER &&
+              break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER)
+            break_op = BREAK_PROHIBITED; /* Rule LB17 */
+
+          if ((row_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
+               row_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS) &&
+              break_type == G_UNICODE_BREAK_NON_STARTER)
+            break_op = BREAK_PROHIBITED; /* Rule LB16 */
+
+          if (row_break_type == G_UNICODE_BREAK_QUOTATION &&
+              break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
+            break_op = BREAK_PROHIBITED; /* Rule LB15 */
+
+          if (row_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
+            break_op = BREAK_PROHIBITED; /* Rule LB14 */
+
+          /* Rule LB13 with Example 7 of Customization */
+          if (break_type == G_UNICODE_BREAK_EXCLAMATION)
+            break_op = BREAK_PROHIBITED;
+
+          if (prev_break_type != G_UNICODE_BREAK_NUMERIC &&
+              (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
+               break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS ||
+               break_type == G_UNICODE_BREAK_INFIX_SEPARATOR ||
+               break_type == G_UNICODE_BREAK_SYMBOL))
+            break_op = BREAK_PROHIBITED;
+
+          if (prev_break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE)
+            break_op = BREAK_PROHIBITED; /* Rule LB12 */
+
+          if (break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE &&
+              (prev_break_type != G_UNICODE_BREAK_SPACE &&
+               prev_break_type != G_UNICODE_BREAK_AFTER &&
+               prev_break_type != G_UNICODE_BREAK_HYPHEN))
+            break_op = BREAK_PROHIBITED; /* Rule LB12a */
+
+          if (prev_break_type == G_UNICODE_BREAK_WORD_JOINER ||
+              break_type == G_UNICODE_BREAK_WORD_JOINER)
+            break_op = BREAK_PROHIBITED; /* Rule LB11 */
+
+
+          /* Rule LB9 */
+          if (break_type == G_UNICODE_BREAK_COMBINING_MARK ||
+              break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
+            {
+              if (!(prev_break_type == G_UNICODE_BREAK_MANDATORY ||
+                    prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
+                    prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
+                    prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
+                    prev_break_type == G_UNICODE_BREAK_SPACE ||
+                    prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE))
+                break_op = BREAK_PROHIBITED;
+            }
+
+          if (row_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
+            break_op = BREAK_ALLOWED; /* Rule LB8 */
+
+          if (prev_wc == 0x200D)
+            break_op = BREAK_PROHIBITED; /* Rule LB8a */
+
+          if (break_type == G_UNICODE_BREAK_SPACE ||
+              break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
+            break_op = BREAK_PROHIBITED; /* Rule LB7 */
+
+          /* Rule LB6 */
+          if (break_type == G_UNICODE_BREAK_MANDATORY ||
+              break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
+              break_type == G_UNICODE_BREAK_LINE_FEED ||
+              break_type == G_UNICODE_BREAK_NEXT_LINE)
+            break_op = BREAK_PROHIBITED;
+
+          /* Rules LB4 and LB5 */
+          if (prev_break_type == G_UNICODE_BREAK_MANDATORY ||
+              (prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN &&
+               wc != '\n') ||
+              prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
+              prev_break_type == G_UNICODE_BREAK_NEXT_LINE)
+            {
+              attrs[i].is_mandatory_break = TRUE;
+              break_op = BREAK_ALLOWED;
+            }
+
+          switch (break_op)
+            {
+            case BREAK_PROHIBITED:
+              /* can't break here */
+              attrs[i].is_line_break = FALSE;
+              break;
+
+            case BREAK_IF_SPACES:
+              /* break if prev char was space */
+              if (prev_break_type != G_UNICODE_BREAK_SPACE)
+                attrs[i].is_line_break = FALSE;
+              break;
+
+            case BREAK_ALLOWED:
+              attrs[i].is_line_break = TRUE;
+              break;
+
+            case BREAK_ALREADY_HANDLED:
+              break;
+
+            default:
+              g_assert_not_reached ();
+              break;
+            }
+
+          /* Rule LB9 */
+          if (!(break_type == G_UNICODE_BREAK_COMBINING_MARK ||
+                break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER))
+            {
+              /* Rule LB25 with Example 7 of Customization */
+              if (break_type == G_UNICODE_BREAK_NUMERIC ||
+                  break_type == G_UNICODE_BREAK_SYMBOL ||
+                  break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
+                {
+                  if (prev_LB_type != LB_Numeric)
+                    prev_LB_type = LB_type;
+                  /* else don't change the prev_LB_type */
+                }
+              else
+                {
+                  prev_LB_type = LB_type;
+                }
+            }
+          /* else don't change the prev_LB_type for Rule LB9 */
+        }
+
+      if (break_type != G_UNICODE_BREAK_SPACE)
+        {
+          /* Rule LB9 */
+          if (break_type == G_UNICODE_BREAK_COMBINING_MARK ||
+              break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
+            {
+              if (i == 0 /* start of text */ ||
+                  prev_break_type == G_UNICODE_BREAK_MANDATORY ||
+                  prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
+                  prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
+                  prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
+                  prev_break_type == G_UNICODE_BREAK_SPACE ||
+                  prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
+                prev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */
+              /* else don't change the prev_break_type for Rule LB9 */
+            }
+          else
+            {
+              prev_prev_break_type = prev_break_type;
+              prev_break_type = break_type;
+            }
+
+          prev_jamo = jamo;
+        }
+      else
+        {
+          if (prev_break_type != G_UNICODE_BREAK_SPACE)
+            {
+              prev_prev_break_type = prev_break_type;
+              prev_break_type = break_type;
+            }
+          /* else don't change the prev_break_type */
+        }
+
+      /* ---- Word breaks ---- */
+
+      /* default to not a word start/end */
+      attrs[i].is_word_start = FALSE;
+      attrs[i].is_word_end = FALSE;
+
+      if (current_word_type != WordNone)
+        {
+          /* Check for a word end */
+          switch ((int) type)
+            {
+            case G_UNICODE_SPACING_MARK:
+            case G_UNICODE_ENCLOSING_MARK:
+            case G_UNICODE_NON_SPACING_MARK:
+            case G_UNICODE_FORMAT:
+              /* nothing, we just eat these up as part of the word */
+              break;
+
+            case G_UNICODE_LOWERCASE_LETTER:
+            case G_UNICODE_MODIFIER_LETTER:
+            case G_UNICODE_OTHER_LETTER:
+            case G_UNICODE_TITLECASE_LETTER:
+            case G_UNICODE_UPPERCASE_LETTER:
+              if (current_word_type == WordLetters)
+                {
+                  /* Japanese special cases for ending the word */
+                  if (JAPANESE (last_word_letter) ||
+                      JAPANESE (wc))
+                    {
+                      if ((HIRAGANA (last_word_letter) &&
+                           !HIRAGANA (wc)) ||
+                          (KATAKANA (last_word_letter) &&
+                           !(KATAKANA (wc) || HIRAGANA (wc))) ||
+                          (KANJI (last_word_letter) &&
+                           !(HIRAGANA (wc) || KANJI (wc))) ||
+                          (JAPANESE (last_word_letter) &&
+                           !JAPANESE (wc)) ||
+                          (!JAPANESE (last_word_letter) &&
+                           JAPANESE (wc)))
+                        attrs[i].is_word_end = TRUE;
+                    }
+                }
+              last_word_letter = wc;
+              break;
+
+            case G_UNICODE_DECIMAL_NUMBER:
+            case G_UNICODE_LETTER_NUMBER:
+            case G_UNICODE_OTHER_NUMBER:
+              last_word_letter = wc;
+              break;
+
+            default:
+              /* Punctuation, control/format chars, etc. all end a word. */
+              attrs[i].is_word_end = TRUE;
+              current_word_type = WordNone;
+              break;
+            }
+        }
+      else
+        {
+          /* Check for a word start */
+          switch ((int) type)
+            {
+            case G_UNICODE_LOWERCASE_LETTER:
+            case G_UNICODE_MODIFIER_LETTER:
+            case G_UNICODE_OTHER_LETTER:
+            case G_UNICODE_TITLECASE_LETTER:
+            case G_UNICODE_UPPERCASE_LETTER:
+              current_word_type = WordLetters;
+              last_word_letter = wc;
+              attrs[i].is_word_start = TRUE;
+              break;
+
+            case G_UNICODE_DECIMAL_NUMBER:
+            case G_UNICODE_LETTER_NUMBER:
+            case G_UNICODE_OTHER_NUMBER:
+              current_word_type = WordNumbers;
+              last_word_letter = wc;
+              attrs[i].is_word_start = TRUE;
+              break;
+
+            default:
+              /* No word here */
+              break;
+            }
+        }
+
+      /* ---- Sentence breaks ---- */
+      {
+
+        /* default to not a sentence start/end */
+        attrs[i].is_sentence_start = FALSE;
+        attrs[i].is_sentence_end = FALSE;
+
+        /* maybe start sentence */
+        if (last_sentence_start == -1 && !is_sentence_boundary)
+          last_sentence_start = i - 1;
+
+        /* remember last non space character position */
+        if (i > 0 && !attrs[i - 1].is_white)
+          last_non_space = i;
+
+        /* meets sentence end, mark both sentence start and end */
+        if (last_sentence_start != -1 && is_sentence_boundary) {
+          if (last_non_space >= last_sentence_start) {
+            attrs[last_sentence_start].is_sentence_start = TRUE;
+            attrs[last_non_space].is_sentence_end = TRUE;
+          }
+
+          last_sentence_start = -1;
+          last_non_space = -1;
+        }
+
+        /* meets space character, move sentence start */
+        if (last_sentence_start != -1 &&
+            last_sentence_start == i - 1 &&
+            attrs[i - 1].is_white) {
+            last_sentence_start++;
+          }
+      }
+
+      /* --- Hyphens --- */
+
+      {
+        gboolean insert_hyphens;
+        gboolean space_or_hyphen = FALSE;
+
+        attrs[i].break_inserts_hyphen = FALSE;
+        attrs[i].break_removes_preceding = FALSE;
+
+        switch ((int)prev_script)
+          {
+          case G_UNICODE_SCRIPT_COMMON:
+            insert_hyphens = prev_wc == 0x00ad;
+            break;
+          case G_UNICODE_SCRIPT_HAN:
+          case G_UNICODE_SCRIPT_HANGUL:
+          case G_UNICODE_SCRIPT_HIRAGANA:
+          case G_UNICODE_SCRIPT_KATAKANA:
+            insert_hyphens = FALSE;
+            break;
+          default:
+            insert_hyphens = TRUE;
+            break;
+          }
+
+        switch ((int)type)
+          {
+          case G_UNICODE_SPACE_SEPARATOR:
+          case G_UNICODE_LINE_SEPARATOR:
+          case G_UNICODE_PARAGRAPH_SEPARATOR:
+            space_or_hyphen = TRUE;
+            break;
+          case G_UNICODE_CONTROL:
+            if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f')
+              space_or_hyphen = TRUE;
+            break;
+          default:
+            if (wc == '-'    || /* Hyphen-minus */
+                wc == 0x058a || /* Armenian hyphen */
+                wc == 0x1400 || /* Canadian syllabics hyphen */
+                wc == 0x1806 || /* Mongolian todo hyphen */
+                wc == 0x2010 || /* Hyphen */
+                wc == 0x2e17 || /* Double oblique hyphen */
+                wc == 0x2e40 || /* Double hyphen */
+                wc == 0x30a0 || /* Katakana-Hiragana double hyphen */
+                wc == 0xfe63 || /* Small hyphen-minus */
+                wc == 0xff0d)   /* Fullwidth hyphen-minus */
+              space_or_hyphen = TRUE;
+            break;
+          }
+
+        if (prev_wc == 0x2027)     /* Hyphenation point */
+          {
+            attrs[i].break_inserts_hyphen = TRUE;
+            attrs[i].break_removes_preceding = TRUE;
+          }
+        else if (attrs[i].is_word_boundary)
+          attrs[i].break_inserts_hyphen = FALSE;
+        else if (prev_space_or_hyphen)
+          attrs[i].break_inserts_hyphen = FALSE;
+        else if (space_or_hyphen)
+          attrs[i].break_inserts_hyphen = FALSE;
+        else
+          attrs[i].break_inserts_hyphen = insert_hyphens;
+
+        prev_space_or_hyphen = space_or_hyphen;
+      }
+
+      prev_wc = wc;
+      prev_script = script;
+
+      /* wc might not be a valid Unicode base character, but really all we
+       * need to know is the last non-combining character */
+      if (type != G_UNICODE_SPACING_MARK &&
+          type != G_UNICODE_ENCLOSING_MARK &&
+          type != G_UNICODE_NON_SPACING_MARK)
+        base_character = wc;
+    }
+
+  i--;
+
+  attrs[0].is_cursor_position = TRUE;  /* Rule GB1 */
+  attrs[i].is_cursor_position = TRUE;  /* Rule GB2 */
+
+  attrs[0].is_word_boundary = TRUE;  /* Rule WB1 */
+  attrs[i].is_word_boundary = TRUE;  /* Rule WB2 */
+
+  attrs[0].is_line_break = FALSE; /* Rule LB2 */
+  attrs[i].is_line_break = TRUE;  /* Rule LB3 */
+  attrs[i].is_mandatory_break = TRUE;  /* Rule LB3 */
+}
+
+/* }}} */
+/* {{{ Tailoring */
+/* {{{ Script-specific tailoring */
+
+#include "break-arabic.c"
+#include "break-indic.c"
+#include "break-thai.c"
+#include "break-latin.c"
+
+static gboolean
+break_script (const char          *item_text,
+              unsigned int         item_length,
+              const Pango2Analysis *analysis,
+              Pango2LogAttr        *attrs,
+              int                  attrs_len)
+{
+  switch (analysis->script)
+    {
+    case G_UNICODE_SCRIPT_ARABIC:
+      break_arabic (item_text, item_length, analysis, attrs, attrs_len);
+      break;
+
+    case G_UNICODE_SCRIPT_DEVANAGARI:
+    case G_UNICODE_SCRIPT_BENGALI:
+    case G_UNICODE_SCRIPT_GURMUKHI:
+    case G_UNICODE_SCRIPT_GUJARATI:
+    case G_UNICODE_SCRIPT_ORIYA:
+    case G_UNICODE_SCRIPT_TAMIL:
+    case G_UNICODE_SCRIPT_TELUGU:
+    case G_UNICODE_SCRIPT_KANNADA:
+    case G_UNICODE_SCRIPT_MALAYALAM:
+    case G_UNICODE_SCRIPT_SINHALA:
+      break_indic (item_text, item_length, analysis, attrs, attrs_len);
+      break;
+
+    case G_UNICODE_SCRIPT_THAI:
+      break_thai (item_text, item_length, analysis, attrs, attrs_len);
+      break;
+
+    case G_UNICODE_SCRIPT_LATIN:
+      break_latin (item_text, item_length, analysis, attrs, attrs_len);
+      break;
+
+    default:
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
+/* }}} */ 
+/* {{{ Attribute-based customization */
+
+/* We allow customizing log attrs in two ways:
+ *
+ * - You can directly remove breaks from a range, using allow_breaks=false.
+ *   We preserve the non-tailorable rules from UAX #14, so mandatory breaks
+ *   and breaks after ZWS remain. We also preserve break opportunities after
+ *   hyphens and visible word dividers.
+ *
+ * - You can tweak the segmentation by marking ranges as word or sentence.
+ *   When doing so, we split adjacent segments to preserve alternating
+ *   starts and ends. We add a line break opportunity before each word that
+ *   is created in this way, and we remove line break opportunities inside
+ *   the word in the same way as for a range marked as allow_breaks=false,
+ *   except that we don't remove char break opportunities.
+ *
+ *   Note that UAX #14 does not guarantee that words fall neatly into
+ *   sentences, so we don't do extra work to enforce that.
+ */
+
+static void
+remove_breaks_from_range (const char   *text,
+                          int           start,
+                          Pango2LogAttr *log_attrs,
+                          int           start_pos,
+                          int           end_pos)
+{
+  int pos;
+  const char *p;
+  gunichar ch;
+  int bt;
+  gboolean after_zws;
+  gboolean after_hyphen;
+
+  /* Assume our range doesn't start after a hyphen or in a zws sequence */
+  after_zws = FALSE;
+  after_hyphen = FALSE;
+  for (pos = start_pos + 1, p = g_utf8_next_char (text + start);
+       pos < end_pos;
+       pos++, p = g_utf8_next_char (p))
+    {
+      /* Mandatory breaks aren't tailorable */
+      if (!log_attrs[pos].is_mandatory_break)
+        log_attrs[pos].is_line_break = FALSE;
+
+      ch = g_utf8_get_char (p);
+      bt = g_unichar_break_type (ch);
+
+      /* Hyphens and visible word dividers */
+      if (after_hyphen)
+        log_attrs[pos].is_line_break = TRUE;
+
+      after_hyphen = ch == 0x00ad || /* Soft Hyphen */
+         ch == 0x05A0 || ch == 0x2010 || /* Breaking Hyphens */
+         ch == 0x2012 || ch == 0x2013 ||
+         ch == 0x05BE || ch == 0x0F0B || /* Visible word dividers */
+         ch == 0x1361 || ch == 0x17D8 ||
+         ch == 0x17DA || ch == 0x2027 ||
+         ch == 0x007C;
+
+      /* ZWS sequence */
+      if (after_zws && bt != G_UNICODE_BREAK_SPACE)
+        log_attrs[pos].is_line_break = TRUE;
+
+      after_zws = bt == G_UNICODE_BREAK_ZERO_WIDTH_SPACE ||
+                  (bt == G_UNICODE_BREAK_SPACE && after_zws);
+    }
+}
+
+static gboolean
+handle_allow_breaks (const char    *text,
+                     int            length,
+                     Pango2AttrList *attrs,
+                     int            offset,
+                     Pango2LogAttr  *log_attrs,
+                     int            log_attrs_len)
+{
+  Pango2AttrIterator iter;
+  gboolean tailored = FALSE;
+
+  pango2_attr_list_init_iterator (attrs, &iter);
+
+  do
+    {
+      const Pango2Attribute *attr = pango2_attr_iterator_get (&iter, PANGO2_ATTR_ALLOW_BREAKS);
+
+      if (!attr)
+        continue;
+
+      if (!attr->int_value)
+        {
+          int start, end;
+          int start_pos, end_pos;
+          int pos;
+
+          start = attr->start_index;
+          end = attr->end_index;
+          if (start < offset)
+            start_pos = 0;
+          else
+            start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+          if (end >= offset + length)
+            end_pos = log_attrs_len;
+          else
+            end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+          for (pos = start_pos + 1; pos < end_pos; pos++)
+            log_attrs[pos].is_char_break = FALSE;
+
+          remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs, start_pos, end_pos);
+
+          tailored = TRUE;
+        }
+    }
+  while (pango2_attr_iterator_next (&iter));
+
+  pango2_attr_iterator_clear (&iter);
+
+  return tailored;
+}
+
+
+static gboolean
+handle_words (const char    *text,
+              int            length,
+              Pango2AttrList *attrs,
+              int            offset,
+              Pango2LogAttr  *log_attrs,
+              int            log_attrs_len)
+{
+  Pango2AttrIterator iter;
+  gboolean tailored = FALSE;
+
+  pango2_attr_list_init_iterator (attrs, &iter);
+
+  do
+    {
+      const Pango2Attribute *attr = pango2_attr_iterator_get (&iter, PANGO2_ATTR_WORD);
+      int start, end;
+      int start_pos, end_pos;
+      int pos;
+
+      if (!attr)
+        continue;
+
+      start = attr->start_index;
+      end = attr->end_index;
+      if (start < offset)
+        start_pos = 0;
+      else
+        start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+      if (end >= offset + length)
+        end_pos = log_attrs_len;
+      else
+        end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+      for (pos = start_pos + 1; pos < end_pos; pos++)
+        {
+          log_attrs[pos].is_word_start = FALSE;
+          log_attrs[pos].is_word_end = FALSE;
+          log_attrs[pos].is_word_boundary = FALSE;
+        }
+
+      remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs,
+                                start_pos, end_pos);
+
+      if (start >= offset)
+        {
+          gboolean in_word = FALSE;
+          for (pos = start_pos; pos >= 0; pos--)
+            {
+              if (log_attrs[pos].is_word_end)
+                {
+                  in_word = pos == start_pos;
+                  break;
+                }
+              if (pos < start_pos && log_attrs[pos].is_word_start)
+                {
+                  in_word = TRUE;
+                  break;
+                }
+            }
+          log_attrs[start_pos].is_word_start = TRUE;
+          log_attrs[start_pos].is_word_end = in_word;
+          log_attrs[start_pos].is_word_boundary = TRUE;
+
+          /* Allow line breaks before words */
+          if (start_pos > 0)
+            log_attrs[start_pos].is_line_break = TRUE;
+
+          tailored = TRUE;
+        }
+
+      if (end < offset + length)
+        {
+          gboolean in_word = FALSE;
+          for (pos = end_pos; pos < log_attrs_len; pos++)
+            {
+              if (log_attrs[pos].is_word_start)
+                {
+                  in_word = pos == end_pos;
+                  break;
+                }
+              if (pos > end_pos && log_attrs[pos].is_word_end)
+                {
+                  in_word = TRUE;
+                  break;
+                }
+            }
+          log_attrs[end_pos].is_word_start = in_word;
+          log_attrs[end_pos].is_word_end = TRUE;
+          log_attrs[end_pos].is_word_boundary = TRUE;
+
+          /* Allow line breaks before words */
+          if (in_word)
+            log_attrs[end_pos].is_line_break = TRUE;
+
+          tailored = TRUE;
+        }
+    }
+  while (pango2_attr_iterator_next (&iter));
+
+  pango2_attr_iterator_clear (&iter);
+
+  return tailored;
+}
+
+static gboolean
+handle_sentences (const char    *text,
+                  int            length,
+                  Pango2AttrList *attrs,
+                  int            offset,
+                  Pango2LogAttr  *log_attrs,
+                  int            log_attrs_len)
+{
+  Pango2AttrIterator iter;
+  gboolean tailored = FALSE;
+
+  pango2_attr_list_init_iterator (attrs, &iter);
+
+  do
+    {
+      const Pango2Attribute *attr = pango2_attr_iterator_get (&iter, PANGO2_ATTR_SENTENCE);
+      int start, end;
+      int start_pos, end_pos;
+      int pos;
+
+      if (!attr)
+        continue;
+
+      start = attr->start_index;
+      end = attr->end_index;
+      if (start < offset)
+        start_pos = 0;
+      else
+        start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+      if (end >= offset + length)
+        end_pos = log_attrs_len;
+      else
+        end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+      for (pos = start_pos + 1; pos < end_pos; pos++)
+        {
+          log_attrs[pos].is_sentence_start = FALSE;
+          log_attrs[pos].is_sentence_end = FALSE;
+          log_attrs[pos].is_sentence_boundary = FALSE;
+
+          tailored = TRUE;
+        }
+      if (start >= offset)
+        {
+          gboolean in_sentence = FALSE;
+          for (pos = start_pos - 1; pos >= 0; pos--)
+            {
+              if (log_attrs[pos].is_sentence_end)
+                break;
+              if (log_attrs[pos].is_sentence_start)
+                {
+                  in_sentence = TRUE;
+                  break;
+                }
+            }
+          log_attrs[start_pos].is_sentence_start = TRUE;
+          log_attrs[start_pos].is_sentence_end = in_sentence;
+          log_attrs[start_pos].is_sentence_boundary = TRUE;
+
+          tailored = TRUE;
+        }
+      if (end < offset + length)
+        {
+          gboolean in_sentence = FALSE;
+          for (pos = end_pos + 1; end_pos < log_attrs_len; pos++)
+            {
+              if (log_attrs[pos].is_sentence_start)
+                break;
+              if (log_attrs[pos].is_sentence_end)
+                {
+                  in_sentence = TRUE;
+                  break;
+                }
+            }
+          log_attrs[end_pos].is_sentence_start = in_sentence;
+          log_attrs[end_pos].is_sentence_end = TRUE;
+          log_attrs[end_pos].is_sentence_boundary = TRUE;
+
+          tailored = TRUE;
+        }
+    }
+  while (pango2_attr_iterator_next (&iter));
+
+  pango2_attr_iterator_clear (&iter);
+
+  return tailored;
+}
+
+static gboolean
+handle_hyphens (const char    *text,
+                int            length,
+                Pango2AttrList *attrs,
+                int            offset,
+                Pango2LogAttr  *log_attrs,
+                int            log_attrs_len)
+{
+  Pango2AttrIterator iter;
+  gboolean tailored = FALSE;
+
+  pango2_attr_list_init_iterator (attrs, &iter);
+
+  do {
+    const Pango2Attribute *attr = pango2_attr_iterator_get (&iter, PANGO2_ATTR_INSERT_HYPHENS);
+
+    if (attr && attr->int_value == 0)
+      {
+        int start, end;
+        int start_pos, end_pos;
+        int pos;
+
+        pango2_attr_iterator_range (&iter, &start, &end);
+        if (start < offset)
+          start_pos = 0;
+        else
+          start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+        if (end >= offset + length)
+          end_pos = log_attrs_len;
+        else
+          end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+        for (pos = start_pos + 1; pos < end_pos; pos++)
+          {
+            if (!log_attrs[pos].break_removes_preceding)
+              {
+                log_attrs[pos].break_inserts_hyphen = FALSE;
+
+                tailored = TRUE;
+              }
+          }
+      }
+  } while (pango2_attr_iterator_next (&iter));
+
+  pango2_attr_iterator_clear (&iter);
+
+  return tailored;
+}
+
+static gboolean
+break_attrs (const char   *text,
+             int           length,
+             GSList       *attributes,
+             int           offset,
+             Pango2LogAttr *log_attrs,
+             int           log_attrs_len)
+{
+  Pango2AttrList allow_breaks;
+  Pango2AttrList words;
+  Pango2AttrList sentences;
+  Pango2AttrList hyphens;
+  GSList *l;
+  gboolean tailored = FALSE;
+
+  pango2_attr_list_init (&allow_breaks);
+  pango2_attr_list_init (&words);
+  pango2_attr_list_init (&sentences);
+  pango2_attr_list_init (&hyphens);
+
+  for (l = attributes; l; l = l->next)
+    {
+      Pango2Attribute *attr = l->data;
+
+      if (attr->type == PANGO2_ATTR_ALLOW_BREAKS)
+        pango2_attr_list_insert (&allow_breaks, pango2_attribute_copy (attr));
+      else if (attr->type == PANGO2_ATTR_WORD)
+        pango2_attr_list_insert (&words, pango2_attribute_copy (attr));
+      else if (attr->type == PANGO2_ATTR_SENTENCE)
+        pango2_attr_list_insert (&sentences, pango2_attribute_copy (attr));
+      else if (attr->type == PANGO2_ATTR_INSERT_HYPHENS)
+        pango2_attr_list_insert (&hyphens, pango2_attribute_copy (attr));
+    }
+
+  tailored |= handle_words (text, length, &words, offset,
+                            log_attrs, log_attrs_len);
+
+  tailored |= handle_sentences (text, length, &words, offset,
+                                log_attrs, log_attrs_len);
+
+  tailored |= handle_hyphens (text, length, &hyphens, offset,
+                              log_attrs, log_attrs_len);
+
+  tailored |= handle_allow_breaks (text, length, &allow_breaks, offset,
+                                   log_attrs, log_attrs_len);
+
+  pango2_attr_list_destroy (&allow_breaks);
+  pango2_attr_list_destroy (&words);
+  pango2_attr_list_destroy (&sentences);
+  pango2_attr_list_destroy (&hyphens);
+
+  return tailored;
+}
+
+/* }}} */
+
+static gboolean
+tailor_break (const char     *text,
+              int             length,
+              Pango2Analysis *analysis,
+              int             item_offset,
+              Pango2LogAttr  *attrs,
+              int             attrs_len)
+{
+  gboolean res;
+
+  if (length < 0)
+    length = strlen (text);
+  else if (text == NULL)
+    text = "";
+
+  res = break_script (text, length, analysis, attrs, attrs_len);
+
+  if (item_offset >= 0 && analysis->extra_attrs)
+    res |= break_attrs (text, length, analysis->extra_attrs, item_offset, attrs, attrs_len);
+
+  return res;
+}
+
+/* }}} */
+/* {{{ Public API */
+
+/**
+ * pango2_default_break:
+ * @text: text to break. Must be valid UTF-8
+ * @length: length of text in bytes (may be -1 if @text is nul-terminated)
+ * @attrs: logical attributes to fill in
+ * @attrs_len: size of the array passed as @attrs
+ *
+ * This is the default break algorithm.
+ *
+ * It applies rules from the [Unicode Line Breaking Algorithm](http://www.unicode.org/unicode/reports/tr14/)
+ * without language-specific tailoring.
+ *
+ * See [func@Pango2.tailor_break] for language-specific breaks.
+ *
+ * See [func@Pango2.attr_break] for attribute-based customization.
+ */
+void
+pango2_default_break (const char   *text,
+                     int           length,
+                     Pango2LogAttr *attrs,
+                     int           attrs_len G_GNUC_UNUSED)
+{
+  Pango2LogAttr before = *attrs;
+
+  default_break (text, length, attrs, attrs_len);
+
+  attrs->is_line_break      |= before.is_line_break;
+  attrs->is_mandatory_break |= before.is_mandatory_break;
+  attrs->is_cursor_position |= before.is_cursor_position;
+}
+
+/**
+ * pango2_tailor_break:
+ * @text: text to process. Must be valid UTF-8
+ * @length: length in bytes of @text
+ * @analysis: `Pango2Analysis` for @text
+ * @offset: Byte offset of @text from the beginning of the
+ *   paragraph, or -1 to ignore attributes from @analysis
+ * @attrs: (array length=attrs_len): array with one `Pango2LogAttr`
+ *   per character in @text, plus one extra, to be filled in
+ * @attrs_len: length of @attrs array
+ *
+ * Apply language-specific tailoring to the breaks in @attrs.
+ *
+ * The line breaks are assumed to have been produced by [func@Pango2.default_break].
+ *
+ * If @offset is not -1, it is used to apply attributes from @analysis that are
+ * relevant to line breaking.
+ *
+ * Note that it is better to pass -1 for @offset and use [func@Pango2.attr_break]
+ * to apply attributes to the whole paragraph.
+ */
+void
+pango2_tailor_break (const char    *text,
+                    int            length,
+                    Pango2Analysis *analysis,
+                    int            offset,
+                    Pango2LogAttr  *attrs,
+                    int            attrs_len)
+{
+  Pango2LogAttr *start = attrs;
+  Pango2LogAttr attr_before = *start;
+
+  if (tailor_break (text, length, analysis, offset, attrs, attrs_len))
+    {
+      /* if tailored, we enforce some of the attrs from before
+       * tailoring at the boundary
+       */
+
+     start->backspace_deletes_character  = attr_before.backspace_deletes_character;
+
+     start->is_line_break      |= attr_before.is_line_break;
+     start->is_mandatory_break |= attr_before.is_mandatory_break;
+     start->is_cursor_position |= attr_before.is_cursor_position;
+    }
+}
+
+/**
+ * pango2_attr_break:
+ * @text: text to break. Must be valid UTF-8
+ * @length: length of text in bytes (may be -1 if @text is nul-terminated)
+ * @attr_list: `Pango2AttrList` to apply
+ * @offset: Byte offset of @text from the beginning of the paragraph
+ * @attrs: (array length=attrs_len): array with one `Pango2LogAttr`
+ *   per character in @text, plus one extra, to be filled in
+ * @attrs_len: length of @attrs array
+ *
+ * Apply customization from attributes to the breaks in @attrs.
+ *
+ * The line breaks are assumed to have been produced
+ * by [func@Pango2.default_break] and [func@Pango2.tailor_break].
+ */
+void
+pango2_attr_break (const char    *text,
+                  int            length,
+                  Pango2AttrList *attr_list,
+                  int            offset,
+                  Pango2LogAttr  *attrs,
+                  int            attrs_len)
+{
+  Pango2LogAttr *start = attrs;
+  Pango2LogAttr attr_before = *start;
+  GSList *attributes;
+
+  attributes = pango2_attr_list_get_attributes (attr_list);
+  if (break_attrs (text, length, attributes, offset, attrs, attrs_len))
+    {
+      /* if tailored, we enforce some of the attrs from before
+       * tailoring at the boundary
+       */
+
+      start->backspace_deletes_character  = attr_before.backspace_deletes_character;
+
+      start->is_line_break      |= attr_before.is_line_break;
+      start->is_mandatory_break |= attr_before.is_mandatory_break;
+      start->is_cursor_position |= attr_before.is_cursor_position;
+    }
+
+  g_slist_free_full (attributes, (GDestroyNotify)pango2_attribute_destroy);
+}
+
+/**
+ * pango2_get_log_attrs:
+ * @text: text to process. Must be valid UTF-8
+ * @length: length in bytes of @text
+ * @attr_list: (nullable): `Pango2AttrList` to apply
+ * @level: embedding level, or -1 if unknown
+ * @language: language tag
+ * @attrs: (array length=attrs_len): array with one `Pango2LogAttr`
+ *   per character in @text, plus one extra, to be filled in
+ * @attrs_len: length of @attrs array
+ *
+ * Computes a `Pango2LogAttr` for each character in @text.
+ *
+ * The @attrs array must have one `Pango2LogAttr` for
+ * each position in @text; if @text contains N characters,
+ * it has N+1 positions, including the last position at the
+ * end of the text. @text should be an entire paragraph;
+ * logical attributes can't be computed without context
+ * (for example you need to see spaces on either side of
+ * a word to know the word is a word).
+ */
+void
+pango2_get_log_attrs (const char    *text,
+                     int             length,
+                     Pango2AttrList *attr_list,
+                     int             level,
+                     Pango2Language *language,
+                     Pango2LogAttr  *attrs,
+                     int             attrs_len)
+{
+  int chars_broken;
+  Pango2Analysis analysis = { NULL };
+  Pango2ScriptIter iter;
+
+  g_return_if_fail (length == 0 || text != NULL);
+  g_return_if_fail (attrs != NULL);
+
+  analysis.level = level;
+  analysis.language = language;
+
+  pango2_default_break (text, length, attrs, attrs_len);
+
+  chars_broken = 0;
+
+  _pango2_script_iter_init (&iter, text, length);
+  do
+    {
+      const char *run_start, *run_end;
+      GUnicodeScript script;
+      int chars_in_range;
+
+      pango2_script_iter_get_range (&iter, &run_start, &run_end, &script);
+      analysis.script = script;
+
+      chars_in_range = pango2_utf8_strlen (run_start, run_end - run_start);
+
+      pango2_tailor_break (run_start,
+                          run_end - run_start,
+                          &analysis,
+                          -1,
+                          attrs + chars_broken,
+                          chars_in_range + 1);
+
+      chars_broken += chars_in_range;
+    }
+  while (pango2_script_iter_next (&iter));
+  _pango2_script_iter_fini (&iter);
+
+  if (attr_list)
+    pango2_attr_break (text, length, attr_list, 0, attrs, attrs_len);
+
+  if (chars_broken + 1 > attrs_len)
+    g_warning ("pango2_get_log_attrs: attrs_len should have been at least %d, but was %d.  Expect corrupted memory.",
+               chars_broken + 1,
+               attrs_len);
+}
+
+/* }}} */
+
+/* vim:set foldmethod=marker expandtab: */