Split pango_default_break into functions

This makes it easier to see what part is taking time.
author: Matthias Clasen <mclasen@redhat.com> 2021-07-28 14:09:23 -0400
committer: Matthias Clasen <mclasen@redhat.com> 2021-07-30 07:58:16 -0400
commit: 73cd563d7bd19b31b57e94e3bb0969970990e589 (patch)
tree: 44c5627509af2ca627b8b0c7ae8f298587cfe0bb
parent: 730a7eca52118d8a5d1f546f34e5de91ee75be4c (diff)
download: pango-73cd563d7bd19b31b57e94e3bb0969970990e589.tar.gz
1 files changed, 472 insertions, 342 deletions
diff --git a/pango/break.c b/pango/break.c
index b9cf3cae..00290a59 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -138,225 +138,13 @@ typedef enum
   WordNumbers
 } WordType;
 
-
-/**
- * pango_default_break:
- * @text: text to break. Must be valid UTF-8
- * @length: length of text in bytes (may be -1 if @text is nul-terminated)
- * @analysis: (nullable): a `PangoAnalysis` structure for the @text
- * @attrs: logical attributes to fill in
- * @attrs_len: size of the array passed as @attrs
- *
- * This is the default break algorithm.
- *
- * It applies Unicode rules without language-specific
- * tailoring, therefore the @analyis argument is unused
- * and can be %NULL.
- *
- * See [func@Pango.tailor_break] for language-specific breaks.
- */
-void
-pango_default_break (const gchar   *text,
-		     gint           length,
-		     PangoAnalysis *analysis G_GNUC_UNUSED,
-		     PangoLogAttr  *attrs,
-		     int            attrs_len G_GNUC_UNUSED)
+ __attribute__((noinline))
+static void
+determine_whitespace (gunichar wc,
+                      GUnicodeType type,
+		      PangoLogAttr  *attrs,
+                      int i)
 {
-  /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
-   * the line breaking stuff is also in TR14 on unicode.org
-   */
-
-  /* This is a default break implementation that should work for nearly all
-   * languages. Language engines can override it optionally.
-   */
-
-  /* FIXME one cheesy optimization here would be to memset attrs to 0
-   * before we start, and then never assign %FALSE to anything
-   */
-
-  const gchar *next;
-  gint i;
-
-  gunichar prev_wc;
-  gunichar next_wc;
-
-  JamoType prev_jamo;
-
-  GUnicodeBreakType next_break_type;
-  GUnicodeBreakType prev_break_type;
-  GUnicodeBreakType prev_prev_break_type;
-
-  /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
-  typedef enum
-  {
-    GB_Other,
-    GB_ControlCRLF,
-    GB_Extend,
-    GB_ZWJ,
-    GB_Prepend,
-    GB_SpacingMark,
-    GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
-    /* Use state machine to handle emoji sequence */
-    /* Rule GB12 and GB13 */
-    GB_RI_Odd, /* Meets odd number of RI */
-    GB_RI_Even, /* Meets even number of RI */
-  } GraphemeBreakType;
-  GraphemeBreakType prev_GB_type = GB_Other;
-  gboolean met_Extended_Pictographic = FALSE;
-
-  /* See Word_Break Property Values table of UAX#29 */
-  typedef enum
-  {
-    WB_Other,
-    WB_NewlineCRLF,
-    WB_ExtendFormat,
-    WB_Katakana,
-    WB_Hebrew_Letter,
-    WB_ALetter,
-    WB_MidNumLet,
-    WB_MidLetter,
-    WB_MidNum,
-    WB_Numeric,
-    WB_ExtendNumLet,
-    WB_RI_Odd,
-    WB_RI_Even,
-    WB_WSegSpace,
-  } WordBreakType;
-  WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
-  gint prev_WB_i = -1;
-
-  /* See Sentence_Break Property Values table of UAX#29 */
-  typedef enum
-  {
-    SB_Other,
-    SB_ExtendFormat,
-    SB_ParaSep,
-    SB_Sp,
-    SB_Lower,
-    SB_Upper,
-    SB_OLetter,
-    SB_Numeric,
-    SB_ATerm,
-    SB_SContinue,
-    SB_STerm,
-    SB_Close,
-    /* Rules SB8 and SB8a */
-    SB_ATerm_Close_Sp,
-    SB_STerm_Close_Sp,
-  } SentenceBreakType;
-  SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
-  gint prev_SB_i = -1;
-
-  /* Rule LB25 with Example 7 of Customization */
-  typedef enum
-  {
-    LB_Other,
-    LB_Numeric,
-    LB_Numeric_Close,
-    LB_RI_Odd,
-    LB_RI_Even,
-  } LineBreakType;
-  LineBreakType prev_LB_type = LB_Other;
-
-  WordType current_word_type = WordNone;
-  gunichar last_word_letter = 0;
-  gunichar base_character = 0;
-
-  gint last_sentence_start = -1;
-  gint last_non_space = -1;
-
-  gboolean almost_done = FALSE;
-  gboolean done = FALSE;
-
-  g_return_if_fail (length == 0 || text != NULL);
-  g_return_if_fail (attrs != NULL);
-
-  next = text;
-
-  prev_break_type = G_UNICODE_BREAK_UNKNOWN;
-  prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
-  prev_wc = 0;
-  prev_jamo = NO_JAMO;
-
-  if (length == 0 || *text == '\0')
-    {
-      next_wc = PARAGRAPH_SEPARATOR;
-      almost_done = TRUE;
-    }
-  else
-    next_wc = g_utf8_get_char (next);
-
-  next_break_type = g_unichar_break_type (next_wc);
-  next_break_type = BREAK_TYPE_SAFE (next_break_type);
-
-  for (i = 0; !done ; i++)
-    {
-      GUnicodeType type;
-      gunichar wc;
-      GUnicodeBreakType break_type;
-      GUnicodeBreakType row_break_type;
-      BreakOpportunity break_op;
-      JamoType jamo;
-      gboolean makes_hangul_syllable;
-
-      /* UAX#29 boundaries */
-      gboolean is_grapheme_boundary;
-      gboolean is_word_boundary;
-      gboolean is_sentence_boundary;
-
-      /* Emoji extended pictographics */
-      gboolean is_Extended_Pictographic;
-
-      gboolean can_break;
-
-      wc = next_wc;
-      break_type = next_break_type;
-
-      if (almost_done)
-	{
-	  /*
-	   * If we have already reached the end of @text g_utf8_next_char()
-	   * may not increment next
-	   */
-	  next_wc = 0;
-	  next_break_type = G_UNICODE_BREAK_UNKNOWN;
-	  done = TRUE;
-	}
-      else
-	{
-	  next = g_utf8_next_char (next);
-
-	  if ((length >= 0 && next >= text + length) || *next == '\0')
-	    {
-	      /* This is how we fill in the last element (end position) of the
-	       * attr array - assume there's a paragraph separators off the end
-	       * of @text.
-	       */
-	      next_wc = PARAGRAPH_SEPARATOR;
-	      almost_done = TRUE;
-	    }
-	  else
-	    next_wc = g_utf8_get_char (next);
-
-	  next_break_type = g_unichar_break_type (next_wc);
-	  next_break_type = BREAK_TYPE_SAFE (next_break_type);
-	}
-
-      type = g_unichar_type (wc);
-      jamo = JAMO_TYPE (break_type);
-
-      /* Determine wheter this forms a Hangul syllable with prev. */
-      if (jamo == NO_JAMO)
-	makes_hangul_syllable = FALSE;
-      else
-	{
-	  JamoType prev_end   = HangulJamoProps[prev_jamo].end  ;
-	  JamoType this_start = HangulJamoProps[     jamo].start;
-
-	  /* See comments before IS_JAMO */
-	  makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
-	}
-
       switch (type)
         {
         case G_UNICODE_SPACE_SEPARATOR:
@@ -379,14 +167,39 @@ pango_default_break (const gchar   *text,
           attrs[i].is_expandable_space = FALSE;
           break;
         }
+}
 
-      is_Extended_Pictographic =
-	_pango_Is_Emoji_Extended_Pictographic (wc);
-
+  /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    GB_Other,
+    GB_ControlCRLF,
+    GB_Extend,
+    GB_ZWJ,
+    GB_Prepend,
+    GB_SpacingMark,
+    GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
+    /* Use state machine to handle emoji sequence */
+    /* Rule GB12 and GB13 */
+    GB_RI_Odd, /* Meets odd number of RI */
+    GB_RI_Even, /* Meets even number of RI */
+  } GraphemeBreakType;
 
       /* ---- UAX#29 Grapheme Boundaries ---- */
+ __attribute__((noinline))
+static void
+determine_grapheme_boundaries (gunichar wc,
+                               GUnicodeType type,
+                               gunichar prev_wc,
+                               gboolean makes_hangul_syllable,
+                               gboolean is_Extended_Pictographic,
+                               gboolean *met_Extended_Pictographic,
+                               GraphemeBreakType *prev_GB_type,
+                               PangoLogAttr *attrs,
+                               int i)
       {
 	GraphemeBreakType GB_type;
+        gboolean is_grapheme_boundary;
 
         /* Find the GraphemeBreakType of wc */
 	GB_type = GB_Other;
@@ -472,9 +285,9 @@ pango_default_break (const gchar   *text,
           case G_UNICODE_OTHER_SYMBOL:
             if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
               {
-                if (prev_GB_type == GB_RI_Odd)
+                if (*prev_GB_type == GB_RI_Odd)
                   GB_type = GB_RI_Even;
-                else if (prev_GB_type == GB_RI_Even)
+                else if (*prev_GB_type == GB_RI_Even)
                   GB_type = GB_RI_Odd;
                 else
                   GB_type = GB_RI_Odd;
@@ -489,20 +302,20 @@ pango_default_break (const gchar   *text,
             break;
 	  }
 
-	/* Rule GB11 */
-	if (met_Extended_Pictographic)
+        /* Rule GB11 */
+	if (*met_Extended_Pictographic)
 	  {
 	    if (GB_type == GB_Extend)
-	      met_Extended_Pictographic = TRUE;
+	      *met_Extended_Pictographic = TRUE;
 	    else if (_pango_Is_Emoji_Extended_Pictographic (prev_wc) &&
 		     GB_type == GB_ZWJ)
-	      met_Extended_Pictographic = TRUE;
-	    else if (prev_GB_type == GB_Extend && GB_type == GB_ZWJ)
-	      met_Extended_Pictographic = TRUE;
-	    else if (prev_GB_type == GB_ZWJ && is_Extended_Pictographic)
-	      met_Extended_Pictographic = TRUE;
+	      *met_Extended_Pictographic = TRUE;
+	    else if (*prev_GB_type == GB_Extend && GB_type == GB_ZWJ)
+	      *met_Extended_Pictographic = TRUE;
+	    else if (*prev_GB_type == GB_ZWJ && is_Extended_Pictographic)
+	      *met_Extended_Pictographic = TRUE;
 	    else
-	      met_Extended_Pictographic = FALSE;
+	      *met_Extended_Pictographic = FALSE;
 	  }
 
 	/* Grapheme Cluster Boundary Rules */
@@ -511,7 +324,7 @@ pango_default_break (const gchar   *text,
 	/* We apply Rules GB1 and GB2 at the end of the function */
 	if (wc == '\n' && prev_wc == '\r')
           is_grapheme_boundary = FALSE; /* Rule GB3 */
-	else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
+	else if (*prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
 	  is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
 	else if (GB_type == GB_InHangulSyllable)
 	  is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
@@ -521,23 +334,34 @@ pango_default_break (const gchar   *text,
 	  is_grapheme_boundary = FALSE; /* Rule GB9 */
 	else if (GB_type == GB_SpacingMark)
 	  is_grapheme_boundary = FALSE; /* Rule GB9a */
-	else if (prev_GB_type == GB_Prepend)
+	else if (*prev_GB_type == GB_Prepend)
 	  is_grapheme_boundary = FALSE; /* Rule GB9b */
 	else if (is_Extended_Pictographic)
 	  { /* Rule GB11 */
-	    if (prev_GB_type == GB_ZWJ && met_Extended_Pictographic)
+	    if (*prev_GB_type == GB_ZWJ && *met_Extended_Pictographic)
 	      is_grapheme_boundary = FALSE;
 	  }
-	else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
+	else if (*prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
 	  is_grapheme_boundary = FALSE; /* Rule GB12 and GB13 */
 
 	if (is_Extended_Pictographic)
-	  met_Extended_Pictographic = TRUE;
+	  *met_Extended_Pictographic = TRUE;
 
 	attrs[i].is_cursor_position = is_grapheme_boundary;
+
+	*prev_GB_type = GB_type;
+      }
+
+ __attribute__((noinline))
+static void
+determine_backspace (gunichar prev_wc,
+                     gunichar base_character,
+                     PangoLogAttr *attrs,
+                     int i)
+{
 	/* If this is a grapheme boundary, we have to decide if backspace
 	 * deletes a character or the whole grapheme cluster */
-	if (is_grapheme_boundary)
+	if (attrs[i].is_cursor_position)
           {
 	    attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
 
@@ -548,14 +372,46 @@ pango_default_break (const gchar   *text,
           }
 	else
 	  attrs[i].backspace_deletes_character = FALSE;
+}
 
-	prev_GB_type = GB_type;
-      }
+  /* See Word_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    WB_Other,
+    WB_NewlineCRLF,
+    WB_ExtendFormat,
+    WB_Katakana,
+    WB_Hebrew_Letter,
+    WB_ALetter,
+    WB_MidNumLet,
+    WB_MidLetter,
+    WB_MidNum,
+    WB_Numeric,
+    WB_ExtendNumLet,
+    WB_RI_Odd,
+    WB_RI_Even,
+    WB_WSegSpace,
+  } WordBreakType;
 
       /* ---- UAX#29 Word Boundaries ---- */
+ __attribute__((noinline))
+static void
+determine_word_boundaries (gunichar wc,
+                           GUnicodeType type,
+                           GUnicodeBreakType break_type,
+                           gunichar prev_wc,
+                           gboolean is_Extended_Pictographic,
+                           WordBreakType *prev_WB_type,
+                           int *prev_WB_i,
+                           WordBreakType *prev_prev_WB_type,
+                           PangoLogAttr *attrs,
+                           int i)
       {
+        gboolean is_word_boundary;
+
 	is_word_boundary = FALSE;
-	if (is_grapheme_boundary ||
+
+	if (attrs[i].is_cursor_position ||
 	    G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */
 	  {
 	    PangoScript script;
@@ -655,7 +511,7 @@ pango_default_break (const gchar   *text,
 
 		  if (G_UNLIKELY(wc >= 0x1F1E6 && wc <= 0x1F1FF))
 		    {
-                      if (prev_WB_type == WB_RI_Odd)
+                      if (*prev_WB_type == WB_RI_Odd)
                         WB_type = WB_RI_Even;
                       else
                         WB_type = WB_RI_Odd;
@@ -700,8 +556,8 @@ pango_default_break (const gchar   *text,
 	    /* We apply Rules WB1 and WB2 at the end of the function */
 
 	    if (prev_wc == 0x3031 && wc == 0x41)
-	      g_debug ("Y %d %d", prev_WB_type, WB_type);
-	    if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
+	      g_debug ("Y %d %d", *prev_WB_type, WB_type);
+	    if (*prev_WB_type == WB_NewlineCRLF && *prev_WB_i + 1 == i)
 	      {
 	        /* The extra check for prev_WB_i is to correctly handle sequences like
 		 * Newline ÷ Extend × Extend
@@ -713,80 +569,116 @@ pango_default_break (const gchar   *text,
 	      is_word_boundary = TRUE; /* Rule WB3b */
 	    else if (prev_wc == 0x200D && is_Extended_Pictographic)
 	      is_word_boundary = FALSE; /* Rule WB3c */
-	    else if (prev_WB_type == WB_WSegSpace &&
-		     WB_type == WB_WSegSpace && prev_WB_i + 1 == i)
+	    else if (*prev_WB_type == WB_WSegSpace &&
+		     WB_type == WB_WSegSpace && *prev_WB_i + 1 == i)
 	      is_word_boundary = FALSE; /* Rule WB3d */
 	    else if (WB_type == WB_ExtendFormat)
 	      is_word_boundary = FALSE; /* Rules WB4? */
-	    else if ((prev_WB_type == WB_ALetter  ||
-                      prev_WB_type == WB_Hebrew_Letter ||
-                      prev_WB_type == WB_Numeric) &&
+	    else if ((*prev_WB_type == WB_ALetter  ||
+                      *prev_WB_type == WB_Hebrew_Letter ||
+                      *prev_WB_type == WB_Numeric) &&
                      (WB_type == WB_ALetter  ||
                       WB_type == WB_Hebrew_Letter ||
                       WB_type == WB_Numeric))
 	      is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10 */
-	    else if (prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
+	    else if (*prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
 	      is_word_boundary = FALSE; /* Rule WB13 */
-	    else if ((prev_WB_type == WB_ALetter ||
-                      prev_WB_type == WB_Hebrew_Letter ||
-                      prev_WB_type == WB_Numeric ||
-                      prev_WB_type == WB_Katakana ||
-                      prev_WB_type == WB_ExtendNumLet) &&
+	    else if ((*prev_WB_type == WB_ALetter ||
+                      *prev_WB_type == WB_Hebrew_Letter ||
+                      *prev_WB_type == WB_Numeric ||
+                      *prev_WB_type == WB_Katakana ||
+                      *prev_WB_type == WB_ExtendNumLet) &&
                      WB_type == WB_ExtendNumLet)
 	      is_word_boundary = FALSE; /* Rule WB13a */
-	    else if (prev_WB_type == WB_ExtendNumLet &&
+	    else if (*prev_WB_type == WB_ExtendNumLet &&
                      (WB_type == WB_ALetter ||
                       WB_type == WB_Hebrew_Letter ||
                       WB_type == WB_Numeric ||
                       WB_type == WB_Katakana))
 	      is_word_boundary = FALSE; /* Rule WB13b */
-	    else if (((prev_prev_WB_type == WB_ALetter ||
-                       prev_prev_WB_type == WB_Hebrew_Letter) &&
+	    else if (((*prev_prev_WB_type == WB_ALetter ||
+                       *prev_prev_WB_type == WB_Hebrew_Letter) &&
                       (WB_type == WB_ALetter ||
                        WB_type == WB_Hebrew_Letter)) &&
-		     (prev_WB_type == WB_MidLetter ||
-                      prev_WB_type == WB_MidNumLet ||
+		     (*prev_WB_type == WB_MidLetter ||
+                      *prev_WB_type == WB_MidNumLet ||
                       prev_wc == 0x0027))
 	      {
-		attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
+		attrs[*prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
 		is_word_boundary = FALSE; /* Rule WB7 */
 	      }
-	    else if (prev_WB_type == WB_Hebrew_Letter && wc == 0x0027)
+	    else if (*prev_WB_type == WB_Hebrew_Letter && wc == 0x0027)
               is_word_boundary = FALSE; /* Rule WB7a */
-	    else if (prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 &&
+	    else if (*prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 &&
                      WB_type == WB_Hebrew_Letter)
               {
-                attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */
+                attrs[*prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */
                 is_word_boundary = FALSE; /* Rule WB7c */
               }
-	    else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
-                     (prev_WB_type == WB_MidNum ||
-                      prev_WB_type == WB_MidNumLet ||
+	    else if ((*prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
+                     (*prev_WB_type == WB_MidNum ||
+                      *prev_WB_type == WB_MidNumLet ||
                       prev_wc == 0x0027))
 	      {
 		is_word_boundary = FALSE; /* Rule WB11 */
-		attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
+		attrs[*prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
 	      }
-	    else if (prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
+	    else if (*prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
 	      is_word_boundary = FALSE; /* Rule WB15 and WB16 */
 	    else
 	      is_word_boundary = TRUE; /* Rule WB999 */
 
 	    if (WB_type != WB_ExtendFormat)
 	      {
-		prev_prev_WB_type = prev_WB_type;
-		prev_WB_type = WB_type;
-		prev_WB_i = i;
+		*prev_prev_WB_type = *prev_WB_type;
+		*prev_WB_type = WB_type;
+		*prev_WB_i = i;
 	      }
 	  }
 
 	attrs[i].is_word_boundary = is_word_boundary;
       }
 
+
+  /* See Sentence_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    SB_Other,
+    SB_ExtendFormat,
+    SB_ParaSep,
+    SB_Sp,
+    SB_Lower,
+    SB_Upper,
+    SB_OLetter,
+    SB_Numeric,
+    SB_ATerm,
+    SB_SContinue,
+    SB_STerm,
+    SB_Close,
+    /* Rules SB8 and SB8a */
+    SB_ATerm_Close_Sp,
+    SB_STerm_Close_Sp,
+  } SentenceBreakType;
       /* ---- UAX#29 Sentence Boundaries ---- */
+
+ __attribute__((noinline))
+static void
+determine_sentence_boundaries (gunichar wc,
+                               GUnicodeType type,
+                               GUnicodeBreakType break_type,
+                               gunichar prev_wc,
+                               SentenceBreakType *prev_SB_type,
+                               int *prev_SB_i,
+                               SentenceBreakType *prev_prev_SB_type,
+                               int *last_sentence_start,
+                               PangoLogAttr *attrs,
+                               int i,
+                               gboolean done)
       {
+        gboolean is_sentence_boundary;
+
 	is_sentence_boundary = FALSE;
-	if (is_word_boundary ||
+	if (attrs[i].is_word_boundary ||
 	    wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */
 	  {
 	    SentenceBreakType SB_type;
@@ -910,7 +802,7 @@ pango_default_break (const gchar   *text,
 
 	    if (wc == '\n' && prev_wc == '\r')
 	      is_sentence_boundary = FALSE; /* Rule SB3 */
-	    else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i)
+	    else if (*prev_SB_type == SB_ParaSep && *prev_SB_i + 1 == i)
 	      {
 		/* The extra check for prev_SB_i is to correctly handle sequences like
 		 * ParaSep ÷ Extend × Extend
@@ -921,80 +813,80 @@ pango_default_break (const gchar   *text,
 	      }
 	    else if (SB_type == SB_ExtendFormat)
 	      is_sentence_boundary = FALSE; /* Rule SB5? */
-	    else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
+	    else if (*prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
 	      is_sentence_boundary = FALSE; /* Rule SB6 */
-	    else if ((prev_prev_SB_type == SB_Upper ||
-		      prev_prev_SB_type == SB_Lower) &&
-		     prev_SB_type == SB_ATerm &&
+	    else if ((*prev_prev_SB_type == SB_Upper ||
+		      *prev_prev_SB_type == SB_Lower) &&
+		     *prev_SB_type == SB_ATerm &&
 		     SB_type == SB_Upper)
 	      is_sentence_boundary = FALSE; /* Rule SB7 */
-	    else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
+	    else if (*prev_SB_type == SB_ATerm && SB_type == SB_Close)
 		SB_type = SB_ATerm;
-	    else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
+	    else if (*prev_SB_type == SB_STerm && SB_type == SB_Close)
 	      SB_type = SB_STerm;
-	    else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
+	    else if (*prev_SB_type == SB_ATerm && SB_type == SB_Sp)
 	      SB_type = SB_ATerm_Close_Sp;
-	    else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
+	    else if (*prev_SB_type == SB_STerm && SB_type == SB_Sp)
 	      SB_type = SB_STerm_Close_Sp;
 	    /* Rule SB8 */
-	    else if ((prev_SB_type == SB_ATerm ||
-		      prev_SB_type == SB_ATerm_Close_Sp) &&
+	    else if ((*prev_SB_type == SB_ATerm ||
+		      *prev_SB_type == SB_ATerm_Close_Sp) &&
 		     SB_type == SB_Lower)
 	      is_sentence_boundary = FALSE;
-	    else if ((prev_prev_SB_type == SB_ATerm ||
-		      prev_prev_SB_type == SB_ATerm_Close_Sp) &&
-		     IS_OTHER_TERM(prev_SB_type) &&
+	    else if ((*prev_prev_SB_type == SB_ATerm ||
+		      *prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+		     IS_OTHER_TERM(*prev_SB_type) &&
 		     SB_type == SB_Lower)
               {
-	        attrs[prev_SB_i].is_sentence_boundary = FALSE;
-	        attrs[prev_SB_i].is_sentence_start = FALSE;
-	        attrs[prev_SB_i].is_sentence_end = FALSE;
-                last_sentence_start = -1;
-                for (int j = prev_SB_i - 1; j >= 0; j--)
+	        attrs[*prev_SB_i].is_sentence_boundary = FALSE;
+	        attrs[*prev_SB_i].is_sentence_start = FALSE;
+	        attrs[*prev_SB_i].is_sentence_end = FALSE;
+                *last_sentence_start = -1;
+                for (int j = *prev_SB_i - 1; j >= 0; j--)
                   {
                     if (attrs[j].is_sentence_boundary)
                       {
-                        last_sentence_start = j;
+                        *last_sentence_start = j;
                         break;
                       }
                   }
               }
-	    else if ((prev_SB_type == SB_ATerm ||
-		      prev_SB_type == SB_ATerm_Close_Sp ||
-		      prev_SB_type == SB_STerm ||
-		      prev_SB_type == SB_STerm_Close_Sp) &&
+	    else if ((*prev_SB_type == SB_ATerm ||
+		      *prev_SB_type == SB_ATerm_Close_Sp ||
+		      *prev_SB_type == SB_STerm ||
+		      *prev_SB_type == SB_STerm_Close_Sp) &&
 		     (SB_type == SB_SContinue ||
 		      SB_type == SB_ATerm || SB_type == SB_STerm))
 	      is_sentence_boundary = FALSE; /* Rule SB8a */
-	    else if ((prev_SB_type == SB_ATerm ||
-		      prev_SB_type == SB_STerm) &&
+	    else if ((*prev_SB_type == SB_ATerm ||
+		      *prev_SB_type == SB_STerm) &&
 		     (SB_type == SB_Close || SB_type == SB_Sp ||
 		      SB_type == SB_ParaSep))
 	      is_sentence_boundary = FALSE; /* Rule SB9 */
-	    else if ((prev_SB_type == SB_ATerm ||
-		      prev_SB_type == SB_ATerm_Close_Sp ||
-		      prev_SB_type == SB_STerm ||
-		      prev_SB_type == SB_STerm_Close_Sp) &&
+	    else if ((*prev_SB_type == SB_ATerm ||
+		      *prev_SB_type == SB_ATerm_Close_Sp ||
+		      *prev_SB_type == SB_STerm ||
+		      *prev_SB_type == SB_STerm_Close_Sp) &&
 		     (SB_type == SB_Sp || SB_type == SB_ParaSep))
 	      is_sentence_boundary = FALSE; /* Rule SB10 */
-	    else if ((prev_SB_type == SB_ATerm ||
-		      prev_SB_type == SB_ATerm_Close_Sp ||
-		      prev_SB_type == SB_STerm ||
-		      prev_SB_type == SB_STerm_Close_Sp) &&
+	    else if ((*prev_SB_type == SB_ATerm ||
+		      *prev_SB_type == SB_ATerm_Close_Sp ||
+		      *prev_SB_type == SB_STerm ||
+		      *prev_SB_type == SB_STerm_Close_Sp) &&
 		     SB_type != SB_ParaSep)
 	      is_sentence_boundary = TRUE; /* Rule SB11 */
 	    else
 	      is_sentence_boundary = FALSE; /* Rule SB998 */
 
 	    if (SB_type != SB_ExtendFormat &&
-		!((prev_prev_SB_type == SB_ATerm ||
-		   prev_prev_SB_type == SB_ATerm_Close_Sp) &&
-		  IS_OTHER_TERM(prev_SB_type) &&
+		!((*prev_prev_SB_type == SB_ATerm ||
+		   *prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+		  IS_OTHER_TERM(*prev_SB_type) &&
 		  IS_OTHER_TERM(SB_type)))
               {
-                prev_prev_SB_type = prev_SB_type;
-                prev_SB_type = SB_type;
-                prev_SB_i = i;
+                *prev_prev_SB_type = *prev_SB_type;
+                *prev_SB_type = SB_type;
+                *prev_SB_i = i;
               }
 
 #undef IS_OTHER_TERM
@@ -1007,7 +899,38 @@ pango_default_break (const gchar   *text,
 	attrs[i].is_sentence_boundary = is_sentence_boundary;
       }
 
+
       /* ---- Line breaking ---- */
+  typedef enum
+  {
+    LB_Other, 
+    LB_Numeric,
+    LB_Numeric_Close,
+    LB_RI_Odd,
+    LB_RI_Even, 
+  } LineBreakType;
+
+ __attribute__((noinline))
+static void
+determine_line_breaks (gunichar wc,
+                       GUnicodeType type,
+                       GUnicodeBreakType break_type,
+                       gunichar prev_wc,
+                       GUnicodeBreakType *pprev_break_type,
+                       GUnicodeBreakType *pprev_prev_break_type,
+                       GUnicodeBreakType next_break_type,
+                       LineBreakType *pprev_LB_type,
+                       JamoType jamo,
+                       JamoType *pprev_jamo,
+                       PangoLogAttr *attrs,
+                       int i)
+{
+      gboolean can_break;
+      BreakOpportunity break_op;
+      GUnicodeBreakType row_break_type;
+      GUnicodeBreakType prev_break_type = *pprev_break_type;
+      GUnicodeBreakType prev_prev_break_type = *pprev_prev_break_type;
+      LineBreakType prev_LB_type = *pprev_LB_type;
 
       break_op = BREAK_ALREADY_HANDLED;
 
@@ -1373,12 +1296,12 @@ pango_default_break (const gchar   *text,
 		  break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
 		{
 		  if (prev_LB_type != LB_Numeric)
-		    prev_LB_type = LB_type;
+		    *pprev_LB_type = LB_type;
 		  /* else don't change the prev_LB_type */
 		}
 	      else
 		{
-		  prev_LB_type = LB_type;
+		  *pprev_LB_type = LB_type;
 		}
 	    }
 	  /* else don't change the prev_LB_type for Rule LB9 */
@@ -1388,8 +1311,8 @@ pango_default_break (const gchar   *text,
         {
           if (prev_break_type != G_UNICODE_BREAK_SPACE)
             {
-              prev_prev_break_type = prev_break_type;
-              prev_break_type = break_type;
+              *pprev_prev_break_type = prev_break_type;
+              *pprev_break_type = break_type;
             }
           /* else don't change the prev_break_type */
         }
@@ -1403,20 +1326,33 @@ pango_default_break (const gchar   *text,
               prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
               prev_break_type == G_UNICODE_BREAK_SPACE ||
               prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
-            prev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */
+            *pprev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */
               /* else don't change the prev_break_type for Rule LB9 */
 
-          prev_jamo = jamo;
+          *pprev_jamo = jamo;
         }
       else
         {
-          prev_prev_break_type = prev_break_type;
-          prev_break_type = break_type;
-          prev_jamo = jamo;
+          *pprev_prev_break_type = prev_break_type;
+          *pprev_break_type = break_type;
+          *pprev_jamo = jamo;
         }
+}
 
       /* ---- Word breaks ---- */
 
+ __attribute__((noinline))
+static void
+determine_word_breaks (gunichar wc,
+                       GUnicodeType type,
+                       WordType *pcurrent_word_type,
+                       gunichar *plast_word_letter,
+                       PangoLogAttr *attrs,
+                       int i)
+{
+  WordType current_word_type = *pcurrent_word_type;
+  gunichar last_word_letter = *plast_word_letter;
+
       /* default to not a word start/end */
       attrs[i].is_word_start = FALSE;
       attrs[i].is_word_end = FALSE;
@@ -1457,19 +1393,19 @@ pango_default_break (const gchar   *text,
 			attrs[i].is_word_end = TRUE;
 		    }
 		}
-	      last_word_letter = wc;
+	      *plast_word_letter = wc;
 	      break;
 
 	    case G_UNICODE_DECIMAL_NUMBER:
 	    case G_UNICODE_LETTER_NUMBER:
 	    case G_UNICODE_OTHER_NUMBER:
-	      last_word_letter = wc;
+	      *plast_word_letter = wc;
 	      break;
 
 	    default:
 	      /* Punctuation, control/format chars, etc. all end a word. */
 	      attrs[i].is_word_end = TRUE;
-	      current_word_type = WordNone;
+	      *pcurrent_word_type = WordNone;
 	      break;
 	    }
 	}
@@ -1483,16 +1419,16 @@ pango_default_break (const gchar   *text,
 	    case G_UNICODE_OTHER_LETTER:
 	    case G_UNICODE_TITLECASE_LETTER:
 	    case G_UNICODE_UPPERCASE_LETTER:
-	      current_word_type = WordLetters;
-	      last_word_letter = wc;
+	      *pcurrent_word_type = WordLetters;
+	      *plast_word_letter = wc;
 	      attrs[i].is_word_start = TRUE;
 	      break;
 
 	    case G_UNICODE_DECIMAL_NUMBER:
 	    case G_UNICODE_LETTER_NUMBER:
 	    case G_UNICODE_OTHER_NUMBER:
-	      current_word_type = WordNumbers;
-	      last_word_letter = wc;
+	      *pcurrent_word_type = WordNumbers;
+	      *plast_word_letter = wc;
 	      attrs[i].is_word_start = TRUE;
 	      break;
 
@@ -1501,42 +1437,236 @@ pango_default_break (const gchar   *text,
 	      break;
 	    }
 	}
+}
 
       /* ---- Sentence breaks ---- */
+static void
+determine_sentence_breaks (gunichar wc,
+                           GUnicodeType type,
+                           int *last_sentence_start,
+                           int *last_non_space,
+                           PangoLogAttr *attrs,
+                           int i)
       {
-
 	/* default to not a sentence start/end */
 	attrs[i].is_sentence_start = FALSE;
 	attrs[i].is_sentence_end = FALSE;
 
 	/* maybe start sentence */
-	if (last_sentence_start == -1 && !is_sentence_boundary)
-	  last_sentence_start = i - 1;
+	if (*last_sentence_start == -1 && !attrs[i].is_sentence_boundary)
+	  *last_sentence_start = i - 1;
 
 	/* remember last non space character position */
 	if (i > 0 && !attrs[i - 1].is_white)
-	  last_non_space = i;
+	  *last_non_space = i;
 
 	/* meets sentence end, mark both sentence start and end */
-	if (last_sentence_start != -1 && is_sentence_boundary) {
-	  if (last_non_space != -1) {
-	    attrs[last_sentence_start].is_sentence_start = TRUE;
-	    attrs[last_non_space].is_sentence_end = TRUE;
+
+	if (*last_sentence_start != -1 && attrs[i].is_sentence_boundary) {
+	  if (*last_non_space != -1) {
+	    attrs[*last_sentence_start].is_sentence_start = TRUE;
+	    attrs[*last_non_space].is_sentence_end = TRUE;
 	  }
 
-	  last_sentence_start = -1;
-	  last_non_space = -1;
+	  *last_sentence_start = -1;
+	  *last_non_space = -1;
 	}
 
 	/* meets space character, move sentence start */
-	if (last_sentence_start != -1 &&
-	    last_sentence_start == i - 1 &&
+	if (*last_sentence_start != -1 &&
+	    *last_sentence_start == i - 1 &&
 	    attrs[i - 1].is_white) {
-	    last_sentence_start++;
+	    (*last_sentence_start)++;
           }
 
       }
 
+/**
+ * pango_default_break:
+ * @text: text to break. Must be valid UTF-8
+ * @length: length of text in bytes (may be -1 if @text is nul-terminated)
+ * @analysis: (nullable): a `PangoAnalysis` structure for the @text
+ * @attrs: logical attributes to fill in
+ * @attrs_len: size of the array passed as @attrs
+ *
+ * This is the default break algorithm.
+ *
+ * It applies Unicode rules without language-specific
+ * tailoring, therefore the @analyis argument is unused
+ * and can be %NULL.
+ *
+ * See [func@Pango.tailor_break] for language-specific breaks.
+ */
+void
+pango_default_break (const gchar   *text,
+		     gint           length,
+		     PangoAnalysis *analysis G_GNUC_UNUSED,
+		     PangoLogAttr  *attrs,
+		     int            attrs_len G_GNUC_UNUSED)
+{
+  /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
+   * the line breaking stuff is also in TR14 on unicode.org
+   */
+
+  /* This is a default break implementation that should work for nearly all
+   * languages. Language engines can override it optionally.
+   */
+
+  /* FIXME one cheesy optimization here would be to memset attrs to 0
+   * before we start, and then never assign %FALSE to anything
+   */
+
+  const gchar *next;
+  gint i;
+
+  gunichar prev_wc;
+  gunichar next_wc;
+
+  JamoType prev_jamo;
+
+  GUnicodeBreakType next_break_type;
+  GUnicodeBreakType prev_break_type;
+  GUnicodeBreakType prev_prev_break_type;
+
+  GraphemeBreakType prev_GB_type = GB_Other;
+  gboolean met_Extended_Pictographic = FALSE;
+
+  WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
+  gint prev_WB_i = -1;
+
+  SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
+  gint prev_SB_i = -1;
+
+  LineBreakType prev_LB_type = LB_Other;
+
+  WordType current_word_type = WordNone;
+  gunichar last_word_letter = 0;
+  gunichar base_character = 0;
+
+  gint last_sentence_start = -1;
+  gint last_non_space = -1;
+
+  gboolean almost_done = FALSE;
+  gboolean done = FALSE;
+
+  g_return_if_fail (length == 0 || text != NULL);
+  g_return_if_fail (attrs != NULL);
+
+  next = text;
+
+  prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+  prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+  prev_wc = 0;
+  prev_jamo = NO_JAMO;
+
+  if (length == 0 || *text == '\0')
+    {
+      next_wc = PARAGRAPH_SEPARATOR;
+      almost_done = TRUE;
+    }
+  else
+    next_wc = g_utf8_get_char (next);
+
+  next_break_type = g_unichar_break_type (next_wc);
+  next_break_type = BREAK_TYPE_SAFE (next_break_type);
+
+  for (i = 0; !done ; i++)
+    {
+      GUnicodeType type;
+      gunichar wc;
+      GUnicodeBreakType break_type;
+      JamoType jamo;
+      gboolean makes_hangul_syllable;
+
+      /* Emoji extended pictographics */
+      gboolean is_Extended_Pictographic;
+
+      wc = next_wc;
+      break_type = next_break_type;
+
+      if (almost_done)
+	{
+	  /*
+	   * If we have already reached the end of @text g_utf8_next_char()
+	   * may not increment next
+	   */
+	  next_wc = 0;
+	  next_break_type = G_UNICODE_BREAK_UNKNOWN;
+	  done = TRUE;
+	}
+      else
+	{
+	  next = g_utf8_next_char (next);
+
+	  if ((length >= 0 && next >= text + length) || *next == '\0')
+	    {
+	      /* This is how we fill in the last element (end position) of the
+	       * attr array - assume there's a paragraph separators off the end
+	       * of @text.
+	       */
+	      next_wc = PARAGRAPH_SEPARATOR;
+	      almost_done = TRUE;
+	    }
+	  else
+	    next_wc = g_utf8_get_char (next);
+
+	  next_break_type = g_unichar_break_type (next_wc);
+	  next_break_type = BREAK_TYPE_SAFE (next_break_type);
+	}
+
+      type = g_unichar_type (wc);
+      jamo = JAMO_TYPE (break_type);
+
+      /* Determine wheter this forms a Hangul syllable with prev. */
+      if (jamo == NO_JAMO)
+	makes_hangul_syllable = FALSE;
+      else
+	{
+	  JamoType prev_end   = HangulJamoProps[prev_jamo].end  ;
+	  JamoType this_start = HangulJamoProps[     jamo].start;
+
+	  /* See comments before IS_JAMO */
+	  makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
+	}
+
+      is_Extended_Pictographic =
+        _pango_Is_Emoji_Extended_Pictographic (wc);
+
+      determine_whitespace (wc, type, attrs, i);
+
+      determine_grapheme_boundaries (wc, type, prev_wc,
+                                     makes_hangul_syllable,
+                                     is_Extended_Pictographic,
+                                     &met_Extended_Pictographic,
+                                     &prev_GB_type,
+                                     attrs, i);
+
+      determine_backspace (prev_wc, base_character, attrs, i);
+
+      determine_word_boundaries (wc, type, break_type, prev_wc,
+                                 is_Extended_Pictographic,
+                                 &prev_WB_type, &prev_WB_i,
+                                 &prev_prev_WB_type,
+                                 attrs, i);
+
+      determine_sentence_boundaries (wc, type, break_type, prev_wc,
+                                     &prev_SB_type, &prev_SB_i,
+                                     &prev_prev_SB_type,
+                                     &last_sentence_start,
+                                     attrs, i, done);
+
+      determine_line_breaks (wc, type, break_type, prev_wc,
+                             &prev_break_type, &prev_prev_break_type,
+                             next_break_type,
+                             &prev_LB_type, jamo, &prev_jamo,
+                             attrs, i);
+
+      determine_word_breaks (wc, type, &current_word_type, &last_word_letter,
+                             attrs, i);
+
+      determine_sentence_breaks (wc, type, &last_sentence_start, &last_non_space,
+                                 attrs, i);
+
       prev_wc = wc;
 
       /* wc might not be a valid Unicode base character, but really all we
author	Matthias Clasen <mclasen@redhat.com>	2021-07-28 14:09:23 -0400
committer	Matthias Clasen <mclasen@redhat.com>	2021-07-30 07:58:16 -0400
commit	73cd563d7bd19b31b57e94e3bb0969970990e589 (patch)
tree	44c5627509af2ca627b8b0c7ae8f298587cfe0bb
parent	730a7eca52118d8a5d1f546f34e5de91ee75be4c (diff)
download	pango-73cd563d7bd19b31b57e94e3bb0969970990e589.tar.gz