break: Cache per-character data

author: Matthias Clasen <mclasen@redhat.com> 2021-07-28 16:10:26 -0400
committer: Matthias Clasen <mclasen@redhat.com> 2021-07-30 07:58:16 -0400
commit: 43cdd9fbab3ababaae7308feec9e4dabdf8dc3a0 (patch)
tree: ebfe5d942562148551e864a29f2857e72c854da1
parent: d16db0d6730ac4f93f6557bd01f3ba5fa75264a3 (diff)
download: pango-43cdd9fbab3ababaae7308feec9e4dabdf8dc3a0.tar.gz
1 files changed, 66 insertions, 48 deletions
diff --git a/pango/break.c b/pango/break.c
index b9cf3cae..aedf54c4 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -138,6 +138,37 @@ typedef enum
   WordNumbers
 } WordType;
 
+static inline void
+get_unichar_data (gunichar           wc,
+                  GUnicodeType      *type,
+                  GUnicodeBreakType *break_type,
+                  PangoScript       *script,
+                  gboolean          *extended_pictographic)
+{
+  static struct {
+    gunichar wc;
+    GUnicodeType type;
+    GUnicodeBreakType break_type;
+    PangoScript script;
+    gboolean extended_pictographic;
+  } cache[256] = { 0, }, *p;
+
+  p = &cache[wc & 0xff];
+
+  if (G_UNLIKELY (p->wc != wc))
+    {
+      p->wc = wc;
+      p->type = g_unichar_type (wc);
+      p->break_type = g_unichar_break_type (wc);
+      p->script = (PangoScript)g_unichar_get_script (wc);
+      p->extended_pictographic = _pango_Is_Emoji_Extended_Pictographic (wc);
+    }
+
+  *type = p->type;
+  *break_type = p->break_type;
+  *script = p->script;
+  *extended_pictographic = p->extended_pictographic;
+}
 
 /**
  * pango_default_break:
@@ -182,10 +213,14 @@ pango_default_break (const gchar   *text,
 
   JamoType prev_jamo;
 
+  GUnicodeType next_type;
   GUnicodeBreakType next_break_type;
   GUnicodeBreakType prev_break_type;
   GUnicodeBreakType prev_prev_break_type;
 
+  PangoScript next_script;
+  gboolean next_Extended_Pictographic;
+
   /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
   typedef enum
   {
@@ -256,6 +291,7 @@ pango_default_break (const gchar   *text,
     LB_RI_Odd,
     LB_RI_Even,
   } LineBreakType;
+  LineBreakType LB_type;
   LineBreakType prev_LB_type = LB_Other;
 
   WordType current_word_type = WordNone;
@@ -286,8 +322,7 @@ pango_default_break (const gchar   *text,
   else
     next_wc = g_utf8_get_char (next);
 
-  next_break_type = g_unichar_break_type (next_wc);
-  next_break_type = BREAK_TYPE_SAFE (next_break_type);
+  get_unichar_data (next_wc, &next_type, &next_break_type, &next_script, &next_Extended_Pictographic);
 
   for (i = 0; !done ; i++)
     {
@@ -299,6 +334,8 @@ pango_default_break (const gchar   *text,
       JamoType jamo;
       gboolean makes_hangul_syllable;
 
+      PangoScript script;
+
       /* UAX#29 boundaries */
       gboolean is_grapheme_boundary;
       gboolean is_word_boundary;
@@ -310,7 +347,10 @@ pango_default_break (const gchar   *text,
       gboolean can_break;
 
       wc = next_wc;
+      type = next_type;
       break_type = next_break_type;
+      script = next_script;
+      is_Extended_Pictographic = next_Extended_Pictographic;
 
       if (almost_done)
 	{
@@ -319,6 +359,7 @@ pango_default_break (const gchar   *text,
 	   * may not increment next
 	   */
 	  next_wc = 0;
+          next_type = 0;
 	  next_break_type = G_UNICODE_BREAK_UNKNOWN;
 	  done = TRUE;
 	}
@@ -338,11 +379,9 @@ pango_default_break (const gchar   *text,
 	  else
 	    next_wc = g_utf8_get_char (next);
 
-	  next_break_type = g_unichar_break_type (next_wc);
-	  next_break_type = BREAK_TYPE_SAFE (next_break_type);
+          get_unichar_data (next_wc, &next_type, &next_break_type, &next_script, &next_Extended_Pictographic);
 	}
 
-      type = g_unichar_type (wc);
       jamo = JAMO_TYPE (break_type);
 
       /* Determine wheter this forms a Hangul syllable with prev. */
@@ -380,9 +419,6 @@ pango_default_break (const gchar   *text,
           break;
         }
 
-      is_Extended_Pictographic =
-	_pango_Is_Emoji_Extended_Pictographic (wc);
-
 
       /* ---- UAX#29 Grapheme Boundaries ---- */
       {
@@ -558,11 +594,8 @@ pango_default_break (const gchar   *text,
 	if (is_grapheme_boundary ||
 	    G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */
 	  {
-	    PangoScript script;
 	    WordBreakType WB_type;
 
-	    script = (PangoScript)g_unichar_get_script (wc);
-
 	    /* Find the WordBreakType of wc */
 	    WB_type = WB_Other;
 
@@ -1025,8 +1058,10 @@ pango_default_break (const gchar   *text,
        */
       can_break = attrs[i].is_cursor_position;
 
-      /* Rule LB1:
-	 assign a line breaking class to each code point of the input. */
+      LB_type = LB_Other;
+
+      /* Rule LB1: assign a line breaking class to each code point of the input. */
+      /* Also determine if we can break */
       switch (break_type)
 	{
 	case G_UNICODE_BREAK_AMBIGUOUS:
@@ -1058,50 +1093,33 @@ pango_default_break (const gchar   *text,
 	case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
 	case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
 	case G_UNICODE_BREAK_EMOJI_MODIFIER:
-	case G_UNICODE_BREAK_REGIONAL_INDICATOR:
           can_break = TRUE;
           break;
 
+        case G_UNICODE_BREAK_REGIONAL_INDICATOR:
+          can_break = TRUE;
+          if (prev_LB_type == LB_RI_Odd)
+            LB_type = LB_RI_Even;
+          else
+            LB_type = LB_RI_Odd;
+          break;
+
+        case G_UNICODE_BREAK_NUMERIC:
+          LB_type = LB_Numeric;
+          break;
+
+        case G_UNICODE_BREAK_CLOSE_PUNCTUATION:
+        case G_UNICODE_BREAK_CLOSE_PARANTHESIS:
+          if (prev_LB_type == LB_Numeric)
+            LB_type = LB_Numeric_Close;
+          break;
+
 	default:
 	  ;
 	}
 
       if (can_break)
 	{
-	  LineBreakType LB_type;
-
-	  /* Find the LineBreakType of wc */
-	  LB_type = LB_Other;
-
-	  if (break_type == G_UNICODE_BREAK_NUMERIC)
-	    LB_type = LB_Numeric;
-
-	  if (break_type == G_UNICODE_BREAK_SYMBOL ||
-	      break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
-	    {
-	      if (!(prev_LB_type == LB_Numeric))
-		LB_type = LB_Other;
-	    }
-
-	  if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
-	      break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS)
-	    {
-	      if (prev_LB_type == LB_Numeric)
-		LB_type = LB_Numeric_Close;
-	      else
-		LB_type = LB_Other;
-	    }
-
-	  if (break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
-	    {
-	      if (prev_LB_type == LB_RI_Odd)
-		LB_type = LB_RI_Even;
-	      else if (prev_LB_type == LB_RI_Even)
-		LB_type = LB_RI_Odd;
-	      else
-		LB_type = LB_RI_Odd;
-	    }
-
 	  attrs[i].is_line_break = TRUE; /* Rule LB31 */
 	  /* Unicode doesn't specify char wrap;
 	     we wrap around all chars currently. */
author	Matthias Clasen <mclasen@redhat.com>	2021-07-28 16:10:26 -0400
committer	Matthias Clasen <mclasen@redhat.com>	2021-07-30 07:58:16 -0400
commit	43cdd9fbab3ababaae7308feec9e4dabdf8dc3a0 (patch)
tree	ebfe5d942562148551e864a29f2857e72c854da1
parent	d16db0d6730ac4f93f6557bd01f3ba5fa75264a3 (diff)
download	pango-43cdd9fbab3ababaae7308feec9e4dabdf8dc3a0.tar.gz