Add segmentation attributesbreak-tailoring

Add attributes that let us override word and sentence boundaries (and, indirectly, line breaks). Tests included.
author: Matthias Clasen <mclasen@redhat.com> 2021-08-21 23:54:03 -0400
committer: Matthias Clasen <mclasen@redhat.com> 2021-08-25 01:08:02 -0400
commit: b614ea2b06b3c9defaceb92b6904fa8a92249abe (patch)
tree: c20fdc8a2dc8a85561e3a8df5bf147d3c2888cd5 /pango
parent: 3aee7615e9a123ae750e49e9864bdaa4b267cdbb (diff)
download: pango-b614ea2b06b3c9defaceb92b6904fa8a92249abe.tar.gz
5 files changed, 453 insertions, 48 deletions
diff --git a/pango/break.c b/pango/break.c
index d348f9b8..864ac339 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -1699,64 +1699,325 @@ break_script (const char          *item_text,
 /* }}} */
 /* {{{ Attribute-based customization */
 
+/* We allow customizing log attrs in two ways:
+ *
+ * - You can directly remove breaks from a range, using allow_breaks=false.
+ *   We preserve the non-tailorable rules from UAX #14, so mandatory breaks
+ *   and breaks after ZWS remain. We also preserve break opportunities after
+ *   hyphens and visible word dividers.
+ *
+ * - You can tweak the segmentation by marking ranges as word or sentence.
+ *   When doing so, we split adjacent segments to preserve alternating
+ *   starts and ends. We add a line break opportunity before each word that
+ *   is created in this way, and we remove line break opportunities inside
+ *   the word in the same way as for a range marked as allow_breaks=false,
+ *   except that we don't remove char break opportunities.
+ *
+ *   Note that UAX #14 does not guarantee that words fall neatly into
+ *   sentences, so we don't do extra work to enforce that.
+ */
+
+static void
+remove_breaks_from_range (const char   *text,
+                          int           start,
+                          PangoLogAttr *log_attrs,
+                          int           start_pos,
+                          int           end_pos)
+{
+  int pos;
+  const char *p;
+  gunichar ch;
+  int bt;
+  gboolean after_zws;
+  gboolean after_hyphen;
+
+  /* Assume our range doesn't start after a hyphen or in a zws sequence */
+  after_zws = FALSE;
+  after_hyphen = FALSE;
+  for (pos = start_pos + 1, p = g_utf8_next_char (text + start);
+       pos < end_pos;
+       pos++, p = g_utf8_next_char (p))
+    {
+      /* Mandatory breaks aren't tailorable */
+      if (!log_attrs[pos].is_mandatory_break)
+        log_attrs[pos].is_line_break = FALSE;
+
+      ch = g_utf8_get_char (p);
+      bt = g_unichar_break_type (ch);
+
+      /* Hyphens and visible word dividers */
+      if (after_hyphen)
+        log_attrs[pos].is_line_break = TRUE;
+
+      after_hyphen = ch == 0x00ad || /* Soft Hyphen */
+         ch == 0x05A0 || ch == 0x2010 || /* Breaking Hyphens */
+         ch == 0x2012 || ch == 0x2013 ||
+         ch == 0x05BE || ch == 0x0F0B || /* Visible word dividers */
+         ch == 0x1361 || ch == 0x17D8 ||
+         ch == 0x17DA || ch == 0x2027 ||
+         ch == 0x007C;
+
+      /* ZWS sequence */
+      if (after_zws && bt != G_UNICODE_BREAK_SPACE)
+        log_attrs[pos].is_line_break = TRUE;
+
+      after_zws = bt == G_UNICODE_BREAK_ZERO_WIDTH_SPACE ||
+                  (bt == G_UNICODE_BREAK_SPACE && after_zws);
+    }
+}
+
 static gboolean
-break_attrs (const char   *text,
-             int           length,
-             GSList       *attributes,
-             int           offset,
-             PangoLogAttr *log_attrs,
-             int           log_attrs_len)
+handle_allow_breaks (const char    *text,
+                     int            length,
+                     PangoAttrList *attrs,
+                     int            offset,
+                     PangoLogAttr  *log_attrs,
+                     int            log_attrs_len)
 {
-  PangoAttrList list;
-  PangoAttrList hyphens;
   PangoAttrIterator iter;
-  GSList *l;
+  gboolean tailored = FALSE;
 
-  _pango_attr_list_init (&list);
-  _pango_attr_list_init (&hyphens);
+  _pango_attr_list_get_iterator (attrs, &iter);
 
-  for (l = attributes; l; l = l->next)
+  do
     {
-      PangoAttribute *attr = l->data;
+      const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_ALLOW_BREAKS);
 
-      if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS)
-        pango_attr_list_insert (&list, pango_attribute_copy (attr));
-      else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS)
-        pango_attr_list_insert (&hyphens, pango_attribute_copy (attr));
+      if (!attr)
+        continue;
+
+      if (!((PangoAttrInt*)attr)->value)
+        {
+          int start, end;
+          int start_pos, end_pos;
+          int pos;
+
+          start = attr->start_index;
+          end = attr->end_index;
+          if (start < offset)
+            start_pos = 0;
+          else
+            start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+          if (end >= offset + length)
+            end_pos = log_attrs_len;
+          else
+            end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+          for (pos = start_pos + 1; pos < end_pos; pos++)
+            log_attrs[pos].is_char_break = FALSE;
+
+          remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs, start_pos, end_pos);
+
+          tailored = TRUE;
+        }
     }
+  while (pango_attr_iterator_next (&iter));
 
-  _pango_attr_list_get_iterator (&list, &iter);
-  do {
-    const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_ALLOW_BREAKS);
+  _pango_attr_iterator_destroy (&iter);
 
-    if (attr && ((PangoAttrInt*)attr)->value == 0)
-      {
-        int start, end;
-        int start_pos, end_pos;
-        int pos;
+  return tailored;
+}
 
-        pango_attr_iterator_range (&iter, &start, &end);
-        if (start < offset)
-          start_pos = 0;
-        else
-          start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
-        if (end >= offset + length)
-          end_pos = log_attrs_len;
-        else
-          end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
 
-        for (pos = start_pos + 1; pos < end_pos; pos++)
-          {
-            log_attrs[pos].is_mandatory_break = FALSE;
-            log_attrs[pos].is_line_break = FALSE;
-            log_attrs[pos].is_char_break = FALSE;
-          }
-      }
-  } while (pango_attr_iterator_next (&iter));
+static gboolean
+handle_words (const char    *text,
+              int            length,
+              PangoAttrList *attrs,
+              int            offset,
+              PangoLogAttr  *log_attrs,
+              int            log_attrs_len)
+{
+  PangoAttrIterator iter;
+  gboolean tailored = FALSE;
+
+  _pango_attr_list_get_iterator (attrs, &iter);
+
+  do
+    {
+      const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_WORD);
+      int start, end;
+      int start_pos, end_pos;
+      int pos;
+
+      if (!attr)
+        continue;
+
+      start = attr->start_index;
+      end = attr->end_index;
+      if (start < offset)
+        start_pos = 0;
+      else
+        start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+      if (end >= offset + length)
+        end_pos = log_attrs_len;
+      else
+        end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+      for (pos = start_pos + 1; pos < end_pos; pos++)
+        {
+          log_attrs[pos].is_word_start = FALSE;
+          log_attrs[pos].is_word_end = FALSE;
+          log_attrs[pos].is_word_boundary = FALSE;
+        }
+
+      remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs,
+                                start_pos, end_pos);
+
+      if (start >= offset)
+        {
+          gboolean in_word = FALSE;
+          for (pos = start_pos - 1; pos >= 0; pos--)
+            {
+              if (log_attrs[pos].is_word_end)
+                break;
+              if (log_attrs[pos].is_word_start)
+                {
+                  in_word = TRUE;
+                  break;
+                }
+            }
+          log_attrs[start_pos].is_word_start = TRUE;
+          log_attrs[start_pos].is_word_end = in_word;
+          log_attrs[start_pos].is_word_boundary = TRUE;
+
+          /* Allow line breaks before words */
+          log_attrs[start_pos].is_line_break = TRUE;
+
+          tailored = TRUE;
+        }
+
+      if (end < offset + length)
+        {
+          gboolean in_word = FALSE;
+          for (pos = end_pos + 1; pos < log_attrs_len; pos++)
+            {
+              if (log_attrs[pos].is_word_start)
+                break;
+              if (log_attrs[pos].is_word_end)
+                {
+                  in_word = TRUE;
+                  break;
+                }
+            }
+          log_attrs[end_pos].is_word_start = in_word;
+          log_attrs[end_pos].is_word_end = TRUE;
+          log_attrs[end_pos].is_word_boundary = TRUE;
+
+          /* Allow line breaks before words */
+          if (in_word)
+            log_attrs[end_pos].is_line_break = TRUE;
+
+          tailored = TRUE;
+        }
+    }
+  while (pango_attr_iterator_next (&iter));
+
+  _pango_attr_iterator_destroy (&iter);
+
+  return tailored;
+}
+
+static gboolean
+handle_sentences (const char    *text,
+                  int            length,
+                  PangoAttrList *attrs,
+                  int            offset,
+                  PangoLogAttr  *log_attrs,
+                  int            log_attrs_len)
+{
+  PangoAttrIterator iter;
+  gboolean tailored = FALSE;
+
+  _pango_attr_list_get_iterator (attrs, &iter);
+
+  do
+    {
+      const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_SENTENCE);
+      int start, end;
+      int start_pos, end_pos;
+      int pos;
+
+      if (!attr)
+        continue;
+
+      start = attr->start_index;
+      end = attr->end_index;
+      if (start < offset)
+        start_pos = 0;
+      else
+        start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+      if (end >= offset + length)
+        end_pos = log_attrs_len;
+      else
+        end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+      for (pos = start_pos + 1; pos < end_pos; pos++)
+        {
+          log_attrs[pos].is_sentence_start = FALSE;
+          log_attrs[pos].is_sentence_end = FALSE;
+          log_attrs[pos].is_sentence_boundary = FALSE;
+
+          tailored = TRUE;
+        }
+      if (start >= offset)
+        {
+          gboolean in_sentence = FALSE;
+          for (pos = start_pos - 1; pos >= 0; pos--)
+            {
+              if (log_attrs[pos].is_sentence_end)
+                break;
+              if (log_attrs[pos].is_sentence_start)
+                {
+                  in_sentence = TRUE;
+                  break;
+                }
+            }
+          log_attrs[start_pos].is_sentence_start = TRUE;
+          log_attrs[start_pos].is_sentence_end = in_sentence;
+          log_attrs[start_pos].is_sentence_boundary = TRUE;
+
+          tailored = TRUE;
+        }
+      if (end < offset + length)
+        {
+          gboolean in_sentence = FALSE;
+          for (pos = end_pos + 1; end_pos < log_attrs_len; pos++)
+            {
+              if (log_attrs[pos].is_sentence_start)
+                break;
+              if (log_attrs[pos].is_sentence_end)
+                {
+                  in_sentence = TRUE;
+                  break;
+                }
+            }
+          log_attrs[end_pos].is_sentence_start = in_sentence;
+          log_attrs[end_pos].is_sentence_end = TRUE;
+          log_attrs[end_pos].is_sentence_boundary = TRUE;
+
+          tailored = TRUE;
+        }
+    }
+  while (pango_attr_iterator_next (&iter));
 
   _pango_attr_iterator_destroy (&iter);
 
-  _pango_attr_list_get_iterator (&hyphens, &iter);
+  return tailored;
+}
+
+static gboolean
+handle_hyphens (const char    *text,
+                int            length,
+                PangoAttrList *attrs,
+                int            offset,
+                PangoLogAttr  *log_attrs,
+                int            log_attrs_len)
+{
+  PangoAttrIterator iter;
+  gboolean tailored = FALSE;
+
+  _pango_attr_list_get_iterator (attrs, &iter);
+
   do {
     const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_INSERT_HYPHENS);
 
@@ -1779,17 +2040,72 @@ break_attrs (const char   *text,
         for (pos = start_pos + 1; pos < end_pos; pos++)
           {
             if (!log_attrs[pos].break_removes_preceding)
-              log_attrs[pos].break_inserts_hyphen = FALSE;
+              {
+                log_attrs[pos].break_inserts_hyphen = FALSE;
+
+                tailored = TRUE;
+              }
           }
       }
   } while (pango_attr_iterator_next (&iter));
 
   _pango_attr_iterator_destroy (&iter);
 
-  _pango_attr_list_destroy (&list);
+  return tailored;
+}
+
+static gboolean
+break_attrs (const char   *text,
+             int           length,
+             GSList       *attributes,
+             int           offset,
+             PangoLogAttr *log_attrs,
+             int           log_attrs_len)
+{
+  PangoAttrList allow_breaks;
+  PangoAttrList words;
+  PangoAttrList sentences;
+  PangoAttrList hyphens;
+  GSList *l;
+  gboolean tailored = FALSE;
+
+  _pango_attr_list_init (&allow_breaks);
+  _pango_attr_list_init (&words);
+  _pango_attr_list_init (&sentences);
+  _pango_attr_list_init (&hyphens);
+
+  for (l = attributes; l; l = l->next)
+    {
+      PangoAttribute *attr = l->data;
+
+      if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS)
+        pango_attr_list_insert (&allow_breaks, pango_attribute_copy (attr));
+      else if (attr->klass->type == PANGO_ATTR_WORD)
+        pango_attr_list_insert (&words, pango_attribute_copy (attr));
+      else if (attr->klass->type == PANGO_ATTR_SENTENCE)
+        pango_attr_list_insert (&sentences, pango_attribute_copy (attr));
+      else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS)
+        pango_attr_list_insert (&hyphens, pango_attribute_copy (attr));
+    }
+
+  tailored |= handle_allow_breaks (text, length, &allow_breaks, offset,
+                                   log_attrs, log_attrs_len);
+
+  tailored |= handle_words (text, length, &words, offset,
+                            log_attrs, log_attrs_len);
+
+  tailored |= handle_sentences (text, length, &words, offset,
+                                log_attrs, log_attrs_len);
+
+  tailored |= handle_hyphens (text, length, &hyphens, offset,
+                              log_attrs, log_attrs_len);
+
+  _pango_attr_list_destroy (&allow_breaks);
+  _pango_attr_list_destroy (&words);
+  _pango_attr_list_destroy (&sentences);
   _pango_attr_list_destroy (&hyphens);
 
-  return TRUE;
+  return tailored;
 }
 
 /* }}} */
@@ -2033,6 +2349,6 @@ pango_get_log_attrs (const char    *text,
                attrs_len);
 }
 
- /* }}} */
+/* }}} */
 
 /* vim:set foldmethod=marker expandtab: */
diff --git a/pango/pango-attributes.c b/pango/pango-attributes.c
index 28dc4105..326234d2 100644
--- a/pango/pango-attributes.c
+++ b/pango/pango-attributes.c
@@ -1303,6 +1303,60 @@ pango_attr_show_new (PangoShowFlags flags)
 }
 
 /**
+ * pango_attr_word_new:
+ *
+ * Marks the range of the attribute as a single word.
+ *
+ * Note that this may require adjustments to word and
+ * sentence classification around the range.
+ *
+ * Return value: (transfer full): the newly allocated
+ *   `PangoAttribute`, which should be freed with
+ *   [method@Pango.Attribute.destroy]
+ *
+ * Since: 1.50
+ */
+PangoAttribute *
+pango_attr_word_new (void)
+{
+  static const PangoAttrClass klass = {
+    PANGO_ATTR_WORD,
+    pango_attr_int_copy,
+    pango_attr_int_destroy,
+    pango_attr_int_equal,
+  };
+
+  return pango_attr_int_new (&klass, 0);
+}
+
+/**
+ * pango_attr_sentence_new:
+ *
+ * Marks the range of the attribute as a single sentence.
+ *
+ * Note that this may require adjustments to word and
+ * sentence classification around the range.
+ *
+ * Return value: (transfer full): the newly allocated
+ *   `PangoAttribute`, which should be freed with
+ *   [method@Pango.Attribute.destroy]
+ *
+ * Since: 1.50
+ */
+PangoAttribute *
+pango_attr_sentence_new (void)
+{
+  static const PangoAttrClass klass = {
+    PANGO_ATTR_SENTENCE,
+    pango_attr_int_copy,
+    pango_attr_int_destroy,
+    pango_attr_int_equal,
+  };
+
+  return pango_attr_int_new (&klass, 0);
+}
+
+/**
  * pango_attr_overline_new:
  * @overline: the overline style
  *
@@ -1477,6 +1531,8 @@ pango_attribute_as_int (PangoAttribute *attr)
     case PANGO_ATTR_OVERLINE:
     case PANGO_ATTR_ABSOLUTE_LINE_HEIGHT:
     case PANGO_ATTR_TEXT_TRANSFORM:
+    case PANGO_ATTR_WORD:
+    case PANGO_ATTR_SENTENCE:
       return (PangoAttrInt *)attr;
 
     default:
diff --git a/pango/pango-attributes.h b/pango/pango-attributes.h
index 86826b62..613aa021 100644
--- a/pango/pango-attributes.h
+++ b/pango/pango-attributes.h
@@ -77,6 +77,8 @@ typedef struct _PangoAttrFontFeatures PangoAttrFontFeatures;
  * @PANGO_ATTR_OVERLINE_COLOR: overline color ([struct@Pango.AttrColor]). Since 1.46
  * @PANGO_ATTR_LINE_HEIGHT: line height factor ([struct@Pango.AttrFloat]). Since: 1.50
  * @PANGO_ATTR_ABSOLUTE_LINE_HEIGHT: line height ([struct@Pango.AttrInt]). Since: 1.50
+ * @PANGO_ATTR_WORD: override segmentation to classify the range of the attribute as a single word ([struct@Pango.AttrInt]). Since 1.50
+ * @PANGO_ATTR_SENTENCE: override segmentation to classify the range of the attribute as a single sentence ([struct@Pango.AttrInt]). Since 1.50
  *
  * The `PangoAttrType` distinguishes between different types of attributes.
  *
@@ -121,6 +123,8 @@ typedef enum
   PANGO_ATTR_LINE_HEIGHT,       /* PangoAttrFloat */
   PANGO_ATTR_ABSOLUTE_LINE_HEIGHT, /* PangoAttrInt */
   PANGO_ATTR_TEXT_TRANSFORM,    /* PangoAttrInt */
+  PANGO_ATTR_WORD,              /* PangoAttrInt */
+  PANGO_ATTR_SENTENCE,          /* PangoAttrInt */
 } PangoAttrType;
 
 /**
@@ -538,6 +542,12 @@ PANGO_AVAILABLE_IN_1_38
 PangoAttribute *        pango_attr_background_alpha_new         (guint16                      alpha);
 PANGO_AVAILABLE_IN_1_44
 PangoAttribute *        pango_attr_allow_breaks_new             (gboolean                     allow_breaks);
+
+PANGO_AVAILABLE_IN_1_50
+PangoAttribute *        pango_attr_word_new                     (void);
+PANGO_AVAILABLE_IN_1_50
+PangoAttribute *        pango_attr_sentence_new                 (void);
+
 PANGO_AVAILABLE_IN_1_44
 PangoAttribute *        pango_attr_insert_hyphens_new           (gboolean                     insert_hyphens);
 PANGO_AVAILABLE_IN_1_46
diff --git a/pango/pango-layout.c b/pango/pango-layout.c
index 85f12f0a..ad56e8f9 100644
--- a/pango/pango-layout.c
+++ b/pango/pango-layout.c
@@ -4323,6 +4323,8 @@ affects_break_or_shape (PangoAttribute *attr,
     {
     /* Affects breaks */
     case PANGO_ATTR_ALLOW_BREAKS:
+    case PANGO_ATTR_WORD:
+    case PANGO_ATTR_SENTENCE:
     /* Affects shaping */
     case PANGO_ATTR_INSERT_HYPHENS:
     case PANGO_ATTR_FONT_FEATURES:
diff --git a/pango/pango-markup.c b/pango/pango-markup.c
index 22064103..a9df8ed0 100644
--- a/pango/pango-markup.c
+++ b/pango/pango-markup.c
@@ -1230,6 +1230,7 @@ span_parse_func     (MarkupData            *md G_GNUC_UNUSED,
   const char *show = NULL;
   const char *line_height = NULL;
   const char *text_transform = NULL;
+  const char *segment = NULL;
 
   g_markup_parse_context_get_position (context,
 				       &line_number, &char_number);
@@ -1297,6 +1298,7 @@ span_parse_func     (MarkupData            *md G_GNUC_UNUSED,
 	CHECK_ATTRIBUTE (strikethrough);
 	CHECK_ATTRIBUTE (strikethrough_color);
 	CHECK_ATTRIBUTE (style);
+	CHECK_ATTRIBUTE (segment);
 	break;
       case 't':
         CHECK_ATTRIBUTE (text_transform);
@@ -1712,7 +1714,7 @@ span_parse_func     (MarkupData            *md G_GNUC_UNUSED,
       gboolean b = FALSE;
 
       if (!span_parse_boolean ("allow_breaks", allow_breaks, &b, line_number, error))
-	goto error;
+        goto error;
 
       add_attribute (tag, pango_attr_allow_breaks_new (b));
     }
@@ -1727,6 +1729,25 @@ span_parse_func     (MarkupData            *md G_GNUC_UNUSED,
       add_attribute (tag, pango_attr_insert_hyphens_new (b));
     }
 
+  if (G_UNLIKELY (segment))
+    {
+      if (strcmp (segment, "word") == 0)
+        add_attribute (tag, pango_attr_word_new ());
+      else if (strcmp (segment, "sentence") == 0)
+        add_attribute (tag, pango_attr_sentence_new ());
+      else
+        {
+          g_set_error (error,
+                       G_MARKUP_ERROR,
+                       G_MARKUP_ERROR_INVALID_CONTENT,
+                       _("Value of 'segment' attribute on <span> tag on line %d "
+                         "could not be parsed; should be one of 'word' or "
+                         "'sentence', not '%s'"),
+                       line_number, segment);
+          goto error;
+        }
+    }
+
   return TRUE;
 
  error:
author	Matthias Clasen <mclasen@redhat.com>	2021-08-21 23:54:03 -0400
committer	Matthias Clasen <mclasen@redhat.com>	2021-08-25 01:08:02 -0400
commit	b614ea2b06b3c9defaceb92b6904fa8a92249abe (patch)
tree	c20fdc8a2dc8a85561e3a8df5bf147d3c2888cd5 /pango
parent	3aee7615e9a123ae750e49e9864bdaa4b267cdbb (diff)
download	pango-b614ea2b06b3c9defaceb92b6904fa8a92249abe.tar.gz