summaryrefslogtreecommitdiff
path: root/pango
diff options
context:
space:
mode:
authorMatthias Clasen <mclasen@redhat.com>2021-08-21 23:54:03 -0400
committerMatthias Clasen <mclasen@redhat.com>2021-08-25 01:08:02 -0400
commitb614ea2b06b3c9defaceb92b6904fa8a92249abe (patch)
treec20fdc8a2dc8a85561e3a8df5bf147d3c2888cd5 /pango
parent3aee7615e9a123ae750e49e9864bdaa4b267cdbb (diff)
downloadpango-b614ea2b06b3c9defaceb92b6904fa8a92249abe.tar.gz
Add segmentation attributesbreak-tailoring
Add attributes that let us override word and sentence boundaries (and, indirectly, line breaks). Tests included.
Diffstat (limited to 'pango')
-rw-r--r--pango/break.c410
-rw-r--r--pango/pango-attributes.c56
-rw-r--r--pango/pango-attributes.h10
-rw-r--r--pango/pango-layout.c2
-rw-r--r--pango/pango-markup.c23
5 files changed, 453 insertions, 48 deletions
diff --git a/pango/break.c b/pango/break.c
index d348f9b8..864ac339 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -1699,64 +1699,325 @@ break_script (const char *item_text,
/* }}} */
/* {{{ Attribute-based customization */
+/* We allow customizing log attrs in two ways:
+ *
+ * - You can directly remove breaks from a range, using allow_breaks=false.
+ * We preserve the non-tailorable rules from UAX #14, so mandatory breaks
+ * and breaks after ZWS remain. We also preserve break opportunities after
+ * hyphens and visible word dividers.
+ *
+ * - You can tweak the segmentation by marking ranges as word or sentence.
+ * When doing so, we split adjacent segments to preserve alternating
+ * starts and ends. We add a line break opportunity before each word that
+ * is created in this way, and we remove line break opportunities inside
+ * the word in the same way as for a range marked as allow_breaks=false,
+ * except that we don't remove char break opportunities.
+ *
+ * Note that UAX #14 does not guarantee that words fall neatly into
+ * sentences, so we don't do extra work to enforce that.
+ */
+
+static void
+remove_breaks_from_range (const char *text,
+ int start,
+ PangoLogAttr *log_attrs,
+ int start_pos,
+ int end_pos)
+{
+ int pos;
+ const char *p;
+ gunichar ch;
+ int bt;
+ gboolean after_zws;
+ gboolean after_hyphen;
+
+ /* Assume our range doesn't start after a hyphen or in a zws sequence */
+ after_zws = FALSE;
+ after_hyphen = FALSE;
+ for (pos = start_pos + 1, p = g_utf8_next_char (text + start);
+ pos < end_pos;
+ pos++, p = g_utf8_next_char (p))
+ {
+ /* Mandatory breaks aren't tailorable */
+ if (!log_attrs[pos].is_mandatory_break)
+ log_attrs[pos].is_line_break = FALSE;
+
+ ch = g_utf8_get_char (p);
+ bt = g_unichar_break_type (ch);
+
+ /* Hyphens and visible word dividers */
+ if (after_hyphen)
+ log_attrs[pos].is_line_break = TRUE;
+
+ after_hyphen = ch == 0x00ad || /* Soft Hyphen */
+ ch == 0x05A0 || ch == 0x2010 || /* Breaking Hyphens */
+ ch == 0x2012 || ch == 0x2013 ||
+ ch == 0x05BE || ch == 0x0F0B || /* Visible word dividers */
+ ch == 0x1361 || ch == 0x17D8 ||
+ ch == 0x17DA || ch == 0x2027 ||
+ ch == 0x007C;
+
+ /* ZWS sequence */
+ if (after_zws && bt != G_UNICODE_BREAK_SPACE)
+ log_attrs[pos].is_line_break = TRUE;
+
+ after_zws = bt == G_UNICODE_BREAK_ZERO_WIDTH_SPACE ||
+ (bt == G_UNICODE_BREAK_SPACE && after_zws);
+ }
+}
+
static gboolean
-break_attrs (const char *text,
- int length,
- GSList *attributes,
- int offset,
- PangoLogAttr *log_attrs,
- int log_attrs_len)
+handle_allow_breaks (const char *text,
+ int length,
+ PangoAttrList *attrs,
+ int offset,
+ PangoLogAttr *log_attrs,
+ int log_attrs_len)
{
- PangoAttrList list;
- PangoAttrList hyphens;
PangoAttrIterator iter;
- GSList *l;
+ gboolean tailored = FALSE;
- _pango_attr_list_init (&list);
- _pango_attr_list_init (&hyphens);
+ _pango_attr_list_get_iterator (attrs, &iter);
- for (l = attributes; l; l = l->next)
+ do
{
- PangoAttribute *attr = l->data;
+ const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_ALLOW_BREAKS);
- if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS)
- pango_attr_list_insert (&list, pango_attribute_copy (attr));
- else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS)
- pango_attr_list_insert (&hyphens, pango_attribute_copy (attr));
+ if (!attr)
+ continue;
+
+ if (!((PangoAttrInt*)attr)->value)
+ {
+ int start, end;
+ int start_pos, end_pos;
+ int pos;
+
+ start = attr->start_index;
+ end = attr->end_index;
+ if (start < offset)
+ start_pos = 0;
+ else
+ start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+ if (end >= offset + length)
+ end_pos = log_attrs_len;
+ else
+ end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+ for (pos = start_pos + 1; pos < end_pos; pos++)
+ log_attrs[pos].is_char_break = FALSE;
+
+ remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs, start_pos, end_pos);
+
+ tailored = TRUE;
+ }
}
+ while (pango_attr_iterator_next (&iter));
- _pango_attr_list_get_iterator (&list, &iter);
- do {
- const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_ALLOW_BREAKS);
+ _pango_attr_iterator_destroy (&iter);
- if (attr && ((PangoAttrInt*)attr)->value == 0)
- {
- int start, end;
- int start_pos, end_pos;
- int pos;
+ return tailored;
+}
- pango_attr_iterator_range (&iter, &start, &end);
- if (start < offset)
- start_pos = 0;
- else
- start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
- if (end >= offset + length)
- end_pos = log_attrs_len;
- else
- end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
- for (pos = start_pos + 1; pos < end_pos; pos++)
- {
- log_attrs[pos].is_mandatory_break = FALSE;
- log_attrs[pos].is_line_break = FALSE;
- log_attrs[pos].is_char_break = FALSE;
- }
- }
- } while (pango_attr_iterator_next (&iter));
+static gboolean
+handle_words (const char *text,
+ int length,
+ PangoAttrList *attrs,
+ int offset,
+ PangoLogAttr *log_attrs,
+ int log_attrs_len)
+{
+ PangoAttrIterator iter;
+ gboolean tailored = FALSE;
+
+ _pango_attr_list_get_iterator (attrs, &iter);
+
+ do
+ {
+ const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_WORD);
+ int start, end;
+ int start_pos, end_pos;
+ int pos;
+
+ if (!attr)
+ continue;
+
+ start = attr->start_index;
+ end = attr->end_index;
+ if (start < offset)
+ start_pos = 0;
+ else
+ start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+ if (end >= offset + length)
+ end_pos = log_attrs_len;
+ else
+ end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+ for (pos = start_pos + 1; pos < end_pos; pos++)
+ {
+ log_attrs[pos].is_word_start = FALSE;
+ log_attrs[pos].is_word_end = FALSE;
+ log_attrs[pos].is_word_boundary = FALSE;
+ }
+
+ remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs,
+ start_pos, end_pos);
+
+ if (start >= offset)
+ {
+ gboolean in_word = FALSE;
+ for (pos = start_pos - 1; pos >= 0; pos--)
+ {
+ if (log_attrs[pos].is_word_end)
+ break;
+ if (log_attrs[pos].is_word_start)
+ {
+ in_word = TRUE;
+ break;
+ }
+ }
+ log_attrs[start_pos].is_word_start = TRUE;
+ log_attrs[start_pos].is_word_end = in_word;
+ log_attrs[start_pos].is_word_boundary = TRUE;
+
+ /* Allow line breaks before words */
+ log_attrs[start_pos].is_line_break = TRUE;
+
+ tailored = TRUE;
+ }
+
+ if (end < offset + length)
+ {
+ gboolean in_word = FALSE;
+ for (pos = end_pos + 1; pos < log_attrs_len; pos++)
+ {
+ if (log_attrs[pos].is_word_start)
+ break;
+ if (log_attrs[pos].is_word_end)
+ {
+ in_word = TRUE;
+ break;
+ }
+ }
+ log_attrs[end_pos].is_word_start = in_word;
+ log_attrs[end_pos].is_word_end = TRUE;
+ log_attrs[end_pos].is_word_boundary = TRUE;
+
+ /* Allow line breaks before words */
+ if (in_word)
+ log_attrs[end_pos].is_line_break = TRUE;
+
+ tailored = TRUE;
+ }
+ }
+ while (pango_attr_iterator_next (&iter));
+
+ _pango_attr_iterator_destroy (&iter);
+
+ return tailored;
+}
+
+static gboolean
+handle_sentences (const char *text,
+ int length,
+ PangoAttrList *attrs,
+ int offset,
+ PangoLogAttr *log_attrs,
+ int log_attrs_len)
+{
+ PangoAttrIterator iter;
+ gboolean tailored = FALSE;
+
+ _pango_attr_list_get_iterator (attrs, &iter);
+
+ do
+ {
+ const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_SENTENCE);
+ int start, end;
+ int start_pos, end_pos;
+ int pos;
+
+ if (!attr)
+ continue;
+
+ start = attr->start_index;
+ end = attr->end_index;
+ if (start < offset)
+ start_pos = 0;
+ else
+ start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+ if (end >= offset + length)
+ end_pos = log_attrs_len;
+ else
+ end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+ for (pos = start_pos + 1; pos < end_pos; pos++)
+ {
+ log_attrs[pos].is_sentence_start = FALSE;
+ log_attrs[pos].is_sentence_end = FALSE;
+ log_attrs[pos].is_sentence_boundary = FALSE;
+
+ tailored = TRUE;
+ }
+ if (start >= offset)
+ {
+ gboolean in_sentence = FALSE;
+ for (pos = start_pos - 1; pos >= 0; pos--)
+ {
+ if (log_attrs[pos].is_sentence_end)
+ break;
+ if (log_attrs[pos].is_sentence_start)
+ {
+ in_sentence = TRUE;
+ break;
+ }
+ }
+ log_attrs[start_pos].is_sentence_start = TRUE;
+ log_attrs[start_pos].is_sentence_end = in_sentence;
+ log_attrs[start_pos].is_sentence_boundary = TRUE;
+
+ tailored = TRUE;
+ }
+ if (end < offset + length)
+ {
+ gboolean in_sentence = FALSE;
+ for (pos = end_pos + 1; end_pos < log_attrs_len; pos++)
+ {
+ if (log_attrs[pos].is_sentence_start)
+ break;
+ if (log_attrs[pos].is_sentence_end)
+ {
+ in_sentence = TRUE;
+ break;
+ }
+ }
+ log_attrs[end_pos].is_sentence_start = in_sentence;
+ log_attrs[end_pos].is_sentence_end = TRUE;
+ log_attrs[end_pos].is_sentence_boundary = TRUE;
+
+ tailored = TRUE;
+ }
+ }
+ while (pango_attr_iterator_next (&iter));
_pango_attr_iterator_destroy (&iter);
- _pango_attr_list_get_iterator (&hyphens, &iter);
+ return tailored;
+}
+
+static gboolean
+handle_hyphens (const char *text,
+ int length,
+ PangoAttrList *attrs,
+ int offset,
+ PangoLogAttr *log_attrs,
+ int log_attrs_len)
+{
+ PangoAttrIterator iter;
+ gboolean tailored = FALSE;
+
+ _pango_attr_list_get_iterator (attrs, &iter);
+
do {
const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_INSERT_HYPHENS);
@@ -1779,17 +2040,72 @@ break_attrs (const char *text,
for (pos = start_pos + 1; pos < end_pos; pos++)
{
if (!log_attrs[pos].break_removes_preceding)
- log_attrs[pos].break_inserts_hyphen = FALSE;
+ {
+ log_attrs[pos].break_inserts_hyphen = FALSE;
+
+ tailored = TRUE;
+ }
}
}
} while (pango_attr_iterator_next (&iter));
_pango_attr_iterator_destroy (&iter);
- _pango_attr_list_destroy (&list);
+ return tailored;
+}
+
+static gboolean
+break_attrs (const char *text,
+ int length,
+ GSList *attributes,
+ int offset,
+ PangoLogAttr *log_attrs,
+ int log_attrs_len)
+{
+ PangoAttrList allow_breaks;
+ PangoAttrList words;
+ PangoAttrList sentences;
+ PangoAttrList hyphens;
+ GSList *l;
+ gboolean tailored = FALSE;
+
+ _pango_attr_list_init (&allow_breaks);
+ _pango_attr_list_init (&words);
+ _pango_attr_list_init (&sentences);
+ _pango_attr_list_init (&hyphens);
+
+ for (l = attributes; l; l = l->next)
+ {
+ PangoAttribute *attr = l->data;
+
+ if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS)
+ pango_attr_list_insert (&allow_breaks, pango_attribute_copy (attr));
+ else if (attr->klass->type == PANGO_ATTR_WORD)
+ pango_attr_list_insert (&words, pango_attribute_copy (attr));
+ else if (attr->klass->type == PANGO_ATTR_SENTENCE)
+ pango_attr_list_insert (&sentences, pango_attribute_copy (attr));
+ else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS)
+ pango_attr_list_insert (&hyphens, pango_attribute_copy (attr));
+ }
+
+ tailored |= handle_allow_breaks (text, length, &allow_breaks, offset,
+ log_attrs, log_attrs_len);
+
+ tailored |= handle_words (text, length, &words, offset,
+ log_attrs, log_attrs_len);
+
+ tailored |= handle_sentences (text, length, &words, offset,
+ log_attrs, log_attrs_len);
+
+ tailored |= handle_hyphens (text, length, &hyphens, offset,
+ log_attrs, log_attrs_len);
+
+ _pango_attr_list_destroy (&allow_breaks);
+ _pango_attr_list_destroy (&words);
+ _pango_attr_list_destroy (&sentences);
_pango_attr_list_destroy (&hyphens);
- return TRUE;
+ return tailored;
}
/* }}} */
@@ -2033,6 +2349,6 @@ pango_get_log_attrs (const char *text,
attrs_len);
}
- /* }}} */
+/* }}} */
/* vim:set foldmethod=marker expandtab: */
diff --git a/pango/pango-attributes.c b/pango/pango-attributes.c
index 28dc4105..326234d2 100644
--- a/pango/pango-attributes.c
+++ b/pango/pango-attributes.c
@@ -1303,6 +1303,60 @@ pango_attr_show_new (PangoShowFlags flags)
}
/**
+ * pango_attr_word_new:
+ *
+ * Marks the range of the attribute as a single word.
+ *
+ * Note that this may require adjustments to word and
+ * sentence classification around the range.
+ *
+ * Return value: (transfer full): the newly allocated
+ * `PangoAttribute`, which should be freed with
+ * [method@Pango.Attribute.destroy]
+ *
+ * Since: 1.50
+ */
+PangoAttribute *
+pango_attr_word_new (void)
+{
+ static const PangoAttrClass klass = {
+ PANGO_ATTR_WORD,
+ pango_attr_int_copy,
+ pango_attr_int_destroy,
+ pango_attr_int_equal,
+ };
+
+ return pango_attr_int_new (&klass, 0);
+}
+
+/**
+ * pango_attr_sentence_new:
+ *
+ * Marks the range of the attribute as a single sentence.
+ *
+ * Note that this may require adjustments to word and
+ * sentence classification around the range.
+ *
+ * Return value: (transfer full): the newly allocated
+ * `PangoAttribute`, which should be freed with
+ * [method@Pango.Attribute.destroy]
+ *
+ * Since: 1.50
+ */
+PangoAttribute *
+pango_attr_sentence_new (void)
+{
+ static const PangoAttrClass klass = {
+ PANGO_ATTR_SENTENCE,
+ pango_attr_int_copy,
+ pango_attr_int_destroy,
+ pango_attr_int_equal,
+ };
+
+ return pango_attr_int_new (&klass, 0);
+}
+
+/**
* pango_attr_overline_new:
* @overline: the overline style
*
@@ -1477,6 +1531,8 @@ pango_attribute_as_int (PangoAttribute *attr)
case PANGO_ATTR_OVERLINE:
case PANGO_ATTR_ABSOLUTE_LINE_HEIGHT:
case PANGO_ATTR_TEXT_TRANSFORM:
+ case PANGO_ATTR_WORD:
+ case PANGO_ATTR_SENTENCE:
return (PangoAttrInt *)attr;
default:
diff --git a/pango/pango-attributes.h b/pango/pango-attributes.h
index 86826b62..613aa021 100644
--- a/pango/pango-attributes.h
+++ b/pango/pango-attributes.h
@@ -77,6 +77,8 @@ typedef struct _PangoAttrFontFeatures PangoAttrFontFeatures;
* @PANGO_ATTR_OVERLINE_COLOR: overline color ([struct@Pango.AttrColor]). Since 1.46
* @PANGO_ATTR_LINE_HEIGHT: line height factor ([struct@Pango.AttrFloat]). Since: 1.50
* @PANGO_ATTR_ABSOLUTE_LINE_HEIGHT: line height ([struct@Pango.AttrInt]). Since: 1.50
+ * @PANGO_ATTR_WORD: override segmentation to classify the range of the attribute as a single word ([struct@Pango.AttrInt]). Since 1.50
+ * @PANGO_ATTR_SENTENCE: override segmentation to classify the range of the attribute as a single sentence ([struct@Pango.AttrInt]). Since 1.50
*
* The `PangoAttrType` distinguishes between different types of attributes.
*
@@ -121,6 +123,8 @@ typedef enum
PANGO_ATTR_LINE_HEIGHT, /* PangoAttrFloat */
PANGO_ATTR_ABSOLUTE_LINE_HEIGHT, /* PangoAttrInt */
PANGO_ATTR_TEXT_TRANSFORM, /* PangoAttrInt */
+ PANGO_ATTR_WORD, /* PangoAttrInt */
+ PANGO_ATTR_SENTENCE, /* PangoAttrInt */
} PangoAttrType;
/**
@@ -538,6 +542,12 @@ PANGO_AVAILABLE_IN_1_38
PangoAttribute * pango_attr_background_alpha_new (guint16 alpha);
PANGO_AVAILABLE_IN_1_44
PangoAttribute * pango_attr_allow_breaks_new (gboolean allow_breaks);
+
+PANGO_AVAILABLE_IN_1_50
+PangoAttribute * pango_attr_word_new (void);
+PANGO_AVAILABLE_IN_1_50
+PangoAttribute * pango_attr_sentence_new (void);
+
PANGO_AVAILABLE_IN_1_44
PangoAttribute * pango_attr_insert_hyphens_new (gboolean insert_hyphens);
PANGO_AVAILABLE_IN_1_46
diff --git a/pango/pango-layout.c b/pango/pango-layout.c
index 85f12f0a..ad56e8f9 100644
--- a/pango/pango-layout.c
+++ b/pango/pango-layout.c
@@ -4323,6 +4323,8 @@ affects_break_or_shape (PangoAttribute *attr,
{
/* Affects breaks */
case PANGO_ATTR_ALLOW_BREAKS:
+ case PANGO_ATTR_WORD:
+ case PANGO_ATTR_SENTENCE:
/* Affects shaping */
case PANGO_ATTR_INSERT_HYPHENS:
case PANGO_ATTR_FONT_FEATURES:
diff --git a/pango/pango-markup.c b/pango/pango-markup.c
index 22064103..a9df8ed0 100644
--- a/pango/pango-markup.c
+++ b/pango/pango-markup.c
@@ -1230,6 +1230,7 @@ span_parse_func (MarkupData *md G_GNUC_UNUSED,
const char *show = NULL;
const char *line_height = NULL;
const char *text_transform = NULL;
+ const char *segment = NULL;
g_markup_parse_context_get_position (context,
&line_number, &char_number);
@@ -1297,6 +1298,7 @@ span_parse_func (MarkupData *md G_GNUC_UNUSED,
CHECK_ATTRIBUTE (strikethrough);
CHECK_ATTRIBUTE (strikethrough_color);
CHECK_ATTRIBUTE (style);
+ CHECK_ATTRIBUTE (segment);
break;
case 't':
CHECK_ATTRIBUTE (text_transform);
@@ -1712,7 +1714,7 @@ span_parse_func (MarkupData *md G_GNUC_UNUSED,
gboolean b = FALSE;
if (!span_parse_boolean ("allow_breaks", allow_breaks, &b, line_number, error))
- goto error;
+ goto error;
add_attribute (tag, pango_attr_allow_breaks_new (b));
}
@@ -1727,6 +1729,25 @@ span_parse_func (MarkupData *md G_GNUC_UNUSED,
add_attribute (tag, pango_attr_insert_hyphens_new (b));
}
+ if (G_UNLIKELY (segment))
+ {
+ if (strcmp (segment, "word") == 0)
+ add_attribute (tag, pango_attr_word_new ());
+ else if (strcmp (segment, "sentence") == 0)
+ add_attribute (tag, pango_attr_sentence_new ());
+ else
+ {
+ g_set_error (error,
+ G_MARKUP_ERROR,
+ G_MARKUP_ERROR_INVALID_CONTENT,
+ _("Value of 'segment' attribute on <span> tag on line %d "
+ "could not be parsed; should be one of 'word' or "
+ "'sentence', not '%s'"),
+ line_number, segment);
+ goto error;
+ }
+ }
+
return TRUE;
error: