diff options
author | Matthias Clasen <mclasen@redhat.com> | 2021-08-21 23:54:03 -0400 |
---|---|---|
committer | Matthias Clasen <mclasen@redhat.com> | 2021-08-25 01:08:02 -0400 |
commit | b614ea2b06b3c9defaceb92b6904fa8a92249abe (patch) | |
tree | c20fdc8a2dc8a85561e3a8df5bf147d3c2888cd5 /pango | |
parent | 3aee7615e9a123ae750e49e9864bdaa4b267cdbb (diff) | |
download | pango-b614ea2b06b3c9defaceb92b6904fa8a92249abe.tar.gz |
Add segmentation attributesbreak-tailoring
Add attributes that let us override word and
sentence boundaries (and, indirectly, line breaks).
Tests included.
Diffstat (limited to 'pango')
-rw-r--r-- | pango/break.c | 410 | ||||
-rw-r--r-- | pango/pango-attributes.c | 56 | ||||
-rw-r--r-- | pango/pango-attributes.h | 10 | ||||
-rw-r--r-- | pango/pango-layout.c | 2 | ||||
-rw-r--r-- | pango/pango-markup.c | 23 |
5 files changed, 453 insertions, 48 deletions
diff --git a/pango/break.c b/pango/break.c index d348f9b8..864ac339 100644 --- a/pango/break.c +++ b/pango/break.c @@ -1699,64 +1699,325 @@ break_script (const char *item_text, /* }}} */ /* {{{ Attribute-based customization */ +/* We allow customizing log attrs in two ways: + * + * - You can directly remove breaks from a range, using allow_breaks=false. + * We preserve the non-tailorable rules from UAX #14, so mandatory breaks + * and breaks after ZWS remain. We also preserve break opportunities after + * hyphens and visible word dividers. + * + * - You can tweak the segmentation by marking ranges as word or sentence. + * When doing so, we split adjacent segments to preserve alternating + * starts and ends. We add a line break opportunity before each word that + * is created in this way, and we remove line break opportunities inside + * the word in the same way as for a range marked as allow_breaks=false, + * except that we don't remove char break opportunities. + * + * Note that UAX #14 does not guarantee that words fall neatly into + * sentences, so we don't do extra work to enforce that. + */ + +static void +remove_breaks_from_range (const char *text, + int start, + PangoLogAttr *log_attrs, + int start_pos, + int end_pos) +{ + int pos; + const char *p; + gunichar ch; + int bt; + gboolean after_zws; + gboolean after_hyphen; + + /* Assume our range doesn't start after a hyphen or in a zws sequence */ + after_zws = FALSE; + after_hyphen = FALSE; + for (pos = start_pos + 1, p = g_utf8_next_char (text + start); + pos < end_pos; + pos++, p = g_utf8_next_char (p)) + { + /* Mandatory breaks aren't tailorable */ + if (!log_attrs[pos].is_mandatory_break) + log_attrs[pos].is_line_break = FALSE; + + ch = g_utf8_get_char (p); + bt = g_unichar_break_type (ch); + + /* Hyphens and visible word dividers */ + if (after_hyphen) + log_attrs[pos].is_line_break = TRUE; + + after_hyphen = ch == 0x00ad || /* Soft Hyphen */ + ch == 0x05A0 || ch == 0x2010 || /* Breaking Hyphens */ + ch == 0x2012 || ch == 0x2013 || + ch == 0x05BE || ch == 0x0F0B || /* Visible word dividers */ + ch == 0x1361 || ch == 0x17D8 || + ch == 0x17DA || ch == 0x2027 || + ch == 0x007C; + + /* ZWS sequence */ + if (after_zws && bt != G_UNICODE_BREAK_SPACE) + log_attrs[pos].is_line_break = TRUE; + + after_zws = bt == G_UNICODE_BREAK_ZERO_WIDTH_SPACE || + (bt == G_UNICODE_BREAK_SPACE && after_zws); + } +} + static gboolean -break_attrs (const char *text, - int length, - GSList *attributes, - int offset, - PangoLogAttr *log_attrs, - int log_attrs_len) +handle_allow_breaks (const char *text, + int length, + PangoAttrList *attrs, + int offset, + PangoLogAttr *log_attrs, + int log_attrs_len) { - PangoAttrList list; - PangoAttrList hyphens; PangoAttrIterator iter; - GSList *l; + gboolean tailored = FALSE; - _pango_attr_list_init (&list); - _pango_attr_list_init (&hyphens); + _pango_attr_list_get_iterator (attrs, &iter); - for (l = attributes; l; l = l->next) + do { - PangoAttribute *attr = l->data; + const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_ALLOW_BREAKS); - if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS) - pango_attr_list_insert (&list, pango_attribute_copy (attr)); - else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS) - pango_attr_list_insert (&hyphens, pango_attribute_copy (attr)); + if (!attr) + continue; + + if (!((PangoAttrInt*)attr)->value) + { + int start, end; + int start_pos, end_pos; + int pos; + + start = attr->start_index; + end = attr->end_index; + if (start < offset) + start_pos = 0; + else + start_pos = g_utf8_pointer_to_offset (text, text + start - offset); + if (end >= offset + length) + end_pos = log_attrs_len; + else + end_pos = g_utf8_pointer_to_offset (text, text + end - offset); + + for (pos = start_pos + 1; pos < end_pos; pos++) + log_attrs[pos].is_char_break = FALSE; + + remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs, start_pos, end_pos); + + tailored = TRUE; + } } + while (pango_attr_iterator_next (&iter)); - _pango_attr_list_get_iterator (&list, &iter); - do { - const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_ALLOW_BREAKS); + _pango_attr_iterator_destroy (&iter); - if (attr && ((PangoAttrInt*)attr)->value == 0) - { - int start, end; - int start_pos, end_pos; - int pos; + return tailored; +} - pango_attr_iterator_range (&iter, &start, &end); - if (start < offset) - start_pos = 0; - else - start_pos = g_utf8_pointer_to_offset (text, text + start - offset); - if (end >= offset + length) - end_pos = log_attrs_len; - else - end_pos = g_utf8_pointer_to_offset (text, text + end - offset); - for (pos = start_pos + 1; pos < end_pos; pos++) - { - log_attrs[pos].is_mandatory_break = FALSE; - log_attrs[pos].is_line_break = FALSE; - log_attrs[pos].is_char_break = FALSE; - } - } - } while (pango_attr_iterator_next (&iter)); +static gboolean +handle_words (const char *text, + int length, + PangoAttrList *attrs, + int offset, + PangoLogAttr *log_attrs, + int log_attrs_len) +{ + PangoAttrIterator iter; + gboolean tailored = FALSE; + + _pango_attr_list_get_iterator (attrs, &iter); + + do + { + const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_WORD); + int start, end; + int start_pos, end_pos; + int pos; + + if (!attr) + continue; + + start = attr->start_index; + end = attr->end_index; + if (start < offset) + start_pos = 0; + else + start_pos = g_utf8_pointer_to_offset (text, text + start - offset); + if (end >= offset + length) + end_pos = log_attrs_len; + else + end_pos = g_utf8_pointer_to_offset (text, text + end - offset); + + for (pos = start_pos + 1; pos < end_pos; pos++) + { + log_attrs[pos].is_word_start = FALSE; + log_attrs[pos].is_word_end = FALSE; + log_attrs[pos].is_word_boundary = FALSE; + } + + remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs, + start_pos, end_pos); + + if (start >= offset) + { + gboolean in_word = FALSE; + for (pos = start_pos - 1; pos >= 0; pos--) + { + if (log_attrs[pos].is_word_end) + break; + if (log_attrs[pos].is_word_start) + { + in_word = TRUE; + break; + } + } + log_attrs[start_pos].is_word_start = TRUE; + log_attrs[start_pos].is_word_end = in_word; + log_attrs[start_pos].is_word_boundary = TRUE; + + /* Allow line breaks before words */ + log_attrs[start_pos].is_line_break = TRUE; + + tailored = TRUE; + } + + if (end < offset + length) + { + gboolean in_word = FALSE; + for (pos = end_pos + 1; pos < log_attrs_len; pos++) + { + if (log_attrs[pos].is_word_start) + break; + if (log_attrs[pos].is_word_end) + { + in_word = TRUE; + break; + } + } + log_attrs[end_pos].is_word_start = in_word; + log_attrs[end_pos].is_word_end = TRUE; + log_attrs[end_pos].is_word_boundary = TRUE; + + /* Allow line breaks before words */ + if (in_word) + log_attrs[end_pos].is_line_break = TRUE; + + tailored = TRUE; + } + } + while (pango_attr_iterator_next (&iter)); + + _pango_attr_iterator_destroy (&iter); + + return tailored; +} + +static gboolean +handle_sentences (const char *text, + int length, + PangoAttrList *attrs, + int offset, + PangoLogAttr *log_attrs, + int log_attrs_len) +{ + PangoAttrIterator iter; + gboolean tailored = FALSE; + + _pango_attr_list_get_iterator (attrs, &iter); + + do + { + const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_SENTENCE); + int start, end; + int start_pos, end_pos; + int pos; + + if (!attr) + continue; + + start = attr->start_index; + end = attr->end_index; + if (start < offset) + start_pos = 0; + else + start_pos = g_utf8_pointer_to_offset (text, text + start - offset); + if (end >= offset + length) + end_pos = log_attrs_len; + else + end_pos = g_utf8_pointer_to_offset (text, text + end - offset); + + for (pos = start_pos + 1; pos < end_pos; pos++) + { + log_attrs[pos].is_sentence_start = FALSE; + log_attrs[pos].is_sentence_end = FALSE; + log_attrs[pos].is_sentence_boundary = FALSE; + + tailored = TRUE; + } + if (start >= offset) + { + gboolean in_sentence = FALSE; + for (pos = start_pos - 1; pos >= 0; pos--) + { + if (log_attrs[pos].is_sentence_end) + break; + if (log_attrs[pos].is_sentence_start) + { + in_sentence = TRUE; + break; + } + } + log_attrs[start_pos].is_sentence_start = TRUE; + log_attrs[start_pos].is_sentence_end = in_sentence; + log_attrs[start_pos].is_sentence_boundary = TRUE; + + tailored = TRUE; + } + if (end < offset + length) + { + gboolean in_sentence = FALSE; + for (pos = end_pos + 1; end_pos < log_attrs_len; pos++) + { + if (log_attrs[pos].is_sentence_start) + break; + if (log_attrs[pos].is_sentence_end) + { + in_sentence = TRUE; + break; + } + } + log_attrs[end_pos].is_sentence_start = in_sentence; + log_attrs[end_pos].is_sentence_end = TRUE; + log_attrs[end_pos].is_sentence_boundary = TRUE; + + tailored = TRUE; + } + } + while (pango_attr_iterator_next (&iter)); _pango_attr_iterator_destroy (&iter); - _pango_attr_list_get_iterator (&hyphens, &iter); + return tailored; +} + +static gboolean +handle_hyphens (const char *text, + int length, + PangoAttrList *attrs, + int offset, + PangoLogAttr *log_attrs, + int log_attrs_len) +{ + PangoAttrIterator iter; + gboolean tailored = FALSE; + + _pango_attr_list_get_iterator (attrs, &iter); + do { const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_INSERT_HYPHENS); @@ -1779,17 +2040,72 @@ break_attrs (const char *text, for (pos = start_pos + 1; pos < end_pos; pos++) { if (!log_attrs[pos].break_removes_preceding) - log_attrs[pos].break_inserts_hyphen = FALSE; + { + log_attrs[pos].break_inserts_hyphen = FALSE; + + tailored = TRUE; + } } } } while (pango_attr_iterator_next (&iter)); _pango_attr_iterator_destroy (&iter); - _pango_attr_list_destroy (&list); + return tailored; +} + +static gboolean +break_attrs (const char *text, + int length, + GSList *attributes, + int offset, + PangoLogAttr *log_attrs, + int log_attrs_len) +{ + PangoAttrList allow_breaks; + PangoAttrList words; + PangoAttrList sentences; + PangoAttrList hyphens; + GSList *l; + gboolean tailored = FALSE; + + _pango_attr_list_init (&allow_breaks); + _pango_attr_list_init (&words); + _pango_attr_list_init (&sentences); + _pango_attr_list_init (&hyphens); + + for (l = attributes; l; l = l->next) + { + PangoAttribute *attr = l->data; + + if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS) + pango_attr_list_insert (&allow_breaks, pango_attribute_copy (attr)); + else if (attr->klass->type == PANGO_ATTR_WORD) + pango_attr_list_insert (&words, pango_attribute_copy (attr)); + else if (attr->klass->type == PANGO_ATTR_SENTENCE) + pango_attr_list_insert (&sentences, pango_attribute_copy (attr)); + else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS) + pango_attr_list_insert (&hyphens, pango_attribute_copy (attr)); + } + + tailored |= handle_allow_breaks (text, length, &allow_breaks, offset, + log_attrs, log_attrs_len); + + tailored |= handle_words (text, length, &words, offset, + log_attrs, log_attrs_len); + + tailored |= handle_sentences (text, length, &words, offset, + log_attrs, log_attrs_len); + + tailored |= handle_hyphens (text, length, &hyphens, offset, + log_attrs, log_attrs_len); + + _pango_attr_list_destroy (&allow_breaks); + _pango_attr_list_destroy (&words); + _pango_attr_list_destroy (&sentences); _pango_attr_list_destroy (&hyphens); - return TRUE; + return tailored; } /* }}} */ @@ -2033,6 +2349,6 @@ pango_get_log_attrs (const char *text, attrs_len); } - /* }}} */ +/* }}} */ /* vim:set foldmethod=marker expandtab: */ diff --git a/pango/pango-attributes.c b/pango/pango-attributes.c index 28dc4105..326234d2 100644 --- a/pango/pango-attributes.c +++ b/pango/pango-attributes.c @@ -1303,6 +1303,60 @@ pango_attr_show_new (PangoShowFlags flags) } /** + * pango_attr_word_new: + * + * Marks the range of the attribute as a single word. + * + * Note that this may require adjustments to word and + * sentence classification around the range. + * + * Return value: (transfer full): the newly allocated + * `PangoAttribute`, which should be freed with + * [method@Pango.Attribute.destroy] + * + * Since: 1.50 + */ +PangoAttribute * +pango_attr_word_new (void) +{ + static const PangoAttrClass klass = { + PANGO_ATTR_WORD, + pango_attr_int_copy, + pango_attr_int_destroy, + pango_attr_int_equal, + }; + + return pango_attr_int_new (&klass, 0); +} + +/** + * pango_attr_sentence_new: + * + * Marks the range of the attribute as a single sentence. + * + * Note that this may require adjustments to word and + * sentence classification around the range. + * + * Return value: (transfer full): the newly allocated + * `PangoAttribute`, which should be freed with + * [method@Pango.Attribute.destroy] + * + * Since: 1.50 + */ +PangoAttribute * +pango_attr_sentence_new (void) +{ + static const PangoAttrClass klass = { + PANGO_ATTR_SENTENCE, + pango_attr_int_copy, + pango_attr_int_destroy, + pango_attr_int_equal, + }; + + return pango_attr_int_new (&klass, 0); +} + +/** * pango_attr_overline_new: * @overline: the overline style * @@ -1477,6 +1531,8 @@ pango_attribute_as_int (PangoAttribute *attr) case PANGO_ATTR_OVERLINE: case PANGO_ATTR_ABSOLUTE_LINE_HEIGHT: case PANGO_ATTR_TEXT_TRANSFORM: + case PANGO_ATTR_WORD: + case PANGO_ATTR_SENTENCE: return (PangoAttrInt *)attr; default: diff --git a/pango/pango-attributes.h b/pango/pango-attributes.h index 86826b62..613aa021 100644 --- a/pango/pango-attributes.h +++ b/pango/pango-attributes.h @@ -77,6 +77,8 @@ typedef struct _PangoAttrFontFeatures PangoAttrFontFeatures; * @PANGO_ATTR_OVERLINE_COLOR: overline color ([struct@Pango.AttrColor]). Since 1.46 * @PANGO_ATTR_LINE_HEIGHT: line height factor ([struct@Pango.AttrFloat]). Since: 1.50 * @PANGO_ATTR_ABSOLUTE_LINE_HEIGHT: line height ([struct@Pango.AttrInt]). Since: 1.50 + * @PANGO_ATTR_WORD: override segmentation to classify the range of the attribute as a single word ([struct@Pango.AttrInt]). Since 1.50 + * @PANGO_ATTR_SENTENCE: override segmentation to classify the range of the attribute as a single sentence ([struct@Pango.AttrInt]). Since 1.50 * * The `PangoAttrType` distinguishes between different types of attributes. * @@ -121,6 +123,8 @@ typedef enum PANGO_ATTR_LINE_HEIGHT, /* PangoAttrFloat */ PANGO_ATTR_ABSOLUTE_LINE_HEIGHT, /* PangoAttrInt */ PANGO_ATTR_TEXT_TRANSFORM, /* PangoAttrInt */ + PANGO_ATTR_WORD, /* PangoAttrInt */ + PANGO_ATTR_SENTENCE, /* PangoAttrInt */ } PangoAttrType; /** @@ -538,6 +542,12 @@ PANGO_AVAILABLE_IN_1_38 PangoAttribute * pango_attr_background_alpha_new (guint16 alpha); PANGO_AVAILABLE_IN_1_44 PangoAttribute * pango_attr_allow_breaks_new (gboolean allow_breaks); + +PANGO_AVAILABLE_IN_1_50 +PangoAttribute * pango_attr_word_new (void); +PANGO_AVAILABLE_IN_1_50 +PangoAttribute * pango_attr_sentence_new (void); + PANGO_AVAILABLE_IN_1_44 PangoAttribute * pango_attr_insert_hyphens_new (gboolean insert_hyphens); PANGO_AVAILABLE_IN_1_46 diff --git a/pango/pango-layout.c b/pango/pango-layout.c index 85f12f0a..ad56e8f9 100644 --- a/pango/pango-layout.c +++ b/pango/pango-layout.c @@ -4323,6 +4323,8 @@ affects_break_or_shape (PangoAttribute *attr, { /* Affects breaks */ case PANGO_ATTR_ALLOW_BREAKS: + case PANGO_ATTR_WORD: + case PANGO_ATTR_SENTENCE: /* Affects shaping */ case PANGO_ATTR_INSERT_HYPHENS: case PANGO_ATTR_FONT_FEATURES: diff --git a/pango/pango-markup.c b/pango/pango-markup.c index 22064103..a9df8ed0 100644 --- a/pango/pango-markup.c +++ b/pango/pango-markup.c @@ -1230,6 +1230,7 @@ span_parse_func (MarkupData *md G_GNUC_UNUSED, const char *show = NULL; const char *line_height = NULL; const char *text_transform = NULL; + const char *segment = NULL; g_markup_parse_context_get_position (context, &line_number, &char_number); @@ -1297,6 +1298,7 @@ span_parse_func (MarkupData *md G_GNUC_UNUSED, CHECK_ATTRIBUTE (strikethrough); CHECK_ATTRIBUTE (strikethrough_color); CHECK_ATTRIBUTE (style); + CHECK_ATTRIBUTE (segment); break; case 't': CHECK_ATTRIBUTE (text_transform); @@ -1712,7 +1714,7 @@ span_parse_func (MarkupData *md G_GNUC_UNUSED, gboolean b = FALSE; if (!span_parse_boolean ("allow_breaks", allow_breaks, &b, line_number, error)) - goto error; + goto error; add_attribute (tag, pango_attr_allow_breaks_new (b)); } @@ -1727,6 +1729,25 @@ span_parse_func (MarkupData *md G_GNUC_UNUSED, add_attribute (tag, pango_attr_insert_hyphens_new (b)); } + if (G_UNLIKELY (segment)) + { + if (strcmp (segment, "word") == 0) + add_attribute (tag, pango_attr_word_new ()); + else if (strcmp (segment, "sentence") == 0) + add_attribute (tag, pango_attr_sentence_new ()); + else + { + g_set_error (error, + G_MARKUP_ERROR, + G_MARKUP_ERROR_INVALID_CONTENT, + _("Value of 'segment' attribute on <span> tag on line %d " + "could not be parsed; should be one of 'word' or " + "'sentence', not '%s'"), + line_number, segment); + goto error; + } + } + return TRUE; error: |