From 70933101e4eb4dee9a29fe0992624ae9c58adf00 Mon Sep 17 00:00:00 2001 From: Matthias Clasen Date: Tue, 24 Aug 2021 22:41:15 -0400 Subject: Add hyphens to log attrs The code computing this is much better off in break.c, so move it there, and keep the information in the log attr array. --- pango/break.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 102 insertions(+), 3 deletions(-) (limited to 'pango/break.c') diff --git a/pango/break.c b/pango/break.c index c55d5f22..8e1aeb56 100644 --- a/pango/break.c +++ b/pango/break.c @@ -249,6 +249,8 @@ default_break (const char *text, gint last_sentence_start = -1; gint last_non_space = -1; + gboolean prev_space_or_hyphen; + gboolean almost_done = FALSE; gboolean done = FALSE; @@ -261,6 +263,7 @@ default_break (const char *text, prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN; prev_wc = 0; prev_jamo = NO_JAMO; + prev_space_or_hyphen = FALSE; if (length == 0 || *text == '\0') { @@ -291,6 +294,8 @@ default_break (const char *text, /* Emoji extended pictographics */ gboolean is_Extended_Pictographic; + PangoScript script; + wc = next_wc; break_type = next_break_type; @@ -533,17 +538,16 @@ default_break (const char *text, prev_GB_type = GB_type; } + script = (PangoScript)g_unichar_get_script (wc); + /* ---- UAX#29 Word Boundaries ---- */ { is_word_boundary = FALSE; if (is_grapheme_boundary || G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */ { - PangoScript script; WordBreakType WB_type; - script = (PangoScript)g_unichar_get_script (wc); - /* Find the WordBreakType of wc */ WB_type = WB_Other; @@ -1552,7 +1556,68 @@ default_break (const char *text, attrs[i - 1].is_white) { last_sentence_start++; } + } + + /* --- Hyphens --- */ + { + gboolean insert_hyphens; + gboolean space_or_hyphen = FALSE; + + switch ((int)script) + { + case PANGO_SCRIPT_COMMON: + case PANGO_SCRIPT_HAN: + case PANGO_SCRIPT_HANGUL: + case PANGO_SCRIPT_HIRAGANA: + case PANGO_SCRIPT_KATAKANA: + insert_hyphens = FALSE; + break; + default: + insert_hyphens = TRUE; + break; + } + + switch ((int)type) + { + case G_UNICODE_SPACE_SEPARATOR: + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + space_or_hyphen = TRUE; + break; + case G_UNICODE_CONTROL: + if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f') + space_or_hyphen = TRUE; + break; + default: + break; + } + + if (!space_or_hyphen) + { + if (wc == '-' || /* Hyphen-minus */ + wc == 0x058a || /* Armenian hyphen */ + wc == 0x1400 || /* Canadian syllabics hyphen */ + wc == 0x1806 || /* Mongolian todo hyphen */ + wc == 0x2010 || /* Hyphen */ + wc == 0x2027 || /* Hyphenation point */ + wc == 0x2e17 || /* Double oblique hyphen */ + wc == 0x2e40 || /* Double hyphen */ + wc == 0x30a0 || /* Katakana-Hiragana double hyphen */ + wc == 0xfe63 || /* Small hyphen-minus */ + wc == 0xff0d) /* Fullwidth hyphen-minus */ + space_or_hyphen = TRUE; + } + + if (attrs[i].is_word_boundary) + attrs[i].break_inserts_hyphen = FALSE; + else if (prev_space_or_hyphen) + attrs[i].break_inserts_hyphen = FALSE; + else if (space_or_hyphen) + attrs[i].break_inserts_hyphen = FALSE; + else + attrs[i].break_inserts_hyphen = insert_hyphens; + prev_space_or_hyphen = space_or_hyphen; } prev_wc = wc; @@ -1633,16 +1698,21 @@ break_attrs (const char *text, int log_attrs_len) { PangoAttrList list; + PangoAttrList hyphens; PangoAttrIterator iter; GSList *l; _pango_attr_list_init (&list); + _pango_attr_list_init (&hyphens); + for (l = attributes; l; l = l->next) { PangoAttribute *attr = l->data; if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS) pango_attr_list_insert (&list, pango_attribute_copy (attr)); + if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS) + pango_attr_list_insert (&hyphens, pango_attribute_copy (attr)); } if (!_pango_attr_list_has_attributes (&list)) @@ -1681,7 +1751,36 @@ break_attrs (const char *text, } while (pango_attr_iterator_next (&iter)); _pango_attr_iterator_destroy (&iter); + + _pango_attr_list_get_iterator (&hyphens, &iter); + do { + const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_INSERT_HYPHENS); + + if (attr && ((PangoAttrInt*)attr)->value == 0) + { + int start, end; + int start_pos, end_pos; + int pos; + + pango_attr_iterator_range (&iter, &start, &end); + if (start < offset) + start_pos = 0; + else + start_pos = g_utf8_pointer_to_offset (text, text + start - offset); + if (end >= offset + length) + end_pos = log_attrs_len; + else + end_pos = g_utf8_pointer_to_offset (text, text + end - offset); + + for (pos = start_pos + 1; pos < end_pos; pos++) + { + log_attrs[pos].break_inserts_hyphen = FALSE; + } + } + } while (pango_attr_iterator_next (&iter)); + _pango_attr_list_destroy (&list); + _pango_attr_list_destroy (&hyphens); return TRUE; } -- cgit v1.2.1 From 2c9792d4b435e87e8616c22e1e5516d7302b06dc Mon Sep 17 00:00:00 2001 From: Matthias Clasen Date: Wed, 25 Aug 2021 00:09:37 -0400 Subject: Refine hyphenation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace ‧ and | with a - when we break there. Update affected test output. Fixes: #603 --- pango/break.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'pango/break.c') diff --git a/pango/break.c b/pango/break.c index 8e1aeb56..043ac0cc 100644 --- a/pango/break.c +++ b/pango/break.c @@ -1559,10 +1559,14 @@ default_break (const char *text, } /* --- Hyphens --- */ + { gboolean insert_hyphens; gboolean space_or_hyphen = FALSE; + attrs[i].break_inserts_hyphen = FALSE; + attrs[i].break_removes_preceding = FALSE; + switch ((int)script) { case PANGO_SCRIPT_COMMON: @@ -1599,7 +1603,6 @@ default_break (const char *text, wc == 0x1400 || /* Canadian syllabics hyphen */ wc == 0x1806 || /* Mongolian todo hyphen */ wc == 0x2010 || /* Hyphen */ - wc == 0x2027 || /* Hyphenation point */ wc == 0x2e17 || /* Double oblique hyphen */ wc == 0x2e40 || /* Double hyphen */ wc == 0x30a0 || /* Katakana-Hiragana double hyphen */ @@ -1617,6 +1620,13 @@ default_break (const char *text, else attrs[i].break_inserts_hyphen = insert_hyphens; + if (prev_wc == 0x007C || /* Vertical Line */ + prev_wc == 0x2027) /* Hyphenation point */ + { + attrs[i].break_inserts_hyphen = TRUE; + attrs[i].break_removes_preceding = TRUE; + } + prev_space_or_hyphen = space_or_hyphen; } @@ -1774,7 +1784,8 @@ break_attrs (const char *text, for (pos = start_pos + 1; pos < end_pos; pos++) { - log_attrs[pos].break_inserts_hyphen = FALSE; + if (!log_attrs[pos].break_removes_preceding) + log_attrs[pos].break_inserts_hyphen = FALSE; } } } while (pango_attr_iterator_next (&iter)); -- cgit v1.2.1 From 6058254f0106c624dba2b555959554e1d3b8f2a1 Mon Sep 17 00:00:00 2001 From: Matthias Clasen Date: Tue, 24 Aug 2021 22:44:14 -0400 Subject: test-break: Print out hyphens Print out all the log attrs we have. Update expected test outputs. --- pango/break.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'pango/break.c') diff --git a/pango/break.c b/pango/break.c index 043ac0cc..d348f9b8 100644 --- a/pango/break.c +++ b/pango/break.c @@ -1721,16 +1721,10 @@ break_attrs (const char *text, if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS) pango_attr_list_insert (&list, pango_attribute_copy (attr)); - if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS) + else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS) pango_attr_list_insert (&hyphens, pango_attribute_copy (attr)); } - if (!_pango_attr_list_has_attributes (&list)) - { - _pango_attr_list_destroy (&list); - return FALSE; - } - _pango_attr_list_get_iterator (&list, &iter); do { const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_ALLOW_BREAKS); @@ -1790,6 +1784,8 @@ break_attrs (const char *text, } } while (pango_attr_iterator_next (&iter)); + _pango_attr_iterator_destroy (&iter); + _pango_attr_list_destroy (&list); _pango_attr_list_destroy (&hyphens); -- cgit v1.2.1