diff options
author | Behdad Esfahbod <behdad@behdad.org> | 2018-10-15 14:53:31 -0700 |
---|---|---|
committer | Behdad Esfahbod <behdad@behdad.org> | 2018-10-15 15:36:06 -0700 |
commit | f8ca9ca5ed3198e7fbab7381e232f9f20ecd93da (patch) | |
tree | 6093dc34732e5c0cbd258e24ac3986f8b321d2c7 /pango/pango-emoji.c | |
parent | 15eb263d75496119d4bcd14069199bf8367926e9 (diff) | |
download | pango-f8ca9ca5ed3198e7fbab7381e232f9f20ecd93da.tar.gz |
[emoji] Port to new Ragel-based iterator, based on Chromium again
There's a couple of regressions in this apparently. I'm working with Dominik
to fix on Chrome side and will push here after.
Diffstat (limited to 'pango/pango-emoji.c')
-rw-r--r-- | pango/pango-emoji.c | 256 |
1 files changed, 140 insertions, 116 deletions
diff --git a/pango/pango-emoji.c b/pango/pango-emoji.c index 29472452..46ab5b3f 100644 --- a/pango/pango-emoji.c +++ b/pango/pango-emoji.c @@ -18,11 +18,27 @@ * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. * - * Implementation of pango_emoji_iter is derived from Chromium: + * Implementation of pango_emoji_iter is based on Chromium's Ragel-based + * parser: * - * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/FontFallbackPriority.h - * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/text/CharacterEmoji.cpp - * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/SymbolsIterator.cpp + * https://chromium-review.googlesource.com/c/chromium/src/+/1264577 + * + * The grammar file emoji_presentation_scanner.rl was just modified to + * adapt the function signature and variables to our usecase. The + * grammar itself was NOT modified: + * + * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/emoji_presentation_scanner.rl + * + * The emoji_presentation_scanner.c is generated from .rl file by + * running ragel on it. + * + * The categorization is also based on: + * + * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/utf16_ragel_iterator.h + * + * The iterator next() is based on: + * + * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/symbols_iterator.cc * * // Copyright 2015 The Chromium Authors. All rights reserved. * // Use of this source code is governed by a BSD-style license that can be @@ -105,62 +121,110 @@ _pango_Is_Regional_Indicator (gunichar ch) const gunichar kCombiningEnclosingCircleBackslashCharacter = 0x20E0; const gunichar kCombiningEnclosingKeycapCharacter = 0x20E3; -const gunichar kEyeCharacter = 0x1F441; -const gunichar kFemaleSignCharacter = 0x2640; -const gunichar kLeftSpeechBubbleCharacter = 0x1F5E8; -const gunichar kMaleSignCharacter = 0x2642; -const gunichar kRainbowCharacter = 0x1F308; -const gunichar kStaffOfAesculapiusCharacter = 0x2695; const gunichar kVariationSelector15Character = 0xFE0E; const gunichar kVariationSelector16Character = 0xFE0F; -const gunichar kWavingWhiteFlagCharacter = 0x1F3F3; const gunichar kZeroWidthJoinerCharacter = 0x200D; - -typedef enum { - PANGO_EMOJI_TYPE_INVALID, - PANGO_EMOJI_TYPE_TEXT, /* For regular non-symbols text */ - PANGO_EMOJI_TYPE_EMOJI_TEXT, /* For emoji in text presentaiton */ - PANGO_EMOJI_TYPE_EMOJI_EMOJI /* For emoji in emoji presentation */ -} PangoEmojiType; - -static PangoEmojiType -_pango_get_emoji_type (gunichar codepoint) +enum PangoEmojiScannerCategory { + EMOJI = 0, + EMOJI_TEXT_PRESENTATION = 1, + EMOJI_EMOJI_PRESENTATION = 2, + EMOJI_MODIFIER_BASE = 3, + EMOJI_MODIFIER = 4, + EMOJI_VS_BASE = 5, + REGIONAL_INDICATOR = 6, + KEYCAP_BASE = 7, + COMBINING_ENCLOSING_KEYCAP = 8, + COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9, + ZWJ = 10, + VS15 = 11, + VS16 = 12, + TAG_BASE = 13, + TAG_SEQUENCE = 14, + TAG_TERM = 15, + kMaxEmojiScannerCategory = 16 +}; + +static unsigned char +_pango_EmojiSegmentationCategory (gunichar codepoint) { - /* Those should only be Emoji presentation as combinations of two. */ - if (_pango_Is_Emoji_Keycap_Base (codepoint) || - _pango_Is_Regional_Indicator (codepoint)) - return PANGO_EMOJI_TYPE_TEXT; - + /* Specific ones first. */ if (codepoint == kCombiningEnclosingKeycapCharacter) - return PANGO_EMOJI_TYPE_EMOJI_EMOJI; - - if (_pango_Is_Emoji_Emoji_Default (codepoint) || - _pango_Is_Emoji_Modifier_Base (codepoint) || - _pango_Is_Emoji_Modifier (codepoint)) - return PANGO_EMOJI_TYPE_EMOJI_EMOJI; - + return COMBINING_ENCLOSING_KEYCAP; + if (codepoint == kCombiningEnclosingCircleBackslashCharacter) + return COMBINING_ENCLOSING_CIRCLE_BACKSLASH; + if (codepoint == kZeroWidthJoinerCharacter) + return ZWJ; + if (codepoint == kVariationSelector15Character) + return VS15; + if (codepoint == kVariationSelector16Character) + return VS16; + if (codepoint == 0x1F3F4) + return TAG_BASE; + if ((codepoint >= 0xE0030 && codepoint <= 0xE0039) || + (codepoint >= 0xE0061 && codepoint <= 0xE007A)) + return TAG_SEQUENCE; + if (codepoint == 0xE007F) + return TAG_TERM; + if (_pango_Is_Emoji_Modifier_Base (codepoint)) + return EMOJI_MODIFIER_BASE; + if (_pango_Is_Emoji_Modifier (codepoint)) + return EMOJI_MODIFIER; + if (_pango_Is_Regional_Indicator (codepoint)) + return REGIONAL_INDICATOR; + if (_pango_Is_Emoji_Keycap_Base (codepoint)) + return KEYCAP_BASE; + + if (_pango_Is_Emoji_Emoji_Default (codepoint)) + return EMOJI_EMOJI_PRESENTATION; if (_pango_Is_Emoji_Text_Default (codepoint)) - return PANGO_EMOJI_TYPE_EMOJI_TEXT; + return EMOJI_TEXT_PRESENTATION; + if (_pango_Is_Emoji (codepoint)) + return EMOJI; - return PANGO_EMOJI_TYPE_TEXT; + /* Ragel state machine will interpret unknown category as "any". */ + return kMaxEmojiScannerCategory; } +#define found_text_presentation_sequence +#define found_emoji_presentation_sequence \ + { \ + if (0) g_print ("emoji %ld..%ld\n", ts - buffer, te - buffer); \ + *last = ts - buffer; \ + *end = te - buffer; \ + return TRUE; \ + } + +#include "emoji_presentation_scanner.c" + PangoEmojiIter * _pango_emoji_iter_init (PangoEmojiIter *iter, const char *text, int length) { - iter->text_start = text; + unsigned int n_chars = g_utf8_strlen (text, length); + unsigned char *types = g_malloc (n_chars); + unsigned int i; + const char *p; + + p = text; + for (i = 0; i < n_chars; i++) + { + types[i] = _pango_EmojiSegmentationCategory (g_utf8_get_char (p)); + p = g_utf8_next_char (p); + } + + iter->text_start = iter->start = iter->end = iter->token_start = iter->token_end = text; if (length >= 0) iter->text_end = text + length; else iter->text_end = text + strlen (text); + iter->is_emoji = FALSE; - iter->start = text; - iter->end = text; - iter->is_emoji = (gboolean) 2; /* HACK */ + iter->types = types; + iter->n_chars = n_chars; + iter->cursor = 0; _pango_emoji_iter_next (iter); @@ -170,102 +234,62 @@ _pango_emoji_iter_init (PangoEmojiIter *iter, void _pango_emoji_iter_fini (PangoEmojiIter *iter) { + g_free (iter->types); } -#define PANGO_EMOJI_TYPE_IS_EMOJI(typ) ((typ) == PANGO_EMOJI_TYPE_EMOJI_EMOJI) - gboolean _pango_emoji_iter_next (PangoEmojiIter *iter) { - PangoEmojiType current_emoji_type = PANGO_EMOJI_TYPE_INVALID; - - if (iter->end == iter->text_end) + if (iter->end >= iter->text_end) return FALSE; iter->start = iter->end; - for (; iter->end < iter->text_end; iter->end = g_utf8_next_char (iter->end)) + /* The scan_emoji_presentation scanner function returns false when it reaches + * the end of the buffer and has not discovered any emoji runs in between. For + * Emoji runs, it returns true, and token_start_ and token_end_ are set to the + * start and end of the emoji sequence. This means, it may skip over text runs + * in between, see below. */ + if (iter->start >= iter->token_end) { - gunichar ch = g_utf8_get_char (iter->end); - - /* Except at the beginning, ZWJ just carries over the emoji or neutral - * text type, VS15 & VS16 we just carry over as well, since we already - * resolved those through lookahead. Also, don't downgrade to text - * presentation for emoji that are part of a ZWJ sequence, example - * U+1F441 U+200D U+1F5E8, eye (text presentation) + ZWJ + left speech - * bubble, see below. */ - if ((!(ch == kZeroWidthJoinerCharacter && !iter->is_emoji) && - ch != kVariationSelector15Character && - ch != kVariationSelector16Character && - ch != kCombiningEnclosingCircleBackslashCharacter && - !_pango_Is_Regional_Indicator(ch) && - !((ch == kLeftSpeechBubbleCharacter || - ch == kRainbowCharacter || - ch == kMaleSignCharacter || - ch == kFemaleSignCharacter || - ch == kStaffOfAesculapiusCharacter) && - !iter->is_emoji)) || - current_emoji_type == PANGO_EMOJI_TYPE_INVALID) { - current_emoji_type = _pango_get_emoji_type (ch); + /* We need to scan furhter. */ + unsigned int token_start, token_end; + if (!scan_emoji_presentation (iter->types, iter->n_chars, iter->cursor, + &token_start, &token_end)) + { + /* The scanner returned false, which means it has reached the end of the + * buffer without discovering any emoji segments in between. */ + iter->end = iter->text_end; + iter->is_emoji = FALSE; + + return TRUE; + }; + /* Ugly... */ + g_assert (iter->cursor <= token_start && token_start < token_end && token_end <= iter->n_chars); + iter->token_start = g_utf8_offset_to_pointer (iter->token_end, token_start - iter->cursor); + iter->token_end = g_utf8_offset_to_pointer (iter->token_end, token_end - iter->cursor); + iter->cursor = token_end; } - if (g_utf8_next_char (iter->end) < iter->text_end) /* Optimize. */ + if (iter->start < iter->token_start) { - gunichar peek_char = g_utf8_get_char (g_utf8_next_char (iter->end)); - - /* Variation Selectors */ - if (current_emoji_type == - PANGO_EMOJI_TYPE_EMOJI_EMOJI && - peek_char == kVariationSelector15Character) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_TEXT; - } - - if ((current_emoji_type == - PANGO_EMOJI_TYPE_EMOJI_TEXT || - _pango_Is_Emoji_Keycap_Base(ch)) && - peek_char == kVariationSelector16Character) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI; - } - - /* Combining characters Keycap... */ - if (_pango_Is_Emoji_Keycap_Base(ch) && - peek_char == kCombiningEnclosingKeycapCharacter) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI; - }; - - /* Regional indicators */ - if (_pango_Is_Regional_Indicator(ch) && - _pango_Is_Regional_Indicator(peek_char)) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI; - } - - /* Upgrade text presentation emoji to emoji presentation when followed by - * ZWJ, Example U+1F441 U+200D U+1F5E8, eye + ZWJ + left speech bubble. */ - if ((ch == kEyeCharacter || - ch == kWavingWhiteFlagCharacter) && - peek_char == kZeroWidthJoinerCharacter) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI; - } + /* The scanner function has progressed to the next emoji segment, but we + * need to return the text segment over which it had skipped. */ + iter->end = iter->token_start; + iter->is_emoji = FALSE; + return TRUE; } - if (iter->is_emoji == (gboolean) 2) - iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type); - if (iter->is_emoji == PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type)) + if (iter->start >= iter->token_start && iter->start < iter->token_end) { - iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type); - - /* Make sure we make progress. Weird sequences, like a VC15 followed - * by VC16, can trick us into stalling otherwise. */ - if (iter->start == iter->end) - iter->end = g_utf8_next_char (iter->end); - + /* Now our cursor has reached the emoji segment, and we can return it. */ + iter->end = iter->token_end; + iter->is_emoji = TRUE; return TRUE; } - } - - iter->is_emoji = PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type); - return TRUE; + g_assert_not_reached (); + return FALSE; } |