summaryrefslogtreecommitdiff
path: root/pango/pango-emoji.c
diff options
context:
space:
mode:
authorBehdad Esfahbod <behdad@behdad.org>2018-10-15 14:53:31 -0700
committerBehdad Esfahbod <behdad@behdad.org>2018-10-15 15:36:06 -0700
commitf8ca9ca5ed3198e7fbab7381e232f9f20ecd93da (patch)
tree6093dc34732e5c0cbd258e24ac3986f8b321d2c7 /pango/pango-emoji.c
parent15eb263d75496119d4bcd14069199bf8367926e9 (diff)
downloadpango-f8ca9ca5ed3198e7fbab7381e232f9f20ecd93da.tar.gz
[emoji] Port to new Ragel-based iterator, based on Chromium again
There's a couple of regressions in this apparently. I'm working with Dominik to fix on Chrome side and will push here after.
Diffstat (limited to 'pango/pango-emoji.c')
-rw-r--r--pango/pango-emoji.c256
1 files changed, 140 insertions, 116 deletions
diff --git a/pango/pango-emoji.c b/pango/pango-emoji.c
index 29472452..46ab5b3f 100644
--- a/pango/pango-emoji.c
+++ b/pango/pango-emoji.c
@@ -18,11 +18,27 @@
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*
- * Implementation of pango_emoji_iter is derived from Chromium:
+ * Implementation of pango_emoji_iter is based on Chromium's Ragel-based
+ * parser:
*
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/FontFallbackPriority.h
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/text/CharacterEmoji.cpp
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/SymbolsIterator.cpp
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577
+ *
+ * The grammar file emoji_presentation_scanner.rl was just modified to
+ * adapt the function signature and variables to our usecase. The
+ * grammar itself was NOT modified:
+ *
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/emoji_presentation_scanner.rl
+ *
+ * The emoji_presentation_scanner.c is generated from .rl file by
+ * running ragel on it.
+ *
+ * The categorization is also based on:
+ *
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/utf16_ragel_iterator.h
+ *
+ * The iterator next() is based on:
+ *
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/symbols_iterator.cc
*
* // Copyright 2015 The Chromium Authors. All rights reserved.
* // Use of this source code is governed by a BSD-style license that can be
@@ -105,62 +121,110 @@ _pango_Is_Regional_Indicator (gunichar ch)
const gunichar kCombiningEnclosingCircleBackslashCharacter = 0x20E0;
const gunichar kCombiningEnclosingKeycapCharacter = 0x20E3;
-const gunichar kEyeCharacter = 0x1F441;
-const gunichar kFemaleSignCharacter = 0x2640;
-const gunichar kLeftSpeechBubbleCharacter = 0x1F5E8;
-const gunichar kMaleSignCharacter = 0x2642;
-const gunichar kRainbowCharacter = 0x1F308;
-const gunichar kStaffOfAesculapiusCharacter = 0x2695;
const gunichar kVariationSelector15Character = 0xFE0E;
const gunichar kVariationSelector16Character = 0xFE0F;
-const gunichar kWavingWhiteFlagCharacter = 0x1F3F3;
const gunichar kZeroWidthJoinerCharacter = 0x200D;
-
-typedef enum {
- PANGO_EMOJI_TYPE_INVALID,
- PANGO_EMOJI_TYPE_TEXT, /* For regular non-symbols text */
- PANGO_EMOJI_TYPE_EMOJI_TEXT, /* For emoji in text presentaiton */
- PANGO_EMOJI_TYPE_EMOJI_EMOJI /* For emoji in emoji presentation */
-} PangoEmojiType;
-
-static PangoEmojiType
-_pango_get_emoji_type (gunichar codepoint)
+enum PangoEmojiScannerCategory {
+ EMOJI = 0,
+ EMOJI_TEXT_PRESENTATION = 1,
+ EMOJI_EMOJI_PRESENTATION = 2,
+ EMOJI_MODIFIER_BASE = 3,
+ EMOJI_MODIFIER = 4,
+ EMOJI_VS_BASE = 5,
+ REGIONAL_INDICATOR = 6,
+ KEYCAP_BASE = 7,
+ COMBINING_ENCLOSING_KEYCAP = 8,
+ COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9,
+ ZWJ = 10,
+ VS15 = 11,
+ VS16 = 12,
+ TAG_BASE = 13,
+ TAG_SEQUENCE = 14,
+ TAG_TERM = 15,
+ kMaxEmojiScannerCategory = 16
+};
+
+static unsigned char
+_pango_EmojiSegmentationCategory (gunichar codepoint)
{
- /* Those should only be Emoji presentation as combinations of two. */
- if (_pango_Is_Emoji_Keycap_Base (codepoint) ||
- _pango_Is_Regional_Indicator (codepoint))
- return PANGO_EMOJI_TYPE_TEXT;
-
+ /* Specific ones first. */
if (codepoint == kCombiningEnclosingKeycapCharacter)
- return PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-
- if (_pango_Is_Emoji_Emoji_Default (codepoint) ||
- _pango_Is_Emoji_Modifier_Base (codepoint) ||
- _pango_Is_Emoji_Modifier (codepoint))
- return PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-
+ return COMBINING_ENCLOSING_KEYCAP;
+ if (codepoint == kCombiningEnclosingCircleBackslashCharacter)
+ return COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
+ if (codepoint == kZeroWidthJoinerCharacter)
+ return ZWJ;
+ if (codepoint == kVariationSelector15Character)
+ return VS15;
+ if (codepoint == kVariationSelector16Character)
+ return VS16;
+ if (codepoint == 0x1F3F4)
+ return TAG_BASE;
+ if ((codepoint >= 0xE0030 && codepoint <= 0xE0039) ||
+ (codepoint >= 0xE0061 && codepoint <= 0xE007A))
+ return TAG_SEQUENCE;
+ if (codepoint == 0xE007F)
+ return TAG_TERM;
+ if (_pango_Is_Emoji_Modifier_Base (codepoint))
+ return EMOJI_MODIFIER_BASE;
+ if (_pango_Is_Emoji_Modifier (codepoint))
+ return EMOJI_MODIFIER;
+ if (_pango_Is_Regional_Indicator (codepoint))
+ return REGIONAL_INDICATOR;
+ if (_pango_Is_Emoji_Keycap_Base (codepoint))
+ return KEYCAP_BASE;
+
+ if (_pango_Is_Emoji_Emoji_Default (codepoint))
+ return EMOJI_EMOJI_PRESENTATION;
if (_pango_Is_Emoji_Text_Default (codepoint))
- return PANGO_EMOJI_TYPE_EMOJI_TEXT;
+ return EMOJI_TEXT_PRESENTATION;
+ if (_pango_Is_Emoji (codepoint))
+ return EMOJI;
- return PANGO_EMOJI_TYPE_TEXT;
+ /* Ragel state machine will interpret unknown category as "any". */
+ return kMaxEmojiScannerCategory;
}
+#define found_text_presentation_sequence
+#define found_emoji_presentation_sequence \
+ { \
+ if (0) g_print ("emoji %ld..%ld\n", ts - buffer, te - buffer); \
+ *last = ts - buffer; \
+ *end = te - buffer; \
+ return TRUE; \
+ }
+
+#include "emoji_presentation_scanner.c"
+
PangoEmojiIter *
_pango_emoji_iter_init (PangoEmojiIter *iter,
const char *text,
int length)
{
- iter->text_start = text;
+ unsigned int n_chars = g_utf8_strlen (text, length);
+ unsigned char *types = g_malloc (n_chars);
+ unsigned int i;
+ const char *p;
+
+ p = text;
+ for (i = 0; i < n_chars; i++)
+ {
+ types[i] = _pango_EmojiSegmentationCategory (g_utf8_get_char (p));
+ p = g_utf8_next_char (p);
+ }
+
+ iter->text_start = iter->start = iter->end = iter->token_start = iter->token_end = text;
if (length >= 0)
iter->text_end = text + length;
else
iter->text_end = text + strlen (text);
+ iter->is_emoji = FALSE;
- iter->start = text;
- iter->end = text;
- iter->is_emoji = (gboolean) 2; /* HACK */
+ iter->types = types;
+ iter->n_chars = n_chars;
+ iter->cursor = 0;
_pango_emoji_iter_next (iter);
@@ -170,102 +234,62 @@ _pango_emoji_iter_init (PangoEmojiIter *iter,
void
_pango_emoji_iter_fini (PangoEmojiIter *iter)
{
+ g_free (iter->types);
}
-#define PANGO_EMOJI_TYPE_IS_EMOJI(typ) ((typ) == PANGO_EMOJI_TYPE_EMOJI_EMOJI)
-
gboolean
_pango_emoji_iter_next (PangoEmojiIter *iter)
{
- PangoEmojiType current_emoji_type = PANGO_EMOJI_TYPE_INVALID;
-
- if (iter->end == iter->text_end)
+ if (iter->end >= iter->text_end)
return FALSE;
iter->start = iter->end;
- for (; iter->end < iter->text_end; iter->end = g_utf8_next_char (iter->end))
+ /* The scan_emoji_presentation scanner function returns false when it reaches
+ * the end of the buffer and has not discovered any emoji runs in between. For
+ * Emoji runs, it returns true, and token_start_ and token_end_ are set to the
+ * start and end of the emoji sequence. This means, it may skip over text runs
+ * in between, see below. */
+ if (iter->start >= iter->token_end)
{
- gunichar ch = g_utf8_get_char (iter->end);
-
- /* Except at the beginning, ZWJ just carries over the emoji or neutral
- * text type, VS15 & VS16 we just carry over as well, since we already
- * resolved those through lookahead. Also, don't downgrade to text
- * presentation for emoji that are part of a ZWJ sequence, example
- * U+1F441 U+200D U+1F5E8, eye (text presentation) + ZWJ + left speech
- * bubble, see below. */
- if ((!(ch == kZeroWidthJoinerCharacter && !iter->is_emoji) &&
- ch != kVariationSelector15Character &&
- ch != kVariationSelector16Character &&
- ch != kCombiningEnclosingCircleBackslashCharacter &&
- !_pango_Is_Regional_Indicator(ch) &&
- !((ch == kLeftSpeechBubbleCharacter ||
- ch == kRainbowCharacter ||
- ch == kMaleSignCharacter ||
- ch == kFemaleSignCharacter ||
- ch == kStaffOfAesculapiusCharacter) &&
- !iter->is_emoji)) ||
- current_emoji_type == PANGO_EMOJI_TYPE_INVALID) {
- current_emoji_type = _pango_get_emoji_type (ch);
+ /* We need to scan furhter. */
+ unsigned int token_start, token_end;
+ if (!scan_emoji_presentation (iter->types, iter->n_chars, iter->cursor,
+ &token_start, &token_end))
+ {
+ /* The scanner returned false, which means it has reached the end of the
+ * buffer without discovering any emoji segments in between. */
+ iter->end = iter->text_end;
+ iter->is_emoji = FALSE;
+
+ return TRUE;
+ };
+ /* Ugly... */
+ g_assert (iter->cursor <= token_start && token_start < token_end && token_end <= iter->n_chars);
+ iter->token_start = g_utf8_offset_to_pointer (iter->token_end, token_start - iter->cursor);
+ iter->token_end = g_utf8_offset_to_pointer (iter->token_end, token_end - iter->cursor);
+ iter->cursor = token_end;
}
- if (g_utf8_next_char (iter->end) < iter->text_end) /* Optimize. */
+ if (iter->start < iter->token_start)
{
- gunichar peek_char = g_utf8_get_char (g_utf8_next_char (iter->end));
-
- /* Variation Selectors */
- if (current_emoji_type ==
- PANGO_EMOJI_TYPE_EMOJI_EMOJI &&
- peek_char == kVariationSelector15Character) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_TEXT;
- }
-
- if ((current_emoji_type ==
- PANGO_EMOJI_TYPE_EMOJI_TEXT ||
- _pango_Is_Emoji_Keycap_Base(ch)) &&
- peek_char == kVariationSelector16Character) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
-
- /* Combining characters Keycap... */
- if (_pango_Is_Emoji_Keycap_Base(ch) &&
- peek_char == kCombiningEnclosingKeycapCharacter) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- };
-
- /* Regional indicators */
- if (_pango_Is_Regional_Indicator(ch) &&
- _pango_Is_Regional_Indicator(peek_char)) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
-
- /* Upgrade text presentation emoji to emoji presentation when followed by
- * ZWJ, Example U+1F441 U+200D U+1F5E8, eye + ZWJ + left speech bubble. */
- if ((ch == kEyeCharacter ||
- ch == kWavingWhiteFlagCharacter) &&
- peek_char == kZeroWidthJoinerCharacter) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
+ /* The scanner function has progressed to the next emoji segment, but we
+ * need to return the text segment over which it had skipped. */
+ iter->end = iter->token_start;
+ iter->is_emoji = FALSE;
+ return TRUE;
}
- if (iter->is_emoji == (gboolean) 2)
- iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
- if (iter->is_emoji == PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type))
+ if (iter->start >= iter->token_start && iter->start < iter->token_end)
{
- iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
-
- /* Make sure we make progress. Weird sequences, like a VC15 followed
- * by VC16, can trick us into stalling otherwise. */
- if (iter->start == iter->end)
- iter->end = g_utf8_next_char (iter->end);
-
+ /* Now our cursor has reached the emoji segment, and we can return it. */
+ iter->end = iter->token_end;
+ iter->is_emoji = TRUE;
return TRUE;
}
- }
-
- iter->is_emoji = PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
- return TRUE;
+ g_assert_not_reached ();
+ return FALSE;
}