summaryrefslogtreecommitdiff
path: root/pango/emoji_presentation_scanner.rl
diff options
context:
space:
mode:
authorBehdad Esfahbod <behdad@behdad.org>2018-10-15 14:53:31 -0700
committerBehdad Esfahbod <behdad@behdad.org>2018-10-15 15:36:06 -0700
commitf8ca9ca5ed3198e7fbab7381e232f9f20ecd93da (patch)
tree6093dc34732e5c0cbd258e24ac3986f8b321d2c7 /pango/emoji_presentation_scanner.rl
parent15eb263d75496119d4bcd14069199bf8367926e9 (diff)
downloadpango-f8ca9ca5ed3198e7fbab7381e232f9f20ecd93da.tar.gz
[emoji] Port to new Ragel-based iterator, based on Chromium again
There's a couple of regressions in this apparently. I'm working with Dominik to fix on Chrome side and will push here after.
Diffstat (limited to 'pango/emoji_presentation_scanner.rl')
-rw-r--r--pango/emoji_presentation_scanner.rl96
1 files changed, 96 insertions, 0 deletions
diff --git a/pango/emoji_presentation_scanner.rl b/pango/emoji_presentation_scanner.rl
new file mode 100644
index 00000000..5eea495a
--- /dev/null
+++ b/pango/emoji_presentation_scanner.rl
@@ -0,0 +1,96 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+%%{
+ machine emoji_presentation;
+ alphtype unsigned char;
+ write data noerror nofinal noentry;
+}%%
+
+%%{
+
+EMOJI = 0;
+EMOJI_TEXT_PRESENTATION = 1;
+EMOJI_EMOJI_PRESENTATION = 2;
+EMOJI_MODIFIER_BASE = 3;
+EMOJI_MODIFIER = 4;
+EMOJI_VS_BASE = 5;
+REGIONAL_INDICATOR = 6;
+KEYCAP_BASE = 7;
+COMBINING_ENCLOSING_KEYCAP = 8;
+COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9;
+ZWJ = 10;
+VS15 = 11;
+VS16 = 12;
+TAG_BASE = 13;
+TAG_SEQUENCE = 14;
+TAG_TERM = 15;
+
+any_emoji = EMOJI_TEXT_PRESENTATION | EMOJI_EMOJI_PRESENTATION | KEYCAP_BASE |
+ EMOJI_MODIFIER_BASE | TAG_BASE | EMOJI;
+
+emoji_combining_encloding_circle_backslash_sequence = any_emoji
+ COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
+
+# This could be sharper than any_emoji by restricting this only to valid
+# variation sequences:
+# https://www.unicode.org/Public/emoji/11.0/emoji-variation-sequences.txt
+# However, implementing
+# https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence is
+# sufficient for our purposes here.
+emoji_presentation_sequence = any_emoji VS16;
+
+emoji_modifier_sequence = EMOJI_MODIFIER_BASE EMOJI_MODIFIER;
+
+emoji_flag_sequence = REGIONAL_INDICATOR REGIONAL_INDICATOR;
+
+# Here we only allow the valid tag sequences
+# https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences, instead of
+# all well-formed ones defined in
+# https://www.unicode.org/reports/tr51/#def_emoji_tag_sequence
+emoji_tag_sequence = TAG_BASE TAG_SEQUENCE+ TAG_TERM;
+
+emoji_keycap_sequence = KEYCAP_BASE COMBINING_ENCLOSING_KEYCAP;
+
+emoji_zwj_element = emoji_presentation_sequence | emoji_modifier_sequence | any_emoji;
+
+emoji_zwj_sequence = emoji_zwj_element ( ZWJ emoji_zwj_element )+;
+
+emoji_presentation = EMOJI_EMOJI_PRESENTATION | TAG_BASE | EMOJI_MODIFIER_BASE |
+ emoji_presentation_sequence | emoji_modifier_sequence | emoji_flag_sequence |
+ emoji_tag_sequence | emoji_keycap_sequence | emoji_zwj_sequence |
+ emoji_combining_encloding_circle_backslash_sequence;
+
+emoji_run = emoji_presentation+;
+
+text_presentation_emoji = any_emoji VS15;
+text_run = text_presentation_emoji | any;
+
+text_and_emoji_run := |*
+emoji_run => { found_emoji_presentation_sequence };
+text_run => { found_text_presentation_sequence };
+*|;
+
+}%%
+
+static gboolean
+scan_emoji_presentation (const unsigned char* buffer,
+ unsigned buffer_size,
+ unsigned cursor,
+ unsigned* last,
+ unsigned* end)
+{
+ const unsigned char *p = buffer + cursor;
+ const unsigned char *pe, *eof, *ts, *te;
+ unsigned act;
+ int cs;
+ pe = eof = buffer + buffer_size;
+
+ %%{
+ write init;
+ write exec;
+ }%%
+ return FALSE;
+}
+