diff options
author | Behdad Esfahbod <behdad@behdad.org> | 2018-10-15 14:53:31 -0700 |
---|---|---|
committer | Behdad Esfahbod <behdad@behdad.org> | 2018-10-15 15:36:06 -0700 |
commit | f8ca9ca5ed3198e7fbab7381e232f9f20ecd93da (patch) | |
tree | 6093dc34732e5c0cbd258e24ac3986f8b321d2c7 /pango/emoji_presentation_scanner.rl | |
parent | 15eb263d75496119d4bcd14069199bf8367926e9 (diff) | |
download | pango-f8ca9ca5ed3198e7fbab7381e232f9f20ecd93da.tar.gz |
[emoji] Port to new Ragel-based iterator, based on Chromium again
There's a couple of regressions in this apparently. I'm working with Dominik
to fix on Chrome side and will push here after.
Diffstat (limited to 'pango/emoji_presentation_scanner.rl')
-rw-r--r-- | pango/emoji_presentation_scanner.rl | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/pango/emoji_presentation_scanner.rl b/pango/emoji_presentation_scanner.rl new file mode 100644 index 00000000..5eea495a --- /dev/null +++ b/pango/emoji_presentation_scanner.rl @@ -0,0 +1,96 @@ +// Copyright 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +%%{ + machine emoji_presentation; + alphtype unsigned char; + write data noerror nofinal noentry; +}%% + +%%{ + +EMOJI = 0; +EMOJI_TEXT_PRESENTATION = 1; +EMOJI_EMOJI_PRESENTATION = 2; +EMOJI_MODIFIER_BASE = 3; +EMOJI_MODIFIER = 4; +EMOJI_VS_BASE = 5; +REGIONAL_INDICATOR = 6; +KEYCAP_BASE = 7; +COMBINING_ENCLOSING_KEYCAP = 8; +COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9; +ZWJ = 10; +VS15 = 11; +VS16 = 12; +TAG_BASE = 13; +TAG_SEQUENCE = 14; +TAG_TERM = 15; + +any_emoji = EMOJI_TEXT_PRESENTATION | EMOJI_EMOJI_PRESENTATION | KEYCAP_BASE | + EMOJI_MODIFIER_BASE | TAG_BASE | EMOJI; + +emoji_combining_encloding_circle_backslash_sequence = any_emoji + COMBINING_ENCLOSING_CIRCLE_BACKSLASH; + +# This could be sharper than any_emoji by restricting this only to valid +# variation sequences: +# https://www.unicode.org/Public/emoji/11.0/emoji-variation-sequences.txt +# However, implementing +# https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence is +# sufficient for our purposes here. +emoji_presentation_sequence = any_emoji VS16; + +emoji_modifier_sequence = EMOJI_MODIFIER_BASE EMOJI_MODIFIER; + +emoji_flag_sequence = REGIONAL_INDICATOR REGIONAL_INDICATOR; + +# Here we only allow the valid tag sequences +# https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences, instead of +# all well-formed ones defined in +# https://www.unicode.org/reports/tr51/#def_emoji_tag_sequence +emoji_tag_sequence = TAG_BASE TAG_SEQUENCE+ TAG_TERM; + +emoji_keycap_sequence = KEYCAP_BASE COMBINING_ENCLOSING_KEYCAP; + +emoji_zwj_element = emoji_presentation_sequence | emoji_modifier_sequence | any_emoji; + +emoji_zwj_sequence = emoji_zwj_element ( ZWJ emoji_zwj_element )+; + +emoji_presentation = EMOJI_EMOJI_PRESENTATION | TAG_BASE | EMOJI_MODIFIER_BASE | + emoji_presentation_sequence | emoji_modifier_sequence | emoji_flag_sequence | + emoji_tag_sequence | emoji_keycap_sequence | emoji_zwj_sequence | + emoji_combining_encloding_circle_backslash_sequence; + +emoji_run = emoji_presentation+; + +text_presentation_emoji = any_emoji VS15; +text_run = text_presentation_emoji | any; + +text_and_emoji_run := |* +emoji_run => { found_emoji_presentation_sequence }; +text_run => { found_text_presentation_sequence }; +*|; + +}%% + +static gboolean +scan_emoji_presentation (const unsigned char* buffer, + unsigned buffer_size, + unsigned cursor, + unsigned* last, + unsigned* end) +{ + const unsigned char *p = buffer + cursor; + const unsigned char *pe, *eof, *ts, *te; + unsigned act; + int cs; + pe = eof = buffer + buffer_size; + + %%{ + write init; + write exec; + }%% + return FALSE; +} + |