summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Clasen <mclasen@redhat.com>2018-11-19 20:33:18 +0000
committerMatthias Clasen <mclasen@redhat.com>2018-11-19 20:33:18 +0000
commit0d8b48284b8d90b19b3f9ffdd75335bece3c3818 (patch)
tree71eebd0c3ea126b95b0a43841bab2a77e1abdea4
parentf3190a5c616bf7d11a0d1e13c20fdedeeece5fe1 (diff)
parentd4da231dfa53924ae074dece0cca93d49f39d30a (diff)
downloadpango-0d8b48284b8d90b19b3f9ffdd75335bece3c3818.tar.gz
Merge branch 'emoji-ragel' into 'master'
Emoji ragel See merge request GNOME/pango!31
-rw-r--r--pango/emoji_presentation_scanner.c497
-rw-r--r--pango/emoji_presentation_scanner.rl96
-rw-r--r--pango/pango-emoji-private.h7
-rw-r--r--pango/pango-emoji-table.h43
-rw-r--r--pango/pango-emoji.c256
-rwxr-xr-xtools/gen-emoji-table.py4
6 files changed, 772 insertions, 131 deletions
diff --git a/pango/emoji_presentation_scanner.c b/pango/emoji_presentation_scanner.c
new file mode 100644
index 00000000..43872abb
--- /dev/null
+++ b/pango/emoji_presentation_scanner.c
@@ -0,0 +1,497 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+
+static const char _emoji_presentation_actions[] = {
+ 0, 1, 0, 1, 1, 1, 5, 1,
+ 6, 1, 7, 1, 8, 1, 9, 2,
+ 2, 3, 2, 2, 4, 0
+};
+
+static const char _emoji_presentation_key_offsets[] = {
+ 0, 3, 8, 9, 13, 15, 22, 26,
+ 33, 42, 52, 63, 71, 82, 92, 103,
+ 115, 116, 121, 0
+};
+
+static const unsigned char _emoji_presentation_trans_keys[] = {
+ 9u, 10u, 12u, 3u, 7u, 13u, 0u, 2u,
+ 6u, 10u, 12u, 8u, 9u, 14u, 15u, 2u,
+ 3u, 6u, 7u, 13u, 0u, 1u, 9u, 10u,
+ 11u, 12u, 2u, 3u, 6u, 7u, 13u, 0u,
+ 1u, 2u, 3u, 6u, 7u, 10u, 12u, 13u,
+ 0u, 1u, 2u, 3u, 6u, 7u, 9u, 10u,
+ 12u, 13u, 0u, 1u, 2u, 3u, 4u, 6u,
+ 7u, 9u, 10u, 12u, 13u, 0u, 1u, 2u,
+ 3u, 6u, 7u, 10u, 13u, 0u, 1u, 2u,
+ 3u, 6u, 7u, 9u, 10u, 12u, 13u, 14u,
+ 0u, 1u, 2u, 3u, 4u, 6u, 7u, 10u,
+ 12u, 13u, 0u, 1u, 2u, 3u, 6u, 7u,
+ 9u, 10u, 11u, 12u, 13u, 0u, 1u, 2u,
+ 3u, 4u, 6u, 7u, 9u, 10u, 11u, 12u,
+ 13u, 0u, 1u, 6u, 10u, 11u, 12u, 8u,
+ 9u, 2u, 3u, 6u, 7u, 9u, 10u, 11u,
+ 12u, 13u, 14u, 0u, 1u, 0u
+};
+
+static const char _emoji_presentation_single_lengths[] = {
+ 3, 3, 1, 2, 2, 5, 4, 5,
+ 7, 8, 9, 6, 9, 8, 9, 10,
+ 1, 3, 10, 0
+};
+
+static const char _emoji_presentation_range_lengths[] = {
+ 0, 1, 0, 1, 0, 1, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 1, 1, 0
+};
+
+static const char _emoji_presentation_index_offsets[] = {
+ 0, 4, 9, 11, 15, 18, 25, 30,
+ 37, 46, 56, 67, 75, 86, 96, 107,
+ 119, 121, 126, 0
+};
+
+static const char _emoji_presentation_trans_cond_spaces[] = {
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, 0
+};
+
+static const short _emoji_presentation_trans_offsets[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 0
+};
+
+static const char _emoji_presentation_trans_lengths[] = {
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 0
+};
+
+static const char _emoji_presentation_cond_keys[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_cond_targs[] = {
+ 7, 1, 11, 5, 13, 8, 8, 8,
+ 5, 7, 5, 1, 11, 7, 5, 4,
+ 7, 5, 14, 15, 16, 17, 18, 6,
+ 5, 7, 1, 5, 11, 5, 9, 10,
+ 2, 3, 12, 0, 5, 9, 10, 2,
+ 3, 1, 11, 12, 0, 5, 9, 10,
+ 2, 3, 7, 1, 11, 12, 0, 5,
+ 9, 10, 11, 2, 3, 7, 1, 11,
+ 12, 0, 5, 9, 10, 2, 3, 1,
+ 12, 0, 5, 9, 10, 2, 3, 7,
+ 1, 11, 12, 4, 0, 5, 9, 10,
+ 11, 2, 3, 1, 11, 12, 0, 5,
+ 9, 10, 2, 3, 7, 1, 5, 11,
+ 12, 0, 5, 9, 10, 11, 2, 3,
+ 7, 1, 5, 11, 12, 0, 5, 7,
+ 5, 1, 5, 11, 7, 5, 9, 10,
+ 2, 3, 7, 1, 5, 11, 12, 4,
+ 0, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 0
+};
+
+static const char _emoji_presentation_cond_actions[] = {
+ 15, 0, 15, 11, 15, 15, 15, 15,
+ 13, 15, 11, 0, 15, 15, 11, 0,
+ 15, 11, 15, 15, 0, 18, 15, 18,
+ 5, 15, 0, 5, 15, 9, 15, 15,
+ 0, 0, 15, 0, 7, 15, 15, 0,
+ 0, 0, 15, 15, 0, 7, 15, 15,
+ 0, 0, 15, 0, 15, 15, 0, 7,
+ 15, 15, 15, 0, 0, 15, 0, 15,
+ 15, 0, 7, 15, 15, 0, 0, 0,
+ 15, 0, 7, 15, 15, 0, 0, 15,
+ 0, 15, 15, 0, 0, 7, 15, 15,
+ 15, 0, 0, 0, 15, 15, 0, 7,
+ 15, 15, 0, 0, 15, 0, 5, 15,
+ 15, 0, 7, 15, 15, 15, 0, 0,
+ 15, 0, 5, 15, 15, 0, 7, 15,
+ 9, 0, 5, 15, 15, 9, 15, 15,
+ 0, 0, 15, 0, 5, 15, 15, 0,
+ 0, 7, 11, 13, 11, 11, 11, 9,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 9, 9, 7, 0
+};
+
+static const char _emoji_presentation_to_state_actions[] = {
+ 0, 0, 0, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_from_state_actions[] = {
+ 0, 0, 0, 0, 0, 3, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_spaces[] = {
+ -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, 0
+};
+
+static const char _emoji_presentation_eof_cond_key_offs[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_key_lens[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_keys[] = {
+ 0
+};
+
+static const short _emoji_presentation_eof_trans[] = {
+ 139, 140, 141, 142, 143, 0, 144, 145,
+ 146, 147, 148, 149, 150, 151, 152, 153,
+ 154, 155, 156, 0
+};
+
+static const char _emoji_presentation_nfa_targs[] = {
+ 0, 0
+};
+
+static const char _emoji_presentation_nfa_offsets[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_nfa_push_actions[] = {
+ 0, 0
+};
+
+static const char _emoji_presentation_nfa_pop_trans[] = {
+ 0, 0
+};
+
+static const int emoji_presentation_start = 5;
+
+static const int emoji_presentation_en_text_and_emoji_run = 5;
+
+
+
+
+
+static gboolean
+scan_emoji_presentation (const unsigned char* buffer,
+unsigned buffer_size,
+unsigned cursor,
+unsigned* last,
+unsigned* end)
+{
+ const unsigned char *p = buffer + cursor;
+ const unsigned char *pe, *eof, *ts, *te;
+ unsigned act;
+ int cs;
+ pe = eof = buffer + buffer_size;
+
+
+ {
+ cs = (int)emoji_presentation_start;
+ ts = 0;
+ te = 0;
+ act = 0;
+ }
+
+ {
+ int _cpc;
+ int _klen;const char * _cekeys;unsigned int _trans = 0;const unsigned char * _keys;const char * _acts;unsigned int _nacts; {
+ if ( p == pe )
+ goto _test_eof;
+ _resume: {
+ _acts = ( _emoji_presentation_actions + (_emoji_presentation_from_state_actions[cs]));
+ _nacts = (unsigned int)(*( _acts));
+ _acts += 1;
+ while ( _nacts > 0 ) {
+ switch ( (*( _acts)) ) {
+ case 1: {
+ {
+ #line 1 "NONE"
+ {ts = p;}}
+ break; }
+ }
+ _nacts -= 1;
+ _acts += 1;
+ }
+
+ _keys = ( _emoji_presentation_trans_keys + (_emoji_presentation_key_offsets[cs]));
+ _trans = (unsigned int)_emoji_presentation_index_offsets[cs];
+
+ _klen = (int)_emoji_presentation_single_lengths[cs];
+ if ( _klen > 0 ) {
+ const unsigned char *_lower = _keys;
+ const unsigned char *_upper = _keys + _klen - 1;
+ const unsigned char *_mid;
+ while ( 1 ) {
+ if ( _upper < _lower )
+ break;
+
+ _mid = _lower + ((_upper-_lower) >> 1);
+ if ( ( (*( p))) < (*( _mid)) )
+ _upper = _mid - 1;
+ else if ( ( (*( p))) > (*( _mid)) )
+ _lower = _mid + 1;
+ else {
+ _trans += (unsigned int)(_mid - _keys);
+ goto _match;
+ }
+ }
+ _keys += _klen;
+ _trans += (unsigned int)_klen;
+ }
+
+ _klen = (int)_emoji_presentation_range_lengths[cs];
+ if ( _klen > 0 ) {
+ const unsigned char *_lower = _keys;
+ const unsigned char *_upper = _keys + (_klen<<1) - 2;
+ const unsigned char *_mid;
+ while ( 1 ) {
+ if ( _upper < _lower )
+ break;
+
+ _mid = _lower + (((_upper-_lower) >> 1) & ~1);
+ if ( ( (*( p))) < (*( _mid)) )
+ _upper = _mid - 2;
+ else if ( ( (*( p))) > (*( _mid + 1)) )
+ _lower = _mid + 2;
+ else {
+ _trans += (unsigned int)((_mid - _keys)>>1);
+ goto _match;
+ }
+ }
+ _trans += (unsigned int)_klen;
+ }
+
+ _match: {
+ goto _match_cond;
+ }
+ }
+ _match_cond: {
+ cs = (int)_emoji_presentation_cond_targs[_trans];
+
+ if ( _emoji_presentation_cond_actions[_trans] == 0 )
+ goto _again;
+
+ _acts = ( _emoji_presentation_actions + (_emoji_presentation_cond_actions[_trans]));
+ _nacts = (unsigned int)(*( _acts));
+ _acts += 1;
+ while ( _nacts > 0 ) {
+ switch ( (*( _acts)) )
+ {
+ case 2: {
+ {
+ #line 1 "NONE"
+ {te = p+1;}}
+ break; }
+ case 3: {
+ {
+ #line 71 "emoji_presentation_scanner.rl"
+ {act = 1;}}
+ break; }
+ case 4: {
+ {
+ #line 72 "emoji_presentation_scanner.rl"
+ {act = 2;}}
+ break; }
+ case 5: {
+ {
+ #line 72 "emoji_presentation_scanner.rl"
+ {te = p+1;{
+ #line 72 "emoji_presentation_scanner.rl"
+ found_text_presentation_sequence }}}
+ break; }
+ case 6: {
+ {
+ #line 71 "emoji_presentation_scanner.rl"
+ {te = p;p = p - 1;{
+ #line 71 "emoji_presentation_scanner.rl"
+ found_emoji_presentation_sequence }}}
+ break; }
+ case 7: {
+ {
+ #line 72 "emoji_presentation_scanner.rl"
+ {te = p;p = p - 1;{
+ #line 72 "emoji_presentation_scanner.rl"
+ found_text_presentation_sequence }}}
+ break; }
+ case 8: {
+ {
+ #line 71 "emoji_presentation_scanner.rl"
+ {p = ((te))-1;
+ {
+ #line 71 "emoji_presentation_scanner.rl"
+ found_emoji_presentation_sequence }}}
+ break; }
+ case 9: {
+ {
+ #line 1 "NONE"
+ {switch( act ) {
+ case 1: {
+ p = ((te))-1;
+ {
+ #line 71 "emoji_presentation_scanner.rl"
+ found_emoji_presentation_sequence } break; }
+ case 2: {
+ p = ((te))-1;
+ {
+ #line 72 "emoji_presentation_scanner.rl"
+ found_text_presentation_sequence } break; }
+ }}
+ }
+ break; }
+ }
+ _nacts -= 1;
+ _acts += 1;
+ }
+
+
+ }
+ _again: {
+ _acts = ( _emoji_presentation_actions + (_emoji_presentation_to_state_actions[cs]));
+ _nacts = (unsigned int)(*( _acts));
+ _acts += 1;
+ while ( _nacts > 0 ) {
+ switch ( (*( _acts)) ) {
+ case 0: {
+ {
+ #line 1 "NONE"
+ {ts = 0;}}
+ break; }
+ }
+ _nacts -= 1;
+ _acts += 1;
+ }
+
+ p += 1;
+ if ( p != pe )
+ goto _resume;
+ }
+ _test_eof: { {}
+ if ( p == eof )
+ {
+ if ( _emoji_presentation_eof_cond_spaces[cs] != -1 ) {
+ _cekeys = ( _emoji_presentation_eof_cond_keys + (_emoji_presentation_eof_cond_key_offs[cs]));
+ _klen = (int)_emoji_presentation_eof_cond_key_lens[cs];
+ _cpc = 0;
+ {
+ const char *_lower = _cekeys;
+ const char *_upper = _cekeys + _klen - 1;
+ const char *_mid;
+ while ( 1 ) {
+ if ( _upper < _lower )
+ break;
+
+ _mid = _lower + ((_upper-_lower) >> 1);
+ if ( _cpc < (int)(*( _mid)) )
+ _upper = _mid - 1;
+ else if ( _cpc > (int)(*( _mid)) )
+ _lower = _mid + 1;
+ else {
+ goto _ok;
+ }
+ }
+ cs = -1;
+ goto _out;
+ }
+ _ok: {}
+ }
+ if ( _emoji_presentation_eof_trans[cs] > 0 ) {
+ _trans = (unsigned int)_emoji_presentation_eof_trans[cs] - 1;
+ goto _match_cond;
+ }
+ }
+
+ }
+ _out: { {}
+ }
+ }
+ }
+
+ return FALSE;
+}
+
diff --git a/pango/emoji_presentation_scanner.rl b/pango/emoji_presentation_scanner.rl
new file mode 100644
index 00000000..c13ae279
--- /dev/null
+++ b/pango/emoji_presentation_scanner.rl
@@ -0,0 +1,96 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+%%{
+ machine emoji_presentation;
+ alphtype unsigned char;
+ write data noerror nofinal noentry;
+}%%
+
+%%{
+
+EMOJI = 0;
+EMOJI_TEXT_PRESENTATION = 1;
+EMOJI_EMOJI_PRESENTATION = 2;
+EMOJI_MODIFIER_BASE = 3;
+EMOJI_MODIFIER = 4;
+EMOJI_VS_BASE = 5;
+REGIONAL_INDICATOR = 6;
+KEYCAP_BASE = 7;
+COMBINING_ENCLOSING_KEYCAP = 8;
+COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9;
+ZWJ = 10;
+VS15 = 11;
+VS16 = 12;
+TAG_BASE = 13;
+TAG_SEQUENCE = 14;
+TAG_TERM = 15;
+
+any_emoji = EMOJI_TEXT_PRESENTATION | EMOJI_EMOJI_PRESENTATION | KEYCAP_BASE |
+ EMOJI_MODIFIER_BASE | TAG_BASE | EMOJI;
+
+emoji_combining_encloding_circle_backslash_sequence = any_emoji
+ COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
+
+# This could be sharper than any_emoji by restricting this only to valid
+# variation sequences:
+# https://www.unicode.org/Public/emoji/11.0/emoji-variation-sequences.txt
+# However, implementing
+# https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence is
+# sufficient for our purposes here.
+emoji_presentation_sequence = any_emoji VS16;
+
+emoji_modifier_sequence = EMOJI_MODIFIER_BASE EMOJI_MODIFIER;
+
+emoji_flag_sequence = REGIONAL_INDICATOR REGIONAL_INDICATOR;
+
+# Here we only allow the valid tag sequences
+# https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences, instead of
+# all well-formed ones defined in
+# https://www.unicode.org/reports/tr51/#def_emoji_tag_sequence
+emoji_tag_sequence = TAG_BASE TAG_SEQUENCE+ TAG_TERM;
+
+emoji_keycap_sequence = KEYCAP_BASE VS16 COMBINING_ENCLOSING_KEYCAP;
+
+emoji_zwj_element = emoji_presentation_sequence | emoji_modifier_sequence | any_emoji;
+
+emoji_zwj_sequence = emoji_zwj_element ( ZWJ emoji_zwj_element )+;
+
+emoji_presentation = EMOJI_EMOJI_PRESENTATION | TAG_BASE | EMOJI_MODIFIER_BASE |
+ emoji_presentation_sequence | emoji_modifier_sequence | emoji_flag_sequence |
+ emoji_tag_sequence | emoji_keycap_sequence | emoji_zwj_sequence |
+ emoji_combining_encloding_circle_backslash_sequence;
+
+emoji_run = emoji_presentation+;
+
+text_presentation_emoji = any_emoji VS15;
+text_run = text_presentation_emoji | any;
+
+text_and_emoji_run := |*
+emoji_run => { found_emoji_presentation_sequence };
+text_run => { found_text_presentation_sequence };
+*|;
+
+}%%
+
+static gboolean
+scan_emoji_presentation (const unsigned char* buffer,
+ unsigned buffer_size,
+ unsigned cursor,
+ unsigned* last,
+ unsigned* end)
+{
+ const unsigned char *p = buffer + cursor;
+ const unsigned char *pe, *eof, *ts, *te;
+ unsigned act;
+ int cs;
+ pe = eof = buffer + buffer_size;
+
+ %%{
+ write init;
+ write exec;
+ }%%
+ return FALSE;
+}
+
diff --git a/pango/pango-emoji-private.h b/pango/pango-emoji-private.h
index eb8a52a7..a360b37a 100644
--- a/pango/pango-emoji-private.h
+++ b/pango/pango-emoji-private.h
@@ -33,6 +33,13 @@ struct _PangoEmojiIter
const gchar *start;
const gchar *end;
gboolean is_emoji;
+
+ const gchar *token_start;
+ const gchar *token_end;
+
+ const unsigned char *types;
+ unsigned int n_chars;
+ unsigned int cursor;
};
PangoEmojiIter *
diff --git a/pango/pango-emoji-table.h b/pango/pango-emoji-table.h
index 0f58f21d..da9ff4fc 100644
--- a/pango/pango-emoji-table.h
+++ b/pango/pango-emoji-table.h
@@ -7,13 +7,13 @@
* on file with this header:
*
* # emoji-data.txt
- * # Date: 2017-06-19, 11:13:24 GMT
- * # © 2017 Unicode®, Inc.
+ * # Date: 2018-02-07, 07:55:18 GMT
+ * # © 2018 Unicode®, Inc.
* # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
* # For terms of use, see http://www.unicode.org/terms_of_use.html
* #
- * # Emoji Data for UTR #51
- * # Version: 5.0
+ * # Emoji Data for UTS #51
+ * # Version: 11.0
* #
* # For documentation and usage, see http://www.unicode.org/reports/tr51
*/
@@ -65,12 +65,12 @@ static const struct Interval _pango_Emoji_table[] =
{0x2640, 0x2640},
{0x2642, 0x2642},
{0x2648, 0x2653},
- {0x2660, 0x2660},
+ {0x265F, 0x2660},
{0x2663, 0x2663},
{0x2665, 0x2666},
{0x2668, 0x2668},
{0x267B, 0x267B},
- {0x267F, 0x267F},
+ {0x267E, 0x267F},
{0x2692, 0x2697},
{0x2699, 0x2699},
{0x269B, 0x269C},
@@ -228,6 +228,7 @@ static const struct Interval _pango_Emoji_table[] =
{0x1F6F3, 0x1F6F3},
{0x1F6F4, 0x1F6F6},
{0x1F6F7, 0x1F6F8},
+ {0x1F6F9, 0x1F6F9},
{0x1F910, 0x1F918},
{0x1F919, 0x1F91E},
{0x1F91F, 0x1F91F},
@@ -240,13 +241,22 @@ static const struct Interval _pango_Emoji_table[] =
{0x1F940, 0x1F945},
{0x1F947, 0x1F94B},
{0x1F94C, 0x1F94C},
+ {0x1F94D, 0x1F94F},
{0x1F950, 0x1F95E},
{0x1F95F, 0x1F96B},
+ {0x1F96C, 0x1F970},
+ {0x1F973, 0x1F976},
+ {0x1F97A, 0x1F97A},
+ {0x1F97C, 0x1F97F},
{0x1F980, 0x1F984},
{0x1F985, 0x1F991},
{0x1F992, 0x1F997},
+ {0x1F998, 0x1F9A2},
+ {0x1F9B0, 0x1F9B9},
{0x1F9C0, 0x1F9C0},
+ {0x1F9C1, 0x1F9C2},
{0x1F9D0, 0x1F9E6},
+ {0x1F9E7, 0x1F9FF},
};
static const struct Interval _pango_Emoji_Presentation_table[] =
@@ -353,6 +363,7 @@ static const struct Interval _pango_Emoji_Presentation_table[] =
{0x1F6EB, 0x1F6EC},
{0x1F6F4, 0x1F6F6},
{0x1F6F7, 0x1F6F8},
+ {0x1F6F9, 0x1F6F9},
{0x1F910, 0x1F918},
{0x1F919, 0x1F91E},
{0x1F91F, 0x1F91F},
@@ -365,13 +376,22 @@ static const struct Interval _pango_Emoji_Presentation_table[] =
{0x1F940, 0x1F945},
{0x1F947, 0x1F94B},
{0x1F94C, 0x1F94C},
+ {0x1F94D, 0x1F94F},
{0x1F950, 0x1F95E},
{0x1F95F, 0x1F96B},
+ {0x1F96C, 0x1F970},
+ {0x1F973, 0x1F976},
+ {0x1F97A, 0x1F97A},
+ {0x1F97C, 0x1F97F},
{0x1F980, 0x1F984},
{0x1F985, 0x1F991},
{0x1F992, 0x1F997},
+ {0x1F998, 0x1F9A2},
+ {0x1F9B0, 0x1F9B9},
{0x1F9C0, 0x1F9C0},
+ {0x1F9C1, 0x1F9C2},
{0x1F9D0, 0x1F9E6},
+ {0x1F9E7, 0x1F9FF},
};
static const struct Interval _pango_Emoji_Modifier_table[] =
@@ -418,18 +438,11 @@ static const struct Interval _pango_Emoji_Modifier_Base_table[] =
{0x1F931, 0x1F932},
{0x1F933, 0x1F939},
{0x1F93D, 0x1F93E},
+ {0x1F9B5, 0x1F9B6},
+ {0x1F9B8, 0x1F9B9},
{0x1F9D1, 0x1F9DD},
};
-static const struct Interval _pango_Emoji_Component_table[] =
-{
- {0x0023, 0x0023},
- {0x002A, 0x002A},
- {0x0030, 0x0039},
- {0x1F1E6, 0x1F1FF},
- {0x1F3FB, 0x1F3FF},
-};
-
#endif /* PANGO_EMOJI_TABLE_H */
/* == End of generated table == */
diff --git a/pango/pango-emoji.c b/pango/pango-emoji.c
index 29472452..46ab5b3f 100644
--- a/pango/pango-emoji.c
+++ b/pango/pango-emoji.c
@@ -18,11 +18,27 @@
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*
- * Implementation of pango_emoji_iter is derived from Chromium:
+ * Implementation of pango_emoji_iter is based on Chromium's Ragel-based
+ * parser:
*
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/FontFallbackPriority.h
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/text/CharacterEmoji.cpp
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/SymbolsIterator.cpp
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577
+ *
+ * The grammar file emoji_presentation_scanner.rl was just modified to
+ * adapt the function signature and variables to our usecase. The
+ * grammar itself was NOT modified:
+ *
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/emoji_presentation_scanner.rl
+ *
+ * The emoji_presentation_scanner.c is generated from .rl file by
+ * running ragel on it.
+ *
+ * The categorization is also based on:
+ *
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/utf16_ragel_iterator.h
+ *
+ * The iterator next() is based on:
+ *
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/symbols_iterator.cc
*
* // Copyright 2015 The Chromium Authors. All rights reserved.
* // Use of this source code is governed by a BSD-style license that can be
@@ -105,62 +121,110 @@ _pango_Is_Regional_Indicator (gunichar ch)
const gunichar kCombiningEnclosingCircleBackslashCharacter = 0x20E0;
const gunichar kCombiningEnclosingKeycapCharacter = 0x20E3;
-const gunichar kEyeCharacter = 0x1F441;
-const gunichar kFemaleSignCharacter = 0x2640;
-const gunichar kLeftSpeechBubbleCharacter = 0x1F5E8;
-const gunichar kMaleSignCharacter = 0x2642;
-const gunichar kRainbowCharacter = 0x1F308;
-const gunichar kStaffOfAesculapiusCharacter = 0x2695;
const gunichar kVariationSelector15Character = 0xFE0E;
const gunichar kVariationSelector16Character = 0xFE0F;
-const gunichar kWavingWhiteFlagCharacter = 0x1F3F3;
const gunichar kZeroWidthJoinerCharacter = 0x200D;
-
-typedef enum {
- PANGO_EMOJI_TYPE_INVALID,
- PANGO_EMOJI_TYPE_TEXT, /* For regular non-symbols text */
- PANGO_EMOJI_TYPE_EMOJI_TEXT, /* For emoji in text presentaiton */
- PANGO_EMOJI_TYPE_EMOJI_EMOJI /* For emoji in emoji presentation */
-} PangoEmojiType;
-
-static PangoEmojiType
-_pango_get_emoji_type (gunichar codepoint)
+enum PangoEmojiScannerCategory {
+ EMOJI = 0,
+ EMOJI_TEXT_PRESENTATION = 1,
+ EMOJI_EMOJI_PRESENTATION = 2,
+ EMOJI_MODIFIER_BASE = 3,
+ EMOJI_MODIFIER = 4,
+ EMOJI_VS_BASE = 5,
+ REGIONAL_INDICATOR = 6,
+ KEYCAP_BASE = 7,
+ COMBINING_ENCLOSING_KEYCAP = 8,
+ COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9,
+ ZWJ = 10,
+ VS15 = 11,
+ VS16 = 12,
+ TAG_BASE = 13,
+ TAG_SEQUENCE = 14,
+ TAG_TERM = 15,
+ kMaxEmojiScannerCategory = 16
+};
+
+static unsigned char
+_pango_EmojiSegmentationCategory (gunichar codepoint)
{
- /* Those should only be Emoji presentation as combinations of two. */
- if (_pango_Is_Emoji_Keycap_Base (codepoint) ||
- _pango_Is_Regional_Indicator (codepoint))
- return PANGO_EMOJI_TYPE_TEXT;
-
+ /* Specific ones first. */
if (codepoint == kCombiningEnclosingKeycapCharacter)
- return PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-
- if (_pango_Is_Emoji_Emoji_Default (codepoint) ||
- _pango_Is_Emoji_Modifier_Base (codepoint) ||
- _pango_Is_Emoji_Modifier (codepoint))
- return PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-
+ return COMBINING_ENCLOSING_KEYCAP;
+ if (codepoint == kCombiningEnclosingCircleBackslashCharacter)
+ return COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
+ if (codepoint == kZeroWidthJoinerCharacter)
+ return ZWJ;
+ if (codepoint == kVariationSelector15Character)
+ return VS15;
+ if (codepoint == kVariationSelector16Character)
+ return VS16;
+ if (codepoint == 0x1F3F4)
+ return TAG_BASE;
+ if ((codepoint >= 0xE0030 && codepoint <= 0xE0039) ||
+ (codepoint >= 0xE0061 && codepoint <= 0xE007A))
+ return TAG_SEQUENCE;
+ if (codepoint == 0xE007F)
+ return TAG_TERM;
+ if (_pango_Is_Emoji_Modifier_Base (codepoint))
+ return EMOJI_MODIFIER_BASE;
+ if (_pango_Is_Emoji_Modifier (codepoint))
+ return EMOJI_MODIFIER;
+ if (_pango_Is_Regional_Indicator (codepoint))
+ return REGIONAL_INDICATOR;
+ if (_pango_Is_Emoji_Keycap_Base (codepoint))
+ return KEYCAP_BASE;
+
+ if (_pango_Is_Emoji_Emoji_Default (codepoint))
+ return EMOJI_EMOJI_PRESENTATION;
if (_pango_Is_Emoji_Text_Default (codepoint))
- return PANGO_EMOJI_TYPE_EMOJI_TEXT;
+ return EMOJI_TEXT_PRESENTATION;
+ if (_pango_Is_Emoji (codepoint))
+ return EMOJI;
- return PANGO_EMOJI_TYPE_TEXT;
+ /* Ragel state machine will interpret unknown category as "any". */
+ return kMaxEmojiScannerCategory;
}
+#define found_text_presentation_sequence
+#define found_emoji_presentation_sequence \
+ { \
+ if (0) g_print ("emoji %ld..%ld\n", ts - buffer, te - buffer); \
+ *last = ts - buffer; \
+ *end = te - buffer; \
+ return TRUE; \
+ }
+
+#include "emoji_presentation_scanner.c"
+
PangoEmojiIter *
_pango_emoji_iter_init (PangoEmojiIter *iter,
const char *text,
int length)
{
- iter->text_start = text;
+ unsigned int n_chars = g_utf8_strlen (text, length);
+ unsigned char *types = g_malloc (n_chars);
+ unsigned int i;
+ const char *p;
+
+ p = text;
+ for (i = 0; i < n_chars; i++)
+ {
+ types[i] = _pango_EmojiSegmentationCategory (g_utf8_get_char (p));
+ p = g_utf8_next_char (p);
+ }
+
+ iter->text_start = iter->start = iter->end = iter->token_start = iter->token_end = text;
if (length >= 0)
iter->text_end = text + length;
else
iter->text_end = text + strlen (text);
+ iter->is_emoji = FALSE;
- iter->start = text;
- iter->end = text;
- iter->is_emoji = (gboolean) 2; /* HACK */
+ iter->types = types;
+ iter->n_chars = n_chars;
+ iter->cursor = 0;
_pango_emoji_iter_next (iter);
@@ -170,102 +234,62 @@ _pango_emoji_iter_init (PangoEmojiIter *iter,
void
_pango_emoji_iter_fini (PangoEmojiIter *iter)
{
+ g_free (iter->types);
}
-#define PANGO_EMOJI_TYPE_IS_EMOJI(typ) ((typ) == PANGO_EMOJI_TYPE_EMOJI_EMOJI)
-
gboolean
_pango_emoji_iter_next (PangoEmojiIter *iter)
{
- PangoEmojiType current_emoji_type = PANGO_EMOJI_TYPE_INVALID;
-
- if (iter->end == iter->text_end)
+ if (iter->end >= iter->text_end)
return FALSE;
iter->start = iter->end;
- for (; iter->end < iter->text_end; iter->end = g_utf8_next_char (iter->end))
+ /* The scan_emoji_presentation scanner function returns false when it reaches
+ * the end of the buffer and has not discovered any emoji runs in between. For
+ * Emoji runs, it returns true, and token_start_ and token_end_ are set to the
+ * start and end of the emoji sequence. This means, it may skip over text runs
+ * in between, see below. */
+ if (iter->start >= iter->token_end)
{
- gunichar ch = g_utf8_get_char (iter->end);
-
- /* Except at the beginning, ZWJ just carries over the emoji or neutral
- * text type, VS15 & VS16 we just carry over as well, since we already
- * resolved those through lookahead. Also, don't downgrade to text
- * presentation for emoji that are part of a ZWJ sequence, example
- * U+1F441 U+200D U+1F5E8, eye (text presentation) + ZWJ + left speech
- * bubble, see below. */
- if ((!(ch == kZeroWidthJoinerCharacter && !iter->is_emoji) &&
- ch != kVariationSelector15Character &&
- ch != kVariationSelector16Character &&
- ch != kCombiningEnclosingCircleBackslashCharacter &&
- !_pango_Is_Regional_Indicator(ch) &&
- !((ch == kLeftSpeechBubbleCharacter ||
- ch == kRainbowCharacter ||
- ch == kMaleSignCharacter ||
- ch == kFemaleSignCharacter ||
- ch == kStaffOfAesculapiusCharacter) &&
- !iter->is_emoji)) ||
- current_emoji_type == PANGO_EMOJI_TYPE_INVALID) {
- current_emoji_type = _pango_get_emoji_type (ch);
+ /* We need to scan furhter. */
+ unsigned int token_start, token_end;
+ if (!scan_emoji_presentation (iter->types, iter->n_chars, iter->cursor,
+ &token_start, &token_end))
+ {
+ /* The scanner returned false, which means it has reached the end of the
+ * buffer without discovering any emoji segments in between. */
+ iter->end = iter->text_end;
+ iter->is_emoji = FALSE;
+
+ return TRUE;
+ };
+ /* Ugly... */
+ g_assert (iter->cursor <= token_start && token_start < token_end && token_end <= iter->n_chars);
+ iter->token_start = g_utf8_offset_to_pointer (iter->token_end, token_start - iter->cursor);
+ iter->token_end = g_utf8_offset_to_pointer (iter->token_end, token_end - iter->cursor);
+ iter->cursor = token_end;
}
- if (g_utf8_next_char (iter->end) < iter->text_end) /* Optimize. */
+ if (iter->start < iter->token_start)
{
- gunichar peek_char = g_utf8_get_char (g_utf8_next_char (iter->end));
-
- /* Variation Selectors */
- if (current_emoji_type ==
- PANGO_EMOJI_TYPE_EMOJI_EMOJI &&
- peek_char == kVariationSelector15Character) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_TEXT;
- }
-
- if ((current_emoji_type ==
- PANGO_EMOJI_TYPE_EMOJI_TEXT ||
- _pango_Is_Emoji_Keycap_Base(ch)) &&
- peek_char == kVariationSelector16Character) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
-
- /* Combining characters Keycap... */
- if (_pango_Is_Emoji_Keycap_Base(ch) &&
- peek_char == kCombiningEnclosingKeycapCharacter) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- };
-
- /* Regional indicators */
- if (_pango_Is_Regional_Indicator(ch) &&
- _pango_Is_Regional_Indicator(peek_char)) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
-
- /* Upgrade text presentation emoji to emoji presentation when followed by
- * ZWJ, Example U+1F441 U+200D U+1F5E8, eye + ZWJ + left speech bubble. */
- if ((ch == kEyeCharacter ||
- ch == kWavingWhiteFlagCharacter) &&
- peek_char == kZeroWidthJoinerCharacter) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
+ /* The scanner function has progressed to the next emoji segment, but we
+ * need to return the text segment over which it had skipped. */
+ iter->end = iter->token_start;
+ iter->is_emoji = FALSE;
+ return TRUE;
}
- if (iter->is_emoji == (gboolean) 2)
- iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
- if (iter->is_emoji == PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type))
+ if (iter->start >= iter->token_start && iter->start < iter->token_end)
{
- iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
-
- /* Make sure we make progress. Weird sequences, like a VC15 followed
- * by VC16, can trick us into stalling otherwise. */
- if (iter->start == iter->end)
- iter->end = g_utf8_next_char (iter->end);
-
+ /* Now our cursor has reached the emoji segment, and we can return it. */
+ iter->end = iter->token_end;
+ iter->is_emoji = TRUE;
return TRUE;
}
- }
-
- iter->is_emoji = PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
- return TRUE;
+ g_assert_not_reached ();
+ return FALSE;
}
diff --git a/tools/gen-emoji-table.py b/tools/gen-emoji-table.py
index 600fe80f..b8018eee 100755
--- a/tools/gen-emoji-table.py
+++ b/tools/gen-emoji-table.py
@@ -51,6 +51,10 @@ print()
print("struct Interval {\n gunichar start, end;\n};")
for typ,s in sets.items():
+ if typ not in ['Emoji',
+ 'Emoji_Presentation',
+ 'Emoji_Modifier',
+ 'Emoji_Modifier_Base']: continue
print()
print("static const struct Interval _pango_%s_table[] =" % typ)
print("{")