diff options
author | Matthias Clasen <mclasen@redhat.com> | 2018-11-19 20:33:18 +0000 |
---|---|---|
committer | Matthias Clasen <mclasen@redhat.com> | 2018-11-19 20:33:18 +0000 |
commit | 0d8b48284b8d90b19b3f9ffdd75335bece3c3818 (patch) | |
tree | 71eebd0c3ea126b95b0a43841bab2a77e1abdea4 | |
parent | f3190a5c616bf7d11a0d1e13c20fdedeeece5fe1 (diff) | |
parent | d4da231dfa53924ae074dece0cca93d49f39d30a (diff) | |
download | pango-0d8b48284b8d90b19b3f9ffdd75335bece3c3818.tar.gz |
Merge branch 'emoji-ragel' into 'master'
Emoji ragel
See merge request GNOME/pango!31
-rw-r--r-- | pango/emoji_presentation_scanner.c | 497 | ||||
-rw-r--r-- | pango/emoji_presentation_scanner.rl | 96 | ||||
-rw-r--r-- | pango/pango-emoji-private.h | 7 | ||||
-rw-r--r-- | pango/pango-emoji-table.h | 43 | ||||
-rw-r--r-- | pango/pango-emoji.c | 256 | ||||
-rwxr-xr-x | tools/gen-emoji-table.py | 4 |
6 files changed, 772 insertions, 131 deletions
diff --git a/pango/emoji_presentation_scanner.c b/pango/emoji_presentation_scanner.c new file mode 100644 index 00000000..43872abb --- /dev/null +++ b/pango/emoji_presentation_scanner.c @@ -0,0 +1,497 @@ +// Copyright 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + + +static const char _emoji_presentation_actions[] = { + 0, 1, 0, 1, 1, 1, 5, 1, + 6, 1, 7, 1, 8, 1, 9, 2, + 2, 3, 2, 2, 4, 0 +}; + +static const char _emoji_presentation_key_offsets[] = { + 0, 3, 8, 9, 13, 15, 22, 26, + 33, 42, 52, 63, 71, 82, 92, 103, + 115, 116, 121, 0 +}; + +static const unsigned char _emoji_presentation_trans_keys[] = { + 9u, 10u, 12u, 3u, 7u, 13u, 0u, 2u, + 6u, 10u, 12u, 8u, 9u, 14u, 15u, 2u, + 3u, 6u, 7u, 13u, 0u, 1u, 9u, 10u, + 11u, 12u, 2u, 3u, 6u, 7u, 13u, 0u, + 1u, 2u, 3u, 6u, 7u, 10u, 12u, 13u, + 0u, 1u, 2u, 3u, 6u, 7u, 9u, 10u, + 12u, 13u, 0u, 1u, 2u, 3u, 4u, 6u, + 7u, 9u, 10u, 12u, 13u, 0u, 1u, 2u, + 3u, 6u, 7u, 10u, 13u, 0u, 1u, 2u, + 3u, 6u, 7u, 9u, 10u, 12u, 13u, 14u, + 0u, 1u, 2u, 3u, 4u, 6u, 7u, 10u, + 12u, 13u, 0u, 1u, 2u, 3u, 6u, 7u, + 9u, 10u, 11u, 12u, 13u, 0u, 1u, 2u, + 3u, 4u, 6u, 7u, 9u, 10u, 11u, 12u, + 13u, 0u, 1u, 6u, 10u, 11u, 12u, 8u, + 9u, 2u, 3u, 6u, 7u, 9u, 10u, 11u, + 12u, 13u, 14u, 0u, 1u, 0u +}; + +static const char _emoji_presentation_single_lengths[] = { + 3, 3, 1, 2, 2, 5, 4, 5, + 7, 8, 9, 6, 9, 8, 9, 10, + 1, 3, 10, 0 +}; + +static const char _emoji_presentation_range_lengths[] = { + 0, 1, 0, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 0, 1, 1, 0 +}; + +static const char _emoji_presentation_index_offsets[] = { + 0, 4, 9, 11, 15, 18, 25, 30, + 37, 46, 56, 67, 75, 86, 96, 107, + 119, 121, 126, 0 +}; + +static const char _emoji_presentation_trans_cond_spaces[] = { + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 0 +}; + +static const short _emoji_presentation_trans_offsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 0 +}; + +static const char _emoji_presentation_trans_lengths[] = { + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0 +}; + +static const char _emoji_presentation_cond_keys[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0 +}; + +static const char _emoji_presentation_cond_targs[] = { + 7, 1, 11, 5, 13, 8, 8, 8, + 5, 7, 5, 1, 11, 7, 5, 4, + 7, 5, 14, 15, 16, 17, 18, 6, + 5, 7, 1, 5, 11, 5, 9, 10, + 2, 3, 12, 0, 5, 9, 10, 2, + 3, 1, 11, 12, 0, 5, 9, 10, + 2, 3, 7, 1, 11, 12, 0, 5, + 9, 10, 11, 2, 3, 7, 1, 11, + 12, 0, 5, 9, 10, 2, 3, 1, + 12, 0, 5, 9, 10, 2, 3, 7, + 1, 11, 12, 4, 0, 5, 9, 10, + 11, 2, 3, 1, 11, 12, 0, 5, + 9, 10, 2, 3, 7, 1, 5, 11, + 12, 0, 5, 9, 10, 11, 2, 3, + 7, 1, 5, 11, 12, 0, 5, 7, + 5, 1, 5, 11, 7, 5, 9, 10, + 2, 3, 7, 1, 5, 11, 12, 4, + 0, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 0 +}; + +static const char _emoji_presentation_cond_actions[] = { + 15, 0, 15, 11, 15, 15, 15, 15, + 13, 15, 11, 0, 15, 15, 11, 0, + 15, 11, 15, 15, 0, 18, 15, 18, + 5, 15, 0, 5, 15, 9, 15, 15, + 0, 0, 15, 0, 7, 15, 15, 0, + 0, 0, 15, 15, 0, 7, 15, 15, + 0, 0, 15, 0, 15, 15, 0, 7, + 15, 15, 15, 0, 0, 15, 0, 15, + 15, 0, 7, 15, 15, 0, 0, 0, + 15, 0, 7, 15, 15, 0, 0, 15, + 0, 15, 15, 0, 0, 7, 15, 15, + 15, 0, 0, 0, 15, 15, 0, 7, + 15, 15, 0, 0, 15, 0, 5, 15, + 15, 0, 7, 15, 15, 15, 0, 0, + 15, 0, 5, 15, 15, 0, 7, 15, + 9, 0, 5, 15, 15, 9, 15, 15, + 0, 0, 15, 0, 5, 15, 15, 0, + 0, 7, 11, 13, 11, 11, 11, 9, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 9, 9, 7, 0 +}; + +static const char _emoji_presentation_to_state_actions[] = { + 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 +}; + +static const char _emoji_presentation_from_state_actions[] = { + 0, 0, 0, 0, 0, 3, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 +}; + +static const char _emoji_presentation_eof_cond_spaces[] = { + -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, 0 +}; + +static const char _emoji_presentation_eof_cond_key_offs[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 +}; + +static const char _emoji_presentation_eof_cond_key_lens[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 +}; + +static const char _emoji_presentation_eof_cond_keys[] = { + 0 +}; + +static const short _emoji_presentation_eof_trans[] = { + 139, 140, 141, 142, 143, 0, 144, 145, + 146, 147, 148, 149, 150, 151, 152, 153, + 154, 155, 156, 0 +}; + +static const char _emoji_presentation_nfa_targs[] = { + 0, 0 +}; + +static const char _emoji_presentation_nfa_offsets[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 +}; + +static const char _emoji_presentation_nfa_push_actions[] = { + 0, 0 +}; + +static const char _emoji_presentation_nfa_pop_trans[] = { + 0, 0 +}; + +static const int emoji_presentation_start = 5; + +static const int emoji_presentation_en_text_and_emoji_run = 5; + + + + + +static gboolean +scan_emoji_presentation (const unsigned char* buffer, +unsigned buffer_size, +unsigned cursor, +unsigned* last, +unsigned* end) +{ + const unsigned char *p = buffer + cursor; + const unsigned char *pe, *eof, *ts, *te; + unsigned act; + int cs; + pe = eof = buffer + buffer_size; + + + { + cs = (int)emoji_presentation_start; + ts = 0; + te = 0; + act = 0; + } + + { + int _cpc; + int _klen;const char * _cekeys;unsigned int _trans = 0;const unsigned char * _keys;const char * _acts;unsigned int _nacts; { + if ( p == pe ) + goto _test_eof; + _resume: { + _acts = ( _emoji_presentation_actions + (_emoji_presentation_from_state_actions[cs])); + _nacts = (unsigned int)(*( _acts)); + _acts += 1; + while ( _nacts > 0 ) { + switch ( (*( _acts)) ) { + case 1: { + { + #line 1 "NONE" + {ts = p;}} + break; } + } + _nacts -= 1; + _acts += 1; + } + + _keys = ( _emoji_presentation_trans_keys + (_emoji_presentation_key_offsets[cs])); + _trans = (unsigned int)_emoji_presentation_index_offsets[cs]; + + _klen = (int)_emoji_presentation_single_lengths[cs]; + if ( _klen > 0 ) { + const unsigned char *_lower = _keys; + const unsigned char *_upper = _keys + _klen - 1; + const unsigned char *_mid; + while ( 1 ) { + if ( _upper < _lower ) + break; + + _mid = _lower + ((_upper-_lower) >> 1); + if ( ( (*( p))) < (*( _mid)) ) + _upper = _mid - 1; + else if ( ( (*( p))) > (*( _mid)) ) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; + } + } + _keys += _klen; + _trans += (unsigned int)_klen; + } + + _klen = (int)_emoji_presentation_range_lengths[cs]; + if ( _klen > 0 ) { + const unsigned char *_lower = _keys; + const unsigned char *_upper = _keys + (_klen<<1) - 2; + const unsigned char *_mid; + while ( 1 ) { + if ( _upper < _lower ) + break; + + _mid = _lower + (((_upper-_lower) >> 1) & ~1); + if ( ( (*( p))) < (*( _mid)) ) + _upper = _mid - 2; + else if ( ( (*( p))) > (*( _mid + 1)) ) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys)>>1); + goto _match; + } + } + _trans += (unsigned int)_klen; + } + + _match: { + goto _match_cond; + } + } + _match_cond: { + cs = (int)_emoji_presentation_cond_targs[_trans]; + + if ( _emoji_presentation_cond_actions[_trans] == 0 ) + goto _again; + + _acts = ( _emoji_presentation_actions + (_emoji_presentation_cond_actions[_trans])); + _nacts = (unsigned int)(*( _acts)); + _acts += 1; + while ( _nacts > 0 ) { + switch ( (*( _acts)) ) + { + case 2: { + { + #line 1 "NONE" + {te = p+1;}} + break; } + case 3: { + { + #line 71 "emoji_presentation_scanner.rl" + {act = 1;}} + break; } + case 4: { + { + #line 72 "emoji_presentation_scanner.rl" + {act = 2;}} + break; } + case 5: { + { + #line 72 "emoji_presentation_scanner.rl" + {te = p+1;{ + #line 72 "emoji_presentation_scanner.rl" + found_text_presentation_sequence }}} + break; } + case 6: { + { + #line 71 "emoji_presentation_scanner.rl" + {te = p;p = p - 1;{ + #line 71 "emoji_presentation_scanner.rl" + found_emoji_presentation_sequence }}} + break; } + case 7: { + { + #line 72 "emoji_presentation_scanner.rl" + {te = p;p = p - 1;{ + #line 72 "emoji_presentation_scanner.rl" + found_text_presentation_sequence }}} + break; } + case 8: { + { + #line 71 "emoji_presentation_scanner.rl" + {p = ((te))-1; + { + #line 71 "emoji_presentation_scanner.rl" + found_emoji_presentation_sequence }}} + break; } + case 9: { + { + #line 1 "NONE" + {switch( act ) { + case 1: { + p = ((te))-1; + { + #line 71 "emoji_presentation_scanner.rl" + found_emoji_presentation_sequence } break; } + case 2: { + p = ((te))-1; + { + #line 72 "emoji_presentation_scanner.rl" + found_text_presentation_sequence } break; } + }} + } + break; } + } + _nacts -= 1; + _acts += 1; + } + + + } + _again: { + _acts = ( _emoji_presentation_actions + (_emoji_presentation_to_state_actions[cs])); + _nacts = (unsigned int)(*( _acts)); + _acts += 1; + while ( _nacts > 0 ) { + switch ( (*( _acts)) ) { + case 0: { + { + #line 1 "NONE" + {ts = 0;}} + break; } + } + _nacts -= 1; + _acts += 1; + } + + p += 1; + if ( p != pe ) + goto _resume; + } + _test_eof: { {} + if ( p == eof ) + { + if ( _emoji_presentation_eof_cond_spaces[cs] != -1 ) { + _cekeys = ( _emoji_presentation_eof_cond_keys + (_emoji_presentation_eof_cond_key_offs[cs])); + _klen = (int)_emoji_presentation_eof_cond_key_lens[cs]; + _cpc = 0; + { + const char *_lower = _cekeys; + const char *_upper = _cekeys + _klen - 1; + const char *_mid; + while ( 1 ) { + if ( _upper < _lower ) + break; + + _mid = _lower + ((_upper-_lower) >> 1); + if ( _cpc < (int)(*( _mid)) ) + _upper = _mid - 1; + else if ( _cpc > (int)(*( _mid)) ) + _lower = _mid + 1; + else { + goto _ok; + } + } + cs = -1; + goto _out; + } + _ok: {} + } + if ( _emoji_presentation_eof_trans[cs] > 0 ) { + _trans = (unsigned int)_emoji_presentation_eof_trans[cs] - 1; + goto _match_cond; + } + } + + } + _out: { {} + } + } + } + + return FALSE; +} + diff --git a/pango/emoji_presentation_scanner.rl b/pango/emoji_presentation_scanner.rl new file mode 100644 index 00000000..c13ae279 --- /dev/null +++ b/pango/emoji_presentation_scanner.rl @@ -0,0 +1,96 @@ +// Copyright 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +%%{ + machine emoji_presentation; + alphtype unsigned char; + write data noerror nofinal noentry; +}%% + +%%{ + +EMOJI = 0; +EMOJI_TEXT_PRESENTATION = 1; +EMOJI_EMOJI_PRESENTATION = 2; +EMOJI_MODIFIER_BASE = 3; +EMOJI_MODIFIER = 4; +EMOJI_VS_BASE = 5; +REGIONAL_INDICATOR = 6; +KEYCAP_BASE = 7; +COMBINING_ENCLOSING_KEYCAP = 8; +COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9; +ZWJ = 10; +VS15 = 11; +VS16 = 12; +TAG_BASE = 13; +TAG_SEQUENCE = 14; +TAG_TERM = 15; + +any_emoji = EMOJI_TEXT_PRESENTATION | EMOJI_EMOJI_PRESENTATION | KEYCAP_BASE | + EMOJI_MODIFIER_BASE | TAG_BASE | EMOJI; + +emoji_combining_encloding_circle_backslash_sequence = any_emoji + COMBINING_ENCLOSING_CIRCLE_BACKSLASH; + +# This could be sharper than any_emoji by restricting this only to valid +# variation sequences: +# https://www.unicode.org/Public/emoji/11.0/emoji-variation-sequences.txt +# However, implementing +# https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence is +# sufficient for our purposes here. +emoji_presentation_sequence = any_emoji VS16; + +emoji_modifier_sequence = EMOJI_MODIFIER_BASE EMOJI_MODIFIER; + +emoji_flag_sequence = REGIONAL_INDICATOR REGIONAL_INDICATOR; + +# Here we only allow the valid tag sequences +# https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences, instead of +# all well-formed ones defined in +# https://www.unicode.org/reports/tr51/#def_emoji_tag_sequence +emoji_tag_sequence = TAG_BASE TAG_SEQUENCE+ TAG_TERM; + +emoji_keycap_sequence = KEYCAP_BASE VS16 COMBINING_ENCLOSING_KEYCAP; + +emoji_zwj_element = emoji_presentation_sequence | emoji_modifier_sequence | any_emoji; + +emoji_zwj_sequence = emoji_zwj_element ( ZWJ emoji_zwj_element )+; + +emoji_presentation = EMOJI_EMOJI_PRESENTATION | TAG_BASE | EMOJI_MODIFIER_BASE | + emoji_presentation_sequence | emoji_modifier_sequence | emoji_flag_sequence | + emoji_tag_sequence | emoji_keycap_sequence | emoji_zwj_sequence | + emoji_combining_encloding_circle_backslash_sequence; + +emoji_run = emoji_presentation+; + +text_presentation_emoji = any_emoji VS15; +text_run = text_presentation_emoji | any; + +text_and_emoji_run := |* +emoji_run => { found_emoji_presentation_sequence }; +text_run => { found_text_presentation_sequence }; +*|; + +}%% + +static gboolean +scan_emoji_presentation (const unsigned char* buffer, + unsigned buffer_size, + unsigned cursor, + unsigned* last, + unsigned* end) +{ + const unsigned char *p = buffer + cursor; + const unsigned char *pe, *eof, *ts, *te; + unsigned act; + int cs; + pe = eof = buffer + buffer_size; + + %%{ + write init; + write exec; + }%% + return FALSE; +} + diff --git a/pango/pango-emoji-private.h b/pango/pango-emoji-private.h index eb8a52a7..a360b37a 100644 --- a/pango/pango-emoji-private.h +++ b/pango/pango-emoji-private.h @@ -33,6 +33,13 @@ struct _PangoEmojiIter const gchar *start; const gchar *end; gboolean is_emoji; + + const gchar *token_start; + const gchar *token_end; + + const unsigned char *types; + unsigned int n_chars; + unsigned int cursor; }; PangoEmojiIter * diff --git a/pango/pango-emoji-table.h b/pango/pango-emoji-table.h index 0f58f21d..da9ff4fc 100644 --- a/pango/pango-emoji-table.h +++ b/pango/pango-emoji-table.h @@ -7,13 +7,13 @@ * on file with this header: * * # emoji-data.txt - * # Date: 2017-06-19, 11:13:24 GMT - * # © 2017 Unicode®, Inc. + * # Date: 2018-02-07, 07:55:18 GMT + * # © 2018 Unicode®, Inc. * # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. * # For terms of use, see http://www.unicode.org/terms_of_use.html * # - * # Emoji Data for UTR #51 - * # Version: 5.0 + * # Emoji Data for UTS #51 + * # Version: 11.0 * # * # For documentation and usage, see http://www.unicode.org/reports/tr51 */ @@ -65,12 +65,12 @@ static const struct Interval _pango_Emoji_table[] = {0x2640, 0x2640}, {0x2642, 0x2642}, {0x2648, 0x2653}, - {0x2660, 0x2660}, + {0x265F, 0x2660}, {0x2663, 0x2663}, {0x2665, 0x2666}, {0x2668, 0x2668}, {0x267B, 0x267B}, - {0x267F, 0x267F}, + {0x267E, 0x267F}, {0x2692, 0x2697}, {0x2699, 0x2699}, {0x269B, 0x269C}, @@ -228,6 +228,7 @@ static const struct Interval _pango_Emoji_table[] = {0x1F6F3, 0x1F6F3}, {0x1F6F4, 0x1F6F6}, {0x1F6F7, 0x1F6F8}, + {0x1F6F9, 0x1F6F9}, {0x1F910, 0x1F918}, {0x1F919, 0x1F91E}, {0x1F91F, 0x1F91F}, @@ -240,13 +241,22 @@ static const struct Interval _pango_Emoji_table[] = {0x1F940, 0x1F945}, {0x1F947, 0x1F94B}, {0x1F94C, 0x1F94C}, + {0x1F94D, 0x1F94F}, {0x1F950, 0x1F95E}, {0x1F95F, 0x1F96B}, + {0x1F96C, 0x1F970}, + {0x1F973, 0x1F976}, + {0x1F97A, 0x1F97A}, + {0x1F97C, 0x1F97F}, {0x1F980, 0x1F984}, {0x1F985, 0x1F991}, {0x1F992, 0x1F997}, + {0x1F998, 0x1F9A2}, + {0x1F9B0, 0x1F9B9}, {0x1F9C0, 0x1F9C0}, + {0x1F9C1, 0x1F9C2}, {0x1F9D0, 0x1F9E6}, + {0x1F9E7, 0x1F9FF}, }; static const struct Interval _pango_Emoji_Presentation_table[] = @@ -353,6 +363,7 @@ static const struct Interval _pango_Emoji_Presentation_table[] = {0x1F6EB, 0x1F6EC}, {0x1F6F4, 0x1F6F6}, {0x1F6F7, 0x1F6F8}, + {0x1F6F9, 0x1F6F9}, {0x1F910, 0x1F918}, {0x1F919, 0x1F91E}, {0x1F91F, 0x1F91F}, @@ -365,13 +376,22 @@ static const struct Interval _pango_Emoji_Presentation_table[] = {0x1F940, 0x1F945}, {0x1F947, 0x1F94B}, {0x1F94C, 0x1F94C}, + {0x1F94D, 0x1F94F}, {0x1F950, 0x1F95E}, {0x1F95F, 0x1F96B}, + {0x1F96C, 0x1F970}, + {0x1F973, 0x1F976}, + {0x1F97A, 0x1F97A}, + {0x1F97C, 0x1F97F}, {0x1F980, 0x1F984}, {0x1F985, 0x1F991}, {0x1F992, 0x1F997}, + {0x1F998, 0x1F9A2}, + {0x1F9B0, 0x1F9B9}, {0x1F9C0, 0x1F9C0}, + {0x1F9C1, 0x1F9C2}, {0x1F9D0, 0x1F9E6}, + {0x1F9E7, 0x1F9FF}, }; static const struct Interval _pango_Emoji_Modifier_table[] = @@ -418,18 +438,11 @@ static const struct Interval _pango_Emoji_Modifier_Base_table[] = {0x1F931, 0x1F932}, {0x1F933, 0x1F939}, {0x1F93D, 0x1F93E}, + {0x1F9B5, 0x1F9B6}, + {0x1F9B8, 0x1F9B9}, {0x1F9D1, 0x1F9DD}, }; -static const struct Interval _pango_Emoji_Component_table[] = -{ - {0x0023, 0x0023}, - {0x002A, 0x002A}, - {0x0030, 0x0039}, - {0x1F1E6, 0x1F1FF}, - {0x1F3FB, 0x1F3FF}, -}; - #endif /* PANGO_EMOJI_TABLE_H */ /* == End of generated table == */ diff --git a/pango/pango-emoji.c b/pango/pango-emoji.c index 29472452..46ab5b3f 100644 --- a/pango/pango-emoji.c +++ b/pango/pango-emoji.c @@ -18,11 +18,27 @@ * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. * - * Implementation of pango_emoji_iter is derived from Chromium: + * Implementation of pango_emoji_iter is based on Chromium's Ragel-based + * parser: * - * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/FontFallbackPriority.h - * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/text/CharacterEmoji.cpp - * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/SymbolsIterator.cpp + * https://chromium-review.googlesource.com/c/chromium/src/+/1264577 + * + * The grammar file emoji_presentation_scanner.rl was just modified to + * adapt the function signature and variables to our usecase. The + * grammar itself was NOT modified: + * + * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/emoji_presentation_scanner.rl + * + * The emoji_presentation_scanner.c is generated from .rl file by + * running ragel on it. + * + * The categorization is also based on: + * + * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/utf16_ragel_iterator.h + * + * The iterator next() is based on: + * + * https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/symbols_iterator.cc * * // Copyright 2015 The Chromium Authors. All rights reserved. * // Use of this source code is governed by a BSD-style license that can be @@ -105,62 +121,110 @@ _pango_Is_Regional_Indicator (gunichar ch) const gunichar kCombiningEnclosingCircleBackslashCharacter = 0x20E0; const gunichar kCombiningEnclosingKeycapCharacter = 0x20E3; -const gunichar kEyeCharacter = 0x1F441; -const gunichar kFemaleSignCharacter = 0x2640; -const gunichar kLeftSpeechBubbleCharacter = 0x1F5E8; -const gunichar kMaleSignCharacter = 0x2642; -const gunichar kRainbowCharacter = 0x1F308; -const gunichar kStaffOfAesculapiusCharacter = 0x2695; const gunichar kVariationSelector15Character = 0xFE0E; const gunichar kVariationSelector16Character = 0xFE0F; -const gunichar kWavingWhiteFlagCharacter = 0x1F3F3; const gunichar kZeroWidthJoinerCharacter = 0x200D; - -typedef enum { - PANGO_EMOJI_TYPE_INVALID, - PANGO_EMOJI_TYPE_TEXT, /* For regular non-symbols text */ - PANGO_EMOJI_TYPE_EMOJI_TEXT, /* For emoji in text presentaiton */ - PANGO_EMOJI_TYPE_EMOJI_EMOJI /* For emoji in emoji presentation */ -} PangoEmojiType; - -static PangoEmojiType -_pango_get_emoji_type (gunichar codepoint) +enum PangoEmojiScannerCategory { + EMOJI = 0, + EMOJI_TEXT_PRESENTATION = 1, + EMOJI_EMOJI_PRESENTATION = 2, + EMOJI_MODIFIER_BASE = 3, + EMOJI_MODIFIER = 4, + EMOJI_VS_BASE = 5, + REGIONAL_INDICATOR = 6, + KEYCAP_BASE = 7, + COMBINING_ENCLOSING_KEYCAP = 8, + COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9, + ZWJ = 10, + VS15 = 11, + VS16 = 12, + TAG_BASE = 13, + TAG_SEQUENCE = 14, + TAG_TERM = 15, + kMaxEmojiScannerCategory = 16 +}; + +static unsigned char +_pango_EmojiSegmentationCategory (gunichar codepoint) { - /* Those should only be Emoji presentation as combinations of two. */ - if (_pango_Is_Emoji_Keycap_Base (codepoint) || - _pango_Is_Regional_Indicator (codepoint)) - return PANGO_EMOJI_TYPE_TEXT; - + /* Specific ones first. */ if (codepoint == kCombiningEnclosingKeycapCharacter) - return PANGO_EMOJI_TYPE_EMOJI_EMOJI; - - if (_pango_Is_Emoji_Emoji_Default (codepoint) || - _pango_Is_Emoji_Modifier_Base (codepoint) || - _pango_Is_Emoji_Modifier (codepoint)) - return PANGO_EMOJI_TYPE_EMOJI_EMOJI; - + return COMBINING_ENCLOSING_KEYCAP; + if (codepoint == kCombiningEnclosingCircleBackslashCharacter) + return COMBINING_ENCLOSING_CIRCLE_BACKSLASH; + if (codepoint == kZeroWidthJoinerCharacter) + return ZWJ; + if (codepoint == kVariationSelector15Character) + return VS15; + if (codepoint == kVariationSelector16Character) + return VS16; + if (codepoint == 0x1F3F4) + return TAG_BASE; + if ((codepoint >= 0xE0030 && codepoint <= 0xE0039) || + (codepoint >= 0xE0061 && codepoint <= 0xE007A)) + return TAG_SEQUENCE; + if (codepoint == 0xE007F) + return TAG_TERM; + if (_pango_Is_Emoji_Modifier_Base (codepoint)) + return EMOJI_MODIFIER_BASE; + if (_pango_Is_Emoji_Modifier (codepoint)) + return EMOJI_MODIFIER; + if (_pango_Is_Regional_Indicator (codepoint)) + return REGIONAL_INDICATOR; + if (_pango_Is_Emoji_Keycap_Base (codepoint)) + return KEYCAP_BASE; + + if (_pango_Is_Emoji_Emoji_Default (codepoint)) + return EMOJI_EMOJI_PRESENTATION; if (_pango_Is_Emoji_Text_Default (codepoint)) - return PANGO_EMOJI_TYPE_EMOJI_TEXT; + return EMOJI_TEXT_PRESENTATION; + if (_pango_Is_Emoji (codepoint)) + return EMOJI; - return PANGO_EMOJI_TYPE_TEXT; + /* Ragel state machine will interpret unknown category as "any". */ + return kMaxEmojiScannerCategory; } +#define found_text_presentation_sequence +#define found_emoji_presentation_sequence \ + { \ + if (0) g_print ("emoji %ld..%ld\n", ts - buffer, te - buffer); \ + *last = ts - buffer; \ + *end = te - buffer; \ + return TRUE; \ + } + +#include "emoji_presentation_scanner.c" + PangoEmojiIter * _pango_emoji_iter_init (PangoEmojiIter *iter, const char *text, int length) { - iter->text_start = text; + unsigned int n_chars = g_utf8_strlen (text, length); + unsigned char *types = g_malloc (n_chars); + unsigned int i; + const char *p; + + p = text; + for (i = 0; i < n_chars; i++) + { + types[i] = _pango_EmojiSegmentationCategory (g_utf8_get_char (p)); + p = g_utf8_next_char (p); + } + + iter->text_start = iter->start = iter->end = iter->token_start = iter->token_end = text; if (length >= 0) iter->text_end = text + length; else iter->text_end = text + strlen (text); + iter->is_emoji = FALSE; - iter->start = text; - iter->end = text; - iter->is_emoji = (gboolean) 2; /* HACK */ + iter->types = types; + iter->n_chars = n_chars; + iter->cursor = 0; _pango_emoji_iter_next (iter); @@ -170,102 +234,62 @@ _pango_emoji_iter_init (PangoEmojiIter *iter, void _pango_emoji_iter_fini (PangoEmojiIter *iter) { + g_free (iter->types); } -#define PANGO_EMOJI_TYPE_IS_EMOJI(typ) ((typ) == PANGO_EMOJI_TYPE_EMOJI_EMOJI) - gboolean _pango_emoji_iter_next (PangoEmojiIter *iter) { - PangoEmojiType current_emoji_type = PANGO_EMOJI_TYPE_INVALID; - - if (iter->end == iter->text_end) + if (iter->end >= iter->text_end) return FALSE; iter->start = iter->end; - for (; iter->end < iter->text_end; iter->end = g_utf8_next_char (iter->end)) + /* The scan_emoji_presentation scanner function returns false when it reaches + * the end of the buffer and has not discovered any emoji runs in between. For + * Emoji runs, it returns true, and token_start_ and token_end_ are set to the + * start and end of the emoji sequence. This means, it may skip over text runs + * in between, see below. */ + if (iter->start >= iter->token_end) { - gunichar ch = g_utf8_get_char (iter->end); - - /* Except at the beginning, ZWJ just carries over the emoji or neutral - * text type, VS15 & VS16 we just carry over as well, since we already - * resolved those through lookahead. Also, don't downgrade to text - * presentation for emoji that are part of a ZWJ sequence, example - * U+1F441 U+200D U+1F5E8, eye (text presentation) + ZWJ + left speech - * bubble, see below. */ - if ((!(ch == kZeroWidthJoinerCharacter && !iter->is_emoji) && - ch != kVariationSelector15Character && - ch != kVariationSelector16Character && - ch != kCombiningEnclosingCircleBackslashCharacter && - !_pango_Is_Regional_Indicator(ch) && - !((ch == kLeftSpeechBubbleCharacter || - ch == kRainbowCharacter || - ch == kMaleSignCharacter || - ch == kFemaleSignCharacter || - ch == kStaffOfAesculapiusCharacter) && - !iter->is_emoji)) || - current_emoji_type == PANGO_EMOJI_TYPE_INVALID) { - current_emoji_type = _pango_get_emoji_type (ch); + /* We need to scan furhter. */ + unsigned int token_start, token_end; + if (!scan_emoji_presentation (iter->types, iter->n_chars, iter->cursor, + &token_start, &token_end)) + { + /* The scanner returned false, which means it has reached the end of the + * buffer without discovering any emoji segments in between. */ + iter->end = iter->text_end; + iter->is_emoji = FALSE; + + return TRUE; + }; + /* Ugly... */ + g_assert (iter->cursor <= token_start && token_start < token_end && token_end <= iter->n_chars); + iter->token_start = g_utf8_offset_to_pointer (iter->token_end, token_start - iter->cursor); + iter->token_end = g_utf8_offset_to_pointer (iter->token_end, token_end - iter->cursor); + iter->cursor = token_end; } - if (g_utf8_next_char (iter->end) < iter->text_end) /* Optimize. */ + if (iter->start < iter->token_start) { - gunichar peek_char = g_utf8_get_char (g_utf8_next_char (iter->end)); - - /* Variation Selectors */ - if (current_emoji_type == - PANGO_EMOJI_TYPE_EMOJI_EMOJI && - peek_char == kVariationSelector15Character) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_TEXT; - } - - if ((current_emoji_type == - PANGO_EMOJI_TYPE_EMOJI_TEXT || - _pango_Is_Emoji_Keycap_Base(ch)) && - peek_char == kVariationSelector16Character) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI; - } - - /* Combining characters Keycap... */ - if (_pango_Is_Emoji_Keycap_Base(ch) && - peek_char == kCombiningEnclosingKeycapCharacter) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI; - }; - - /* Regional indicators */ - if (_pango_Is_Regional_Indicator(ch) && - _pango_Is_Regional_Indicator(peek_char)) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI; - } - - /* Upgrade text presentation emoji to emoji presentation when followed by - * ZWJ, Example U+1F441 U+200D U+1F5E8, eye + ZWJ + left speech bubble. */ - if ((ch == kEyeCharacter || - ch == kWavingWhiteFlagCharacter) && - peek_char == kZeroWidthJoinerCharacter) { - current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI; - } + /* The scanner function has progressed to the next emoji segment, but we + * need to return the text segment over which it had skipped. */ + iter->end = iter->token_start; + iter->is_emoji = FALSE; + return TRUE; } - if (iter->is_emoji == (gboolean) 2) - iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type); - if (iter->is_emoji == PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type)) + if (iter->start >= iter->token_start && iter->start < iter->token_end) { - iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type); - - /* Make sure we make progress. Weird sequences, like a VC15 followed - * by VC16, can trick us into stalling otherwise. */ - if (iter->start == iter->end) - iter->end = g_utf8_next_char (iter->end); - + /* Now our cursor has reached the emoji segment, and we can return it. */ + iter->end = iter->token_end; + iter->is_emoji = TRUE; return TRUE; } - } - - iter->is_emoji = PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type); - return TRUE; + g_assert_not_reached (); + return FALSE; } diff --git a/tools/gen-emoji-table.py b/tools/gen-emoji-table.py index 600fe80f..b8018eee 100755 --- a/tools/gen-emoji-table.py +++ b/tools/gen-emoji-table.py @@ -51,6 +51,10 @@ print() print("struct Interval {\n gunichar start, end;\n};") for typ,s in sets.items(): + if typ not in ['Emoji', + 'Emoji_Presentation', + 'Emoji_Modifier', + 'Emoji_Modifier_Base']: continue print() print("static const struct Interval _pango_%s_table[] =" % typ) print("{") |