diff options
author | Alexander Barkov <bar@mariadb.org> | 2016-06-21 21:36:23 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2016-06-21 21:36:23 +0400 |
commit | 63120090f994cc78876944e9f7a76f53337fa46e (patch) | |
tree | 2e84fe105bb38d1b3ab608495f01664b1d949e38 /strings | |
parent | 61492ea5ddd14efd2ede257700ac4d8fba2a04e6 (diff) | |
download | mariadb-git-63120090f994cc78876944e9f7a76f53337fa46e.tar.gz |
MDEV-10262 ucs2_thai_520_w2: wrong implicit weights on the secondary level
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-uca.c | 98 |
1 files changed, 77 insertions, 21 deletions
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 8a092e8f085..cebb723cebb 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -6539,7 +6539,8 @@ MY_UCA_INFO my_uca_v400= 0, /* nitems */ NULL, /* item */ NULL /* flags */ - } + }, + 0 /* levelno */ }, }, @@ -30084,7 +30085,8 @@ MY_UCA_INFO my_uca_v520_th= THAI_CONTRACTIONS, /* nitems */ thai_contractions, /* item */ NULL /* flags */ - } + }, + 0 /* levelno */ }, { 0x10FFFF, /* maxchar */ @@ -30094,7 +30096,8 @@ MY_UCA_INFO my_uca_v520_th= THAI_CONTRACTIONS_W2, /* nitems */ thai_contractions_w2, /* item */ NULL /* flags */ - } + }, + 1 /* levelno */ }, }, @@ -30127,8 +30130,9 @@ MY_UCA_INFO my_uca_v520= { /* Contractions: */ 0, /* nitems */ NULL, /* item */ - NULL /* flags */ - } + NULL /* flags */ + }, + 0 /* levelno */ }, }, @@ -31529,37 +31533,88 @@ my_uca_previous_context_find(my_uca_scanner *scanner, /****************************************************************/ +/** + Implicit weights for a code CP are constructed as follows: + [.AAAA.0020.0002][.BBBB.0000.0000] + + where: + AAAA= BASE + (CP >> 15); + BBBB= (CP & 0x7FFF) | 0x8000; + + There are two weights in the primary level (AAAA followed by BBBB). + There is one weight on other levels: + - 0020 on the secondary level + - 0002 on the tertiary level +*/ + + +/** + Return BASE for an implicit weight on the primary level + + According to UCA, BASE is calculated as follows: + - FB40 for Unified_Ideograph=True AND + ((Block=CJK_Unified_Ideograph) OR + (Block=CJK_Compatibility_Ideographs)) + - FB80 for Unified_Ideograph=True AND NOT + ((Block=CJK_Unified_Ideograph) OR + (Block=CJK_Compatibility_Ideographs)) + - FBC0 for any other code point + TODO: it seems we're not handling BASE correctly: + - check what are those blocks + - there are more Unified Ideograph blocks in the latest Unicode versions +*/ +static inline uint16 +my_uca_implicit_weight_base(my_wc_t code) +{ + if (code >= 0x3400 && code <= 0x4DB5) + return 0xFB80; + if (code >= 0x4E00 && code <= 0x9FA5) + return 0xFB40; + return 0xFBC0; +} + /** - Return implicit UCA weight + Return an implicit UCA weight for the primary level. Used for characters that do not have assigned UCA weights. @param scanner UCA weight scanner @return The leading implicit weight. + + The second weight is stored in scanner->implicit[0] + and is later returned on the next my_uca_scanner_next_any() call. */ static inline int -my_uca_scanner_next_implicit(my_uca_scanner *scanner) +my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner) { - scanner->code= (scanner->page << 8) + scanner->code; - scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000; - scanner->implicit[1]= 0; + my_wc_t wc= (scanner->page << 8) + scanner->code; + scanner->implicit[0]= (wc & 0x7FFF) | 0x8000; /* The second weight */ + scanner->implicit[1]= 0; /* 0 terminator */ scanner->wbeg= scanner->implicit; - - scanner->page= scanner->page >> 7; - - if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5) - scanner->page+= 0xFB80; - else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5) - scanner->page+= 0xFB40; - else - scanner->page+= 0xFBC0; - - return scanner->page; + return my_uca_implicit_weight_base(wc) + (wc >> 15); } +/** + Return an implicit weight for the current level + (according to scanner->level->levelno). + +*/ +static inline int +my_uca_scanner_next_implicit(my_uca_scanner *scanner) +{ + switch (scanner->level->levelno) { + case 0: return my_uca_scanner_next_implicit_primary(scanner);/* Primary level*/ + case 1: scanner->wbeg= nochar; return 0x0020; /* Secondary level */ + case 2: scanner->wbeg= nochar; return 0x0002; /* Tertiary level */ + default: scanner->wbeg= nochar; break; + } + DBUG_ASSERT(0); + return 0; +} + /* The same two functions for any character set */ @@ -33829,6 +33884,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, size_t i, npages= (src->maxchar + 1) / 256; dst->maxchar= src->maxchar; + dst->levelno= src->levelno; if (check_rules(loader, rules, dst, src)) return TRUE; |