diff options
author | Alexander Barkov <bar@mariadb.org> | 2016-06-21 21:36:23 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2016-06-21 21:36:23 +0400 |
commit | 63120090f994cc78876944e9f7a76f53337fa46e (patch) | |
tree | 2e84fe105bb38d1b3ab608495f01664b1d949e38 | |
parent | 61492ea5ddd14efd2ede257700ac4d8fba2a04e6 (diff) | |
download | mariadb-git-63120090f994cc78876944e9f7a76f53337fa46e.tar.gz |
MDEV-10262 ucs2_thai_520_w2: wrong implicit weights on the secondary level
-rw-r--r-- | include/m_ctype.h | 1 | ||||
-rw-r--r-- | mysql-test/include/ctype_uca_w2.inc | 7 | ||||
-rw-r--r-- | mysql-test/r/ctype_uca.result | 26 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf16_uca.result | 13 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf32_uca.result | 13 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8mb4_uca.result | 13 | ||||
-rw-r--r-- | strings/ctype-uca.c | 98 |
7 files changed, 150 insertions, 21 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index 497ba6de927..dcbfb604f03 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -131,6 +131,7 @@ typedef struct my_uca_level_info_st uchar *lengths; uint16 **weights; MY_CONTRACTIONS contractions; + uint levelno; } MY_UCA_WEIGHT_LEVEL; diff --git a/mysql-test/include/ctype_uca_w2.inc b/mysql-test/include/ctype_uca_w2.inc index 86ed7ddd134..b59f73b8a63 100644 --- a/mysql-test/include/ctype_uca_w2.inc +++ b/mysql-test/include/ctype_uca_w2.inc @@ -24,6 +24,13 @@ DROP TABLE t1; CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0; SHOW CREATE TABLE t1; +INSERT INTO t1 VALUES (_ucs2 0x3400); +INSERT INTO t1 VALUES (_ucs2 0xF001); +SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1; +DROP TABLE t1; + +CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0; +SHOW CREATE TABLE t1; INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a DESC; diff --git a/mysql-test/r/ctype_uca.result b/mysql-test/r/ctype_uca.result index 1609e7fc320..d9cba536814 100644 --- a/mysql-test/r/ctype_uca.result +++ b/mysql-test/r/ctype_uca.result @@ -14033,6 +14033,19 @@ Table Create Table t1 CREATE TABLE `t1` ( `a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL ) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (_ucs2 0x3400); +INSERT INTO t1 VALUES (_ucs2 0xF001); +SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1; +ucs2 HEX(a) HEX(WEIGHT_STRING(a)) +3400 E39080 FB80B4000020 +F001 EF8081 FBC1F0010020 +DROP TABLE t1; +CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; a HEX(WEIGHT_STRING(a LEVEL 2)) @@ -14714,6 +14727,19 @@ Table Create Table t1 CREATE TABLE `t1` ( `a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL ) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (_ucs2 0x3400); +INSERT INTO t1 VALUES (_ucs2 0xF001); +SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1; +ucs2 HEX(a) HEX(WEIGHT_STRING(a)) +3400 3400 FB80B4000020 +F001 F001 FBC1F0010020 +DROP TABLE t1; +CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; a HEX(WEIGHT_STRING(a LEVEL 2)) diff --git a/mysql-test/r/ctype_utf16_uca.result b/mysql-test/r/ctype_utf16_uca.result index 1ee06062fb6..1e4c77ea83d 100644 --- a/mysql-test/r/ctype_utf16_uca.result +++ b/mysql-test/r/ctype_utf16_uca.result @@ -6663,6 +6663,19 @@ Table Create Table t1 CREATE TABLE `t1` ( `a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL ) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (_ucs2 0x3400); +INSERT INTO t1 VALUES (_ucs2 0xF001); +SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1; +ucs2 HEX(a) HEX(WEIGHT_STRING(a)) +3400 3400 FB80B4000020 +F001 F001 FBC1F0010020 +DROP TABLE t1; +CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; a HEX(WEIGHT_STRING(a LEVEL 2)) diff --git a/mysql-test/r/ctype_utf32_uca.result b/mysql-test/r/ctype_utf32_uca.result index dab23a80a43..234a01bb108 100644 --- a/mysql-test/r/ctype_utf32_uca.result +++ b/mysql-test/r/ctype_utf32_uca.result @@ -6683,6 +6683,19 @@ Table Create Table t1 CREATE TABLE `t1` ( `a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL ) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (_ucs2 0x3400); +INSERT INTO t1 VALUES (_ucs2 0xF001); +SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1; +ucs2 HEX(a) HEX(WEIGHT_STRING(a)) +3400 00003400 FB80B4000020 +F001 0000F001 FBC1F0010020 +DROP TABLE t1; +CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; a HEX(WEIGHT_STRING(a LEVEL 2)) diff --git a/mysql-test/r/ctype_utf8mb4_uca.result b/mysql-test/r/ctype_utf8mb4_uca.result index b711dfb65c8..4792d746709 100644 --- a/mysql-test/r/ctype_utf8mb4_uca.result +++ b/mysql-test/r/ctype_utf8mb4_uca.result @@ -5373,6 +5373,19 @@ Table Create Table t1 CREATE TABLE `t1` ( `a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL ) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (_ucs2 0x3400); +INSERT INTO t1 VALUES (_ucs2 0xF001); +SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1; +ucs2 HEX(a) HEX(WEIGHT_STRING(a)) +3400 E39080 FB80B4000020 +F001 EF8081 FBC1F0010020 +DROP TABLE t1; +CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; a HEX(WEIGHT_STRING(a LEVEL 2)) diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 8a092e8f085..cebb723cebb 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -6539,7 +6539,8 @@ MY_UCA_INFO my_uca_v400= 0, /* nitems */ NULL, /* item */ NULL /* flags */ - } + }, + 0 /* levelno */ }, }, @@ -30084,7 +30085,8 @@ MY_UCA_INFO my_uca_v520_th= THAI_CONTRACTIONS, /* nitems */ thai_contractions, /* item */ NULL /* flags */ - } + }, + 0 /* levelno */ }, { 0x10FFFF, /* maxchar */ @@ -30094,7 +30096,8 @@ MY_UCA_INFO my_uca_v520_th= THAI_CONTRACTIONS_W2, /* nitems */ thai_contractions_w2, /* item */ NULL /* flags */ - } + }, + 1 /* levelno */ }, }, @@ -30127,8 +30130,9 @@ MY_UCA_INFO my_uca_v520= { /* Contractions: */ 0, /* nitems */ NULL, /* item */ - NULL /* flags */ - } + NULL /* flags */ + }, + 0 /* levelno */ }, }, @@ -31529,37 +31533,88 @@ my_uca_previous_context_find(my_uca_scanner *scanner, /****************************************************************/ +/** + Implicit weights for a code CP are constructed as follows: + [.AAAA.0020.0002][.BBBB.0000.0000] + + where: + AAAA= BASE + (CP >> 15); + BBBB= (CP & 0x7FFF) | 0x8000; + + There are two weights in the primary level (AAAA followed by BBBB). + There is one weight on other levels: + - 0020 on the secondary level + - 0002 on the tertiary level +*/ + + +/** + Return BASE for an implicit weight on the primary level + + According to UCA, BASE is calculated as follows: + - FB40 for Unified_Ideograph=True AND + ((Block=CJK_Unified_Ideograph) OR + (Block=CJK_Compatibility_Ideographs)) + - FB80 for Unified_Ideograph=True AND NOT + ((Block=CJK_Unified_Ideograph) OR + (Block=CJK_Compatibility_Ideographs)) + - FBC0 for any other code point + TODO: it seems we're not handling BASE correctly: + - check what are those blocks + - there are more Unified Ideograph blocks in the latest Unicode versions +*/ +static inline uint16 +my_uca_implicit_weight_base(my_wc_t code) +{ + if (code >= 0x3400 && code <= 0x4DB5) + return 0xFB80; + if (code >= 0x4E00 && code <= 0x9FA5) + return 0xFB40; + return 0xFBC0; +} + /** - Return implicit UCA weight + Return an implicit UCA weight for the primary level. Used for characters that do not have assigned UCA weights. @param scanner UCA weight scanner @return The leading implicit weight. + + The second weight is stored in scanner->implicit[0] + and is later returned on the next my_uca_scanner_next_any() call. */ static inline int -my_uca_scanner_next_implicit(my_uca_scanner *scanner) +my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner) { - scanner->code= (scanner->page << 8) + scanner->code; - scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000; - scanner->implicit[1]= 0; + my_wc_t wc= (scanner->page << 8) + scanner->code; + scanner->implicit[0]= (wc & 0x7FFF) | 0x8000; /* The second weight */ + scanner->implicit[1]= 0; /* 0 terminator */ scanner->wbeg= scanner->implicit; - - scanner->page= scanner->page >> 7; - - if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5) - scanner->page+= 0xFB80; - else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5) - scanner->page+= 0xFB40; - else - scanner->page+= 0xFBC0; - - return scanner->page; + return my_uca_implicit_weight_base(wc) + (wc >> 15); } +/** + Return an implicit weight for the current level + (according to scanner->level->levelno). + +*/ +static inline int +my_uca_scanner_next_implicit(my_uca_scanner *scanner) +{ + switch (scanner->level->levelno) { + case 0: return my_uca_scanner_next_implicit_primary(scanner);/* Primary level*/ + case 1: scanner->wbeg= nochar; return 0x0020; /* Secondary level */ + case 2: scanner->wbeg= nochar; return 0x0002; /* Tertiary level */ + default: scanner->wbeg= nochar; break; + } + DBUG_ASSERT(0); + return 0; +} + /* The same two functions for any character set */ @@ -33829,6 +33884,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, size_t i, npages= (src->maxchar + 1) / 256; dst->maxchar= src->maxchar; + dst->levelno= src->levelno; if (check_rules(loader, rules, dst, src)) return TRUE; |