summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.org>2016-06-21 21:36:23 +0400
committerAlexander Barkov <bar@mariadb.org>2016-06-21 21:36:23 +0400
commit63120090f994cc78876944e9f7a76f53337fa46e (patch)
tree2e84fe105bb38d1b3ab608495f01664b1d949e38
parent61492ea5ddd14efd2ede257700ac4d8fba2a04e6 (diff)
downloadmariadb-git-63120090f994cc78876944e9f7a76f53337fa46e.tar.gz
MDEV-10262 ucs2_thai_520_w2: wrong implicit weights on the secondary level
-rw-r--r--include/m_ctype.h1
-rw-r--r--mysql-test/include/ctype_uca_w2.inc7
-rw-r--r--mysql-test/r/ctype_uca.result26
-rw-r--r--mysql-test/r/ctype_utf16_uca.result13
-rw-r--r--mysql-test/r/ctype_utf32_uca.result13
-rw-r--r--mysql-test/r/ctype_utf8mb4_uca.result13
-rw-r--r--strings/ctype-uca.c98
7 files changed, 150 insertions, 21 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 497ba6de927..dcbfb604f03 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -131,6 +131,7 @@ typedef struct my_uca_level_info_st
uchar *lengths;
uint16 **weights;
MY_CONTRACTIONS contractions;
+ uint levelno;
} MY_UCA_WEIGHT_LEVEL;
diff --git a/mysql-test/include/ctype_uca_w2.inc b/mysql-test/include/ctype_uca_w2.inc
index 86ed7ddd134..b59f73b8a63 100644
--- a/mysql-test/include/ctype_uca_w2.inc
+++ b/mysql-test/include/ctype_uca_w2.inc
@@ -24,6 +24,13 @@ DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES (_ucs2 0x3400);
+INSERT INTO t1 VALUES (_ucs2 0xF001);
+SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a DESC;
diff --git a/mysql-test/r/ctype_uca.result b/mysql-test/r/ctype_uca.result
index 1609e7fc320..d9cba536814 100644
--- a/mysql-test/r/ctype_uca.result
+++ b/mysql-test/r/ctype_uca.result
@@ -14033,6 +14033,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_ucs2 0x3400);
+INSERT INTO t1 VALUES (_ucs2 0xF001);
+SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
+ucs2 HEX(a) HEX(WEIGHT_STRING(a))
+3400 E39080 FB80B4000020
+F001 EF8081 FBC1F0010020
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
@@ -14714,6 +14727,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_ucs2 0x3400);
+INSERT INTO t1 VALUES (_ucs2 0xF001);
+SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
+ucs2 HEX(a) HEX(WEIGHT_STRING(a))
+3400 3400 FB80B4000020
+F001 F001 FBC1F0010020
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
diff --git a/mysql-test/r/ctype_utf16_uca.result b/mysql-test/r/ctype_utf16_uca.result
index 1ee06062fb6..1e4c77ea83d 100644
--- a/mysql-test/r/ctype_utf16_uca.result
+++ b/mysql-test/r/ctype_utf16_uca.result
@@ -6663,6 +6663,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_ucs2 0x3400);
+INSERT INTO t1 VALUES (_ucs2 0xF001);
+SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
+ucs2 HEX(a) HEX(WEIGHT_STRING(a))
+3400 3400 FB80B4000020
+F001 F001 FBC1F0010020
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
diff --git a/mysql-test/r/ctype_utf32_uca.result b/mysql-test/r/ctype_utf32_uca.result
index dab23a80a43..234a01bb108 100644
--- a/mysql-test/r/ctype_utf32_uca.result
+++ b/mysql-test/r/ctype_utf32_uca.result
@@ -6683,6 +6683,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_ucs2 0x3400);
+INSERT INTO t1 VALUES (_ucs2 0xF001);
+SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
+ucs2 HEX(a) HEX(WEIGHT_STRING(a))
+3400 00003400 FB80B4000020
+F001 0000F001 FBC1F0010020
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
diff --git a/mysql-test/r/ctype_utf8mb4_uca.result b/mysql-test/r/ctype_utf8mb4_uca.result
index b711dfb65c8..4792d746709 100644
--- a/mysql-test/r/ctype_utf8mb4_uca.result
+++ b/mysql-test/r/ctype_utf8mb4_uca.result
@@ -5373,6 +5373,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_ucs2 0x3400);
+INSERT INTO t1 VALUES (_ucs2 0xF001);
+SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
+ucs2 HEX(a) HEX(WEIGHT_STRING(a))
+3400 E39080 FB80B4000020
+F001 EF8081 FBC1F0010020
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 8a092e8f085..cebb723cebb 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -6539,7 +6539,8 @@ MY_UCA_INFO my_uca_v400=
0, /* nitems */
NULL, /* item */
NULL /* flags */
- }
+ },
+ 0 /* levelno */
},
},
@@ -30084,7 +30085,8 @@ MY_UCA_INFO my_uca_v520_th=
THAI_CONTRACTIONS, /* nitems */
thai_contractions, /* item */
NULL /* flags */
- }
+ },
+ 0 /* levelno */
},
{
0x10FFFF, /* maxchar */
@@ -30094,7 +30096,8 @@ MY_UCA_INFO my_uca_v520_th=
THAI_CONTRACTIONS_W2, /* nitems */
thai_contractions_w2, /* item */
NULL /* flags */
- }
+ },
+ 1 /* levelno */
},
},
@@ -30127,8 +30130,9 @@ MY_UCA_INFO my_uca_v520=
{ /* Contractions: */
0, /* nitems */
NULL, /* item */
- NULL /* flags */
- }
+ NULL /* flags */
+ },
+ 0 /* levelno */
},
},
@@ -31529,37 +31533,88 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
/****************************************************************/
+/**
+ Implicit weights for a code CP are constructed as follows:
+ [.AAAA.0020.0002][.BBBB.0000.0000]
+
+ where:
+ AAAA= BASE + (CP >> 15);
+ BBBB= (CP & 0x7FFF) | 0x8000;
+
+ There are two weights in the primary level (AAAA followed by BBBB).
+ There is one weight on other levels:
+ - 0020 on the secondary level
+ - 0002 on the tertiary level
+*/
+
+
+/**
+ Return BASE for an implicit weight on the primary level
+
+ According to UCA, BASE is calculated as follows:
+ - FB40 for Unified_Ideograph=True AND
+ ((Block=CJK_Unified_Ideograph) OR
+ (Block=CJK_Compatibility_Ideographs))
+ - FB80 for Unified_Ideograph=True AND NOT
+ ((Block=CJK_Unified_Ideograph) OR
+ (Block=CJK_Compatibility_Ideographs))
+ - FBC0 for any other code point
+ TODO: it seems we're not handling BASE correctly:
+ - check what are those blocks
+ - there are more Unified Ideograph blocks in the latest Unicode versions
+*/
+static inline uint16
+my_uca_implicit_weight_base(my_wc_t code)
+{
+ if (code >= 0x3400 && code <= 0x4DB5)
+ return 0xFB80;
+ if (code >= 0x4E00 && code <= 0x9FA5)
+ return 0xFB40;
+ return 0xFBC0;
+}
+
/**
- Return implicit UCA weight
+ Return an implicit UCA weight for the primary level.
Used for characters that do not have assigned UCA weights.
@param scanner UCA weight scanner
@return The leading implicit weight.
+
+ The second weight is stored in scanner->implicit[0]
+ and is later returned on the next my_uca_scanner_next_any() call.
*/
static inline int
-my_uca_scanner_next_implicit(my_uca_scanner *scanner)
+my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)
{
- scanner->code= (scanner->page << 8) + scanner->code;
- scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000;
- scanner->implicit[1]= 0;
+ my_wc_t wc= (scanner->page << 8) + scanner->code;
+ scanner->implicit[0]= (wc & 0x7FFF) | 0x8000; /* The second weight */
+ scanner->implicit[1]= 0; /* 0 terminator */
scanner->wbeg= scanner->implicit;
-
- scanner->page= scanner->page >> 7;
-
- if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5)
- scanner->page+= 0xFB80;
- else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5)
- scanner->page+= 0xFB40;
- else
- scanner->page+= 0xFBC0;
-
- return scanner->page;
+ return my_uca_implicit_weight_base(wc) + (wc >> 15);
}
+/**
+ Return an implicit weight for the current level
+ (according to scanner->level->levelno).
+
+*/
+static inline int
+my_uca_scanner_next_implicit(my_uca_scanner *scanner)
+{
+ switch (scanner->level->levelno) {
+ case 0: return my_uca_scanner_next_implicit_primary(scanner);/* Primary level*/
+ case 1: scanner->wbeg= nochar; return 0x0020; /* Secondary level */
+ case 2: scanner->wbeg= nochar; return 0x0002; /* Tertiary level */
+ default: scanner->wbeg= nochar; break;
+ }
+ DBUG_ASSERT(0);
+ return 0;
+}
+
/*
The same two functions for any character set
*/
@@ -33829,6 +33884,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
size_t i, npages= (src->maxchar + 1) / 256;
dst->maxchar= src->maxchar;
+ dst->levelno= src->levelno;
if (check_rules(loader, rules, dst, src))
return TRUE;