diff options
author | Alexander Barkov <bar@mariadb.com> | 2021-11-14 07:09:08 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.com> | 2021-11-24 13:45:35 +0400 |
commit | f9ad8072cdb6376a3cf5384c76a85beb905f5dd8 (patch) | |
tree | 64c04f66a12781e5a30c07aa220208426483f357 | |
parent | 0a3d1d106ae6bbbac3f169080f06c1b1b1f606ac (diff) | |
download | mariadb-git-bb-bar-10.8.tar.gz |
MDEV-27042 UCA: Resetting contractions to ignorable does not work wellbb-bar-10.8
The weight scanner routine scanner_next() did not properly handle the cases
when a contraction produces no weights (is ignorable).
Adding a helper routine my_uca_scanner_set_weight() and using
it in all cases:
- A single ASCII character
- A contraction starting with an ASCII character
- A multi-byte character
- A contraction starting with a multi-byte character
Also adding two other helper routines:
- my_uca_scanner_next_expansion_weight()
- my_uca_scanner_set_weight_outside_maxchar()
to avoid using scanner->wbeg directly inside scanner_next().
This reduces the probability of similar future bugs.
-rw-r--r-- | mysql-test/main/ctype_ldml.result | 37 | ||||
-rw-r--r-- | mysql-test/main/ctype_ldml.test | 21 | ||||
-rw-r--r-- | mysql-test/std_data/ldml/Index.xml | 3 | ||||
-rw-r--r-- | strings/ctype-uca.c | 27 | ||||
-rw-r--r-- | strings/ctype-uca.ic | 62 |
5 files changed, 114 insertions, 36 deletions
diff --git a/mysql-test/main/ctype_ldml.result b/mysql-test/main/ctype_ldml.result index 05b31a4ea70..d4b24ad9af3 100644 --- a/mysql-test/main/ctype_ldml.result +++ b/mysql-test/main/ctype_ldml.result @@ -34,6 +34,9 @@ Bar +7-912-800-80-01 SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1'; name phone Bar +7-912-800-80-01 +SELECT * FROM t1 WHERE phone='tel.79128008001'; +name phone +Bar +7-912-800-80-01 DROP TABLE t1; show collation like 'utf8mb3_test_ci'; Collation Charset Id Default Compiled Sortlen @@ -3042,3 +3045,37 @@ SHOW COLLATION LIKE 'latin1_test_replace'; Collation Charset Id Default Compiled Sortlen SELECT 'foo' = 'foo ' COLLATE latin1_test_replace; ERROR HY000: Unknown collation: 'latin1_test_replace' +# +# MDEV-27042 UCA: Resetting contractions to ignorable does not work well +# +CREATE TABLE t1 ( +phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci +); +INSERT INTO t1 VALUES ('123'); +INSERT INTO t1 VALUES ('tel.123'); +INSERT INTO t1 VALUES ('tél.123'); +INSERT INTO t1 VALUES ('tèl.123'); +INSERT INTO t1 VALUES ('ťel.123'); +INSERT INTO t1 VALUES ('ťèl.123'); +INSERT INTO t1 VALUES ('tex.123'); +SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone; +phone +123 +tel.123 +tél.123 +ťel.123 +SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone; +phone +tex.123 +tèl.123 +ťèl.123 +SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone; +phone HEX(WEIGHT_STRING(phone)) +123 0E2A0E2B0E2C +tel.123 0E2A0E2B0E2C +tél.123 0E2A0E2B0E2C +ťel.123 0E2A0E2B0E2C +tèl.123 10020E8B0F2E025D0E2A0E2B0E2C +ťèl.123 10020E8B0F2E025D0E2A0E2B0E2C +tex.123 10020E8B105A025D0E2A0E2B0E2C +DROP TABLE t1; diff --git a/mysql-test/main/ctype_ldml.test b/mysql-test/main/ctype_ldml.test index 155e584935b..0fda35d278d 100644 --- a/mysql-test/main/ctype_ldml.test +++ b/mysql-test/main/ctype_ldml.test @@ -33,6 +33,7 @@ SELECT * FROM t1 ORDER BY phone; SELECT * FROM t1 WHERE phone='+7(912)800-80-01'; SELECT * FROM t1 WHERE phone='79128008001'; SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1'; +SELECT * FROM t1 WHERE phone='tel.79128008001'; DROP TABLE t1; show collation like 'utf8mb3_test_ci'; @@ -615,3 +616,23 @@ SELECT 'a' COLLATE utf8_czech_test_bad_w2; SHOW COLLATION LIKE 'latin1_test_replace'; --error ER_UNKNOWN_COLLATION SELECT 'foo' = 'foo ' COLLATE latin1_test_replace; + + +--echo # +--echo # MDEV-27042 UCA: Resetting contractions to ignorable does not work well +--echo # + +CREATE TABLE t1 ( + phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci +); +INSERT INTO t1 VALUES ('123'); +INSERT INTO t1 VALUES ('tel.123'); +INSERT INTO t1 VALUES ('tél.123'); +INSERT INTO t1 VALUES ('tèl.123'); +INSERT INTO t1 VALUES ('ťel.123'); +INSERT INTO t1 VALUES ('ťèl.123'); +INSERT INTO t1 VALUES ('tex.123'); +SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone; +SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone; +SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone; +DROP TABLE t1; diff --git a/mysql-test/std_data/ldml/Index.xml b/mysql-test/std_data/ldml/Index.xml index cd4ddde3d72..a6fee091d02 100644 --- a/mysql-test/std_data/ldml/Index.xml +++ b/mysql-test/std_data/ldml/Index.xml @@ -9,6 +9,9 @@ <i>\u0029</i> <!-- right parenthesis --> <i>\u002B</i> <!-- plus --> <i>\u002D</i> <!-- hyphen --> + <i>tel.</i> + <i>tél.</i> + <i>ťel.</i> </rules> </collation> <collation name="utf8mb3_test_ci" id="353"> diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 0cc7052c230..bdb2b4e3e75 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -31175,6 +31175,33 @@ static const uint16 nochar[]= {0,0}; #define MY_UCA_PREVIOUS_CONTEXT_HEAD 64 #define MY_UCA_PREVIOUS_CONTEXT_TAIL 128 + +static inline uint16 +my_uca_scanner_next_expansion_weight(my_uca_scanner *scanner) +{ + if (scanner->wbeg[0]) + return *scanner->wbeg++; + return 0; +} + + +static inline uint16 +my_uca_scanner_set_weight(my_uca_scanner *scanner, const uint16 *weight) +{ + scanner->wbeg= weight + 1; + return *weight; +} + + +static inline uint16 +my_uca_scanner_set_weight_outside_maxchar(my_uca_scanner *scanner) +{ + /* Return 0xFFFD as weight for all characters outside BMP */ + scanner->wbeg= nochar; + return 0xFFFD; +} + + /********** Helper functions to handle contraction ************/ diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic index 7adf802b25c..186b6436b76 100644 --- a/strings/ctype-uca.ic +++ b/strings/ctype-uca.ic @@ -40,20 +40,16 @@ static inline int MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) { - /* - Check if the weights for the previous character have been - already fully scanned. If yes, then get the next character and - initialize wbeg and wlength to its weight string. - */ - - if (scanner->wbeg[0]) /* More weights left from the previous step: */ - return *scanner->wbeg++; /* return the next weight from expansion */ + uint16 weight= my_uca_scanner_next_expansion_weight(scanner); + if (weight) + return weight; /* Next expansion weight found */ - do + for ( ; ; ) { const uint16 *wpage; int mblen; my_wc_t currwc; + const uint16 *cweight; /* Get next character */ #if MY_UCA_ASCII_OPTIMIZE @@ -64,23 +60,21 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) scanner->sbeg+= 1; #if MY_UCA_COMPILE_CONTRACTIONS - if (my_uca_needs_context_handling(scanner->level, currwc)) + if (my_uca_needs_context_handling(scanner->level, currwc) && + (cweight= my_uca_context_weight_find(scanner, currwc))) { - const uint16 *cweight= my_uca_context_weight_find(scanner, currwc); - if (cweight) - { - scanner->wbeg= cweight + 1; - return *cweight; - } + if ((weight= my_uca_scanner_set_weight(scanner, cweight))) + return weight; + continue; /* Ignorable contraction */ } #endif scanner->page= 0; scanner->code= (int) currwc; - scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0]; - if (scanner->wbeg[0]) - return *scanner->wbeg++; - continue; + cweight= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0]; + if ((weight= my_uca_scanner_set_weight(scanner, cweight))) + return weight; + continue; /* Ignorable character */ } else #endif @@ -109,21 +103,15 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) scanner->sbeg+= mblen; if (currwc > scanner->level->maxchar) - { - /* Return 0xFFFD as weight for all characters outside BMP */ - scanner->wbeg= nochar; - return 0xFFFD; - } + return my_uca_scanner_set_weight_outside_maxchar(scanner); #if MY_UCA_COMPILE_CONTRACTIONS - if (my_uca_needs_context_handling(scanner->level, currwc)) + if (my_uca_needs_context_handling(scanner->level, currwc) && + (cweight= my_uca_context_weight_find(scanner, currwc))) { - const uint16 *cweight= my_uca_context_weight_find(scanner, currwc); - if (cweight) - { - scanner->wbeg= cweight + 1; - return *cweight; - } + if ((weight= my_uca_scanner_set_weight(scanner, cweight))) + return weight; + continue; /* Ignorable contraction */ } #endif @@ -136,11 +124,13 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) return my_uca_scanner_next_implicit(scanner); /* Calculate pointer to w[0]'s weight, using page and offset */ - scanner->wbeg= wpage + - scanner->code * scanner->level->lengths[scanner->page]; - } while (!scanner->wbeg[0]); /* Skip ignorable characters */ + cweight= wpage + scanner->code * scanner->level->lengths[scanner->page]; + if ((weight= my_uca_scanner_set_weight(scanner, cweight))) + return weight; + continue; /* Ignorable character */ + } - return *scanner->wbeg++; + return 0; } |