summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.com>2021-11-14 07:09:08 +0400
committerAlexander Barkov <bar@mariadb.com>2021-11-24 13:45:35 +0400
commitf9ad8072cdb6376a3cf5384c76a85beb905f5dd8 (patch)
tree64c04f66a12781e5a30c07aa220208426483f357
parent0a3d1d106ae6bbbac3f169080f06c1b1b1f606ac (diff)
downloadmariadb-git-bb-bar-10.8.tar.gz
MDEV-27042 UCA: Resetting contractions to ignorable does not work wellbb-bar-10.8
The weight scanner routine scanner_next() did not properly handle the cases when a contraction produces no weights (is ignorable). Adding a helper routine my_uca_scanner_set_weight() and using it in all cases: - A single ASCII character - A contraction starting with an ASCII character - A multi-byte character - A contraction starting with a multi-byte character Also adding two other helper routines: - my_uca_scanner_next_expansion_weight() - my_uca_scanner_set_weight_outside_maxchar() to avoid using scanner->wbeg directly inside scanner_next(). This reduces the probability of similar future bugs.
-rw-r--r--mysql-test/main/ctype_ldml.result37
-rw-r--r--mysql-test/main/ctype_ldml.test21
-rw-r--r--mysql-test/std_data/ldml/Index.xml3
-rw-r--r--strings/ctype-uca.c27
-rw-r--r--strings/ctype-uca.ic62
5 files changed, 114 insertions, 36 deletions
diff --git a/mysql-test/main/ctype_ldml.result b/mysql-test/main/ctype_ldml.result
index 05b31a4ea70..d4b24ad9af3 100644
--- a/mysql-test/main/ctype_ldml.result
+++ b/mysql-test/main/ctype_ldml.result
@@ -34,6 +34,9 @@ Bar +7-912-800-80-01
SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
name phone
Bar +7-912-800-80-01
+SELECT * FROM t1 WHERE phone='tel.79128008001';
+name phone
+Bar +7-912-800-80-01
DROP TABLE t1;
show collation like 'utf8mb3_test_ci';
Collation Charset Id Default Compiled Sortlen
@@ -3042,3 +3045,37 @@ SHOW COLLATION LIKE 'latin1_test_replace';
Collation Charset Id Default Compiled Sortlen
SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
ERROR HY000: Unknown collation: 'latin1_test_replace'
+#
+# MDEV-27042 UCA: Resetting contractions to ignorable does not work well
+#
+CREATE TABLE t1 (
+phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
+);
+INSERT INTO t1 VALUES ('123');
+INSERT INTO t1 VALUES ('tel.123');
+INSERT INTO t1 VALUES ('tél.123');
+INSERT INTO t1 VALUES ('tèl.123');
+INSERT INTO t1 VALUES ('ťel.123');
+INSERT INTO t1 VALUES ('ťèl.123');
+INSERT INTO t1 VALUES ('tex.123');
+SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone;
+phone
+123
+tel.123
+tél.123
+ťel.123
+SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone;
+phone
+tex.123
+tèl.123
+ťèl.123
+SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone;
+phone HEX(WEIGHT_STRING(phone))
+123 0E2A0E2B0E2C
+tel.123 0E2A0E2B0E2C
+tél.123 0E2A0E2B0E2C
+ťel.123 0E2A0E2B0E2C
+tèl.123 10020E8B0F2E025D0E2A0E2B0E2C
+ťèl.123 10020E8B0F2E025D0E2A0E2B0E2C
+tex.123 10020E8B105A025D0E2A0E2B0E2C
+DROP TABLE t1;
diff --git a/mysql-test/main/ctype_ldml.test b/mysql-test/main/ctype_ldml.test
index 155e584935b..0fda35d278d 100644
--- a/mysql-test/main/ctype_ldml.test
+++ b/mysql-test/main/ctype_ldml.test
@@ -33,6 +33,7 @@ SELECT * FROM t1 ORDER BY phone;
SELECT * FROM t1 WHERE phone='+7(912)800-80-01';
SELECT * FROM t1 WHERE phone='79128008001';
SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
+SELECT * FROM t1 WHERE phone='tel.79128008001';
DROP TABLE t1;
show collation like 'utf8mb3_test_ci';
@@ -615,3 +616,23 @@ SELECT 'a' COLLATE utf8_czech_test_bad_w2;
SHOW COLLATION LIKE 'latin1_test_replace';
--error ER_UNKNOWN_COLLATION
SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
+
+
+--echo #
+--echo # MDEV-27042 UCA: Resetting contractions to ignorable does not work well
+--echo #
+
+CREATE TABLE t1 (
+ phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
+);
+INSERT INTO t1 VALUES ('123');
+INSERT INTO t1 VALUES ('tel.123');
+INSERT INTO t1 VALUES ('tél.123');
+INSERT INTO t1 VALUES ('tèl.123');
+INSERT INTO t1 VALUES ('ťel.123');
+INSERT INTO t1 VALUES ('ťèl.123');
+INSERT INTO t1 VALUES ('tex.123');
+SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone;
+SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone;
+SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone;
+DROP TABLE t1;
diff --git a/mysql-test/std_data/ldml/Index.xml b/mysql-test/std_data/ldml/Index.xml
index cd4ddde3d72..a6fee091d02 100644
--- a/mysql-test/std_data/ldml/Index.xml
+++ b/mysql-test/std_data/ldml/Index.xml
@@ -9,6 +9,9 @@
<i>\u0029</i> <!-- right parenthesis -->
<i>\u002B</i> <!-- plus -->
<i>\u002D</i> <!-- hyphen -->
+ <i>tel.</i>
+ <i>tél.</i>
+ <i>ťel.</i>
</rules>
</collation>
<collation name="utf8mb3_test_ci" id="353">
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 0cc7052c230..bdb2b4e3e75 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -31175,6 +31175,33 @@ static const uint16 nochar[]= {0,0};
#define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
#define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
+
+static inline uint16
+my_uca_scanner_next_expansion_weight(my_uca_scanner *scanner)
+{
+ if (scanner->wbeg[0])
+ return *scanner->wbeg++;
+ return 0;
+}
+
+
+static inline uint16
+my_uca_scanner_set_weight(my_uca_scanner *scanner, const uint16 *weight)
+{
+ scanner->wbeg= weight + 1;
+ return *weight;
+}
+
+
+static inline uint16
+my_uca_scanner_set_weight_outside_maxchar(my_uca_scanner *scanner)
+{
+ /* Return 0xFFFD as weight for all characters outside BMP */
+ scanner->wbeg= nochar;
+ return 0xFFFD;
+}
+
+
/********** Helper functions to handle contraction ************/
diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic
index 7adf802b25c..186b6436b76 100644
--- a/strings/ctype-uca.ic
+++ b/strings/ctype-uca.ic
@@ -40,20 +40,16 @@
static inline int
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
{
- /*
- Check if the weights for the previous character have been
- already fully scanned. If yes, then get the next character and
- initialize wbeg and wlength to its weight string.
- */
-
- if (scanner->wbeg[0]) /* More weights left from the previous step: */
- return *scanner->wbeg++; /* return the next weight from expansion */
+ uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
+ if (weight)
+ return weight; /* Next expansion weight found */
- do
+ for ( ; ; )
{
const uint16 *wpage;
int mblen;
my_wc_t currwc;
+ const uint16 *cweight;
/* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
@@ -64,23 +60,21 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
scanner->sbeg+= 1;
#if MY_UCA_COMPILE_CONTRACTIONS
- if (my_uca_needs_context_handling(scanner->level, currwc))
+ if (my_uca_needs_context_handling(scanner->level, currwc) &&
+ (cweight= my_uca_context_weight_find(scanner, currwc)))
{
- const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
- if (cweight)
- {
- scanner->wbeg= cweight + 1;
- return *cweight;
- }
+ if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+ return weight;
+ continue; /* Ignorable contraction */
}
#endif
scanner->page= 0;
scanner->code= (int) currwc;
- scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
- if (scanner->wbeg[0])
- return *scanner->wbeg++;
- continue;
+ cweight= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
+ if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+ return weight;
+ continue; /* Ignorable character */
}
else
#endif
@@ -109,21 +103,15 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
scanner->sbeg+= mblen;
if (currwc > scanner->level->maxchar)
- {
- /* Return 0xFFFD as weight for all characters outside BMP */
- scanner->wbeg= nochar;
- return 0xFFFD;
- }
+ return my_uca_scanner_set_weight_outside_maxchar(scanner);
#if MY_UCA_COMPILE_CONTRACTIONS
- if (my_uca_needs_context_handling(scanner->level, currwc))
+ if (my_uca_needs_context_handling(scanner->level, currwc) &&
+ (cweight= my_uca_context_weight_find(scanner, currwc)))
{
- const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
- if (cweight)
- {
- scanner->wbeg= cweight + 1;
- return *cweight;
- }
+ if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+ return weight;
+ continue; /* Ignorable contraction */
}
#endif
@@ -136,11 +124,13 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
return my_uca_scanner_next_implicit(scanner);
/* Calculate pointer to w[0]'s weight, using page and offset */
- scanner->wbeg= wpage +
- scanner->code * scanner->level->lengths[scanner->page];
- } while (!scanner->wbeg[0]); /* Skip ignorable characters */
+ cweight= wpage + scanner->code * scanner->level->lengths[scanner->page];
+ if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+ return weight;
+ continue; /* Ignorable character */
+ }
- return *scanner->wbeg++;
+ return 0;
}