diff options
author | Alexander Barkov <bar@mnogosearch.org> | 2013-10-31 14:24:24 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mnogosearch.org> | 2013-10-31 14:24:24 +0400 |
commit | bd3dc54261f10f387a03ad99ce74c3824c42e462 (patch) | |
tree | 2eb1a284095b7d7bd28368bab9e229880a56fc95 | |
parent | eea91f633f903b8c223b7d470e4be7366cbf57c8 (diff) | |
download | mariadb-git-bd3dc54261f10f387a03ad99ce74c3824c42e462.tar.gz |
A few minor Unicode collation customization improvements were made,
which makes it possible to add more world language collations
with very complex collation rules (e.g. Myanmar):
- Weight string for a single character in a user defined collation
was erroneously limited to 7 weights (instead of 8 weights).
Added an extra element in the user-defined weight arrays,
to fit 8 non-zero weights.
- Weight string limit for contractions was made two times longer (16 weights),
which allows longer contractions without affecting the performance
of filesort.
- A user-defined collation now refuses to initialize and reports an error
in case if a weight string gets longer than 8 weights for a single character,
or longer than 16 weights for a contraction. Previously weight strings
for such characters (and contractions) were cut, so a collation
could silently start with wrong rules.
- Fixed a bug in handling rules like "&a << b" in combination with
shift-after-method="expand". The primary weight for "b" was not
correctly calculated, which erroneously made "b" primary greater than "a"
instead of primary equal to "a".
-rw-r--r-- | include/m_ctype.h | 16 | ||||
-rw-r--r-- | mysql-test/r/ctype_ldml.result | 10 | ||||
-rw-r--r-- | mysql-test/std_data/Index.xml | 14 | ||||
-rw-r--r-- | mysql-test/t/ctype_ldml.test | 4 | ||||
-rw-r--r-- | strings/ctype-uca.c | 140 |
5 files changed, 136 insertions, 48 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index b9682df12bf..1b60f2091b1 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -88,13 +88,25 @@ extern MY_UNICASE_INFO my_unicase_mysql500; extern MY_UNICASE_INFO my_unicase_unicode520; #define MY_UCA_MAX_CONTRACTION 6 -#define MY_UCA_MAX_WEIGHT_SIZE 8 +/* + The DUCET tables in ctype-uca.c are dumped with a limit of 8 weights + per character. cs->strxfrm_multiply is set to 8 for all UCA based collations. + + In language-specific UCA collations (with tailorings) we also do not allow + a single character to have more than 8 weights to stay with the same + strxfrm_multiply limit. Note, contractions are allowed to have twice longer + weight strings (up to 16 weights). As a contraction consists of at + least 2 characters, this makes sure that strxfrm_multiply ratio of 8 + is respected. +*/ +#define MY_UCA_MAX_WEIGHT_SIZE (8+1) /* Including 0 terminator */ +#define MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE (2*8+1) /* Including 0 terminator */ #define MY_UCA_WEIGHT_LEVELS 1 typedef struct my_contraction_t { my_wc_t ch[MY_UCA_MAX_CONTRACTION]; /* Character sequence */ - uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */ + uint16 weight[MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */ my_bool with_context; } MY_CONTRACTION; diff --git a/mysql-test/r/ctype_ldml.result b/mysql-test/r/ctype_ldml.result index 0373d74afb4..bd96a1f6255 100644 --- a/mysql-test/r/ctype_ldml.result +++ b/mysql-test/r/ctype_ldml.result @@ -425,6 +425,7 @@ ucs2_test_ci ucs2 358 8 ucs2_vn_ci ucs2 359 8 ucs2_5624_1 ucs2 360 8 utf8_5624_5 utf8 368 8 +utf8_5624_5_bad utf8 369 8 utf32_test_ci utf32 391 8 utf8_maxuserid_ci utf8 2047 8 show collation like '%test%'; @@ -1030,9 +1031,12 @@ INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I'); INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R'); INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z'); INSERT INTO t1 VALUES ('AA'),('AAA'); +INSERT INTO t1 VALUES ('001'),('002'); SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a); a HEX(WEIGHT_STRING(a)) 0 0E29 +001 0E29 +002 0E29 0z 0E290E292357 0ン 0E291E81 a 0E29233E @@ -1093,6 +1097,12 @@ AA 0E293358 AAA 0E293359 1 0E2A DROP TABLE t1; +SET NAMES utf8 COLLATE utf8_5624_5_bad; +ERROR HY000: Unknown collation: 'utf8_5624_5_bad' +SHOW WARNINGS; +Level Code Message +Error 1273 Unknown collation: 'utf8_5624_5_bad' +Warning 1273 Expansion too long: 'a\u002Daaaaaa10' # # End of WL#5624 # diff --git a/mysql-test/std_data/Index.xml b/mysql-test/std_data/Index.xml index 25c32099203..0aa415d0e48 100644 --- a/mysql-test/std_data/Index.xml +++ b/mysql-test/std_data/Index.xml @@ -114,13 +114,25 @@ weight space between 0 and 1 in DUCET. Also, to test it works with contractions, put some after 'z'. --> - <reset>0</reset> + <reset>0</reset><s>001</s><s>002</s> <pc>abcdefghijklmnopqrstuvwxyz</pc><p>aa</p><p>aaa</p> <reset before="primary">1</reset> <pc>ABCDEFGHIJKLMNOPQRSTUVWXYZ</pc><p>AA</p><p>AAA</p> </rules> </collation> + <collation name="utf8_5624_5_bad" id="369" shift-after-method="expand"> + <rules> + <reset>a-a4</reset><p>xxx04</a> + <reset>a-aa5</reset><p>xxx05</a> + <reset>a-aaa6</reset><p>xxx06</a> + <reset>a-aaaa7</reset><p>xxx07</a> + <reset>a-aaaaa8</reset><p>xxx08</a> + <reset>a-aaaaaa9</reset><p>xxx09</a> + <reset>a-aaaaaa10</reset><p>xxx10</a> + </rules> + </collation> + <collation name="utf8_hugeid_ci" id="2047000000"> <rules> <reset>a</reset> diff --git a/mysql-test/t/ctype_ldml.test b/mysql-test/t/ctype_ldml.test index 4442cf9b35a..82dfd1c8591 100644 --- a/mysql-test/t/ctype_ldml.test +++ b/mysql-test/t/ctype_ldml.test @@ -342,10 +342,14 @@ INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I'); INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R'); INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z'); INSERT INTO t1 VALUES ('AA'),('AAA'); +INSERT INTO t1 VALUES ('001'),('002'); SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a); DROP TABLE t1; +--error ER_UNKNOWN_COLLATION +SET NAMES utf8 COLLATE utf8_5624_5_bad; +SHOW WARNINGS; --echo # --echo # End of WL#5624 diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 7ed88da1ffa..e3138f7f310 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -8211,7 +8211,7 @@ ex: Collation rule item */ -#define MY_UCA_MAX_EXPANSION 6 /* Maximum expansion length */ +#define MY_UCA_MAX_EXPANSION 10 /* Maximum expansion length */ typedef struct my_coll_rule_item_st { @@ -8821,42 +8821,6 @@ my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p) MY_UCA_MAX_EXPANSION, "Expansion")) return 0; } - - if (p->rules->shift_after_method == my_shift_method_expand || - p->rule.before_level == 1) /* Apply "before primary" option */ - { - /* - Suppose we have this rule: &B[before primary] < C - i.e. we need to put C before B, but after A, so - the result order is: A < C < B. - - Let primary weight of B be [BBBB]. - - We cannot just use [BBBB-1] as weight for C: - DUCET does not have enough unused weights between any two characters, - so using [BBBB-1] will likely make C equal to the previous character, - which is A, so we'll get this order instead of the desired: A = C < B. - - To guarantee that that C is sorted after A, we'll use expansion - with a kind of "biggest possible character". - As "biggest possible character" we'll use "last_non_ignorable": - - We'll compose weight for C as: [BBBB-1][MMMM+1] - where [MMMM] is weight for "last_non_ignorable". - - We also do the same trick for "reset after" if the collation - option says so. E.g. for the rules "&B < C", weight for - C will be calculated as: [BBBB][MMMM+1] - - At this point we only need to store codepoints - 'B' and 'last_non_ignorable'. Actual weights for 'C' - will be calculated according to the above formula later, - in create_tailoring(). - */ - if (!my_coll_rule_expand(p->rule.base, MY_UCA_MAX_EXPANSION, - p->rules->uca->last_non_ignorable)) - return my_coll_parser_too_long_error(p, "Expansion"); - } return 1; } @@ -9056,20 +9020,25 @@ my_coll_rule_parse(MY_COLL_RULES *rules, @dst_uca destination UCA weight data @to destination address @to_length size of destination + @nweights OUT number of weights put to "to" @str qide string @len string length - @return number of weights put + @return FALSE on success, TRUE if the weights did not fit. */ -static size_t +static my_bool my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst, - uint16 *to, size_t to_length, + uint16 *to, size_t to_length, size_t *nweights, my_wc_t *str, size_t len) { size_t count; + int rc= FALSE; if (!to_length) - return 0; + { + *nweights= 0; + return len > 0; + } to_length--; /* Without trailing zero */ for (count= 0; len; ) @@ -9099,10 +9068,13 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst, *to++= *from++; count++; } + if (count == to_length && from && * from) + rc= TRUE; /* All weights did not fit */ } *to= 0; - return count; + *nweights= count; + return rc; } @@ -9191,6 +9163,37 @@ apply_shift(MY_CHARSET_LOADER *loader, } +static void +wstr_to_str(char *str, size_t length, my_wc_t *wc, size_t wlength) +{ + const char *end= str + length; + char *s; + size_t i, rem; + for (s= str, i= 0; (rem= (end - s)) > 0 && i < wlength; i++) + { + if ((wc[i] >= '0' && wc[i] <= '9') || + (wc[i] >= 'a' && wc[i] <= 'z') || + (wc[i] >= 'A' && wc[i] <= 'Z')) + s+= my_snprintf(s, rem, "%c", (int) wc[i]); + else + s+= my_snprintf(s, rem, "\\u%04X", (int) wc[i]); + } +} + + +static void +my_charset_loader_error_for_rule(MY_CHARSET_LOADER *loader, + const MY_COLL_RULE *r, + const char *name, + my_wc_t *wc, size_t wlength) +{ + char tmp[128]; + wstr_to_str(tmp, sizeof(tmp), wc, wlength); + my_snprintf(loader->error, sizeof(loader->error), + "%s too long: '%s'", name, tmp); +} + + static my_bool apply_one_rule(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, MY_COLL_RULE *r, int level, @@ -9200,6 +9203,47 @@ apply_one_rule(MY_CHARSET_LOADER *loader, size_t nreset= my_coll_rule_reset_length(r); /* Length of reset sequence */ size_t nshift= my_coll_rule_shift_length(r); /* Length of shift sequence */ uint16 *to; + my_bool rc; + + if ((rules->shift_after_method == my_shift_method_expand && r->diff[0]) || + r->before_level == 1) + { + /* + Suppose we have this rule: &B[before primary] < C + i.e. we need to put C before B, but after A, so + the result order is: A < C < B. + + Let primary weight of B be [BBBB]. + + We cannot just use [BBBB-1] as weight for C: + DUCET does not have enough unused weights between any two characters, + so using [BBBB-1] will likely make C equal to the previous character, + which is A, so we'll get this order instead of the desired: A = C < B. + + To guarantee that that C is sorted after A, we'll use expansion + with a kind of "biggest possible character". + As "biggest possible character" we'll use "last_non_ignorable": + + We'll compose weight for C as: [BBBB-1][MMMM+1] + where [MMMM] is weight for "last_non_ignorable". + + We also do the same trick for "reset after" if the collation + option says so. E.g. for the rules "&B < C", weight for + C will be calculated as: [BBBB][MMMM+1] + + At this point we only need to store codepoints + 'B' and 'last_non_ignorable'. Actual weights for 'C' + will be calculated according to the above formula later, + in create_tailoring(). + */ + if (!my_coll_rule_expand(r->base, MY_UCA_MAX_EXPANSION, + rules->uca->last_non_ignorable)) + { + my_charset_loader_error_for_rule(loader, r, "Expansion", r->base, nreset); + return TRUE; + } + nreset= my_coll_rule_reset_length(r); + } if (nshift >= 2) /* Contraction */ { @@ -9222,8 +9266,9 @@ apply_one_rule(MY_CHARSET_LOADER *loader, r->with_context)->weight; /* Store weights of the "reset to" character */ dst->contractions.nitems--; /* Temporarily hide - it's incomplete */ - nweights= my_char_weight_put(dst, to, MY_UCA_MAX_WEIGHT_SIZE, - r->base, nreset); + rc= my_char_weight_put(dst, + to, MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE, &nweights, + r->base, nreset); dst->contractions.nitems++; /* Activate, now it's complete */ } else @@ -9232,7 +9277,12 @@ apply_one_rule(MY_CHARSET_LOADER *loader, DBUG_ASSERT(dst->weights[pagec]); to= my_char_weight_addr(dst, r->curr[0]); /* Store weights of the "reset to" character */ - nweights= my_char_weight_put(dst, to, dst->lengths[pagec], r->base, nreset); + rc= my_char_weight_put(dst, to, dst->lengths[pagec], &nweights, r->base, nreset); + } + if (rc) + { + my_charset_loader_error_for_rule(loader, r, "Expansion", r->base, nreset); + return rc; } /* Apply level difference. */ |