summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mnogosearch.org>2013-10-31 14:24:24 +0400
committerAlexander Barkov <bar@mnogosearch.org>2013-10-31 14:24:24 +0400
commitbd3dc54261f10f387a03ad99ce74c3824c42e462 (patch)
tree2eb1a284095b7d7bd28368bab9e229880a56fc95 /strings
parenteea91f633f903b8c223b7d470e4be7366cbf57c8 (diff)
downloadmariadb-git-bd3dc54261f10f387a03ad99ce74c3824c42e462.tar.gz
A few minor Unicode collation customization improvements were made,
which makes it possible to add more world language collations with very complex collation rules (e.g. Myanmar): - Weight string for a single character in a user defined collation was erroneously limited to 7 weights (instead of 8 weights). Added an extra element in the user-defined weight arrays, to fit 8 non-zero weights. - Weight string limit for contractions was made two times longer (16 weights), which allows longer contractions without affecting the performance of filesort. - A user-defined collation now refuses to initialize and reports an error in case if a weight string gets longer than 8 weights for a single character, or longer than 16 weights for a contraction. Previously weight strings for such characters (and contractions) were cut, so a collation could silently start with wrong rules. - Fixed a bug in handling rules like "&a << b" in combination with shift-after-method="expand". The primary weight for "b" was not correctly calculated, which erroneously made "b" primary greater than "a" instead of primary equal to "a".
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-uca.c140
1 files changed, 95 insertions, 45 deletions
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 7ed88da1ffa..e3138f7f310 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -8211,7 +8211,7 @@ ex:
Collation rule item
*/
-#define MY_UCA_MAX_EXPANSION 6 /* Maximum expansion length */
+#define MY_UCA_MAX_EXPANSION 10 /* Maximum expansion length */
typedef struct my_coll_rule_item_st
{
@@ -8821,42 +8821,6 @@ my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p)
MY_UCA_MAX_EXPANSION, "Expansion"))
return 0;
}
-
- if (p->rules->shift_after_method == my_shift_method_expand ||
- p->rule.before_level == 1) /* Apply "before primary" option */
- {
- /*
- Suppose we have this rule: &B[before primary] < C
- i.e. we need to put C before B, but after A, so
- the result order is: A < C < B.
-
- Let primary weight of B be [BBBB].
-
- We cannot just use [BBBB-1] as weight for C:
- DUCET does not have enough unused weights between any two characters,
- so using [BBBB-1] will likely make C equal to the previous character,
- which is A, so we'll get this order instead of the desired: A = C < B.
-
- To guarantee that that C is sorted after A, we'll use expansion
- with a kind of "biggest possible character".
- As "biggest possible character" we'll use "last_non_ignorable":
-
- We'll compose weight for C as: [BBBB-1][MMMM+1]
- where [MMMM] is weight for "last_non_ignorable".
-
- We also do the same trick for "reset after" if the collation
- option says so. E.g. for the rules "&B < C", weight for
- C will be calculated as: [BBBB][MMMM+1]
-
- At this point we only need to store codepoints
- 'B' and 'last_non_ignorable'. Actual weights for 'C'
- will be calculated according to the above formula later,
- in create_tailoring().
- */
- if (!my_coll_rule_expand(p->rule.base, MY_UCA_MAX_EXPANSION,
- p->rules->uca->last_non_ignorable))
- return my_coll_parser_too_long_error(p, "Expansion");
- }
return 1;
}
@@ -9056,20 +9020,25 @@ my_coll_rule_parse(MY_COLL_RULES *rules,
@dst_uca destination UCA weight data
@to destination address
@to_length size of destination
+ @nweights OUT number of weights put to "to"
@str qide string
@len string length
- @return number of weights put
+ @return FALSE on success, TRUE if the weights did not fit.
*/
-static size_t
+static my_bool
my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
- uint16 *to, size_t to_length,
+ uint16 *to, size_t to_length, size_t *nweights,
my_wc_t *str, size_t len)
{
size_t count;
+ int rc= FALSE;
if (!to_length)
- return 0;
+ {
+ *nweights= 0;
+ return len > 0;
+ }
to_length--; /* Without trailing zero */
for (count= 0; len; )
@@ -9099,10 +9068,13 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
*to++= *from++;
count++;
}
+ if (count == to_length && from && * from)
+ rc= TRUE; /* All weights did not fit */
}
*to= 0;
- return count;
+ *nweights= count;
+ return rc;
}
@@ -9191,6 +9163,37 @@ apply_shift(MY_CHARSET_LOADER *loader,
}
+static void
+wstr_to_str(char *str, size_t length, my_wc_t *wc, size_t wlength)
+{
+ const char *end= str + length;
+ char *s;
+ size_t i, rem;
+ for (s= str, i= 0; (rem= (end - s)) > 0 && i < wlength; i++)
+ {
+ if ((wc[i] >= '0' && wc[i] <= '9') ||
+ (wc[i] >= 'a' && wc[i] <= 'z') ||
+ (wc[i] >= 'A' && wc[i] <= 'Z'))
+ s+= my_snprintf(s, rem, "%c", (int) wc[i]);
+ else
+ s+= my_snprintf(s, rem, "\\u%04X", (int) wc[i]);
+ }
+}
+
+
+static void
+my_charset_loader_error_for_rule(MY_CHARSET_LOADER *loader,
+ const MY_COLL_RULE *r,
+ const char *name,
+ my_wc_t *wc, size_t wlength)
+{
+ char tmp[128];
+ wstr_to_str(tmp, sizeof(tmp), wc, wlength);
+ my_snprintf(loader->error, sizeof(loader->error),
+ "%s too long: '%s'", name, tmp);
+}
+
+
static my_bool
apply_one_rule(MY_CHARSET_LOADER *loader,
MY_COLL_RULES *rules, MY_COLL_RULE *r, int level,
@@ -9200,6 +9203,47 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
size_t nreset= my_coll_rule_reset_length(r); /* Length of reset sequence */
size_t nshift= my_coll_rule_shift_length(r); /* Length of shift sequence */
uint16 *to;
+ my_bool rc;
+
+ if ((rules->shift_after_method == my_shift_method_expand && r->diff[0]) ||
+ r->before_level == 1)
+ {
+ /*
+ Suppose we have this rule: &B[before primary] < C
+ i.e. we need to put C before B, but after A, so
+ the result order is: A < C < B.
+
+ Let primary weight of B be [BBBB].
+
+ We cannot just use [BBBB-1] as weight for C:
+ DUCET does not have enough unused weights between any two characters,
+ so using [BBBB-1] will likely make C equal to the previous character,
+ which is A, so we'll get this order instead of the desired: A = C < B.
+
+ To guarantee that that C is sorted after A, we'll use expansion
+ with a kind of "biggest possible character".
+ As "biggest possible character" we'll use "last_non_ignorable":
+
+ We'll compose weight for C as: [BBBB-1][MMMM+1]
+ where [MMMM] is weight for "last_non_ignorable".
+
+ We also do the same trick for "reset after" if the collation
+ option says so. E.g. for the rules "&B < C", weight for
+ C will be calculated as: [BBBB][MMMM+1]
+
+ At this point we only need to store codepoints
+ 'B' and 'last_non_ignorable'. Actual weights for 'C'
+ will be calculated according to the above formula later,
+ in create_tailoring().
+ */
+ if (!my_coll_rule_expand(r->base, MY_UCA_MAX_EXPANSION,
+ rules->uca->last_non_ignorable))
+ {
+ my_charset_loader_error_for_rule(loader, r, "Expansion", r->base, nreset);
+ return TRUE;
+ }
+ nreset= my_coll_rule_reset_length(r);
+ }
if (nshift >= 2) /* Contraction */
{
@@ -9222,8 +9266,9 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
r->with_context)->weight;
/* Store weights of the "reset to" character */
dst->contractions.nitems--; /* Temporarily hide - it's incomplete */
- nweights= my_char_weight_put(dst, to, MY_UCA_MAX_WEIGHT_SIZE,
- r->base, nreset);
+ rc= my_char_weight_put(dst,
+ to, MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE, &nweights,
+ r->base, nreset);
dst->contractions.nitems++; /* Activate, now it's complete */
}
else
@@ -9232,7 +9277,12 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
DBUG_ASSERT(dst->weights[pagec]);
to= my_char_weight_addr(dst, r->curr[0]);
/* Store weights of the "reset to" character */
- nweights= my_char_weight_put(dst, to, dst->lengths[pagec], r->base, nreset);
+ rc= my_char_weight_put(dst, to, dst->lengths[pagec], &nweights, r->base, nreset);
+ }
+ if (rc)
+ {
+ my_charset_loader_error_for_rule(loader, r, "Expansion", r->base, nreset);
+ return rc;
}
/* Apply level difference. */