summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/m_ctype.h53
-rw-r--r--strings/ctype-uca-scanner_next.inl39
-rw-r--r--strings/ctype-uca.c548
-rw-r--r--strings/ctype-uca.ic30
-rw-r--r--unittest/strings/strings-t.c2
5 files changed, 660 insertions, 12 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 1e7f06dce98..811b3b71a17 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -141,6 +141,58 @@ const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
my_wc_t wc1, my_wc_t wc2);
+typedef struct my_uca_weight2_t
+{
+ uint16 weight[2];
+} MY_UCA_WEIGHT2;
+
+
+/*
+ In DUCET as of Unicode-14.0.0:
+ - All characters in the range U+0000..U+007F (i.e. using one byte in utf8)
+ have not more than two weights on all weight levels.
+ - All characters in the range U+0080..U+07FF (i.e. using two bytes in utf8)
+ have not more than four weights on all weight levels.
+ Therefore the limit of 4 weights should cover all byte pairs
+ (i.e. two ASCII characters or one 2-byte character)
+ that are a subject for the "process 2 bytes at a time" optimization.
+ If some collation reorders any character from the mentioned ranges
+ in the way that it produces more weights, such character will not
+ be optimized, but will be correctly processed the slower mb_wc-based
+ method (1 character at a time).
+*/
+#define MY_UCA_2BYTES_MAX_WEIGHT_SIZE (4+1) /* Including 0 terminator */
+
+typedef struct my_uca_2bytes_item_t
+{
+ uint16 weight[MY_UCA_2BYTES_MAX_WEIGHT_SIZE];
+} MY_UCA_2BYTES_ITEM;
+
+
+typedef struct my_uca_level_booster_t
+{
+ /*
+ A helper array to process 2 bytes at a time during string comparison.
+ It maps all 2-bytes sequences that make:
+ - two ASCII characters or
+ - one 2-byte character
+ to their weights. The weight length is limited to
+ MY_UCA_2BYTES_MAX_WEIGHT_SIZE-1 weights.
+ This array is used in the main loop optimization.
+ */
+ MY_UCA_2BYTES_ITEM weight_strings_2bytes[0x10000];
+ /*
+ A helper array to process 2bytes at a time during string comparison,
+ with an even more efficient way than the above one.
+ The weight size is limited to 2 weights, so it's used for the cases
+ when 2 input bytes produce 1 or 2 weights.
+ This limit makes the code using this array even simpler and faster.
+ This array is used for prefix optimization.
+ */
+ MY_UCA_WEIGHT2 weight_strings_2bytes_to_1_or_2_weights[0x10000];
+} MY_UCA_LEVEL_BOOSTER;
+
+
typedef struct my_uca_contraction_hash_t
{
size_t nitems_alloced;
@@ -157,6 +209,7 @@ typedef struct my_uca_level_info_st
MY_CONTRACTIONS contractions;
uint levelno;
MY_UCA_CONTRACTION_HASH contraction_hash;
+ MY_UCA_LEVEL_BOOSTER *booster;
} MY_UCA_WEIGHT_LEVEL;
diff --git a/strings/ctype-uca-scanner_next.inl b/strings/ctype-uca-scanner_next.inl
index acab31f21ef..b79e0deff1a 100644
--- a/strings/ctype-uca-scanner_next.inl
+++ b/strings/ctype-uca-scanner_next.inl
@@ -78,6 +78,45 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
my_wc_t currwc= 0;
const uint16 *cweight;
+#if MY_UCA_ASCII_OPTIMIZE && !defined(SCANNER_NEXT_NCHARS)
+ if (scanner->sbeg + 1 < scanner->send)
+ {
+ const MY_UCA_2BYTES_ITEM *ww;
+ ww= my_uca_level_booster_2bytes_item_addr_const(scanner->level->booster,
+ scanner->sbeg[0],
+ scanner->sbeg[1]);
+ if (my_uca_2bytes_item_is_applicable(ww))
+ {
+ /*
+ Byte pairs that make 2-byte head characters in previous
+ context pairs are marked as not applicable for optimization
+ during the collation initialization. So when we come here
+ sbeg[0] and sbeg[1] are:
+ - either two ASCII characters
+ - or one 2-byte character which IS NOT a previous context head
+ Just remember sbeg[1] as the previous character for simplicity.
+ This may erroneously interpret bytes 0x80..0x9F as previous context
+ head characters U+0080..U+009F. However, CLDR does not have any real
+ collations that use these characters as previous context heads.
+ */
+ scanner->page= 0;
+ scanner->code= (int) scanner->sbeg[1];
+ scanner->sbeg+= 2;
+ if ((weight= my_uca_scanner_set_weight(scanner, ww->weight)))
+ {
+ /*
+ TODO: add support for scanner_next_with_nchars and do this:
+ SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+ */
+ return weight;
+ }
+ continue; /* Ignorable character */
+ }
+ /* 2 byte optimization is not applicable, go the slow path */
+ }
+#endif
+
+
/* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
/* Get next ASCII character */
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 8d8f70903ad..38d81910053 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -6549,7 +6549,8 @@ MY_UCA_INFO my_uca_v400=
NULL /* flags */
},
0, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
},
{
0,
@@ -6561,7 +6562,8 @@ MY_UCA_INFO my_uca_v400=
NULL
},
1, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
},
{0}
},
@@ -30112,7 +30114,8 @@ MY_UCA_INFO my_uca_v520_th=
NULL /* flags */
},
0, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
},
{
0x10FFFF, /* maxchar */
@@ -30124,7 +30127,8 @@ MY_UCA_INFO my_uca_v520_th=
NULL /* flags */
},
1, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
},
{0}
},
@@ -30164,7 +30168,8 @@ MY_UCA_INFO my_uca_v520=
NULL /* flags */
},
0, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
},
{
@@ -30177,7 +30182,8 @@ MY_UCA_INFO my_uca_v520=
NULL /* flags */
},
1, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
},
{0}
@@ -30221,7 +30227,8 @@ static MY_UCA_INFO my_uca_v1400=
NULL /* flags */
},
0, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
},
{
@@ -30234,7 +30241,8 @@ static MY_UCA_INFO my_uca_v1400=
NULL /* flags */
},
1, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
},
{
@@ -30247,7 +30255,8 @@ static MY_UCA_INFO my_uca_v1400=
NULL /* flags */
},
2, /* levelno */
- {0} /* contraction_hash */
+ {0}, /* contraction_hash */
+ NULL /* booster */
}
},
@@ -33947,8 +33956,522 @@ my_uca_generate_pages(MY_CHARSET_LOADER *loader,
}
+static size_t
+my_uca_weight_cpy(uint16 *dst, const uint16 *src)
+{
+ const uint16 *src0= src;
+ for ( ; ; dst++, src++ )
+ {
+ *dst= *src;
+ if (!dst[0])
+ break;
+ }
+ return src - src0;
+}
+
+
+/*
+ The value 0xFFFF does not exist in UCA weights.
+ Let's use it to mark byte pairs that have complex
+ mapping.
+*/
+#define MY_UCA_2BYTES_NOT_APPLICABLE 0xFFFF
+
+
+static inline my_bool
+my_uca_2bytes_item_is_applicable(const MY_UCA_2BYTES_ITEM *w2)
+{
+ return w2->weight[1] != MY_UCA_2BYTES_NOT_APPLICABLE;
+}
+
+
+static void
+my_uca_2bytes_item_set_not_applicable(MY_UCA_2BYTES_ITEM *dst)
+{
+ dst->weight[0]= 0;
+ dst->weight[1]= MY_UCA_2BYTES_NOT_APPLICABLE;
+}
+
+
+/* Calculate the length of a 0-terminated weight string */
+static inline size_t
+my_uca_weight_length(const uint16 *str)
+{
+ uint res;
+ for (res= 0; str[res] ; res++)
+ { }
+ return res;
+}
+
+
+/*
+ Copy a 0-terminated weight string if it fits,
+ otherwise mark the byte pair as not applicable for optimization.
+*/
+static void
+my_uca_2bytes_item_weight_cpy(MY_UCA_2BYTES_ITEM *dst, const uint16 *src)
+{
+ size_t wlen= my_uca_weight_length(src);
+ if (wlen + 1 > array_elements(dst->weight))
+ my_uca_2bytes_item_set_not_applicable(dst);
+ else
+ my_uca_weight_cpy(dst->weight, src);
+}
+
+
+/*
+ Concatenate two 0-terminated weight strings if they fit together,
+ otherwise mark the byte pair as not applicable for optimization.
+*/
+static void
+my_uca_2bytes_item_weight_cpy2(MY_UCA_2BYTES_ITEM *dst,
+ const uint16 *wa,
+ const uint16 *wb)
+{
+ size_t la= my_uca_weight_length(wa);
+ size_t lb= my_uca_weight_length(wb);
+ if (la + lb + 1 > array_elements(dst->weight))
+ {
+ my_uca_2bytes_item_set_not_applicable(dst);
+ }
+ else
+ {
+ my_uca_weight_cpy(dst->weight, wa);
+ my_uca_weight_cpy(dst->weight + la, wb);
+ }
+}
+
+
+/*
+ Contatenate weights of two ASCII characters if they fit together,
+ otherwise mark the byte pair as not applicable for optimization.
+*/
+static void
+my_uca_2bytes_item_set_ascii2(MY_UCA_2BYTES_ITEM *dst,
+ const MY_UCA_WEIGHT_LEVEL *level,
+ uchar a, uchar b)
+{
+ const uint16 *wa= level->weights[0] + (uint) a * level->lengths[0];
+ const uint16 *wb= level->weights[0] + (uint) b * level->lengths[0];
+ my_uca_2bytes_item_weight_cpy2(dst, wa, wb);
+}
+
+
+/*
+ Check if two bytes make a well-formed 2-byte character.
+ Copy its weight if it fits.
+ If the two bytes do not make a well-formed 2-byte character,
+ or the weight of a valid 2-byte character is too long, then
+ mark this byte pair as not applicable for optimization.
+*/
+static void
+my_uca_2bytes_item_set_non_ascii2(MY_UCA_2BYTES_ITEM *dst,
+ const MY_UCA_WEIGHT_LEVEL *level,
+ CHARSET_INFO *cs,
+ uchar a, uchar b)
+{
+ uchar ch[2]= {a, b};
+ my_wc_t wc;
+ int rc= my_ci_mb_wc(cs, &wc, &ch[0], &ch[2]);
+ if (rc == 2)
+ {
+ /* Byte sequence 'ab' make one valid 2-byte character */
+ uint pageno= wc>>8;
+ const uint16 *w= level->weights[pageno] + (wc & 0xFF) * level->lengths[pageno];
+ my_uca_2bytes_item_weight_cpy(dst, w);
+ }
+ else
+ {
+ my_uca_2bytes_item_set_not_applicable(dst);
+ }
+}
+
+
+static inline MY_UCA_2BYTES_ITEM *
+my_uca_level_booster_2bytes_item_addr(MY_UCA_LEVEL_BOOSTER *booster,
+ uchar a, uchar b)
+{
+ size_t w2offs= a * 256 + b;
+ return &booster->weight_strings_2bytes[w2offs];
+}
+
+
+static inline const MY_UCA_2BYTES_ITEM *
+my_uca_level_booster_2bytes_item_addr_const(const MY_UCA_LEVEL_BOOSTER *booster,
+ uchar a, uchar b)
+{
+ size_t w2offs= a * 256 + b;
+ return &booster->weight_strings_2bytes[w2offs];
+}
+
+
+static inline const MY_UCA_WEIGHT2 *
+my_uca_level_booster_simple_weight2_addr_const(
+ const MY_UCA_LEVEL_BOOSTER *booster,
+ uchar a, uchar b)
+{
+ uint offs= (uint) a * 256 + b;
+ return &booster->weight_strings_2bytes_to_1_or_2_weights[offs];
+}
+
+
+static void
+my_uca_level_booster_2bytes_disable2(MY_UCA_LEVEL_BOOSTER *booster,
+ uchar a, uchar b)
+{
+ MY_UCA_2BYTES_ITEM *dst= my_uca_level_booster_2bytes_item_addr(booster, a, b);
+ my_uca_2bytes_item_set_not_applicable(dst);
+}
+
+
+static void
+my_uca_level_booster_2bytes_disable_if_2byte_mb(MY_UCA_LEVEL_BOOSTER *booster,
+ CHARSET_INFO *cs,
+ my_wc_t wc)
+{
+ uchar tmp[MY_CS_MBMAXLEN];
+ int rc= my_ci_wc_mb(cs, wc, tmp, tmp + sizeof(tmp));
+ if (rc == 2)
+ my_uca_level_booster_2bytes_disable2(booster, tmp[0], tmp[1]);
+}
+
+
+static inline void
+my_uca_level_booster_2bytes_set_not_applicable_by_tail(
+ MY_UCA_LEVEL_BOOSTER *booster,
+ uchar tail)
+{
+ uint head;
+ for (head= 0; head < 256; head++)
+ my_uca_level_booster_2bytes_disable2(booster, (uchar) head, tail);
+}
+
+
+/*
+ Mark all byte pairs whose weight depend on the surrounding context
+ because of the given true contraction.
+*/
+static void
+my_uca_level_booster_2bytes_disable_contraction(MY_UCA_LEVEL_BOOSTER *booster,
+ const MY_CONTRACTION *c,
+ CHARSET_INFO *cs)
+{
+ /* Previous context sequences are handled by a separate routine */
+ DBUG_ASSERT(!c->with_context);
+
+ if (c->ch[0] < 0x80)
+ {
+ /*
+ 2-byte pairs that end with an ASCII contraction head.
+ ...xAB...
+ Suppose AB is a contraction where A is an ASCII character.
+ Disable byte pairs xA (for all x=0x00..0xFF).
+ */
+ my_uca_level_booster_2bytes_set_not_applicable_by_tail(booster,
+ (uchar) c->ch[0]);
+
+ /*
+ Disable 2-byte ASCII combinations that start
+ 3-character (or longer) contractions.
+ */
+ if (c->ch[1] < 0x80 && c->ch[2] != 0)
+ {
+ /*
+ A 3+ character contraction that starts with two ASCII characters:
+ ...ABx...
+ */
+ my_uca_level_booster_2bytes_disable2(booster,
+ (uchar) c->ch[0],
+ (uchar) c->ch[1]);
+ }
+ }
+ else
+ {
+ /*
+ Disable 2-byte characters that start contractions:
+ ...[Aa][B]... MB + ASCII
+ ...[Aa][Bb].. MB + MB2
+ ...[Aa][Bbb].. MB + MB3
+ ...[Aa][Bbbb].. MB + MB4
+ The weight of the character [Aa] depends on what goes after it.
+ */
+ my_uca_level_booster_2bytes_disable_if_2byte_mb(booster, cs, c->ch[0]);
+ }
+}
+
+
+/*
+ Mark all byte pairs whose weight depend on the surrounding context
+ because of the given previous context sequence.
+*/
+static void
+my_uca_level_booster_2bytes_disable_previous_context(
+ MY_UCA_LEVEL_BOOSTER *booster,
+ const MY_CONTRACTION *c,
+ CHARSET_INFO *cs)
+{
+ /* True contractions are handled by a separate routine */
+ DBUG_ASSERT(c->with_context);
+
+ if (c->ch[0] < 0x80 && c->ch[1] < 0x80)
+ {
+ DBUG_ASSERT(c->ch[2] == 0);
+ if (c->ch[2] == 0)
+ {
+ /*
+ A previous context pair with exactly two ASCII characters:
+ ...AB...
+ "A" is a look-behind character (the context).
+ "B" is a character that we need to generate a weight for.
+ The underlying code does not support handling these character
+ in a single shot yet. It works as follows at the moment:
+ - A is scanned separately from B and generates its independent weight.
+ - B is scanned separately on the next step and and generates its
+ context dependent weight (by looking behind).
+ */
+ my_uca_level_booster_2bytes_disable2(booster,
+ (uchar) c->ch[0],
+ (uchar) c->ch[1]);
+ }
+ }
+ else
+ {
+ /*
+ Disable 2-byte characters that start pairs with a previous context:
+ ...[Aa][B]... MB + ASCII
+ ...[Aa][Bb].. MB + MB
+ These characters can be actually scanned in a single shot,
+ but the relevant code in scanner_next() assumes previous context
+ head characters are ASCII only, so it sets the previous
+ character simply as sbeg[1].
+ */
+ my_uca_level_booster_2bytes_disable_if_2byte_mb(booster, cs, c->ch[0]);
+ }
+}
+
+
+/*
+ Set the weight of a 2-byte sequence,
+ or mark the sequence as not applicable for optimization.
+*/
+static void
+my_uca_2bytes_item_set_pair(MY_UCA_2BYTES_ITEM *dst,
+ const MY_UCA_WEIGHT_LEVEL *level,
+ CHARSET_INFO *cs,
+ uchar a, uchar b)
+{
+ if (a < 0x80 && b < 0x80)
+ my_uca_2bytes_item_set_ascii2(dst, level, a, b);
+ else
+ my_uca_2bytes_item_set_non_ascii2(dst, level, cs, a, b);
+}
+
+
+/*
+ For every byte pair [00..FF][00..FF] set its weight,
+ or mark it as not applicable for optimization.
+*/
+static void
+my_uca_level_booster_2bytes_populate_pairs(MY_UCA_LEVEL_BOOSTER *booster,
+ const MY_UCA_WEIGHT_LEVEL *level,
+ CHARSET_INFO *cs)
+{
+ uint a, b;
+ for (a= 0; a < 256; a++)
+ {
+ for (b= 0; b < 256; b++)
+ {
+ MY_UCA_2BYTES_ITEM *dst;
+ dst= my_uca_level_booster_2bytes_item_addr(booster, (uchar) a, (uchar) b);
+ my_uca_2bytes_item_set_pair(dst, level, cs, (uchar) a, (uchar) b);
+ }
+ }
+}
+
+
+/*
+ Populate contractions consisting of two ASCII letters.
+ Only true contractions are handled here so far.
+ Previous context pairs are handled separately.
+*/
+static void
+my_uca_level_booster_2bytes_pupulate_ascii2_contractions(
+ MY_UCA_LEVEL_BOOSTER *booster,
+ const MY_CONTRACTIONS *list)
+{
+ size_t i;
+ for (i= 0; i < list->nitems; i++)
+ {
+ const MY_CONTRACTION *c= &list->item[i];
+ if (c->ch[0] < 0x80 && c->ch[1] < 0x80 && c->ch[2] == 0 &&
+ !c->with_context)
+ {
+ MY_UCA_2BYTES_ITEM *dst;
+ dst= my_uca_level_booster_2bytes_item_addr(booster,
+ (uchar) c->ch[0],
+ (uchar) c->ch[1]);
+ my_uca_2bytes_item_weight_cpy(dst, c->weight);
+ }
+ }
+}
+
+
+/*
+ Mark all byte pairs whose weight depend on the context
+ (because of contractions and previous context sequences)
+ as not applicable for optimization.
+*/
+static void
+my_uca_level_booster_2bytes_disable_context_dependent(
+ MY_UCA_LEVEL_BOOSTER *booster,
+ const MY_CONTRACTIONS *list,
+ CHARSET_INFO *cs)
+{
+ size_t i;
+ for (i= 0; i < list->nitems; i++)
+ {
+ const MY_CONTRACTION *c= &list->item[i];
+ if (c->with_context)
+ my_uca_level_booster_2bytes_disable_previous_context(booster, c, cs);
+ else
+ my_uca_level_booster_2bytes_disable_contraction(booster, c, cs);
+ }
+}
+
+
+/*
+ Populate the array of MY_UCA_WEIGHT2 for all possible byte pairs {a,b}
+ as follows:
+
+ Number of characters Number of weights WEIGHT2
+ -------------------- ----------------- ------
+ 2 (two ASCII chars) 0 (both ignorable) {0,0} [IGN]
+ 2 (two ASCII chars) 1 (e.g. Czech "ch") {X,0}
+ 2 (two ASCII chars) 1 (e.g. ignorable + non-ignorable) {X,0}
+ 2 (two ASCII chars) 2 (two ASCII chars, one weigth each) {X,0}
+ 2 (two ASCII chars) 3+ (contraction with a long expansion) {0,0} [E3]
+ 1 (one 2-byte char) 0 (ignorable) {0,0} [IGN]
+ 1 (one 2-byte char) 1 {X,0}
+ 1 (one 2-byte char) 2 (short expansion, e.g. German SZ) {X,Y}
+ 1 (one 2-byte char) 3+ (long expansion) {0,0} [E3]
+ 0 (incomplete 3/4-byte char) {0,0} [INC]
+
+ All byte pairs that depend on the context (e.g. contraction parts)
+ and that were previously marked as such by
+ my_uca_level_booster_2bytes_disable_context_dependent()
+ set WEIGHT2 to {0,0} [CTX].
+
+ After the initialization, the array contains non-zero weights for
+ the most typical simple cases of mapping from 2-bytes to weights,
+ so inside strnncoll*() we can skip equal string prefixes much faster,
+ using a cheaper simpler code.
+*/
+static void
+my_uca_level_booster_weight2_populate(MY_UCA_LEVEL_BOOSTER *booster)
+{
+ size_t i;
+ for (i= 0; i < 0x10000; i++)
+ {
+ MY_UCA_WEIGHT2 *dst= &booster->weight_strings_2bytes_to_1_or_2_weights[i];
+ MY_UCA_2BYTES_ITEM *src= &booster->weight_strings_2bytes[i];
+ if (src->weight[0] && (!src->weight[1] || !src->weight[2]))
+ {
+ /*
+ Simplest mapping:
+ - Two ASCII characters make one or two weights
+ - One 2-byte character makes one or two weights
+ Handled by the simpler loop at the comparison time.
+ */
+ dst->weight[0]= src->weight[0];
+ dst->weight[1]= src->weight[1];
+ }
+ else
+ {
+ /*
+ More complex mapping:
+ - Ignorable - see [IGN] above
+ - More than two weights - see [E3] above
+ - Incomplete (a 3-byte or 4-byte char head) - see [INC] above
+ - Not applicable (context dependent) - see [CTX] above
+ Handled by the full-featured slower loop at the comparison time.
+ */
+ dst->weight[0]= 0;
+ dst->weight[1]= 0;
+ }
+ }
+}
+
+
+static void
+my_uca_level_booster_populate(MY_UCA_LEVEL_BOOSTER *dst,
+ const MY_UCA_WEIGHT_LEVEL *src,
+ CHARSET_INFO *cs)
+{
+ my_uca_level_booster_2bytes_populate_pairs(dst, src, cs);
+ my_uca_level_booster_2bytes_pupulate_ascii2_contractions(dst,
+ &src->contractions);
+ my_uca_level_booster_2bytes_disable_context_dependent(dst,
+ &src->contractions,
+ cs);
+ my_uca_level_booster_weight2_populate(dst);
+}
+
+
+static MY_UCA_LEVEL_BOOSTER *
+my_uca_level_booster_alloc(MY_CHARSET_LOADER *loader)
+{
+ size_t nbytes= sizeof(MY_UCA_LEVEL_BOOSTER);
+ MY_UCA_LEVEL_BOOSTER *res;
+ if (!(res= (MY_UCA_LEVEL_BOOSTER *) (loader->once_alloc)(nbytes)))
+ return NULL;
+ bzero(res, nbytes);
+ return res;
+}
+
+
+static MY_UCA_LEVEL_BOOSTER *
+my_uca_level_booster_new(MY_CHARSET_LOADER *loader,
+ CHARSET_INFO *cs,
+ MY_UCA_WEIGHT_LEVEL *level)
+{
+ MY_UCA_LEVEL_BOOSTER *res;
+ if (!(res= my_uca_level_booster_alloc(loader)))
+ return NULL;
+ my_uca_level_booster_populate(res, level, cs);
+ return res;
+}
+
+
+/*
+ Skip the simple equal prefix of two string using
+ "One or two bytes produce one or two weights" optimization.
+ Return the prefix length.
+*/
+static size_t
+my_uca_level_booster_equal_prefix_length(const MY_UCA_LEVEL_BOOSTER *booster,
+ const uchar *s, size_t slen,
+ const uchar *t, size_t tlen)
+{
+ const uchar *s0= s;
+ size_t simple_count= MY_MIN(slen, tlen) >> 1;
+ for ( ; simple_count; s+= 2, t+= 2, simple_count--)
+ {
+ const MY_UCA_WEIGHT2 *ws, *wt;
+ ws= my_uca_level_booster_simple_weight2_addr_const(booster, s[0], s[1]);
+ wt= my_uca_level_booster_simple_weight2_addr_const(booster, t[0], t[1]);
+ if (ws->weight[0] &&
+ ws->weight[0] == wt->weight[0] &&
+ ws->weight[1] == wt->weight[1])
+ continue;
+ break;
+ }
+ return s - s0;
+}
+
+
static my_bool
-init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
+init_weight_level(MY_CHARSET_LOADER *loader, CHARSET_INFO *cs,
+ MY_COLL_RULES *rules,
MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src)
{
MY_COLL_RULE *r, *rlast;
@@ -34055,6 +34578,9 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
}
}
+ if (cs->mbminlen == 1)
+ dst->booster= my_uca_level_booster_new(loader, cs, dst);
+
return FALSE;
}
@@ -34151,7 +34677,7 @@ my_uca_init_levels(MY_CHARSET_LOADER *loader, MY_UCA_INFO *dst,
cs->coll_name.str, i + 1);
return TRUE;
}
- if (init_weight_level(loader, rules,
+ if (init_weight_level(loader, cs, rules,
&dst->level[i], &src->level[i]))
return TRUE;
}
diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic
index f0855355a92..f3d543be1b1 100644
--- a/strings/ctype-uca.ic
+++ b/strings/ctype-uca.ic
@@ -95,6 +95,15 @@ MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs,
my_uca_scanner tscanner;
int s_res;
int t_res;
+
+#if MY_UCA_ASCII_OPTIMIZE
+{
+ size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
+ s, slen, t, tlen);
+ s+= prefix, slen-= prefix;
+ t+= prefix, tlen-= prefix;
+}
+#endif
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
@@ -204,6 +213,15 @@ MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs,
my_uca_scanner sscanner, tscanner;
int s_res, t_res;
+#if MY_UCA_ASCII_OPTIMIZE
+{
+ size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
+ s, slen, t, tlen);
+ s+= prefix, slen-= prefix;
+ t+= prefix, tlen-= prefix;
+}
+#endif
+
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
@@ -432,6 +450,18 @@ MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(CHARSET_INFO *cs,
size_t s_nchars_left= nchars;
size_t t_nchars_left= nchars;
+/*
+TODO: strnncollsp_nchars_onelevel
+#if MY_UCA_ASCII_OPTIMIZE
+{
+ size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
+ s, slen, t, tlen);
+ s+= prefix, slen-= prefix;
+ t+= prefix, tlen-= prefix;
+}
+#endif
+*/
+
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c
index 7532244b0a2..9636634fb8e 100644
--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -1341,7 +1341,7 @@ strnncollsp_char_one(CHARSET_INFO *cs, const STRNNCOLLSP_CHAR_PARAM *p)
str2hex(ahex, sizeof(ahex), p->a.str, p->a.length);
str2hex(bhex, sizeof(bhex), p->b.str, p->b.length);
diag("%-25s %-12s %-12s %3d %7d %7d%s",
- cs->cs_name.str, ahex, bhex, (int) p->nchars, p->res, res,
+ cs->coll_name.str, ahex, bhex, (int) p->nchars, p->res, res,
eqres(res, p->res) ? "" : " FAILED");
if (!eqres(res, p->res))
{