diff options
author | Alexander Barkov <bar@mariadb.com> | 2021-11-18 19:18:44 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.com> | 2021-11-23 11:20:54 +0400 |
commit | e98d423ced656e3cb8f3082f6ba1b14d727ae92a (patch) | |
tree | eaaad85626411e8e75ee2a4413de277c2a5f408e | |
parent | 66ea4a628cf7612099b890d5ca8bb60a42ba7f87 (diff) | |
download | mariadb-git-e98d423ced656e3cb8f3082f6ba1b14d727ae92a.tar.gz |
Performance improvement
-rw-r--r-- | include/m_ctype.h | 52 | ||||
-rw-r--r-- | strings/ctype-uca.c | 486 | ||||
-rw-r--r-- | strings/ctype-uca.ic | 50 |
3 files changed, 580 insertions, 8 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index 41523913c10..b3b3a8093ea 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -138,6 +138,57 @@ my_bool my_uca_can_be_contraction_tail(const MY_CONTRACTIONS *c, my_wc_t wc); uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c, my_wc_t wc1, my_wc_t wc2); +typedef struct my_uca_weight2_t +{ + uint16 weight[2]; +} MY_UCA_WEIGHT2; + + +/* + In DUCET as of Unicode-14.0.0: + - All characters in the range U+0000..U+007F (i.e. using one byte in utf8) + have not more than two weights on all weight levels. + - All characters in the range U+0080..U+07FF (i.e. using two bytes in utf8) + have not more than four weights on all weight levels. + Therefore the limit of 4 weights should cover all byte pairs + (i.e. two ASCII characters or one 2-byte character) + that are a subject for the "process 2 bytes at a time" optimization. + If some collation reorders any character from the mentioned ranges + in the way that it produces more weights, such character will not + be optimized, but will be correctly processed the slower mb_wc-based + method (1 character at a time). +*/ +#define MY_UCA_2BYTES_MAX_WEIGHT_SIZE (4+1) /* Including 0 terminator */ + +typedef struct my_uca_2bytes_item_t +{ + uint16 weight[MY_UCA_2BYTES_MAX_WEIGHT_SIZE]; +} MY_UCA_2BYTES_ITEM; + + +typedef struct my_uca_level_booster_t +{ + /* + A helper array to process 2 bytes at a time during string comparison. + It maps all 2-bytes sequences that make: + - two ASCII characters or + - one 2-byte character + to their weights. The weight length is limited to + MY_UCA_2BYTES_MAX_WEIGHT_SIZE-1 weights. + This array is used in the main loop optimization. + */ + MY_UCA_2BYTES_ITEM weight_strings_2bytes[0x10000]; + /* + A helper array to process 2bytes at a time during string comparison, + with an even more efficient way than the above one. + The weight size is limited to 2 weights, so it's used for the cases + when 2 input bytes produce 1 or 2 weights. + This limit makes the code using this array even simpler and faster. + This array is used for prefix optimization. + */ + MY_UCA_WEIGHT2 weight_strings_2bytes_to_1_or_2_weights[0x10000]; +} MY_UCA_LEVEL_BOOSTER; + /* Collation weights on a single level (e.g. primary, secondary, tertiarty) */ typedef struct my_uca_level_info_st @@ -147,6 +198,7 @@ typedef struct my_uca_level_info_st uint16 **weights; MY_CONTRACTIONS contractions; uint levelno; + MY_UCA_LEVEL_BOOSTER *booster; } MY_UCA_WEIGHT_LEVEL; diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 45b6869c6f5..b8bc48eb6fc 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -6540,7 +6540,8 @@ MY_UCA_INFO my_uca_v400= NULL, /* item */ NULL /* flags */ }, - 0 /* levelno */ + 0, /* levelno */ + NULL /* weights_2bytes */ }, { 0, @@ -6551,7 +6552,8 @@ MY_UCA_INFO my_uca_v400= NULL, NULL }, - 1 /* levelno */ + 1, /* levelno */ + NULL /* weights_2bytes */ }, }, @@ -30097,7 +30099,8 @@ MY_UCA_INFO my_uca_v520_th= thai_contractions, /* item */ NULL /* flags */ }, - 0 /* levelno */ + 0, /* levelno */ + NULL /* weights_2bytes */ }, { 0x10FFFF, /* maxchar */ @@ -30108,7 +30111,8 @@ MY_UCA_INFO my_uca_v520_th= thai_contractions_w2, /* item */ NULL /* flags */ }, - 1 /* levelno */ + 1, /* levelno */ + NULL /* weights_2bytes */ }, }, @@ -30143,7 +30147,8 @@ MY_UCA_INFO my_uca_v520= NULL, /* item */ NULL /* flags */ }, - 0 /* levelno */ + 0, /* levelno */ + NULL /* weights_2bytes */ }, { @@ -30155,7 +30160,8 @@ MY_UCA_INFO my_uca_v520= NULL, /* item */ NULL /* flags */ }, - 1 /* levelno */ + 1, /* levelno */ + NULL /* weights_2bytes */ }, }, @@ -33637,8 +33643,468 @@ my_uca_generate_pages(MY_CHARSET_LOADER *loader, } +static size_t +my_uca_weight_cpy(uint16 *dst, const uint16 *src) +{ + const uint16 *src0= src; + for ( ; ; dst++, src++ ) + { + *dst= *src; + if (!dst[0]) + break; + } + return src - src0; +} + + +#define MY_UCA_NOT_APPLICABLE_MARKER 0xFFFF + +static inline my_bool +my_uca_2bytes_item_is_applicable(const MY_UCA_2BYTES_ITEM *w2) +{ + return w2->weight[1] != MY_UCA_NOT_APPLICABLE_MARKER; +} + + +static void +my_uca_2bytes_item_set_not_applicable(MY_UCA_2BYTES_ITEM *dst) +{ + dst->weight[0]= 0; + dst->weight[1]= MY_UCA_NOT_APPLICABLE_MARKER; +} + + +static inline size_t +my_uca_weight_length(const uint16 *str) +{ + uint res; + for (res= 0; str[res] ; res++) + { } + return res; +} + + +static void +my_uca_2bytes_item_weight_cpy(MY_UCA_2BYTES_ITEM *dst, const uint16 *src) +{ + size_t wlen= my_uca_weight_length(src); + if (wlen + 1 > array_elements(dst->weight)) + my_uca_2bytes_item_set_not_applicable(dst); + else + my_uca_weight_cpy(dst->weight, src); +} + + +static void +my_uca_2bytes_item_weight_cpy2(MY_UCA_2BYTES_ITEM *dst, + const uint16 *wa, + const uint16 *wb) +{ + size_t la= my_uca_weight_length(wa); + size_t lb= my_uca_weight_length(wb); + if (la + lb + 1 > array_elements(dst->weight)) + { + my_uca_2bytes_item_set_not_applicable(dst); + } + else + { + my_uca_weight_cpy(dst->weight, wa); + my_uca_weight_cpy(dst->weight + la, wb); + } +} + + +static void +my_uca_2bytes_item_set_ascii2(MY_UCA_2BYTES_ITEM *dst, + const MY_UCA_WEIGHT_LEVEL *level, + uint a, uint b) +{ + const uint16 *wa= level->weights[0] + a * level->lengths[0]; + const uint16 *wb= level->weights[0] + b * level->lengths[0]; + my_uca_2bytes_item_weight_cpy2(dst, wa, wb); +} + + +static void +my_uca_2bytes_item_set_non_ascii(MY_UCA_2BYTES_ITEM *dst, + const MY_UCA_WEIGHT_LEVEL *level, + CHARSET_INFO *cs, + uint a, uint b) +{ + uchar ch[2]= {(uchar) a, (uchar) b}; + my_wc_t wc; + int rc= my_ci_mb_wc(cs, &wc, &ch[0], &ch[2]); + if (rc == 2) + { + /* Byte sequence 'ab' make one valid 2-byte character */ + uint pageno= wc>>8; + const uint16 *w= level->weights[pageno] + (wc & 0xFF) * level->lengths[pageno]; + my_uca_2bytes_item_weight_cpy(dst, w); + } + else /* TODO: ILSEQ vs TOO_SMALL */ + { + my_uca_2bytes_item_set_not_applicable(dst); + } +} + + +static inline MY_UCA_2BYTES_ITEM * +my_uca_level_booster_2bytes_item_addr(MY_UCA_LEVEL_BOOSTER *booster, + uint a, uint b) +{ + size_t w2offs= a * 256 + b; + DBUG_ASSERT(a < 256 && b < 256); + return &booster->weight_strings_2bytes[w2offs]; +} + + +static inline const MY_UCA_2BYTES_ITEM * +my_uca_level_booster_2bytes_item_addr_const(const MY_UCA_LEVEL_BOOSTER *booster, + uint a, uint b) +{ + size_t w2offs= a * 256 + b; + DBUG_ASSERT(a < 256 && b < 256); + return &booster->weight_strings_2bytes[w2offs]; +} + + +static inline const MY_UCA_WEIGHT2 * +my_uca_level_booster_simple_weight2_addr_const( + const MY_UCA_LEVEL_BOOSTER *booster, + uchar a, uchar b) +{ + uint offs= (uint) a * 256 + b; + return &booster->weight_strings_2bytes_to_1_or_2_weights[offs]; +} + + +static void +my_uca_level_booster_2bytes_disable_if_2byte_char(MY_UCA_LEVEL_BOOSTER *booster, + CHARSET_INFO *cs, + my_wc_t ch) +{ + uchar tmp[MY_CS_MBMAXLEN]; + int rc= my_ci_wc_mb(cs, ch, tmp, tmp + sizeof(tmp)); + if (rc == 2) + { + MY_UCA_2BYTES_ITEM *dst= my_uca_level_booster_2bytes_item_addr(booster, + tmp[0], + tmp[1]); + my_uca_2bytes_item_set_not_applicable(dst); + } +} + + +static inline void +my_uca_level_booster_2bytes_set_not_applicable_by_tail( + MY_UCA_LEVEL_BOOSTER *booster, + uint tail) +{ + uint head; + for (head= 0; head < 256; head++) + { + MY_UCA_2BYTES_ITEM *dst= my_uca_level_booster_2bytes_item_addr(booster, + head, + tail); + my_uca_2bytes_item_set_not_applicable(dst); + } +} + + +static void +my_uca_level_booster_2bytes_disable_contraction(MY_UCA_LEVEL_BOOSTER *booster, + const MY_CONTRACTION *c, + CHARSET_INFO *cs) +{ + DBUG_ASSERT(!c->with_context); + + if (c->ch[0] < 0x80) + { + MY_UCA_2BYTES_ITEM *dst; + /* + 2-byte pairs that end with an ASCII contraction head. + ...xAB... + Suppose AB is a contraction where A is an ASCII character. + Disable byte pairs xA (for all x=0x00..0xFF). + */ + my_uca_level_booster_2bytes_set_not_applicable_by_tail(booster, c->ch[0]); + + /* + Disable 2-byte ASCII combinations that start + 3-byte (or longer) contractions. + */ + if (c->ch[1] < 0x80 && c->ch[2] != 0) + { + /* + A 3+ character contraction that starts from two ASCII characters: + ...ABC... + */ + dst= my_uca_level_booster_2bytes_item_addr(booster, c->ch[0], c->ch[1]); + my_uca_2bytes_item_set_not_applicable(dst); + } + } + else + { + /* + Disable 2-byte characters that start contractions: + ...[Aa][B]... MB + ASCII + ...[Aa][Bb].. MB + MB + The next weight depends on the following characters. + */ + my_uca_level_booster_2bytes_disable_if_2byte_char(booster, cs, c->ch[0]); + } +} + + +static void +my_uca_level_booster_2bytes_disable_previous_context( + MY_UCA_LEVEL_BOOSTER *booster, + const MY_CONTRACTION *c, + CHARSET_INFO *cs) +{ + DBUG_ASSERT(c->with_context); + + if (c->ch[0] < 0x80 && c->ch[1] < 0x80) + { + DBUG_ASSERT(c->ch[2] == 0); + if (c->ch[2] == 0) + { + /* + A previous context pair with exactly two ASCII characters: + ...AB... + "A" is a look-behind character (the context). + "B" is a character that we need to generate a weight for. + The underlying code does not support handling these character + in a single shot yet. It works as follows at the moment: + - A is scanned separately from B and generates its independent weight. + - B is scanned separately on the next step and and generates its + context dependent weight (by looking behind). + */ + MY_UCA_2BYTES_ITEM *dst= my_uca_level_booster_2bytes_item_addr(booster, + c->ch[0], + c->ch[1]); + my_uca_2bytes_item_set_not_applicable(dst); + } + } + else + { + /* + Disable 2-byte characters that start pairs with a previous context: + ...[Aa][B]... MB + ASCII + ...[Aa][Bb].. MB + MB + These characters can be actually scanned in a single shot, + but the relevant code in scanner_next() assumes previous context + head characters are ASCII only sets and sets the previous + character simply as sbeg[1]. + */ + my_uca_level_booster_2bytes_disable_if_2byte_char(booster, cs, c->ch[0]); + } +} + + +static void +my_uca_2bytes_item_set_pair(MY_UCA_2BYTES_ITEM *dst, + const MY_UCA_WEIGHT_LEVEL *level, + CHARSET_INFO *cs, + uint a, uint b) +{ + if (a < 0x80 && b < 0x80) + my_uca_2bytes_item_set_ascii2(dst, level, a, b); + else + my_uca_2bytes_item_set_non_ascii(dst, level, cs, a, b); +} + + +static void +my_uca_level_booster_2bytes_populate_pairs(MY_UCA_LEVEL_BOOSTER *booster, + const MY_UCA_WEIGHT_LEVEL *level, + CHARSET_INFO *cs) +{ + uint a, b; + for (a= 0; a < 256; a++) + { + for (b= 0; b < 256; b++) + { + MY_UCA_2BYTES_ITEM *dst; + dst= my_uca_level_booster_2bytes_item_addr(booster, a, b); + my_uca_2bytes_item_set_pair(dst, level, cs, a, b); + } + } +} + + +/* + Populate contractions consisting of two ASCII letters. + Only true contractions are handled here so far. + Previous context pairs are handled separately. +*/ +static void +my_uca_level_booster_2bytes_pupulate_ascii2_contractions( + MY_UCA_LEVEL_BOOSTER *booster, + const MY_CONTRACTIONS *list) +{ + size_t i; + for (i= 0; i < list->nitems; i++) + { + const MY_CONTRACTION *c= &list->item[i]; + if (c->ch[0] < 0x80 && c->ch[1] < 0x80 && c->ch[2] == 0 && + !c->with_context) + { + MY_UCA_2BYTES_ITEM *dst= my_uca_level_booster_2bytes_item_addr(booster, + c->ch[0], + c->ch[1]); + my_uca_2bytes_item_weight_cpy(dst, c->weight); + } + } +} + + +static void +my_uca_level_booster_2bytes_disable_context_dependent( + MY_UCA_LEVEL_BOOSTER *booster, + const MY_CONTRACTIONS *list, + CHARSET_INFO *cs) +{ + size_t i; + for (i= 0; i < list->nitems; i++) + { + const MY_CONTRACTION *c= &list->item[i]; + if (c->with_context) + my_uca_level_booster_2bytes_disable_previous_context(booster, c, cs); + else + my_uca_level_booster_2bytes_disable_contraction(booster, c, cs); + } +} + + +/* + Populate the array of MY_UCA_WEIGHT2 for all possible byte pairs {a,b} + as follows: + + Number of characters Number of weights WEIGHT2 + -------------------- ----------------- ------ + 2 (two ASCII chars) 0 (both ignorable) {0,0} [IGN] + 2 (two ASCII chars) 1 (e.g. Czech "ch") {X,0} + 2 (two ASCII chars) 1 (e.g. ignorable + non-ignorable) {X,0} + 2 (two ASCII chars) 2 (two ASCII chars, one weigth each) {X,0} + 2 (two ASCII chars) 3+ (contraction with a long expansion) {0,0} [E3] + 1 (one 2-byte char) 0 (ignorable) {0,0} [IGN] + 1 (one 2-byte char) 1 {X,0} + 1 (one 2-byte char) 2 (short expansion, e.g. German SZ) {X,Y} + 1 (one 2-byte char) 3+ (long expansion) {0,0} [E3] + 0 (incomplete 3/4-byte char) {0,0} [INC] + + All byte pairs that depend on the context (e.g. contraction parts) + and that were previously marked as such by + my_uca_level_booster_2bytes_disable_context_dependent() + set WEIGHT2 to {0,0} [CTX]. + + After the initialization, the array contains non-zero weights for + the most typical simple cases of mapping from 2-bytes to weights, + so inside strnncoll*() we can skip equal string prefixes much faster, + using a cheaper simpler code. +*/ +static void +my_uca_level_booster_weight2_populate(MY_UCA_LEVEL_BOOSTER *booster) +{ + size_t i; + for (i= 0; i < 0x10000; i++) + { + MY_UCA_WEIGHT2 *dst= &booster->weight_strings_2bytes_to_1_or_2_weights[i]; + MY_UCA_2BYTES_ITEM *src= &booster->weight_strings_2bytes[i]; + if (src->weight[0] && (!src->weight[1] || !src->weight[2])) + { + /* + Simplest mapping: + - Two ASCII characters make one or two weights + - One 2-byte character makes one or two weights + Handled by the simpler loop at the comparison time. + */ + dst->weight[0]= src->weight[0]; + dst->weight[1]= src->weight[1]; + } + else + { + /* + More complex mapping: + - Ignorable - see [IGN] above + - More than two weights - see [E3] above + - Incomplete (a 3-byte or 4-byte char head) - see [INC] above + - Not applicable (context dependent) - see [CTX] above + Handled by the full-featured slower loop at the comparison time. + */ + dst->weight[0]= 0; + dst->weight[1]= 0; + } + } +} + + +static void +my_uca_level_booster_populate(MY_UCA_LEVEL_BOOSTER *dst, + const MY_UCA_WEIGHT_LEVEL *src, + CHARSET_INFO *cs) +{ + my_uca_level_booster_2bytes_populate_pairs(dst, src, cs); + my_uca_level_booster_2bytes_pupulate_ascii2_contractions(dst, + &src->contractions); + my_uca_level_booster_2bytes_disable_context_dependent(dst, + &src->contractions, + cs); + my_uca_level_booster_weight2_populate(dst); +} + + +static MY_UCA_LEVEL_BOOSTER * +my_uca_level_booster_alloc(MY_CHARSET_LOADER *loader) +{ + size_t nbytes= sizeof(MY_UCA_LEVEL_BOOSTER); + MY_UCA_LEVEL_BOOSTER *res; + if (!(res= (MY_UCA_LEVEL_BOOSTER *) (loader->once_alloc)(nbytes))) + return NULL; + bzero(res, nbytes); + return res; +} + + +static MY_UCA_LEVEL_BOOSTER * +my_uca_level_booster_new(MY_CHARSET_LOADER *loader, + CHARSET_INFO *cs, + MY_UCA_WEIGHT_LEVEL *level) +{ + MY_UCA_LEVEL_BOOSTER *res; + if (!(res= my_uca_level_booster_alloc(loader))) + return NULL; + my_uca_level_booster_populate(res, level, cs); + return res; +} + + +static size_t +my_uca_level_booster_equal_prefix_length(const MY_UCA_LEVEL_BOOSTER *booster, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + const uchar *s0= s; + size_t simple_count= MY_MIN(slen, tlen) >> 1; + for ( ; simple_count; s+= 2, t+= 2, simple_count--) + { + const MY_UCA_WEIGHT2 *ws, *wt; + ws= my_uca_level_booster_simple_weight2_addr_const(booster, s[0], s[1]); + wt= my_uca_level_booster_simple_weight2_addr_const(booster, t[0], t[1]); + if (ws->weight[0] && + ws->weight[0] == wt->weight[0] && + ws->weight[1] == wt->weight[1]) + continue; + break; + } + return s - s0; +} + + static my_bool -init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, +init_weight_level(MY_CHARSET_LOADER *loader, CHARSET_INFO *cs, + MY_COLL_RULES *rules, MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src) { MY_COLL_RULE *r, *rlast; @@ -33736,6 +34202,10 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, memcpy(weights, item->weight, length * sizeof(uint16)); weights[length]= 0; } + + if (cs->mbminlen == 1) + dst->booster= my_uca_level_booster_new(loader, cs, dst); + return FALSE; } @@ -33879,7 +34349,7 @@ create_tailoring(struct charset_info_st *cs, cs->coll_name.str, i + 1); goto ex; } - if ((rc= init_weight_level(loader, &rules, + if ((rc= init_weight_level(loader, cs, &rules, &new_uca.level[i], &src_uca->level[i]))) goto ex; } diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic index 32da4c87fdc..5ae1696677d 100644 --- a/strings/ctype-uca.ic +++ b/strings/ctype-uca.ic @@ -51,6 +51,38 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) my_wc_t currwc; const uint16 *cweight; +#if MY_UCA_ASCII_OPTIMIZE + if (scanner->sbeg + 1 < scanner->send) + { + const MY_UCA_2BYTES_ITEM *ww; + ww= my_uca_level_booster_2bytes_item_addr_const(scanner->level->booster, + scanner->sbeg[0], + scanner->sbeg[1]); + if (my_uca_2bytes_item_is_applicable(ww)) + { + /* + Byte pairs that make 2-byte head characters in previous + context pairs are marked as not applicable for optimization + during the collation initialization. So when we come here + sbeg[0] and sbeg[1] are: + - either two ASCII characters + - or one 2-byte character which IS NOT a previous context head + Just remember sbeg[1] as the previous character for simplicity. + This may erroneously interpret bytes 0x80..0x9F as previous context + head characters U+0080..U+009F. However, CLDR does not have any real + collations that use these characters as previous context heads. + */ + scanner->page= 0; + scanner->code= (int) scanner->sbeg[1]; + scanner->sbeg+= 2; + if ((weight= my_uca_scanner_set_weight(scanner, ww->weight))) + return weight; + continue; /* Ignorable character */ + } + /* 2 byte optimization is not applicable, go the slow path */ + } +#endif + /* Get next character */ #if MY_UCA_ASCII_OPTIMIZE /* Get next ASCII character */ @@ -195,6 +227,15 @@ MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs, my_uca_scanner tscanner; int s_res; int t_res; + +#if MY_UCA_ASCII_OPTIMIZE +{ + size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster, + s, slen, t, tlen); + s+= prefix, slen-= prefix; + t+= prefix, tlen-= prefix; +} +#endif my_uca_scanner_init_any(&sscanner, cs, level, s, slen); my_uca_scanner_init_any(&tscanner, cs, level, t, tlen); @@ -301,6 +342,15 @@ MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs, my_uca_scanner sscanner, tscanner; int s_res, t_res; +#if MY_UCA_ASCII_OPTIMIZE +{ + size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster, + s, slen, t, tlen); + s+= prefix, slen-= prefix; + t+= prefix, tlen-= prefix; +} +#endif + my_uca_scanner_init_any(&sscanner, cs, level, s, slen); my_uca_scanner_init_any(&tscanner, cs, level, t, tlen); |