diff options
author | Alexander Barkov <bar@mariadb.com> | 2018-10-16 19:10:57 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.com> | 2018-10-18 07:49:58 +0400 |
commit | 475c6ec551fa8847f8993e7cb2d3ff1119f29f5a (patch) | |
tree | 94cd3cd9fa24d54c69601fc27e35abd746fe9d95 /strings | |
parent | d88c136b9fb409cbf6421635a6175329e7182cd7 (diff) | |
download | mariadb-git-475c6ec551fa8847f8993e7cb2d3ff1119f29f5a.tar.gz |
MDEV-17474 Change Unicode collation implementation from "handler" to "inline" style (part#2)
Additional changes:
1. Adding a fast path for ASCII characters
2. Adding dedicated MY_COLLATION_HANDLERs for collations with no contractions
(for utf8 and for utf8mb4 character sets). The choice between
the full-featured handler and the "no contraction" handler is
made at the collation initialization time.
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-uca.c | 183 | ||||
-rw-r--r-- | strings/ctype-uca.ic | 88 |
2 files changed, 239 insertions, 32 deletions
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 4c670861a9f..320dc1ee225 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -31410,6 +31410,28 @@ my_uca_can_be_previous_context_tail(const MY_CONTRACTIONS *list, my_wc_t wc) /** + Check if a character needs previous/next context handling: + - can be a previois context tail + - can be a contraction start + + @param level Pointer to an UCA weight level data + @param wc Code point + + @return + @retval FALSE - does not need context handling + @retval TRUE - needs context handing +*/ + +static inline my_bool +my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc) +{ + return level->contractions.nitems > 0 && + level->contractions.flags[wc & MY_UCA_CNT_FLAG_MASK] & + (MY_UCA_PREVIOUS_CONTEXT_TAIL | MY_UCA_CNT_HEAD); +} + + +/** Compare two wide character strings, wide analog to strncmp(). @param a Pointer to the first string @@ -31543,6 +31565,60 @@ my_uca_previous_context_find(my_uca_scanner *scanner, return NULL; } + +/* + Find a context dependent weight of a character. + @param scanner - UCA weight scanner. The caller should set + its members "page" and "code" to the previous character + (or to zeros if there is no a previous character). + @param wc - an array of wide characters which has at least + MY_UCA_MAX_CONTRACTION elements, where wc[0] is set + to the current character (whose weight is being resolved). + The values of wc[i>0] is not important, but if wc[0] + appears to be a known contraction head, the function + will collect further contraction parts into wc[i>0]. + If wc[0] and the previous character make a previous context + pair, then wc[1] is set to the previous character. + + @retval NULL if could not find any contextual weights for wc[0] + @retval non null pointer to a zero-terminated weight string otherwise +*/ +static inline uint16 * +my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc) +{ + uint16 *cweight; + DBUG_ASSERT(scanner->level->contractions.nitems); + /* + If we have scanned a character which can have previous context, + and there were some more characters already before, + then reconstruct codepoint of the previous character + from "page" and "code" into w[1], and verify that {wc[1], wc[0]} + together form a real previous context pair. + Note, we support only 2-character long sequences with previous + context at the moment. CLDR does not have longer sequences. + */ + if (my_uca_can_be_previous_context_tail(&scanner->level->contractions, + wc[0]) && + scanner->wbeg != nochar && /* if not the very first character */ + my_uca_can_be_previous_context_head(&scanner->level->contractions, + (wc[1]= ((scanner->page << 8) + + scanner->code))) && + (cweight= my_uca_previous_context_find(scanner, wc[1], wc[0]))) + { + scanner->page= scanner->code= 0; /* Clear for the next character */ + return cweight; + } + else if (my_uca_can_be_contraction_head(&scanner->level->contractions, + wc[0])) + { + /* Check if w[0] starts a contraction */ + if ((cweight= my_uca_scanner_contraction_find(scanner, wc))) + return cweight; + } + return NULL; +} + + /****************************************************************/ /** @@ -31935,6 +32011,23 @@ int my_wildcmp_uca(CHARSET_INFO *cs, /* + Tests if an optimized "no contraction" handler can be used for + the given collation. +*/ +static my_bool +my_uca_collation_can_optimize_no_contractions(CHARSET_INFO *cs) +{ + uint i; + for (i= 0; i < cs->levels_for_order ; i++) + { + if (my_uca_have_contractions_quick(&cs->uca->level[i])) + return FALSE; + } + return TRUE; +} + + +/* Collation language is implemented according to subset of ICU Collation Customization (tailorings): http://icu.sourceforge.net/userguide/Collate_Customization.html @@ -33645,6 +33738,31 @@ static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len) /* + This structure is used at the collation initialization time, to switch + from a full-featured collation handler to a "no contraction" collation + handler if the collation is known not to have any contractions. +*/ +typedef struct +{ + MY_COLLATION_HANDLER *pad; + MY_COLLATION_HANDLER *nopad; + MY_COLLATION_HANDLER *multilevel_pad; + MY_COLLATION_HANDLER *multilevel_nopad; +} MY_COLLATION_HANDLER_PACKAGE; + + +static void my_uca_handler_map(struct charset_info_st *cs, + const MY_COLLATION_HANDLER_PACKAGE *from, + const MY_COLLATION_HANDLER_PACKAGE *to) +{ + if (cs->coll == from->pad) cs->coll= to->pad; + else if (cs->coll == from->nopad) cs->coll= to->nopad; + else if (cs->coll == from->multilevel_pad) cs->coll= to->multilevel_pad; + else if (cs->coll == from->multilevel_nopad) cs->coll= to->multilevel_nopad; +} + + +/* Define generic collation handlers for multi-level collations with tailoring: my_uca_collation_handler_nopad_multilevel_generic @@ -33656,6 +33774,9 @@ static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len) #define MY_FUNCTION_NAME(x) my_uca_ ## x ## _generic #define MY_MB_WC(scanner, wc, beg, end) (scanner->cs->cset->mb_wc(scanner->cs, wc, beg, end)) #define MY_LIKE_RANGE my_like_range_generic +#define MY_UCA_ASCII_OPTIMIZE 0 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_coll_init_uca #include "ctype-uca.ic" @@ -33758,6 +33879,9 @@ ex: #define MY_FUNCTION_NAME(x) my_uca_ ## x ## _ucs2 #define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_ucs2_quick(wc, beg, end)) #define MY_LIKE_RANGE my_like_range_generic +#define MY_UCA_ASCII_OPTIMIZE 0 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_coll_init_uca #include "ctype-uca.ic" @@ -34711,12 +34835,38 @@ struct charset_info_st my_charset_ucs2_unicode_520_nopad_ci= #ifdef HAVE_CHARSET_utf8 +static my_bool +my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader); + #include "ctype-utf8.h" #define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb3 #define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end)) #define MY_LIKE_RANGE my_like_range_mb +#define MY_UCA_ASCII_OPTIMIZE 1 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3 #include "ctype-uca.ic" +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb3 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_mb +#define MY_UCA_ASCII_OPTIMIZE 1 +#define MY_UCA_COMPILE_CONTRACTIONS 0 +#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3 +#include "ctype-uca.ic" + + +static my_bool +my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) +{ + if (my_coll_init_uca(cs, loader)) + return TRUE; + if (my_uca_collation_can_optimize_no_contractions(cs)) + my_uca_handler_map(cs, &my_uca_package_utf8mb3, + &my_uca_package_no_contractions_utf8mb3); + return FALSE; +} + /* We consider bytes with code more than 127 as a letter. @@ -35690,12 +35840,39 @@ struct charset_info_st my_charset_utf8_unicode_520_nopad_ci= #ifdef HAVE_CHARSET_utf8mb4 +static my_bool +my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader); + + #define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb4 #define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end)) #define MY_LIKE_RANGE my_like_range_mb +#define MY_UCA_ASCII_OPTIMIZE 1 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4 +#include "ctype-uca.ic" + +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb4 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_mb +#define MY_UCA_ASCII_OPTIMIZE 1 +#define MY_UCA_COMPILE_CONTRACTIONS 0 +#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4 #include "ctype-uca.ic" +static my_bool +my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) +{ + if (my_coll_init_uca(cs, loader)) + return TRUE; + if (my_uca_collation_can_optimize_no_contractions(cs)) + my_uca_handler_map(cs, &my_uca_package_utf8mb4, + &my_uca_package_no_contractions_utf8mb4); + return FALSE; +} + + extern MY_CHARSET_HANDLER my_charset_utf8mb4_handler; #define MY_CS_UTF8MB4_UCA_FLAGS (MY_CS_COMMON_UCA_FLAGS|MY_CS_UNICODE_SUPPLEMENT) @@ -36646,6 +36823,9 @@ struct charset_info_st my_charset_utf8mb4_unicode_520_nopad_ci= #define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf32 #define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf32_quick(wc, beg, end)) #define MY_LIKE_RANGE my_like_range_generic +#define MY_UCA_ASCII_OPTIMIZE 0 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_coll_init_uca #include "ctype-uca.ic" @@ -37601,6 +37781,9 @@ struct charset_info_st my_charset_utf32_unicode_520_nopad_ci= #define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf16 #define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf16_quick(wc, beg, end)) #define MY_LIKE_RANGE my_like_range_generic +#define MY_UCA_ASCII_OPTIMIZE 0 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_coll_init_uca #include "ctype-uca.ic" diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic index 7b2ca3447dd..de002efab1b 100644 --- a/strings/ctype-uca.ic +++ b/strings/ctype-uca.ic @@ -25,6 +25,15 @@ #ifndef MY_LIKE_RANGE #error MY_LIKE_RANGE is not defined #endif +#ifndef MY_UCA_ASCII_OPTIMIZE +#error MY_ASCII_OPTIMIZE is not defined +#endif +#ifndef MY_UCA_COMPILE_CONTRACTIONS +#error MY_UCA_COMPILE_CONTRACTIONS is not defined +#endif +#ifndef MY_UCA_COLL_INIT +#error MY_UCA_COLL_INIT is not defined +#endif static inline int @@ -46,6 +55,32 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) int mblen; /* Get next character */ +#if MY_UCA_ASCII_OPTIMIZE + /* Get next ASCII character */ + if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80) + { + wc[0]= scanner->sbeg[0]; + scanner->sbeg+= 1; + +#if MY_UCA_COMPILE_CONTRACTIONS + if (my_uca_needs_context_handling(scanner->level, wc[0])) + { + uint16 *cweight= my_uca_context_weight_find(scanner, wc); + if (cweight) + return *cweight; + } +#endif + + scanner->page= 0; + scanner->code= (int) wc[0]; + scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0]; + if (scanner->wbeg[0]) + return *scanner->wbeg++; + continue; + } + else +#endif + /* Get next MB character */ if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg, scanner->send)) <= 0)) { @@ -76,37 +111,14 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) return 0xFFFD; } - if (my_uca_have_contractions_quick(scanner->level)) +#if MY_UCA_COMPILE_CONTRACTIONS + if (my_uca_needs_context_handling(scanner->level, wc[0])) { - uint16 *cweight; - /* - If we have scanned a character which can have previous context, - and there were some more characters already before, - then reconstruct codepoint of the previous character - from "page" and "code" into w[1], and verify that {wc[1], wc[0]} - together form a real previous context pair. - Note, we support only 2-character long sequences with previous - context at the moment. CLDR does not have longer sequences. - */ - if (my_uca_can_be_previous_context_tail(&scanner->level->contractions, - wc[0]) && - scanner->wbeg != nochar && /* if not the very first character */ - my_uca_can_be_previous_context_head(&scanner->level->contractions, - (wc[1]= ((scanner->page << 8) + - scanner->code))) && - (cweight= my_uca_previous_context_find(scanner, wc[1], wc[0]))) - { - scanner->page= scanner->code= 0; /* Clear for the next character */ + uint16 *cweight= my_uca_context_weight_find(scanner, wc); + if (cweight) return *cweight; - } - else if (my_uca_can_be_contraction_head(&scanner->level->contractions, - wc[0])) - { - /* Check if w[0] starts a contraction */ - if ((cweight= my_uca_scanner_contraction_find(scanner, wc))) - return *cweight; - } } +#endif /* Process single character */ scanner->page= wc[0] >> 8; @@ -685,7 +697,7 @@ MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs, */ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)= { - my_coll_init_uca, + MY_UCA_COLL_INIT, MY_FUNCTION_NAME(strnncoll), MY_FUNCTION_NAME(strnncollsp), MY_FUNCTION_NAME(strnxfrm), @@ -706,7 +718,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)= */ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)= { - my_coll_init_uca, + MY_UCA_COLL_INIT, MY_FUNCTION_NAME(strnncoll), MY_FUNCTION_NAME(strnncollsp_nopad), MY_FUNCTION_NAME(strnxfrm_nopad), @@ -725,7 +737,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)= */ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)= { - my_coll_init_uca, + MY_UCA_COLL_INIT, MY_FUNCTION_NAME(strnncoll_multilevel), MY_FUNCTION_NAME(strnncollsp_multilevel), MY_FUNCTION_NAME(strnxfrm_multilevel), @@ -744,7 +756,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)= */ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)= { - my_coll_init_uca, + MY_UCA_COLL_INIT, MY_FUNCTION_NAME(strnncoll_multilevel), MY_FUNCTION_NAME(strnncollsp_nopad_multilevel), MY_FUNCTION_NAME(strnxfrm_multilevel), @@ -758,6 +770,18 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)= }; +MY_COLLATION_HANDLER_PACKAGE MY_FUNCTION_NAME(package)= +{ + &MY_FUNCTION_NAME(collation_handler), + &MY_FUNCTION_NAME(collation_handler_nopad), + &MY_FUNCTION_NAME(collation_handler_multilevel), + &MY_FUNCTION_NAME(collation_handler_nopad_multilevel) +}; + + #undef MY_FUNCTION_NAME #undef MY_MB_WC #undef MY_LIKE_RANGE +#undef MY_UCA_ASCII_OPTIMIZE +#undef MY_UCA_COMPILE_CONTRACTIONS +#undef MY_UCA_COLL_INIT |