diff options
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-uca.c | 1444 | ||||
-rw-r--r-- | strings/ctype-uca.ic | 839 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 145 | ||||
-rw-r--r-- | strings/ctype-ucs2.h | 32 | ||||
-rw-r--r-- | strings/ctype-unidata.h | 31 | ||||
-rw-r--r-- | strings/ctype-utf16.h | 80 | ||||
-rw-r--r-- | strings/ctype-utf32.h | 33 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 364 | ||||
-rw-r--r-- | strings/ctype-utf8.h | 190 | ||||
-rw-r--r-- | strings/json_lib.c | 249 | ||||
-rw-r--r-- | strings/strcoll.ic | 267 |
11 files changed, 2260 insertions, 1414 deletions
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 9efd7242118..8368e33cc1d 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -31158,17 +31158,6 @@ typedef struct my_uca_scanner_st CHARSET_INFO *cs; } my_uca_scanner; -/* - Charset dependent scanner part, to optimize - some character sets. -*/ -typedef struct my_uca_scanner_handler_st -{ - void (*init)(my_uca_scanner *scanner, CHARSET_INFO *cs, - const MY_UCA_WEIGHT_LEVEL *level, - const uchar *str, size_t length); - int (*next)(my_uca_scanner *scanner); -} my_uca_scanner_handler; static const uint16 nochar[]= {0,0}; @@ -31421,6 +31410,28 @@ my_uca_can_be_previous_context_tail(const MY_CONTRACTIONS *list, my_wc_t wc) /** + Check if a character needs previous/next context handling: + - can be a previois context tail + - can be a contraction start + + @param level Pointer to an UCA weight level data + @param wc Code point + + @return + @retval FALSE - does not need context handling + @retval TRUE - needs context handing +*/ + +static inline my_bool +my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc) +{ + return level->contractions.nitems > 0 && + level->contractions.flags[wc & MY_UCA_CNT_FLAG_MASK] & + (MY_UCA_PREVIOUS_CONTEXT_TAIL | MY_UCA_CNT_HEAD); +} + + +/** Compare two wide character strings, wide analog to strncmp(). @param a Pointer to the first string @@ -31554,6 +31565,60 @@ my_uca_previous_context_find(my_uca_scanner *scanner, return NULL; } + +/* + Find a context dependent weight of a character. + @param scanner - UCA weight scanner. The caller should set + its members "page" and "code" to the previous character + (or to zeros if there is no a previous character). + @param wc - an array of wide characters which has at least + MY_UCA_MAX_CONTRACTION elements, where wc[0] is set + to the current character (whose weight is being resolved). + The values of wc[i>0] is not important, but if wc[0] + appears to be a known contraction head, the function + will collect further contraction parts into wc[i>0]. + If wc[0] and the previous character make a previous context + pair, then wc[1] is set to the previous character. + + @retval NULL if could not find any contextual weights for wc[0] + @retval non null pointer to a zero-terminated weight string otherwise +*/ +static inline uint16 * +my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc) +{ + uint16 *cweight; + DBUG_ASSERT(scanner->level->contractions.nitems); + /* + If we have scanned a character which can have previous context, + and there were some more characters already before, + then reconstruct codepoint of the previous character + from "page" and "code" into w[1], and verify that {wc[1], wc[0]} + together form a real previous context pair. + Note, we support only 2-character long sequences with previous + context at the moment. CLDR does not have longer sequences. + */ + if (my_uca_can_be_previous_context_tail(&scanner->level->contractions, + wc[0]) && + scanner->wbeg != nochar && /* if not the very first character */ + my_uca_can_be_previous_context_head(&scanner->level->contractions, + (wc[1]= ((scanner->page << 8) + + scanner->code))) && + (cweight= my_uca_previous_context_find(scanner, wc[1], wc[0]))) + { + scanner->page= scanner->code= 0; /* Clear for the next character */ + return cweight; + } + else if (my_uca_can_be_contraction_head(&scanner->level->contractions, + wc[0])) + { + /* Check if w[0] starts a contraction */ + if ((cweight= my_uca_scanner_contraction_find(scanner, wc))) + return cweight; + } + return NULL; +} + + /****************************************************************/ /** @@ -31675,223 +31740,6 @@ my_uca_scanner_init_any(my_uca_scanner *scanner, scanner->cs= cs; } -static int my_uca_scanner_next_any(my_uca_scanner *scanner) -{ - /* - Check if the weights for the previous character have been - already fully scanned. If yes, then get the next character and - initialize wbeg and wlength to its weight string. - */ - - if (scanner->wbeg[0]) /* More weights left from the previous step: */ - return *scanner->wbeg++; /* return the next weight from expansion */ - - do - { - const uint16 *wpage; - my_wc_t wc[MY_UCA_MAX_CONTRACTION]; - int mblen; - - /* Get next character */ - if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, wc, - scanner->sbeg, - scanner->send)) <= 0)) - { - if (scanner->sbeg >= scanner->send) - return -1; /* No more bytes, end of line reached */ - /* - There are some more bytes left. Non-positive mb_len means that - we got an incomplete or a bad byte sequence. Consume mbminlen bytes. - */ - if ((scanner->sbeg+= scanner->cs->mbminlen) > scanner->send) - { - /* For safety purposes don't go beyond the string range. */ - scanner->sbeg= scanner->send; - } - /* - Treat every complete or incomplete mbminlen unit as a weight which is - greater than weight for any possible normal character. - 0xFFFF is greater than any possible weight in the UCA weight table. - */ - return 0xFFFF; - } - - scanner->sbeg+= mblen; - if (wc[0] > scanner->level->maxchar) - { - /* Return 0xFFFD as weight for all characters outside BMP */ - scanner->wbeg= nochar; - return 0xFFFD; - } - - if (my_uca_have_contractions_quick(scanner->level)) - { - uint16 *cweight; - /* - If we have scanned a character which can have previous context, - and there were some more characters already before, - then reconstruct codepoint of the previous character - from "page" and "code" into w[1], and verify that {wc[1], wc[0]} - together form a real previous context pair. - Note, we support only 2-character long sequences with previous - context at the moment. CLDR does not have longer sequences. - */ - if (my_uca_can_be_previous_context_tail(&scanner->level->contractions, - wc[0]) && - scanner->wbeg != nochar && /* if not the very first character */ - my_uca_can_be_previous_context_head(&scanner->level->contractions, - (wc[1]= ((scanner->page << 8) + - scanner->code))) && - (cweight= my_uca_previous_context_find(scanner, wc[1], wc[0]))) - { - scanner->page= scanner->code= 0; /* Clear for the next character */ - return *cweight; - } - else if (my_uca_can_be_contraction_head(&scanner->level->contractions, - wc[0])) - { - /* Check if w[0] starts a contraction */ - if ((cweight= my_uca_scanner_contraction_find(scanner, wc))) - return *cweight; - } - } - - /* Process single character */ - scanner->page= wc[0] >> 8; - scanner->code= wc[0] & 0xFF; - - /* If weight page for w[0] does not exist, then calculate algoritmically */ - if (!(wpage= scanner->level->weights[scanner->page])) - return my_uca_scanner_next_implicit(scanner); - - /* Calculate pointer to w[0]'s weight, using page and offset */ - scanner->wbeg= wpage + - scanner->code * scanner->level->lengths[scanner->page]; - } while (!scanner->wbeg[0]); /* Skip ignorable characters */ - - return *scanner->wbeg++; -} - - -static my_uca_scanner_handler my_any_uca_scanner_handler= -{ - my_uca_scanner_init_any, - my_uca_scanner_next_any -}; - -/* - Compares two strings according to the collation - - SYNOPSIS: - my_strnncoll_uca() - cs Character set information - s First string - slen First string length - t Second string - tlen Seconf string length - level DUCETweight level - - NOTES: - Initializes two weight scanners and gets weights - corresponding to two strings in a loop. If weights are not - the same at some step then returns their difference. - - In the while() comparison these situations are possible: - 1. (s_res>0) and (t_res>0) and (s_res == t_res) - Weights are the same so far, continue comparison - 2. (s_res>0) and (t_res>0) and (s_res!=t_res) - A difference has been found, return. - 3. (s_res>0) and (t_res<0) - We have reached the end of the second string, or found - an illegal multibyte sequence in the second string. - Return a positive number, i.e. the first string is bigger. - 4. (s_res<0) and (t_res>0) - We have reached the end of the first string, or found - an illegal multibyte sequence in the first string. - Return a negative number, i.e. the second string is bigger. - 5. (s_res<0) and (t_res<0) - Both scanners returned -1. It means we have riched - the end-of-string of illegal-sequence in both strings - at the same time. Return 0, strings are equal. - - RETURN - Difference between two strings, according to the collation: - 0 - means strings are equal - negative number - means the first string is smaller - positive number - means the first string is bigger -*/ - -static int my_strnncoll_uca_onelevel(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - const MY_UCA_WEIGHT_LEVEL *level, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - my_uca_scanner sscanner; - my_uca_scanner tscanner; - int s_res; - int t_res; - - scanner_handler->init(&sscanner, cs, level, s, slen); - scanner_handler->init(&tscanner, cs, level, t, tlen); - - do - { - s_res= scanner_handler->next(&sscanner); - t_res= scanner_handler->next(&tscanner); - } while ( s_res == t_res && s_res >0); - - return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res); -} - -static int my_strnncoll_uca(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - return my_strnncoll_uca_onelevel(cs, scanner_handler, &cs->uca->level[0], - s, slen, t, tlen, t_is_prefix); -} - -static int my_strnncoll_uca_multilevel(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - uint num_level= cs->levels_for_order; - uint i; - for (i= 0; i != num_level; i++) - { - int ret= my_strnncoll_uca_onelevel(cs, scanner_handler, &cs->uca->level[i], - s, slen, t, tlen, t_is_prefix); - if (ret) - return ret; - } - return 0; -} - - -static int -my_strnncollsp_generic_uca_nopad_multilevel(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - uint num_level= cs->levels_for_order; - uint i; - for (i= 0; i != num_level; i++) - { - int ret= my_strnncoll_uca_onelevel(cs, &my_any_uca_scanner_handler, - &cs->uca->level[i], - s, slen, t, tlen, FALSE); - if (ret) - return ret; - } - return 0; -} - static inline int my_space_weight(const MY_UCA_WEIGHT_LEVEL *level) @@ -31924,258 +31772,6 @@ my_char_weight_addr(const MY_UCA_WEIGHT_LEVEL *level, uint wc) } -/* - Compares two strings according to the collation, - ignoring trailing spaces. - - SYNOPSIS: - my_strnncollsp_uca() - cs Character set information - s First string - slen First string length - t Second string - tlen Seconf string length - level DUCETweight level - - NOTES: - Works exactly the same with my_strnncoll_uca(), - but ignores trailing spaces. - - In the while() comparison these situations are possible: - 1. (s_res>0) and (t_res>0) and (s_res == t_res) - Weights are the same so far, continue comparison - 2. (s_res>0) and (t_res>0) and (s_res!=t_res) - A difference has been found, return. - 3. (s_res>0) and (t_res<0) - We have reached the end of the second string, or found - an illegal multibyte sequence in the second string. - Compare the first string to an infinite array of - space characters until difference is found, or until - the end of the first string. - 4. (s_res<0) and (t_res>0) - We have reached the end of the first string, or found - an illegal multibyte sequence in the first string. - Compare the second string to an infinite array of - space characters until difference is found or until - the end of the second steing. - 5. (s_res<0) and (t_res<0) - Both scanners returned -1. It means we have riched - the end-of-string of illegal-sequence in both strings - at the same time. Return 0, strings are equal. - - RETURN - Difference between two strings, according to the collation: - 0 - means strings are equal - negative number - means the first string is smaller - positive number - means the first string is bigger -*/ - -static int my_strnncollsp_uca_onelevel(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - const MY_UCA_WEIGHT_LEVEL *level, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - my_uca_scanner sscanner, tscanner; - int s_res, t_res; - - scanner_handler->init(&sscanner, cs, level, s, slen); - scanner_handler->init(&tscanner, cs, level, t, tlen); - - do - { - s_res= scanner_handler->next(&sscanner); - t_res= scanner_handler->next(&tscanner); - } while ( s_res == t_res && s_res >0); - - if (s_res > 0 && t_res < 0) - { - /* Calculate weight for SPACE character */ - t_res= my_space_weight(level); - - /* compare the first string to spaces */ - do - { - if (s_res != t_res) - return (s_res - t_res); - s_res= scanner_handler->next(&sscanner); - } while (s_res > 0); - return 0; - } - - if (s_res < 0 && t_res > 0) - { - /* Calculate weight for SPACE character */ - s_res= my_space_weight(level); - - /* compare the second string to spaces */ - do - { - if (s_res != t_res) - return (s_res - t_res); - t_res= scanner_handler->next(&tscanner); - } while (t_res > 0); - return 0; - } - - return ( s_res - t_res ); -} - -static int my_strnncollsp_uca(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - return my_strnncollsp_uca_onelevel(cs, scanner_handler, &cs->uca->level[0], - s, slen, t, tlen); -} - -static int my_strnncollsp_uca_multilevel(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - uint num_level= cs->levels_for_order; - uint i; - for (i= 0; i != num_level; i++) - { - int ret= my_strnncollsp_uca_onelevel(cs, scanner_handler, - &cs->uca->level[i], s, slen, t, tlen); - if (ret) - return ret; - } - return 0; -} - -/* - Calculates hash value for the given string, - according to the collation, and ignoring trailing spaces. - - SYNOPSIS: - my_hash_sort_uca() - cs Character set information - s String - slen String's length - n1 First hash parameter - n2 Second hash parameter - - NOTES: - Scans consequently weights and updates - hash parameters n1 and n2. In a case insensitive collation, - upper and lower case of the same letter will return the same - weight sequence, and thus will produce the same hash values - in n1 and n2. - - This functions is used for one-level and for multi-level collations. - We intentionally use only primary level in multi-level collations. - This helps to have PARTITION BY KEY put primarily equal records - into the same partition. E.g. in utf8_thai_520_ci records that differ - only in tone marks go into the same partition. - - RETURN - N/A -*/ - -static void my_hash_sort_uca(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - const uchar *s, size_t slen, - ulong *nr1, ulong *nr2) -{ - int s_res; - my_uca_scanner scanner; - int space_weight= my_space_weight(&cs->uca->level[0]); - register ulong m1= *nr1, m2= *nr2; - - scanner_handler->init(&scanner, cs, &cs->uca->level[0], s, slen); - - while ((s_res= scanner_handler->next(&scanner)) >0) - { - if (s_res == space_weight) - { - /* Combine all spaces to be able to skip end spaces */ - uint count= 0; - do - { - count++; - if ((s_res= scanner_handler->next(&scanner)) <= 0) - { - /* Skip strings at end of string */ - goto end; - } - } - while (s_res == space_weight); - - /* Add back that has for the space characters */ - do - { - /* - We can't use MY_HASH_ADD_16() here as we, because of a misstake - in the original code, where we added the 16 byte variable the - opposite way. Changing this would cause old partitioned tables - to fail. - */ - MY_HASH_ADD(m1, m2, space_weight >> 8); - MY_HASH_ADD(m1, m2, space_weight & 0xFF); - } - while (--count != 0); - - } - /* See comment above why we can't use MY_HASH_ADD_16() */ - MY_HASH_ADD(m1, m2, s_res >> 8); - MY_HASH_ADD(m1, m2, s_res & 0xFF); - } -end: - *nr1= m1; - *nr2= m2; -} - - -static void my_hash_sort_uca_nopad(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - const uchar *s, size_t slen, - ulong *nr1, ulong *nr2) -{ - int s_res; - my_uca_scanner scanner; - register ulong m1= *nr1, m2= *nr2; - - scanner_handler->init(&scanner, cs, &cs->uca->level[0], s, slen); - - while ((s_res= scanner_handler->next(&scanner)) >0) - { - /* See comment above why we can't use MY_HASH_ADD_16() */ - MY_HASH_ADD(m1, m2, s_res >> 8); - MY_HASH_ADD(m1, m2, s_res & 0xFF); - } - *nr1= m1; - *nr2= m2; -} - - -static uchar * -my_strnxfrm_uca_onelevel_internal(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - MY_UCA_WEIGHT_LEVEL *level, - uchar *dst, uchar *de, uint *nweights, - const uchar *src, size_t srclen) -{ - my_uca_scanner scanner; - int s_res; - - DBUG_ASSERT(src || !srclen); - - scanner_handler->init(&scanner, cs, level, src, srclen); - for (; dst < de && *nweights && - (s_res= scanner_handler->next(&scanner)) > 0 ; (*nweights)--) - { - *dst++= s_res >> 8; - if (dst < de) - *dst++= s_res & 0xFF; - } - return dst; -} - - static uchar * my_strnxfrm_uca_padn(uchar *dst, uchar *de, uint nweights, int weight) { @@ -32202,27 +31798,6 @@ my_strnxfrm_uca_pad(uchar *dst, uchar *de, int weight) } -static uchar * -my_strnxfrm_uca_onelevel(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - MY_UCA_WEIGHT_LEVEL *level, - uchar *dst, uchar *de, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uchar *d0= dst; - - dst= my_strnxfrm_uca_onelevel_internal(cs, scanner_handler, level, - dst, de, &nweights, - src, srclen); - DBUG_ASSERT(dst <= de); - if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) - dst= my_strnxfrm_uca_padn(dst, de, nweights, my_space_weight(level)); - DBUG_ASSERT(dst <= de); - my_strxfrm_desc_and_reverse(d0, dst, flags, 0); - return dst; -} - - /* Return the minimum possible weight on a level. */ @@ -32233,136 +31808,6 @@ static uint min_weight_on_level(MY_UCA_WEIGHT_LEVEL *level) } -static uchar * -my_strnxfrm_uca_nopad_onelevel(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - MY_UCA_WEIGHT_LEVEL *level, - uchar *dst, uchar *de, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uchar *d0= dst; - - dst= my_strnxfrm_uca_onelevel_internal(cs, scanner_handler, level, - dst, de, &nweights, - src, srclen); - DBUG_ASSERT(dst <= de); - /* Pad with the minimum possible weight on this level */ - if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) - dst= my_strnxfrm_uca_padn(dst, de, nweights, min_weight_on_level(level)); - DBUG_ASSERT(dst <= de); - my_strxfrm_desc_and_reverse(d0, dst, flags, 0); - return dst; -} - - -/* - For the given string creates its "binary image", suitable - to be used in binary comparison, i.e. in memcmp(). - - SYNOPSIS: - my_strnxfrm_uca() - cs Character set information - dst Where to write the image - dstlen Space available for the image, in bytes - src The source string - srclen Length of the source string, in bytes - - NOTES: - In a loop, scans weights from the source string and writes - them into the binary image. In a case insensitive collation, - upper and lower cases of the same letter will produce the - same image subsequences. When we have reached the end-of-string - or found an illegal multibyte sequence, the loop stops. - - It is impossible to restore the original string using its - binary image. - - Binary images are used for bulk comparison purposes, - e.g. in ORDER BY, when it is more efficient to create - a binary image and use it instead of weight scanner - for the original strings for every comparison. - - RETURN - Number of bytes that have been written into the binary image. -*/ - - -static size_t -my_strnxfrm_uca(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uchar *d0= dst; - uchar *de= dst + dstlen; - - dst= my_strnxfrm_uca_onelevel(cs, scanner_handler, &cs->uca->level[0], - dst, de, nweights, src, srclen, flags); - /* - This can probably be changed to memset(dst, 0, de - dst), - like my_strnxfrm_uca_multilevel() does. - */ - if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) - dst= my_strnxfrm_uca_pad(dst, de, my_space_weight(&cs->uca->level[0])); - return dst - d0; -} - - -static size_t -my_strnxfrm_uca_nopad(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uchar *d0= dst; - uchar *de= dst + dstlen; - - dst= my_strnxfrm_uca_nopad_onelevel(cs, scanner_handler, &cs->uca->level[0], - dst, de, nweights, src, srclen, flags); - if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) - { - memset(dst, 0, de - dst); - dst= de; - } - return dst - d0; -} - - -static size_t -my_strnxfrm_uca_multilevel(CHARSET_INFO *cs, - my_uca_scanner_handler *scanner_handler, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uint num_level= cs->levels_for_order; - uchar *d0= dst; - uchar *de= dst + dstlen; - uint current_level; - - for (current_level= 0; current_level != num_level; current_level++) - { - if (!(flags & MY_STRXFRM_LEVEL_ALL) || - (flags & (MY_STRXFRM_LEVEL1 << current_level))) - dst= cs->state & MY_CS_NOPAD ? - my_strnxfrm_uca_nopad_onelevel(cs, scanner_handler, - &cs->uca->level[current_level], - dst, de, nweights, - src, srclen, flags) : - my_strnxfrm_uca_onelevel(cs, scanner_handler, - &cs->uca->level[current_level], - dst, de, nweights, - src, srclen, flags); - } - - if (dst < de && (flags & MY_STRXFRM_PAD_TO_MAXLEN)) - { - memset(dst, 0, de - dst); - dst= de; - } - - return dst - d0; -} - /* This function compares if two characters are the same. The sign +1 or -1 does not matter. The only @@ -32568,6 +32013,23 @@ int my_wildcmp_uca(CHARSET_INFO *cs, /* + Tests if an optimized "no contraction" handler can be used for + the given collation. +*/ +static my_bool +my_uca_collation_can_optimize_no_contractions(CHARSET_INFO *cs) +{ + uint i; + for (i= 0; i < cs->levels_for_order ; i++) + { + if (my_uca_have_contractions_quick(&cs->uca->level[i])) + return FALSE; + } + return TRUE; +} + + +/* Collation language is implemented according to subset of ICU Collation Customization (tailorings): http://icu.sourceforge.net/userguide/Collate_Customization.html @@ -34250,8 +33712,74 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, } -MY_COLLATION_HANDLER my_collation_any_uca_handler_multilevel; -MY_COLLATION_HANDLER my_collation_generic_uca_nopad_handler_multilevel; +static my_bool +create_tailoring(struct charset_info_st *cs, + MY_CHARSET_LOADER *loader); + +static my_bool +my_coll_init_uca(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) +{ + cs->pad_char= ' '; + cs->ctype= my_charset_utf8_unicode_ci.ctype; + if (!cs->caseinfo) + cs->caseinfo= &my_unicase_default; + return create_tailoring(cs, loader); +} + + +static size_t my_strnxfrmlen_any_uca(CHARSET_INFO *cs, size_t len) +{ + /* UCA uses 2 bytes per weight */ + return (len + cs->mbmaxlen - 1) / cs->mbmaxlen * cs->strxfrm_multiply * 2; +} + +static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len) +{ + return my_strnxfrmlen_any_uca(cs, len) * cs->levels_for_order; +} + + +/* + This structure is used at the collation initialization time, to switch + from a full-featured collation handler to a "no contraction" collation + handler if the collation is known not to have any contractions. +*/ +typedef struct +{ + MY_COLLATION_HANDLER *pad; + MY_COLLATION_HANDLER *nopad; + MY_COLLATION_HANDLER *multilevel_pad; + MY_COLLATION_HANDLER *multilevel_nopad; +} MY_COLLATION_HANDLER_PACKAGE; + + +static void my_uca_handler_map(struct charset_info_st *cs, + const MY_COLLATION_HANDLER_PACKAGE *from, + const MY_COLLATION_HANDLER_PACKAGE *to) +{ + if (cs->coll == from->pad) cs->coll= to->pad; + else if (cs->coll == from->nopad) cs->coll= to->nopad; + else if (cs->coll == from->multilevel_pad) cs->coll= to->multilevel_pad; + else if (cs->coll == from->multilevel_nopad) cs->coll= to->multilevel_nopad; +} + + +/* + Define generic collation handlers for multi-level collations with tailoring: + + my_uca_collation_handler_nopad_multilevel_generic + my_uca_collation_handler_multilevel_generic + + TODO: Use faster character-set specific versions of MY_COLLATION_HANDLER + instead of generic. +*/ +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _generic +#define MY_MB_WC(scanner, wc, beg, end) (scanner->cs->cset->mb_wc(scanner->cs, wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_generic +#define MY_UCA_ASCII_OPTIMIZE 0 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_coll_init_uca +#include "ctype-uca.ic" /* @@ -34336,8 +33864,8 @@ create_tailoring(struct charset_info_st *cs, cs->uca[0]= new_uca; if (cs->levels_for_order > 1) cs->coll= (cs->state & MY_CS_NOPAD) ? - &my_collation_generic_uca_nopad_handler_multilevel : - &my_collation_any_uca_handler_multilevel; + &my_uca_collation_handler_nopad_multilevel_generic : + &my_uca_collation_handler_multilevel_generic; ex: (loader->free)(rules.rule); @@ -34346,235 +33874,17 @@ ex: return rc; } -/* - Universal CHARSET_INFO compatible wrappers - for the above internal functions. - Should work for any character set. -*/ - -static my_bool -my_coll_init_uca(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) -{ - cs->pad_char= ' '; - cs->ctype= my_charset_utf8_unicode_ci.ctype; - if (!cs->caseinfo) - cs->caseinfo= &my_unicase_default; - return create_tailoring(cs, loader); -} - - -static int my_strnncoll_any_uca(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - return my_strnncoll_uca(cs, &my_any_uca_scanner_handler, - s, slen, t, tlen, t_is_prefix); -} - -static int my_strnncoll_any_uca_multilevel(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - return my_strnncoll_uca_multilevel(cs, &my_any_uca_scanner_handler, - s, slen, t, tlen, t_is_prefix); -} - -static int my_strnncollsp_any_uca(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - return my_strnncollsp_uca(cs, &my_any_uca_scanner_handler, s, slen, t, tlen); -} - - -static int my_strnncollsp_generic_uca_nopad(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - return my_strnncoll_uca(cs, &my_any_uca_scanner_handler, - s, slen, t, tlen, FALSE); -} - - -static int my_strnncollsp_any_uca_multilevel(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - return my_strnncollsp_uca_multilevel(cs, &my_any_uca_scanner_handler, - s, slen, t, tlen); -} - -static void my_hash_sort_any_uca(CHARSET_INFO *cs, - const uchar *s, size_t slen, - ulong *n1, ulong *n2) -{ - my_hash_sort_uca(cs, &my_any_uca_scanner_handler, s, slen, n1, n2); -} - -static void my_hash_sort_generic_uca_nopad(CHARSET_INFO *cs, - const uchar *s, size_t slen, - ulong *n1, ulong *n2) -{ - my_hash_sort_uca_nopad(cs, &my_any_uca_scanner_handler, s, slen, n1, n2); -} - -static size_t my_strnxfrm_any_uca(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - return my_strnxfrm_uca(cs, &my_any_uca_scanner_handler, - dst, dstlen, nweights, src, srclen, flags); -} - -static size_t my_strnxfrm_generic_uca_nopad(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, - uint nweights, - const uchar *src, size_t srclen, - uint flags) -{ - return my_strnxfrm_uca_nopad(cs, &my_any_uca_scanner_handler, - dst, dstlen, nweights, src, srclen, flags); -} - -static size_t my_strnxfrm_any_uca_multilevel(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, - uint nweights, const uchar *src, - size_t srclen, uint flags) -{ - return my_strnxfrm_uca_multilevel(cs, &my_any_uca_scanner_handler, - dst, dstlen, nweights, src, srclen, - flags); -} - -static size_t my_strnxfrmlen_any_uca(CHARSET_INFO *cs, size_t len) -{ - /* UCA uses 2 bytes per weight */ - return (len + cs->mbmaxlen - 1) / cs->mbmaxlen * cs->strxfrm_multiply * 2; -} - -static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len) -{ - return my_strnxfrmlen_any_uca(cs, len) * cs->levels_for_order; -} - - -/* NO PAD handler for character sets with mbminlen==1 */ -MY_COLLATION_HANDLER my_collation_mb_uca_nopad_handler = -{ - my_coll_init_uca, - my_strnncoll_any_uca, - my_strnncollsp_generic_uca_nopad, - my_strnxfrm_generic_uca_nopad, - my_strnxfrmlen_any_uca, - my_like_range_mb, - my_wildcmp_uca, - NULL, - my_instr_mb, - my_hash_sort_generic_uca_nopad, - my_propagate_complex -}; - - -/* NO PAD handler for character sets with mbminlen>=1 */ -MY_COLLATION_HANDLER my_collation_generic_uca_nopad_handler = -{ - my_coll_init_uca, - my_strnncoll_any_uca, - my_strnncollsp_generic_uca_nopad, - my_strnxfrm_generic_uca_nopad, - my_strnxfrmlen_any_uca, - my_like_range_generic, - my_wildcmp_uca, - NULL, - my_instr_mb, - my_hash_sort_generic_uca_nopad, - my_propagate_complex -}; - - -MY_COLLATION_HANDLER my_collation_any_uca_handler_multilevel= -{ - my_coll_init_uca, - my_strnncoll_any_uca_multilevel, - my_strnncollsp_any_uca_multilevel, - my_strnxfrm_any_uca_multilevel, - my_strnxfrmlen_any_uca_multilevel, - my_like_range_generic, - my_wildcmp_uca, - NULL, - my_instr_mb, - my_hash_sort_any_uca, - my_propagate_complex -}; - - -MY_COLLATION_HANDLER my_collation_generic_uca_nopad_handler_multilevel = -{ - my_coll_init_uca, - my_strnncoll_any_uca_multilevel, - my_strnncollsp_generic_uca_nopad_multilevel, - my_strnxfrm_any_uca_multilevel, - my_strnxfrmlen_any_uca_multilevel, - my_like_range_generic, - my_wildcmp_uca, - NULL, - my_instr_mb, - my_hash_sort_generic_uca_nopad, - my_propagate_complex -}; - #ifdef HAVE_CHARSET_ucs2 -/* - UCS2 optimized CHARSET_INFO compatible wrappers. -*/ -static int my_strnncoll_ucs2_uca(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - return my_strnncoll_uca(cs, &my_any_uca_scanner_handler, - s, slen, t, tlen, t_is_prefix); -} -static int my_strnncollsp_ucs2_uca(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - return my_strnncollsp_uca(cs, &my_any_uca_scanner_handler, s, slen, t, tlen); -} - -static void my_hash_sort_ucs2_uca(CHARSET_INFO *cs, - const uchar *s, size_t slen, - ulong *n1, ulong *n2) -{ - my_hash_sort_uca(cs, &my_any_uca_scanner_handler, s, slen, n1, n2); -} - -static size_t my_strnxfrm_ucs2_uca(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - return my_strnxfrm_uca(cs, &my_any_uca_scanner_handler, - dst, dstlen, nweights, src, srclen, flags); -} - -MY_COLLATION_HANDLER my_collation_ucs2_uca_handler = -{ - my_coll_init_uca, /* init */ - my_strnncoll_ucs2_uca, - my_strnncollsp_ucs2_uca, - my_strnxfrm_ucs2_uca, - my_strnxfrmlen_any_uca, - my_like_range_generic, - my_wildcmp_uca, - NULL, - my_instr_mb, - my_hash_sort_ucs2_uca, - my_propagate_complex -}; +#include "ctype-ucs2.h" +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _ucs2 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_ucs2_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_generic +#define MY_UCA_ASCII_OPTIMIZE 0 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_coll_init_uca +#include "ctype-uca.ic" #define MY_CS_UCS2_UCA_FLAGS (MY_CS_COMMON_UCA_FLAGS|MY_CS_NONASCII) @@ -34609,7 +33919,7 @@ struct charset_info_st my_charset_ucs2_unicode_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_icelandic_uca_ci= @@ -34641,7 +33951,7 @@ struct charset_info_st my_charset_ucs2_icelandic_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_latvian_uca_ci= @@ -34673,7 +33983,7 @@ struct charset_info_st my_charset_ucs2_latvian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_romanian_uca_ci= @@ -34705,7 +34015,7 @@ struct charset_info_st my_charset_ucs2_romanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_slovenian_uca_ci= @@ -34737,7 +34047,7 @@ struct charset_info_st my_charset_ucs2_slovenian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_polish_uca_ci= @@ -34769,7 +34079,7 @@ struct charset_info_st my_charset_ucs2_polish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_estonian_uca_ci= @@ -34801,7 +34111,7 @@ struct charset_info_st my_charset_ucs2_estonian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_spanish_uca_ci= @@ -34833,7 +34143,7 @@ struct charset_info_st my_charset_ucs2_spanish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_swedish_uca_ci= @@ -34865,7 +34175,7 @@ struct charset_info_st my_charset_ucs2_swedish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_turkish_uca_ci= @@ -34897,7 +34207,7 @@ struct charset_info_st my_charset_ucs2_turkish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_czech_uca_ci= @@ -34929,7 +34239,7 @@ struct charset_info_st my_charset_ucs2_czech_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -34962,7 +34272,7 @@ struct charset_info_st my_charset_ucs2_danish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_lithuanian_uca_ci= @@ -34994,7 +34304,7 @@ struct charset_info_st my_charset_ucs2_lithuanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_slovak_uca_ci= @@ -35026,7 +34336,7 @@ struct charset_info_st my_charset_ucs2_slovak_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_spanish2_uca_ci= @@ -35058,7 +34368,7 @@ struct charset_info_st my_charset_ucs2_spanish2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35091,7 +34401,7 @@ struct charset_info_st my_charset_ucs2_roman_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35124,7 +34434,7 @@ struct charset_info_st my_charset_ucs2_persian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35157,7 +34467,7 @@ struct charset_info_st my_charset_ucs2_esperanto_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35190,7 +34500,7 @@ struct charset_info_st my_charset_ucs2_hungarian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_sinhala_uca_ci= @@ -35222,7 +34532,7 @@ struct charset_info_st my_charset_ucs2_sinhala_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35256,7 +34566,7 @@ struct charset_info_st my_charset_ucs2_german2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; struct charset_info_st my_charset_ucs2_croatian_mysql561_uca_ci= @@ -35288,7 +34598,7 @@ struct charset_info_st my_charset_ucs2_croatian_mysql561_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35321,7 +34631,7 @@ struct charset_info_st my_charset_ucs2_croatian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35354,7 +34664,7 @@ struct charset_info_st my_charset_ucs2_myanmar_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35387,7 +34697,7 @@ struct charset_info_st my_charset_ucs2_thai_520_w2= 0, /* escape_with_backslash_is_dangerous */ 2, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_any_uca_handler_multilevel + &my_uca_collation_handler_multilevel_ucs2 }; struct charset_info_st my_charset_ucs2_unicode_520_ci= @@ -35419,7 +34729,7 @@ struct charset_info_st my_charset_ucs2_unicode_520_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35452,7 +34762,7 @@ struct charset_info_st my_charset_ucs2_vietnamese_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_ucs2_uca_handler + &my_uca_collation_handler_ucs2 }; @@ -35485,7 +34795,7 @@ struct charset_info_st my_charset_ucs2_unicode_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_generic_uca_nopad_handler + &my_uca_collation_handler_nopad_ucs2 }; @@ -35518,7 +34828,7 @@ struct charset_info_st my_charset_ucs2_unicode_520_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_ucs2_handler, - &my_collation_generic_uca_nopad_handler + &my_uca_collation_handler_nopad_ucs2 }; @@ -35526,20 +34836,38 @@ struct charset_info_st my_charset_ucs2_unicode_520_nopad_ci= #ifdef HAVE_CHARSET_utf8 -MY_COLLATION_HANDLER my_collation_any_uca_handler = + +static my_bool +my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader); + +#include "ctype-utf8.h" +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb3 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_mb +#define MY_UCA_ASCII_OPTIMIZE 1 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3 +#include "ctype-uca.ic" + +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb3 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_mb +#define MY_UCA_ASCII_OPTIMIZE 1 +#define MY_UCA_COMPILE_CONTRACTIONS 0 +#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3 +#include "ctype-uca.ic" + + +static my_bool +my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) { - my_coll_init_uca, /* init */ - my_strnncoll_any_uca, - my_strnncollsp_any_uca, - my_strnxfrm_any_uca, - my_strnxfrmlen_any_uca, - my_like_range_mb, - my_wildcmp_uca, - NULL, - my_instr_mb, - my_hash_sort_any_uca, - my_propagate_complex -}; + if (my_coll_init_uca(cs, loader)) + return TRUE; + if (my_uca_collation_can_optimize_no_contractions(cs)) + my_uca_handler_map(cs, &my_uca_package_utf8mb3, + &my_uca_package_no_contractions_utf8mb3); + return FALSE; +} /* @@ -35602,7 +34930,7 @@ struct charset_info_st my_charset_utf8_unicode_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; @@ -35635,7 +34963,7 @@ struct charset_info_st my_charset_utf8_icelandic_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_latvian_uca_ci= @@ -35667,7 +34995,7 @@ struct charset_info_st my_charset_utf8_latvian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_romanian_uca_ci= @@ -35699,7 +35027,7 @@ struct charset_info_st my_charset_utf8_romanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_slovenian_uca_ci= @@ -35731,7 +35059,7 @@ struct charset_info_st my_charset_utf8_slovenian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_polish_uca_ci= @@ -35763,7 +35091,7 @@ struct charset_info_st my_charset_utf8_polish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_estonian_uca_ci= @@ -35795,7 +35123,7 @@ struct charset_info_st my_charset_utf8_estonian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_spanish_uca_ci= @@ -35827,7 +35155,7 @@ struct charset_info_st my_charset_utf8_spanish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_swedish_uca_ci= @@ -35859,7 +35187,7 @@ struct charset_info_st my_charset_utf8_swedish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_turkish_uca_ci= @@ -35891,7 +35219,7 @@ struct charset_info_st my_charset_utf8_turkish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_czech_uca_ci= @@ -35923,7 +35251,7 @@ struct charset_info_st my_charset_utf8_czech_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; @@ -35956,7 +35284,7 @@ struct charset_info_st my_charset_utf8_danish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_lithuanian_uca_ci= @@ -35988,7 +35316,7 @@ struct charset_info_st my_charset_utf8_lithuanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_slovak_uca_ci= @@ -36020,7 +35348,7 @@ struct charset_info_st my_charset_utf8_slovak_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_spanish2_uca_ci= @@ -36052,7 +35380,7 @@ struct charset_info_st my_charset_utf8_spanish2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_roman_uca_ci= @@ -36084,7 +35412,7 @@ struct charset_info_st my_charset_utf8_roman_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_persian_uca_ci= @@ -36116,7 +35444,7 @@ struct charset_info_st my_charset_utf8_persian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_esperanto_uca_ci= @@ -36148,7 +35476,7 @@ struct charset_info_st my_charset_utf8_esperanto_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_hungarian_uca_ci= @@ -36180,7 +35508,7 @@ struct charset_info_st my_charset_utf8_hungarian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_sinhala_uca_ci= @@ -36212,7 +35540,7 @@ struct charset_info_st my_charset_utf8_sinhala_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; @@ -36245,7 +35573,7 @@ struct charset_info_st my_charset_utf8_german2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_croatian_mysql561_uca_ci= @@ -36277,7 +35605,7 @@ struct charset_info_st my_charset_utf8_croatian_mysql561_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; @@ -36310,7 +35638,7 @@ struct charset_info_st my_charset_utf8_croatian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; @@ -36343,7 +35671,7 @@ struct charset_info_st my_charset_utf8_myanmar_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; @@ -36376,7 +35704,7 @@ struct charset_info_st my_charset_utf8_unicode_520_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; struct charset_info_st my_charset_utf8_thai_520_w2= @@ -36408,7 +35736,7 @@ struct charset_info_st my_charset_utf8_thai_520_w2= 0, /* escape_with_backslash_is_dangerous */ 2, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler_multilevel + &my_uca_collation_handler_multilevel_utf8mb3 }; struct charset_info_st my_charset_utf8_vietnamese_ci= @@ -36440,7 +35768,7 @@ struct charset_info_st my_charset_utf8_vietnamese_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb3 }; @@ -36473,7 +35801,7 @@ struct charset_info_st my_charset_utf8_unicode_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_mb_uca_nopad_handler + &my_uca_collation_handler_nopad_utf8mb3 }; @@ -36506,7 +35834,7 @@ struct charset_info_st my_charset_utf8_unicode_520_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_mb_uca_nopad_handler + &my_uca_collation_handler_nopad_utf8mb3 }; #endif /* HAVE_CHARSET_utf8 */ @@ -36514,6 +35842,39 @@ struct charset_info_st my_charset_utf8_unicode_520_nopad_ci= #ifdef HAVE_CHARSET_utf8mb4 +static my_bool +my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader); + + +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb4 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_mb +#define MY_UCA_ASCII_OPTIMIZE 1 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4 +#include "ctype-uca.ic" + +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb4 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_mb +#define MY_UCA_ASCII_OPTIMIZE 1 +#define MY_UCA_COMPILE_CONTRACTIONS 0 +#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4 +#include "ctype-uca.ic" + + +static my_bool +my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) +{ + if (my_coll_init_uca(cs, loader)) + return TRUE; + if (my_uca_collation_can_optimize_no_contractions(cs)) + my_uca_handler_map(cs, &my_uca_package_utf8mb4, + &my_uca_package_no_contractions_utf8mb4); + return FALSE; +} + + extern MY_CHARSET_HANDLER my_charset_utf8mb4_handler; #define MY_CS_UTF8MB4_UCA_FLAGS (MY_CS_COMMON_UCA_FLAGS|MY_CS_UNICODE_SUPPLEMENT) @@ -36548,7 +35909,7 @@ struct charset_info_st my_charset_utf8mb4_unicode_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; @@ -36581,7 +35942,7 @@ struct charset_info_st my_charset_utf8mb4_icelandic_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_latvian_uca_ci= @@ -36613,7 +35974,7 @@ struct charset_info_st my_charset_utf8mb4_latvian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_romanian_uca_ci= @@ -36645,7 +36006,7 @@ struct charset_info_st my_charset_utf8mb4_romanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_slovenian_uca_ci= @@ -36677,7 +36038,7 @@ struct charset_info_st my_charset_utf8mb4_slovenian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_polish_uca_ci= @@ -36709,7 +36070,7 @@ struct charset_info_st my_charset_utf8mb4_polish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_estonian_uca_ci= @@ -36741,7 +36102,7 @@ struct charset_info_st my_charset_utf8mb4_estonian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_spanish_uca_ci= @@ -36773,7 +36134,7 @@ struct charset_info_st my_charset_utf8mb4_spanish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_swedish_uca_ci= @@ -36805,7 +36166,7 @@ struct charset_info_st my_charset_utf8mb4_swedish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_turkish_uca_ci= @@ -36837,7 +36198,7 @@ struct charset_info_st my_charset_utf8mb4_turkish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_czech_uca_ci= @@ -36869,7 +36230,7 @@ struct charset_info_st my_charset_utf8mb4_czech_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; @@ -36902,7 +36263,7 @@ struct charset_info_st my_charset_utf8mb4_danish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_lithuanian_uca_ci= @@ -36934,7 +36295,7 @@ struct charset_info_st my_charset_utf8mb4_lithuanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_slovak_uca_ci= @@ -36966,7 +36327,7 @@ struct charset_info_st my_charset_utf8mb4_slovak_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_spanish2_uca_ci= @@ -36998,7 +36359,7 @@ struct charset_info_st my_charset_utf8mb4_spanish2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_roman_uca_ci= @@ -37030,7 +36391,7 @@ struct charset_info_st my_charset_utf8mb4_roman_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_persian_uca_ci= @@ -37062,7 +36423,7 @@ struct charset_info_st my_charset_utf8mb4_persian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_esperanto_uca_ci= @@ -37094,7 +36455,7 @@ struct charset_info_st my_charset_utf8mb4_esperanto_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_hungarian_uca_ci= @@ -37126,7 +36487,7 @@ struct charset_info_st my_charset_utf8mb4_hungarian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_sinhala_uca_ci= @@ -37158,7 +36519,7 @@ struct charset_info_st my_charset_utf8mb4_sinhala_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_german2_uca_ci= @@ -37190,7 +36551,7 @@ struct charset_info_st my_charset_utf8mb4_german2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_croatian_mysql561_uca_ci= @@ -37222,7 +36583,7 @@ struct charset_info_st my_charset_utf8mb4_croatian_mysql561_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; @@ -37255,7 +36616,7 @@ struct charset_info_st my_charset_utf8mb4_croatian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; @@ -37288,7 +36649,7 @@ struct charset_info_st my_charset_utf8mb4_myanmar_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_thai_520_w2= @@ -37320,7 +36681,7 @@ struct charset_info_st my_charset_utf8mb4_thai_520_w2= 0, /* escape_with_backslash_is_dangerous */ 2, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler_multilevel + &my_uca_collation_handler_multilevel_utf8mb4 }; struct charset_info_st my_charset_utf8mb4_unicode_520_ci= @@ -37352,7 +36713,7 @@ struct charset_info_st my_charset_utf8mb4_unicode_520_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; @@ -37385,7 +36746,7 @@ struct charset_info_st my_charset_utf8mb4_vietnamese_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_any_uca_handler + &my_uca_collation_handler_utf8mb4 }; @@ -37418,7 +36779,7 @@ struct charset_info_st my_charset_utf8mb4_unicode_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_mb_uca_nopad_handler + &my_uca_collation_handler_nopad_utf8mb4 }; @@ -37451,7 +36812,7 @@ struct charset_info_st my_charset_utf8mb4_unicode_520_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8mb4_handler, - &my_collation_mb_uca_nopad_handler + &my_uca_collation_handler_nopad_utf8mb4 }; @@ -37460,20 +36821,14 @@ struct charset_info_st my_charset_utf8mb4_unicode_520_nopad_ci= #ifdef HAVE_CHARSET_utf32 -MY_COLLATION_HANDLER my_collation_utf32_uca_handler = -{ - my_coll_init_uca, /* init */ - my_strnncoll_any_uca, - my_strnncollsp_any_uca, - my_strnxfrm_any_uca, - my_strnxfrmlen_any_uca, - my_like_range_generic, - my_wildcmp_uca, - NULL, - my_instr_mb, - my_hash_sort_any_uca, - my_propagate_complex -}; +#include "ctype-utf32.h" +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf32 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf32_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_generic +#define MY_UCA_ASCII_OPTIMIZE 0 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_coll_init_uca +#include "ctype-uca.ic" extern MY_CHARSET_HANDLER my_charset_utf32_handler; @@ -37510,7 +36865,7 @@ struct charset_info_st my_charset_utf32_unicode_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; @@ -37543,7 +36898,7 @@ struct charset_info_st my_charset_utf32_icelandic_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_latvian_uca_ci= @@ -37575,7 +36930,7 @@ struct charset_info_st my_charset_utf32_latvian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_romanian_uca_ci= @@ -37607,7 +36962,7 @@ struct charset_info_st my_charset_utf32_romanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_slovenian_uca_ci= @@ -37639,7 +36994,7 @@ struct charset_info_st my_charset_utf32_slovenian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_polish_uca_ci= @@ -37671,7 +37026,7 @@ struct charset_info_st my_charset_utf32_polish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_estonian_uca_ci= @@ -37703,7 +37058,7 @@ struct charset_info_st my_charset_utf32_estonian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_spanish_uca_ci= @@ -37735,7 +37090,7 @@ struct charset_info_st my_charset_utf32_spanish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_swedish_uca_ci= @@ -37767,7 +37122,7 @@ struct charset_info_st my_charset_utf32_swedish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_turkish_uca_ci= @@ -37799,7 +37154,7 @@ struct charset_info_st my_charset_utf32_turkish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_czech_uca_ci= @@ -37831,7 +37186,7 @@ struct charset_info_st my_charset_utf32_czech_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; @@ -37864,7 +37219,7 @@ struct charset_info_st my_charset_utf32_danish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_lithuanian_uca_ci= @@ -37896,7 +37251,7 @@ struct charset_info_st my_charset_utf32_lithuanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_slovak_uca_ci= @@ -37928,7 +37283,7 @@ struct charset_info_st my_charset_utf32_slovak_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_spanish2_uca_ci= @@ -37960,7 +37315,7 @@ struct charset_info_st my_charset_utf32_spanish2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_roman_uca_ci= @@ -37992,7 +37347,7 @@ struct charset_info_st my_charset_utf32_roman_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_persian_uca_ci= @@ -38024,7 +37379,7 @@ struct charset_info_st my_charset_utf32_persian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_esperanto_uca_ci= @@ -38056,7 +37411,7 @@ struct charset_info_st my_charset_utf32_esperanto_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_hungarian_uca_ci= @@ -38088,7 +37443,7 @@ struct charset_info_st my_charset_utf32_hungarian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_sinhala_uca_ci= @@ -38120,7 +37475,7 @@ struct charset_info_st my_charset_utf32_sinhala_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_german2_uca_ci= @@ -38152,7 +37507,7 @@ struct charset_info_st my_charset_utf32_german2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_croatian_mysql561_uca_ci= @@ -38184,7 +37539,7 @@ struct charset_info_st my_charset_utf32_croatian_mysql561_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; struct charset_info_st my_charset_utf32_croatian_uca_ci= @@ -38216,7 +37571,7 @@ struct charset_info_st my_charset_utf32_croatian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; @@ -38249,7 +37604,7 @@ struct charset_info_st my_charset_utf32_myanmar_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; @@ -38282,7 +37637,7 @@ struct charset_info_st my_charset_utf32_thai_520_w2= 0, /* escape_with_backslash_is_dangerous */ 2, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_any_uca_handler_multilevel + &my_uca_collation_handler_multilevel_utf32 }; @@ -38315,7 +37670,7 @@ struct charset_info_st my_charset_utf32_unicode_520_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; @@ -38348,7 +37703,7 @@ struct charset_info_st my_charset_utf32_vietnamese_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_utf32_uca_handler + &my_uca_collation_handler_utf32 }; @@ -38381,7 +37736,7 @@ struct charset_info_st my_charset_utf32_unicode_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_generic_uca_nopad_handler + &my_uca_collation_handler_nopad_utf32 }; @@ -38414,7 +37769,7 @@ struct charset_info_st my_charset_utf32_unicode_520_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf32_handler, - &my_collation_generic_uca_nopad_handler + &my_uca_collation_handler_nopad_utf32 }; @@ -38424,21 +37779,14 @@ struct charset_info_st my_charset_utf32_unicode_520_nopad_ci= #ifdef HAVE_CHARSET_utf16 - -MY_COLLATION_HANDLER my_collation_utf16_uca_handler = -{ - my_coll_init_uca, /* init */ - my_strnncoll_any_uca, - my_strnncollsp_any_uca, - my_strnxfrm_any_uca, - my_strnxfrmlen_any_uca, - my_like_range_generic, - my_wildcmp_uca, - NULL, - my_instr_mb, - my_hash_sort_any_uca, - my_propagate_complex -}; +#include "ctype-utf16.h" +#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf16 +#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf16_quick(wc, beg, end)) +#define MY_LIKE_RANGE my_like_range_generic +#define MY_UCA_ASCII_OPTIMIZE 0 +#define MY_UCA_COMPILE_CONTRACTIONS 1 +#define MY_UCA_COLL_INIT my_coll_init_uca +#include "ctype-uca.ic" extern MY_CHARSET_HANDLER my_charset_utf16_handler; @@ -38475,7 +37823,7 @@ struct charset_info_st my_charset_utf16_unicode_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; @@ -38508,7 +37856,7 @@ struct charset_info_st my_charset_utf16_icelandic_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_latvian_uca_ci= @@ -38540,7 +37888,7 @@ struct charset_info_st my_charset_utf16_latvian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_romanian_uca_ci= @@ -38572,7 +37920,7 @@ struct charset_info_st my_charset_utf16_romanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_slovenian_uca_ci= @@ -38604,7 +37952,7 @@ struct charset_info_st my_charset_utf16_slovenian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_polish_uca_ci= @@ -38636,7 +37984,7 @@ struct charset_info_st my_charset_utf16_polish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_estonian_uca_ci= @@ -38668,7 +38016,7 @@ struct charset_info_st my_charset_utf16_estonian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_spanish_uca_ci= @@ -38700,7 +38048,7 @@ struct charset_info_st my_charset_utf16_spanish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_swedish_uca_ci= @@ -38732,7 +38080,7 @@ struct charset_info_st my_charset_utf16_swedish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_turkish_uca_ci= @@ -38764,7 +38112,7 @@ struct charset_info_st my_charset_utf16_turkish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_czech_uca_ci= @@ -38796,7 +38144,7 @@ struct charset_info_st my_charset_utf16_czech_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; @@ -38829,7 +38177,7 @@ struct charset_info_st my_charset_utf16_danish_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_lithuanian_uca_ci= @@ -38861,7 +38209,7 @@ struct charset_info_st my_charset_utf16_lithuanian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_slovak_uca_ci= @@ -38893,7 +38241,7 @@ struct charset_info_st my_charset_utf16_slovak_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_spanish2_uca_ci= @@ -38925,7 +38273,7 @@ struct charset_info_st my_charset_utf16_spanish2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_roman_uca_ci= @@ -38957,7 +38305,7 @@ struct charset_info_st my_charset_utf16_roman_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_persian_uca_ci= @@ -38989,7 +38337,7 @@ struct charset_info_st my_charset_utf16_persian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_esperanto_uca_ci= @@ -39021,7 +38369,7 @@ struct charset_info_st my_charset_utf16_esperanto_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_hungarian_uca_ci= @@ -39053,7 +38401,7 @@ struct charset_info_st my_charset_utf16_hungarian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_sinhala_uca_ci= @@ -39085,7 +38433,7 @@ struct charset_info_st my_charset_utf16_sinhala_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; struct charset_info_st my_charset_utf16_german2_uca_ci= @@ -39117,7 +38465,7 @@ struct charset_info_st my_charset_utf16_german2_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; @@ -39150,7 +38498,7 @@ struct charset_info_st my_charset_utf16_croatian_mysql561_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; @@ -39183,7 +38531,7 @@ struct charset_info_st my_charset_utf16_croatian_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; @@ -39216,7 +38564,7 @@ struct charset_info_st my_charset_utf16_myanmar_uca_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; @@ -39249,7 +38597,7 @@ struct charset_info_st my_charset_utf16_thai_520_w2= 0, /* escape_with_backslash_is_dangerous */ 2, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_any_uca_handler_multilevel + &my_uca_collation_handler_multilevel_utf16 }; @@ -39282,7 +38630,7 @@ struct charset_info_st my_charset_utf16_unicode_520_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; @@ -39315,7 +38663,7 @@ struct charset_info_st my_charset_utf16_vietnamese_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_utf16_uca_handler + &my_uca_collation_handler_utf16 }; @@ -39348,7 +38696,7 @@ struct charset_info_st my_charset_utf16_unicode_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_generic_uca_nopad_handler + &my_uca_collation_handler_nopad_utf16 }; @@ -39381,7 +38729,7 @@ struct charset_info_st my_charset_utf16_unicode_520_nopad_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16_handler, - &my_collation_generic_uca_nopad_handler + &my_uca_collation_handler_nopad_utf16 }; diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic new file mode 100644 index 00000000000..70c10199e3e --- /dev/null +++ b/strings/ctype-uca.ic @@ -0,0 +1,839 @@ +/* + Copyright (c) 2018 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + + +#ifndef MY_FUNCTION_NAME +#error MY_FUNCTION_NAME is not defined +#endif +#ifndef MY_MB_WC +#error MY_MB_WC is not defined +#endif +#ifndef MY_LIKE_RANGE +#error MY_LIKE_RANGE is not defined +#endif +#ifndef MY_UCA_ASCII_OPTIMIZE +#error MY_ASCII_OPTIMIZE is not defined +#endif +#ifndef MY_UCA_COMPILE_CONTRACTIONS +#error MY_UCA_COMPILE_CONTRACTIONS is not defined +#endif +#ifndef MY_UCA_COLL_INIT +#error MY_UCA_COLL_INIT is not defined +#endif + + +static inline int +MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) +{ + /* + Check if the weights for the previous character have been + already fully scanned. If yes, then get the next character and + initialize wbeg and wlength to its weight string. + */ + + if (scanner->wbeg[0]) /* More weights left from the previous step: */ + return *scanner->wbeg++; /* return the next weight from expansion */ + + do + { + const uint16 *wpage; + my_wc_t wc[MY_UCA_MAX_CONTRACTION]; + int mblen; + + /* Get next character */ +#if MY_UCA_ASCII_OPTIMIZE + /* Get next ASCII character */ + if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80) + { + wc[0]= scanner->sbeg[0]; + scanner->sbeg+= 1; + +#if MY_UCA_COMPILE_CONTRACTIONS + if (my_uca_needs_context_handling(scanner->level, wc[0])) + { + uint16 *cweight= my_uca_context_weight_find(scanner, wc); + if (cweight) + return *cweight; + } +#endif + + scanner->page= 0; + scanner->code= (int) wc[0]; + scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0]; + if (scanner->wbeg[0]) + return *scanner->wbeg++; + continue; + } + else +#endif + /* Get next MB character */ + if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg, + scanner->send)) <= 0)) + { + if (scanner->sbeg >= scanner->send) + return -1; /* No more bytes, end of line reached */ + /* + There are some more bytes left. Non-positive mb_len means that + we got an incomplete or a bad byte sequence. Consume mbminlen bytes. + */ + if ((scanner->sbeg+= scanner->cs->mbminlen) > scanner->send) + { + /* For safety purposes don't go beyond the string range. */ + scanner->sbeg= scanner->send; + } + /* + Treat every complete or incomplete mbminlen unit as a weight which is + greater than weight for any possible normal character. + 0xFFFF is greater than any possible weight in the UCA weight table. + */ + return 0xFFFF; + } + + scanner->sbeg+= mblen; + if (wc[0] > scanner->level->maxchar) + { + /* Return 0xFFFD as weight for all characters outside BMP */ + scanner->wbeg= nochar; + return 0xFFFD; + } + +#if MY_UCA_COMPILE_CONTRACTIONS + if (my_uca_needs_context_handling(scanner->level, wc[0])) + { + uint16 *cweight= my_uca_context_weight_find(scanner, wc); + if (cweight) + return *cweight; + } +#endif + + /* Process single character */ + scanner->page= wc[0] >> 8; + scanner->code= wc[0] & 0xFF; + + /* If weight page for w[0] does not exist, then calculate algoritmically */ + if (!(wpage= scanner->level->weights[scanner->page])) + return my_uca_scanner_next_implicit(scanner); + + /* Calculate pointer to w[0]'s weight, using page and offset */ + scanner->wbeg= wpage + + scanner->code * scanner->level->lengths[scanner->page]; + } while (!scanner->wbeg[0]); /* Skip ignorable characters */ + + return *scanner->wbeg++; +} + + + +/* + Compares two strings according to the collation + + SYNOPSIS: + strnncoll_onelevel() + cs Character set information + level Weight level (0 primary, 1 secondary, 2 tertiary, etc) + s First string + slen First string length + t Second string + tlen Seconf string length + level DUCETweight level + + NOTES: + Initializes two weight scanners and gets weights + corresponding to two strings in a loop. If weights are not + the same at some step then returns their difference. + + In the while() comparison these situations are possible: + 1. (s_res>0) and (t_res>0) and (s_res == t_res) + Weights are the same so far, continue comparison + 2. (s_res>0) and (t_res>0) and (s_res!=t_res) + A difference has been found, return. + 3. (s_res>0) and (t_res<0) + We have reached the end of the second string, or found + an illegal multibyte sequence in the second string. + Return a positive number, i.e. the first string is bigger. + 4. (s_res<0) and (t_res>0) + We have reached the end of the first string, or found + an illegal multibyte sequence in the first string. + Return a negative number, i.e. the second string is bigger. + 5. (s_res<0) and (t_res<0) + Both scanners returned -1. It means we have riched + the end-of-string of illegal-sequence in both strings + at the same time. Return 0, strings are equal. + + RETURN + Difference between two strings, according to the collation: + 0 - means strings are equal + negative number - means the first string is smaller + positive number - means the first string is bigger +*/ + +static int +MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs, + const MY_UCA_WEIGHT_LEVEL *level, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool t_is_prefix) +{ + my_uca_scanner sscanner; + my_uca_scanner tscanner; + int s_res; + int t_res; + + my_uca_scanner_init_any(&sscanner, cs, level, s, slen); + my_uca_scanner_init_any(&tscanner, cs, level, t, tlen); + + do + { + s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner); + t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner); + } while ( s_res == t_res && s_res >0); + + return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res); +} + + +/* + One-level, PAD SPACE. +*/ +static int +MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool t_is_prefix) +{ + return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0], + s, slen, t, tlen, t_is_prefix); +} + + +/* + Multi-level, PAD SPACE. +*/ +static int +MY_FUNCTION_NAME(strnncoll_multilevel)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool t_is_prefix) +{ + uint i, num_level= cs->levels_for_order; + for (i= 0; i != num_level; i++) + { + int ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i], + s, slen, t, tlen, + t_is_prefix); + if (ret) + return ret; + } + return 0; +} + + +/* + Compares two strings according to the collation, + ignoring trailing spaces. + + SYNOPSIS: + strnncollsp_onelevel() + cs Character set information + level UCA weight level + s First string + slen First string length + t Second string + tlen Seconf string length + level DUCETweight level + + NOTES: + Works exactly the same with my_strnncoll_uca(), + but ignores trailing spaces. + + In the while() comparison these situations are possible: + 1. (s_res>0) and (t_res>0) and (s_res == t_res) + Weights are the same so far, continue comparison + 2. (s_res>0) and (t_res>0) and (s_res!=t_res) + A difference has been found, return. + 3. (s_res>0) and (t_res<0) + We have reached the end of the second string, or found + an illegal multibyte sequence in the second string. + Compare the first string to an infinite array of + space characters until difference is found, or until + the end of the first string. + 4. (s_res<0) and (t_res>0) + We have reached the end of the first string, or found + an illegal multibyte sequence in the first string. + Compare the second string to an infinite array of + space characters until difference is found or until + the end of the second steing. + 5. (s_res<0) and (t_res<0) + Both scanners returned -1. It means we have riched + the end-of-string of illegal-sequence in both strings + at the same time. Return 0, strings are equal. + + RETURN + Difference between two strings, according to the collation: + 0 - means strings are equal + negative number - means the first string is smaller + positive number - means the first string is bigger +*/ + +static int +MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs, + const MY_UCA_WEIGHT_LEVEL *level, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + my_uca_scanner sscanner, tscanner; + int s_res, t_res; + + my_uca_scanner_init_any(&sscanner, cs, level, s, slen); + my_uca_scanner_init_any(&tscanner, cs, level, t, tlen); + + do + { + s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner); + t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner); + } while ( s_res == t_res && s_res >0); + + if (s_res > 0 && t_res < 0) + { + /* Calculate weight for SPACE character */ + t_res= my_space_weight(level); + + /* compare the first string to spaces */ + do + { + if (s_res != t_res) + return (s_res - t_res); + s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner); + } while (s_res > 0); + return 0; + } + + if (s_res < 0 && t_res > 0) + { + /* Calculate weight for SPACE character */ + s_res= my_space_weight(level); + + /* compare the second string to spaces */ + do + { + if (s_res != t_res) + return (s_res - t_res); + t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner); + } while (t_res > 0); + return 0; + } + + return ( s_res - t_res ); +} + + +/* + One-level, PAD SPACE +*/ +static int +MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + return MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[0], + s, slen, t, tlen); +} + + +/* + One-level, NO PAD +*/ +static int +MY_FUNCTION_NAME(strnncollsp_nopad)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0], + s, slen, t, tlen, FALSE); +} + + +/* + Multi-level, PAD SPACE +*/ +static int +MY_FUNCTION_NAME(strnncollsp_multilevel)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + + uint i, num_level= cs->levels_for_order; + for (i= 0; i != num_level; i++) + { + int ret= MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[i], + s, slen, t, tlen); + if (ret) + return ret; + } + return 0; +} + + +/* + Multi-level, NO PAD +*/ +static int +MY_FUNCTION_NAME(strnncollsp_nopad_multilevel)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen) +{ + uint num_level= cs->levels_for_order; + uint i; + for (i= 0; i != num_level; i++) + { + int ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i], + s, slen, t, tlen, FALSE); + if (ret) + return ret; + } + return 0; +} + + + +/* + Calculates hash value for the given string, + according to the collation, and ignoring trailing spaces. + + SYNOPSIS: + hash_sort() + cs Character set information + s String + slen String's length + n1 First hash parameter + n2 Second hash parameter + + NOTES: + Scans consequently weights and updates + hash parameters n1 and n2. In a case insensitive collation, + upper and lower case of the same letter will return the same + weight sequence, and thus will produce the same hash values + in n1 and n2. + + This functions is used for one-level and for multi-level collations. + We intentionally use only primary level in multi-level collations. + This helps to have PARTITION BY KEY put primarily equal records + into the same partition. E.g. in utf8_thai_520_ci records that differ + only in tone marks go into the same partition. + + RETURN + N/A +*/ + +static void +MY_FUNCTION_NAME(hash_sort)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + ulong *nr1, ulong *nr2) +{ + int s_res; + my_uca_scanner scanner; + int space_weight= my_space_weight(&cs->uca->level[0]); + register ulong m1= *nr1, m2= *nr2; + + my_uca_scanner_init_any(&scanner, cs, &cs->uca->level[0], s, slen); + + while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) >0) + { + if (s_res == space_weight) + { + /* Combine all spaces to be able to skip end spaces */ + uint count= 0; + do + { + count++; + if ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) <= 0) + { + /* Skip strings at end of string */ + goto end; + } + } + while (s_res == space_weight); + + /* Add back that has for the space characters */ + do + { + /* + We can't use MY_HASH_ADD_16() here as we, because of a misstake + in the original code, where we added the 16 byte variable the + opposite way. Changing this would cause old partitioned tables + to fail. + */ + MY_HASH_ADD(m1, m2, space_weight >> 8); + MY_HASH_ADD(m1, m2, space_weight & 0xFF); + } + while (--count != 0); + + } + /* See comment above why we can't use MY_HASH_ADD_16() */ + MY_HASH_ADD(m1, m2, s_res >> 8); + MY_HASH_ADD(m1, m2, s_res & 0xFF); + } +end: + *nr1= m1; + *nr2= m2; +} + + +static void +MY_FUNCTION_NAME(hash_sort_nopad)(CHARSET_INFO *cs, + const uchar *s, size_t slen, + ulong *nr1, ulong *nr2) +{ + int s_res; + my_uca_scanner scanner; + register ulong m1= *nr1, m2= *nr2; + + my_uca_scanner_init_any(&scanner, cs, &cs->uca->level[0], s, slen); + + while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) >0) + { + /* See comment above why we can't use MY_HASH_ADD_16() */ + MY_HASH_ADD(m1, m2, s_res >> 8); + MY_HASH_ADD(m1, m2, s_res & 0xFF); + } + *nr1= m1; + *nr2= m2; +} + + + +/* + For the given string creates its "binary image", suitable + to be used in binary comparison, i.e. in memcmp(). + + SYNOPSIS: + my_strnxfrm_uca() + cs Character set information + dst Where to write the image + dstlen Space available for the image, in bytes + src The source string + srclen Length of the source string, in bytes + + NOTES: + In a loop, scans weights from the source string and writes + them into the binary image. In a case insensitive collation, + upper and lower cases of the same letter will produce the + same image subsequences. When we have reached the end-of-string + or found an illegal multibyte sequence, the loop stops. + + It is impossible to restore the original string using its + binary image. + + Binary images are used for bulk comparison purposes, + e.g. in ORDER BY, when it is more efficient to create + a binary image and use it instead of weight scanner + for the original strings for every comparison. + + RETURN + Number of bytes that have been written into the binary image. +*/ + +static uchar * +MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(CHARSET_INFO *cs, + MY_UCA_WEIGHT_LEVEL *level, + uchar *dst, uchar *de, + uint *nweights, + const uchar *src, size_t srclen) +{ + my_uca_scanner scanner; + int s_res; + + DBUG_ASSERT(src || !srclen); + +#if MY_UCA_ASCII_OPTIMIZE && !MY_UCA_COMPILE_CONTRACTIONS + /* + Fast path for the ASCII range with no contractions. + */ + { + const uchar *de2= de - 1; /* Last position where 2 bytes fit */ + const uint16 *weights0= level->weights[0]; + uint lengths0= level->lengths[0]; + for ( ; ; src++, srclen--) + { + const uint16 *weight; + if (!srclen || !*nweights) + return dst; /* Done */ + if (*src > 0x7F) + break; /* Non-ASCII */ + + weight= weights0 + (((uint) *src) * lengths0); + if (!(s_res= *weight)) + continue; /* Ignorable */ + if (weight[1]) /* Expansion (e.g. in a user defined collation */ + break; + + /* Here we have a character with extactly one 2-byte UCA weight */ + if (dst < de2) /* Most typical case is when both bytes fit */ + { + *dst++= s_res >> 8; + *dst++= s_res & 0xFF; + (*nweights)--; + continue; + } + if (dst >= de) /* No space left in "dst" */ + return dst; + *dst++= s_res >> 8; /* There is space only for one byte */ + (*nweights)--; + return dst; + } + } +#endif + + my_uca_scanner_init_any(&scanner, cs, level, src, srclen); + for (; dst < de && *nweights && + (s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) > 0 ; (*nweights)--) + { + *dst++= s_res >> 8; + if (dst < de) + *dst++= s_res & 0xFF; + } + return dst; +} + + +static uchar * +MY_FUNCTION_NAME(strnxfrm_onelevel)(CHARSET_INFO *cs, + MY_UCA_WEIGHT_LEVEL *level, + uchar *dst, uchar *de, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *d0= dst; + dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level, + dst, de, &nweights, + src, srclen); + DBUG_ASSERT(dst <= de); + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + dst= my_strnxfrm_uca_padn(dst, de, nweights, my_space_weight(level)); + DBUG_ASSERT(dst <= de); + my_strxfrm_desc_and_reverse(d0, dst, flags, 0); + return dst; +} + + + +static uchar * +MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(CHARSET_INFO *cs, + MY_UCA_WEIGHT_LEVEL *level, + uchar *dst, uchar *de, uint nweights, + const uchar *src, size_t srclen, + uint flags) +{ + uchar *d0= dst; + dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level, + dst, de, &nweights, + src, srclen); + DBUG_ASSERT(dst <= de); + /* Pad with the minimum possible weight on this level */ + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + dst= my_strnxfrm_uca_padn(dst, de, nweights, min_weight_on_level(level)); + DBUG_ASSERT(dst <= de); + my_strxfrm_desc_and_reverse(d0, dst, flags, 0); + return dst; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *d0= dst; + uchar *de= dst + dstlen; + + /* + There are two ways to handle trailing spaces for PAD SPACE collations: + 1. Keep trailing spaces as they are, so have strnxfrm_onelevel() scan + spaces as normal characters. This will call scanner_next() for every + trailing space and calculate its weight using UCA weights. + 2. Strip trailing spaces before calling strnxfrm_onelevel(), as it will + append weights for implicit spaces anyway, up to the desired key size. + This will effectively generate exactly the same sortable key result. + The latter is much faster. + */ + + if (flags & MY_STRXFRM_PAD_WITH_SPACE) + srclen= cs->cset->lengthsp(cs, (const char*) src, srclen); + dst= MY_FUNCTION_NAME(strnxfrm_onelevel)(cs, &cs->uca->level[0], + dst, de, nweights, + src, srclen, flags); + /* + This can probably be changed to memset(dst, 0, de - dst), + like my_strnxfrm_uca_multilevel() does. + */ + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + dst= my_strnxfrm_uca_pad(dst, de, my_space_weight(&cs->uca->level[0])); + return dst - d0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + uint nweights, + const uchar *src, size_t srclen, + uint flags) +{ + uchar *d0= dst; + uchar *de= dst + dstlen; + + dst= MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs, &cs->uca->level[0], + dst, de, nweights, + src, srclen, flags); + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + { + memset(dst, 0, de - dst); + dst= de; + } + return dst - d0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + uint nweights, + const uchar *src, size_t srclen, + uint flags) +{ + uint num_level= cs->levels_for_order; + uchar *d0= dst; + uchar *de= dst + dstlen; + uint current_level; + + for (current_level= 0; current_level != num_level; current_level++) + { + if (!(flags & MY_STRXFRM_LEVEL_ALL) || + (flags & (MY_STRXFRM_LEVEL1 << current_level))) + dst= cs->state & MY_CS_NOPAD ? + MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs, + &cs->uca->level[current_level], + dst, de, nweights, + src, srclen, flags) : + MY_FUNCTION_NAME(strnxfrm_onelevel)(cs, + &cs->uca->level[current_level], + dst, de, nweights, + src, srclen, flags); + } + + if (dst < de && (flags & MY_STRXFRM_PAD_TO_MAXLEN)) + { + memset(dst, 0, de - dst); + dst= de; + } + + return dst - d0; +} + + +/* + One-level, PAD SPACE +*/ +MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)= +{ + MY_UCA_COLL_INIT, + MY_FUNCTION_NAME(strnncoll), + MY_FUNCTION_NAME(strnncollsp), + MY_FUNCTION_NAME(strnxfrm), + my_strnxfrmlen_any_uca, + MY_LIKE_RANGE, + my_wildcmp_uca, + NULL, /* strcasecmp() */ + my_instr_mb, + MY_FUNCTION_NAME(hash_sort), + my_propagate_complex +}; + + +/* + One-level, NO PAD + For character sets with mbminlen==1 use MY_LIKE_RANGE=my_like_range_mb + For character sets with mbminlen>=2 use MY_LIKE_RANGE=my_like_range_generic +*/ +MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)= +{ + MY_UCA_COLL_INIT, + MY_FUNCTION_NAME(strnncoll), + MY_FUNCTION_NAME(strnncollsp_nopad), + MY_FUNCTION_NAME(strnxfrm_nopad), + my_strnxfrmlen_any_uca, + MY_LIKE_RANGE, /* my_like_range_mb or my_like_range_generic */ + my_wildcmp_uca, + NULL, /* strcasecmp() */ + my_instr_mb, + MY_FUNCTION_NAME(hash_sort_nopad), + my_propagate_complex +}; + + +/* + Multi-level, PAD SPACE +*/ +MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)= +{ + MY_UCA_COLL_INIT, + MY_FUNCTION_NAME(strnncoll_multilevel), + MY_FUNCTION_NAME(strnncollsp_multilevel), + MY_FUNCTION_NAME(strnxfrm_multilevel), + my_strnxfrmlen_any_uca_multilevel, + MY_LIKE_RANGE, + my_wildcmp_uca, + NULL, /* strcasecmp() */ + my_instr_mb, + MY_FUNCTION_NAME(hash_sort), + my_propagate_complex +}; + + +/* + Multi-level, NO PAD +*/ +MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)= +{ + MY_UCA_COLL_INIT, + MY_FUNCTION_NAME(strnncoll_multilevel), + MY_FUNCTION_NAME(strnncollsp_nopad_multilevel), + MY_FUNCTION_NAME(strnxfrm_multilevel), + my_strnxfrmlen_any_uca_multilevel, + MY_LIKE_RANGE, + my_wildcmp_uca, + NULL, /* strcasecmp() */ + my_instr_mb, + MY_FUNCTION_NAME(hash_sort), + my_propagate_complex +}; + + +MY_COLLATION_HANDLER_PACKAGE MY_FUNCTION_NAME(package)= +{ + &MY_FUNCTION_NAME(collation_handler), + &MY_FUNCTION_NAME(collation_handler_nopad), + &MY_FUNCTION_NAME(collation_handler_multilevel), + &MY_FUNCTION_NAME(collation_handler_nopad_multilevel) +}; + + +#undef MY_FUNCTION_NAME +#undef MY_MB_WC +#undef MY_LIKE_RANGE +#undef MY_UCA_ASCII_OPTIMIZE +#undef MY_UCA_COMPILE_CONTRACTIONS +#undef MY_UCA_COLL_INIT diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 7596b7f2168..28e7def3ddf 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -23,6 +23,8 @@ #include <my_sys.h> #include <stdarg.h> +#include "ctype-unidata.h" + #if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2) #define HAVE_CHARSET_mb2 @@ -1184,35 +1186,7 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), but the JSON functions needed my_utf16_uni() so the #ifdef was moved lower. */ - - -/* - D800..DB7F - Non-provate surrogate high (896 pages) - DB80..DBFF - Private surrogate high (128 pages) - DC00..DFFF - Surrogate low (1024 codes in a page) -*/ -#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800 -#define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF -#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00 -#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF - -#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8) -#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC) -/* Test if a byte is a leading byte of a high or low surrogate head: */ -#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8) -/* Test if a Unicode code point is a high or low surrogate head */ -#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800) - -#define MY_UTF16_WC2(a, b) ((a << 8) + b) - -/* - a= 110110?? (<< 18) - b= ???????? (<< 10) - c= 110111?? (<< 8) - d= ???????? (<< 0) -*/ -#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \ - ((c & 3) << 8) + d + 0x10000) +#include "ctype-utf16.h" #define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b0)) #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2)) @@ -1220,10 +1194,17 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1) { my_wc_t wc= MY_UTF16_WC2(b0, b1); - MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf16_quick(pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1) #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER @@ -1261,32 +1242,7 @@ static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1) my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t *pwc, const uchar *s, const uchar *e) { - if (s + 2 > e) - return MY_CS_TOOSMALL2; - - /* - High bytes: 0xD[89AB] = B'110110??' - Low bytes: 0xD[CDEF] = B'110111??' - Surrogate mask: 0xFC = B'11111100' - */ - - if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */ - { - if (s + 4 > e) - return MY_CS_TOOSMALL4; - - if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */ - return MY_CS_ILSEQ; - - *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]); - return 4; - } - - if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */ - return MY_CS_ILSEQ; - - *pwc= MY_UTF16_WC2(s[0], s[1]); - return 2; + return my_mb_wc_utf16_quick(pwc, s, e); } @@ -1546,7 +1502,7 @@ static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler = NULL, /* init */ my_strnncoll_utf16_general_ci, my_strnncollsp_utf16_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf16_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf16_ci, @@ -1578,7 +1534,7 @@ static MY_COLLATION_HANDLER my_collation_utf16_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_utf16_general_ci, my_strnncollsp_utf16_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf16_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf16_ci, @@ -1775,6 +1731,13 @@ struct charset_info_st my_charset_utf16_nopad_bin= #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3)) #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) (cs->cset->mb_wc(cs, pwc, s, e)) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0) #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER @@ -1879,7 +1842,7 @@ static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler = NULL, /* init */ my_strnncoll_utf16le_general_ci, my_strnncollsp_utf16le_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf16le_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf16_ci, @@ -1911,7 +1874,7 @@ static MY_COLLATION_HANDLER my_collation_utf16le_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_utf16le_general_ci, my_strnncollsp_utf16le_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf16le_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf16_ci, @@ -2109,6 +2072,8 @@ struct charset_info_st my_charset_utf16le_nopad_bin= #ifdef HAVE_CHARSET_utf32 +#include "ctype-utf32.h" + /* Check is b0 and b1 start a valid UTF32 four-byte sequence. Don't accept characters greater than U+10FFFF. @@ -2117,8 +2082,6 @@ struct charset_info_st my_charset_utf16le_nopad_bin= #define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1)) -#define MY_UTF32_WC4(b0,b1,b2,b3) ((((my_wc_t)b0) << 24) + (b1 << 16) + \ - (b2 << 8) + (b3)) static inline int my_weight_utf32_general_ci(uchar b0, uchar b1, uchar b2, uchar b3) @@ -2126,12 +2089,19 @@ static inline int my_weight_utf32_general_ci(uchar b0, uchar b1, my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3); if (wc <= 0xFFFF) { - MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } return MY_CS_REPLACEMENT_CHARACTER; } #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf32_quick(pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) #define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3) #include "strcoll.ic" @@ -2161,10 +2131,7 @@ static int my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t *pwc, const uchar *s, const uchar *e) { - if (s + 4 > e) - return MY_CS_TOOSMALL4; - *pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]); - return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4; + return my_mb_wc_utf32_quick(pwc, s, e); } @@ -2698,7 +2665,7 @@ static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler = NULL, /* init */ my_strnncoll_utf32_general_ci, my_strnncollsp_utf32_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf32_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf32_ci, @@ -2730,7 +2697,7 @@ static MY_COLLATION_HANDLER my_collation_utf32_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_utf32_general_ci, my_strnncollsp_utf32_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf32_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf32_ci, @@ -2928,6 +2895,8 @@ struct charset_info_st my_charset_utf32_nopad_bin= #ifdef HAVE_CHARSET_ucs2 +#include "ctype-ucs2.h" + static const uchar ctype_ucs2[] = { 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32, @@ -2995,20 +2964,30 @@ static const uchar to_upper_ucs2[] = { static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1) { my_wc_t wc= UCS2_CODE(b0, b1); - MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } -#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1) +#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1) #include "strcoll.ic" -#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1) +#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin +#define DEFINE_STRNXFRM_UNICODE_BIN2 +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1) #include "strcoll.ic" @@ -3037,11 +3016,7 @@ my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)), static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t * pwc, const uchar *s, const uchar *e) { - if (s+2 > e) /* Need 2 characters */ - return MY_CS_TOOSMALL2; - - *pwc= ((uchar)s[0]) * 256 + ((uchar)s[1]); - return 2; + return my_mb_wc_ucs2_quick(pwc, s, e); } static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) , @@ -3280,7 +3255,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler = NULL, /* init */ my_strnncoll_ucs2_general_ci, my_strnncollsp_ucs2_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_ucs2_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_ucs2_ci, @@ -3296,7 +3271,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler = NULL, /* init */ my_strnncoll_ucs2_bin, my_strnncollsp_ucs2_bin, - my_strnxfrm_unicode, + my_strnxfrm_ucs2_bin, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_ucs2_bin, @@ -3312,7 +3287,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_ucs2_general_ci, my_strnncollsp_ucs2_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_ucs2_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_ucs2_ci, @@ -3328,7 +3303,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_nopad_bin_handler = NULL, /* init */ my_strnncoll_ucs2_bin, my_strnncollsp_ucs2_nopad_bin, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_ucs2_bin, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_ucs2_bin, diff --git a/strings/ctype-ucs2.h b/strings/ctype-ucs2.h new file mode 100644 index 00000000000..c989324172d --- /dev/null +++ b/strings/ctype-ucs2.h @@ -0,0 +1,32 @@ +/* + Copyright (c) 2018 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef _CTYPE_UCS2_H +#define _CTYPE_UCS2_H + + +static inline int +my_mb_wc_ucs2_quick(my_wc_t * pwc, const uchar *s, const uchar *e) +{ + if (s+2 > e) /* Need 2 characters */ + return MY_CS_TOOSMALL2; + *pwc= ((uchar)s[0]) * 256 + ((uchar)s[1]); + return 2; +} + + +#endif /* _CTYPE_UCS2_H */ diff --git a/strings/ctype-unidata.h b/strings/ctype-unidata.h new file mode 100644 index 00000000000..6712f5e1d79 --- /dev/null +++ b/strings/ctype-unidata.h @@ -0,0 +1,31 @@ +#ifndef CTYPE_UNIDATA_H_INCLUDED +#define CTYPE_UNIDATA_H_INCLUDED +/* + Copyright (c) 2018 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#define MY_UNICASE_INFO_DEFAULT_MAXCHAR 0xFFFF +extern MY_UNICASE_CHARACTER my_unicase_default_page00[256]; +extern MY_UNICASE_CHARACTER *my_unicase_default_pages[256]; + +size_t my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights); +size_t my_strxfrm_pad_unicode(uchar *str, uchar *strend); + + +#define PUT_WC_BE2_HAVE_1BYTE(dst, de, wc) \ + do { *dst++= (uchar) (wc >> 8); if (dst < de) *dst++= (uchar) (wc & 0xFF); } while(0) + +#endif /* CTYPE_UNIDATA_H_INCLUDED */ diff --git a/strings/ctype-utf16.h b/strings/ctype-utf16.h new file mode 100644 index 00000000000..d4cf4664f97 --- /dev/null +++ b/strings/ctype-utf16.h @@ -0,0 +1,80 @@ +/* + Copyright (c) 2018 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef _CTYPE_UTF16_H +#define _CTYPE_UTF16_H + +/* + D800..DB7F - Non-provate surrogate high (896 pages) + DB80..DBFF - Private surrogate high (128 pages) + DC00..DFFF - Surrogate low (1024 codes in a page) +*/ +#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800 +#define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF +#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00 +#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF + +#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8) +#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC) +/* Test if a byte is a leading byte of a high or low surrogate head: */ +#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8) +/* Test if a Unicode code point is a high or low surrogate head */ +#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800) + +#define MY_UTF16_WC2(a, b) ((a << 8) + b) + +/* + a= 110110?? (<< 18) + b= ???????? (<< 10) + c= 110111?? (<< 8) + d= ???????? (<< 0) +*/ +#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \ + ((c & 3) << 8) + d + 0x10000) + +static inline int +my_mb_wc_utf16_quick(my_wc_t *pwc, const uchar *s, const uchar *e) +{ + if (s + 2 > e) + return MY_CS_TOOSMALL2; + + /* + High bytes: 0xD[89AB] = B'110110??' + Low bytes: 0xD[CDEF] = B'110111??' + Surrogate mask: 0xFC = B'11111100' + */ + + if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */ + { + if (s + 4 > e) + return MY_CS_TOOSMALL4; + + if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */ + return MY_CS_ILSEQ; + + *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]); + return 4; + } + + if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */ + return MY_CS_ILSEQ; + + *pwc= MY_UTF16_WC2(s[0], s[1]); + return 2; +} + +#endif /* _CTYPE_UTF16_H */ diff --git a/strings/ctype-utf32.h b/strings/ctype-utf32.h new file mode 100644 index 00000000000..e295dc6d081 --- /dev/null +++ b/strings/ctype-utf32.h @@ -0,0 +1,33 @@ +/* + Copyright (c) 2018 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef _CTYPE_UTF32_H +#define _CTYPE_UTF32_H + +#define MY_UTF32_WC4(b0,b1,b2,b3) ((((my_wc_t)b0) << 24) + (b1 << 16) + \ + (b2 << 8) + (b3)) + +static inline int +my_mb_wc_utf32_quick(my_wc_t *pwc, const uchar *s, const uchar *e) +{ + if (s + 4 > e) + return MY_CS_TOOSMALL4; + *pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]); + return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4; +} + +#endif /* _CTYPE_UTF32_H */ diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 4ef376dccc8..4ddb086b734 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -26,78 +26,10 @@ #define EILSEQ ENOENT #endif -/* Detect special bytes and sequences */ -#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40) -/* - Check MB2 character assuming that b0 is alredy known to be >= 0xC2. - Use this macro if the caller already checked b0 for: - - an MB1 character - - an unused gap between MB1 and MB2HEAD -*/ -#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \ - IS_CONTINUATION_BYTE((uchar) b1)) +#include "ctype-utf8.h" +#include "ctype-unidata.h" -/* - Check MB3 character assuming that b0 is already known to be - in the valid MB3HEAD range [0xE0..0xEF]. -*/ -#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \ - IS_CONTINUATION_BYTE(b2) && \ - ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0)) - -/* - Check MB3 character assuming that b0 is already known to be >= 0xE0, - but is not checked for the high end 0xF0 yet. - Use this macro if the caller already checked b0 for: - - an MB1 character - - an unused gap between MB1 and MB2HEAD - - an MB2HEAD -*/ -#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \ - IS_UTF8MB3_STEP2(b0,b1,b2)) - -/* - UTF-8 quick four-byte mask: - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - Encoding allows to encode U+00010000..U+001FFFFF - - The maximum character defined in the Unicode standard is U+0010FFFF. - Higher characters U+00110000..U+001FFFFF are not used. - - 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) - 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) - - Valid codes: - [F0][90..BF][80..BF][80..BF] - [F1][80..BF][80..BF][80..BF] - [F2][80..BF][80..BF][80..BF] - [F3][80..BF][80..BF][80..BF] - [F4][80..8F][80..BF][80..BF] -*/ - -/* - Check MB4 character assuming that b0 is already - known to be in the range [0xF0..0xF4] -*/ -#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \ - IS_CONTINUATION_BYTE(b2) && \ - IS_CONTINUATION_BYTE(b3) && \ - (b0 >= 0xf1 || b1 >= 0x90) && \ - (b0 <= 0xf3 || b1 <= 0x8F)) -#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \ - IS_UTF8MB4_STEP2(b0,b1,b2,b3)) - -/* Convert individual bytes to Unicode code points */ -#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\ - ((my_wc_t) ((uchar) b1 ^ 0x80))) -#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\ - ((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\ - ((my_wc_t) ((uchar) b2 ^ 0x80))) -#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\ - ((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\ - ((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\ - (my_wc_t) ((uchar) b3 ^ 0x80)) /* Definitions for strcoll.ic */ #define IS_MB1_CHAR(x) ((uchar) (x) < 0x80) @@ -180,7 +112,7 @@ int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e) #include "my_uctype.h" -static MY_UNICASE_CHARACTER plane00[]={ +MY_UNICASE_CHARACTER my_unicase_default_page00[]={ {0x0000,0x0000,0x0000}, {0x0001,0x0001,0x0001}, {0x0002,0x0002,0x0002}, {0x0003,0x0003,0x0003}, {0x0004,0x0004,0x0004}, {0x0005,0x0005,0x0005}, @@ -313,7 +245,7 @@ static MY_UNICASE_CHARACTER plane00[]={ /* - Almost similar to plane00, but maps sorting order + Almost similar to my_unicase_default_page00, but maps sorting order for U+00DF to 0x00DF instead of 0x0053. */ static MY_UNICASE_CHARACTER plane00_mysql500[]={ @@ -1759,9 +1691,10 @@ static MY_UNICASE_CHARACTER planeFF[]={ }; -static MY_UNICASE_CHARACTER *my_unicase_pages_default[256]= +MY_UNICASE_CHARACTER *my_unicase_default_pages[256]= { - plane00, plane01, plane02, plane03, plane04, plane05, NULL, NULL, + my_unicase_default_page00, + plane01, plane02, plane03, plane04, plane05, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F, @@ -1798,8 +1731,8 @@ static MY_UNICASE_CHARACTER *my_unicase_pages_default[256]= MY_UNICASE_INFO my_unicase_default= { - 0xFFFF, - my_unicase_pages_default + MY_UNICASE_INFO_DEFAULT_MAXCHAR, + my_unicase_default_pages }; @@ -4646,7 +4579,7 @@ my_wildcmp_unicode(CHARSET_INFO *cs, @return Result length */ -static size_t +size_t my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights) { uchar *str0; @@ -4675,7 +4608,7 @@ my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights) @return Result length */ -static size_t +size_t my_strxfrm_pad_unicode(uchar *str, uchar *strend) { uchar *str0= str; @@ -4690,95 +4623,6 @@ my_strxfrm_pad_unicode(uchar *str, uchar *strend) } -size_t my_strnxfrm_unicode_internal(CHARSET_INFO *cs, - uchar *dst, uchar *de, uint *nweights, - const uchar *src, const uchar *se) -{ - my_wc_t UNINIT_VAR(wc); - int res; - uchar *dst0= dst; - MY_UNICASE_INFO *uni_plane= (cs->state & MY_CS_BINSORT) ? - NULL : cs->caseinfo; - - DBUG_ASSERT(src || !se); - - for (; dst < de && *nweights; (*nweights)--) - { - if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0) - break; - src+= res; - - if (uni_plane) - my_tosort_unicode(uni_plane, &wc, cs->state); - - *dst++= (uchar) (wc >> 8); - if (dst < de) - *dst++= (uchar) (wc & 0xFF); - } - return dst - dst0; -} - - -/* - Store sorting weights using 2 bytes per character. - - This function is shared between - - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin - which support BMP only (U+0000..U+FFFF). - - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, - which map all supplementary characters to weight 0xFFFD. -*/ -size_t -my_strnxfrm_unicode(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uchar *dst0= dst; - uchar *de= dst + dstlen; - dst+= my_strnxfrm_unicode_internal(cs, dst, de, &nweights, - src, src + srclen); - DBUG_ASSERT(dst <= de); /* Safety */ - - if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) - dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights); - - my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); - - if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) - dst+= my_strxfrm_pad_unicode(dst, de); - return dst - dst0; -} - - -size_t -my_strnxfrm_unicode_nopad(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uchar *dst0= dst; - uchar *de= dst + dstlen; - dst+= my_strnxfrm_unicode_internal(cs, dst, de, &nweights, - src, src + srclen); - DBUG_ASSERT(dst <= de); /* Safety */ - - if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) - { - size_t len= de - dst; - set_if_smaller(len, nweights * 2); - memset(dst, 0x00, len); - dst+= len; - } - - my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); - - if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) - { - memset(dst, 0x00, de - dst); - dst= de; - } - return dst - dst0; -} - /* For BMP-only collations that use 2 bytes per weight. */ @@ -4977,42 +4821,7 @@ static const uchar to_upper_utf8[] = { static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t * pwc, const uchar *s, const uchar *e) { - uchar c; - - if (s >= e) - return MY_CS_TOOSMALL; - - c= s[0]; - if (c < 0x80) - { - *pwc = c; - return 1; - } - else if (c < 0xc2) - return MY_CS_ILSEQ; - else if (c < 0xe0) - { - if (s+2 > e) /* We need 2 characters */ - return MY_CS_TOOSMALL2; - - if (!(IS_CONTINUATION_BYTE(s[1]))) - return MY_CS_ILSEQ; - - *pwc= UTF8MB2_CODE(c, s[1]); - return 2; - } - else if (c < 0xf0) - { - if (s+3 > e) /* We need 3 characters */ - return MY_CS_TOOSMALL3; - - if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) - return MY_CS_ILSEQ; - - *pwc= UTF8MB3_CODE(c, s[1], s[2]); - return 3; - } - return MY_CS_ILSEQ; + return my_mb_wc_utf8mb3_quick(pwc, s, e); } @@ -5308,7 +5117,7 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) It represents a single byte character. Convert it into weight according to collation. */ - s_wc= plane00[(uchar) s[0]].tolower; + s_wc= my_unicase_default_page00[(uchar) s[0]].tolower; s++; } else @@ -5350,7 +5159,7 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) if ((uchar) t[0] < 128) { /* Convert single byte character into weight */ - t_wc= plane00[(uchar) t[0]].tolower; + t_wc= my_unicase_default_page00[(uchar) t[0]].tolower; t++; } else @@ -5413,14 +5222,14 @@ int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)), static inline int my_weight_mb1_utf8_general_ci(uchar b) { - return (int) plane00[b & 0xFF].sort; + return (int) my_unicase_default_page00[b & 0xFF].sort; } static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1) { my_wc_t wc= UTF8MB2_CODE(b0, b1); - MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } @@ -5428,16 +5237,23 @@ static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1) static inline int my_weight_mb3_utf8_general_ci(uchar b0, uchar b1, uchar b2) { my_wc_t wc= UTF8MB3_CODE(b0, b1, b2); - MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } -#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_ci -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x) -#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y) -#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z) +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e) +#define OPTIMIZE_ASCII 1 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x) +#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y) +#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z) #include "strcoll.ic" @@ -5473,19 +5289,28 @@ my_weight_mb3_utf8_general_mysql500_ci(uchar b0, uchar b1, uchar b2) } -#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_mysql500_ci -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_mysql500_ci(x) -#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_mysql500_ci(x,y) -#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_mysql500_ci(x,y,z) +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_mysql500_ci +#define DEFINE_STRNXFRM_UNICODE +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e) +#define OPTIMIZE_ASCII 1 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 plane00_mysql500 +#define UNICASE_PAGES my_unicase_pages_mysql500 +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_mysql500_ci(x) +#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_mysql500_ci(x,y) +#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_mysql500_ci(x,y,z) #include "strcoll.ic" -#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_bin -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB1(x) ((int) (uchar) (x)) -#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y)) -#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z)) +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_bin +#define DEFINE_STRNXFRM_UNICODE_BIN2 +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e) +#define OPTIMIZE_ASCII 1 +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) ((int) (uchar) (x)) +#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y)) +#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z)) #include "strcoll.ic" @@ -5534,7 +5359,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler = NULL, /* init */ my_strnncoll_utf8_general_ci, my_strnncollsp_utf8_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf8_general_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8, @@ -5550,7 +5375,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_mysql500_ci_handler = NULL, /* init */ my_strnncoll_utf8_general_mysql500_ci, my_strnncollsp_utf8_general_mysql500_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf8_general_mysql500_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8, @@ -5566,7 +5391,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_bin_handler = NULL, /* init */ my_strnncoll_utf8_bin, my_strnncollsp_utf8_bin, - my_strnxfrm_unicode, + my_strnxfrm_utf8_bin, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_mb_bin, @@ -5582,7 +5407,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_utf8_general_ci, my_strnncollsp_utf8_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf8_general_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8, @@ -5598,7 +5423,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_nopad_bin_handler = NULL, /* init */ my_strnncoll_utf8_bin, my_strnncollsp_utf8_nopad_bin, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf8_bin, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_mb_bin, @@ -5927,7 +5752,7 @@ static MY_COLLATION_HANDLER my_collation_cs_handler = NULL, /* init */ my_strnncoll_utf8_cs, my_strnncollsp_utf8_cs, - my_strnxfrm_unicode, + my_strnxfrm_utf8_general_ci, my_strnxfrmlen_unicode, my_like_range_simple, my_wildcmp_mb, @@ -7212,13 +7037,30 @@ my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end) #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN /* my_well_formed_char_length_filename */ +#define MY_FUNCTION_NAME(x) my_ ## x ## _filename +#define DEFINE_STRNNCOLL 0 +#define DEFINE_STRNXFRM_UNICODE +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_filename(cs, pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages + +/* +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x) +#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y) +#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z) +*/ +#include "strcoll.ic" + static MY_COLLATION_HANDLER my_collation_filename_handler = { NULL, /* init */ my_strnncoll_simple, my_strnncollsp_simple, - my_strnxfrm_unicode, + my_strnxfrm_filename, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8, @@ -7375,52 +7217,7 @@ static int my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), my_wc_t * pwc, const uchar *s, const uchar *e) { - uchar c; - - if (s >= e) - return MY_CS_TOOSMALL; - - c= s[0]; - if (c < 0x80) - { - *pwc= c; - return 1; - } - else if (c < 0xc2) - return MY_CS_ILSEQ; - else if (c < 0xe0) - { - if (s + 2 > e) /* We need 2 characters */ - return MY_CS_TOOSMALL2; - - if (!(IS_CONTINUATION_BYTE(s[1]))) - return MY_CS_ILSEQ; - - *pwc= UTF8MB2_CODE(c, s[1]); - return 2; - } - else if (c < 0xf0) - { - if (s + 3 > e) /* We need 3 characters */ - return MY_CS_TOOSMALL3; - - if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) - return MY_CS_ILSEQ; - - *pwc= UTF8MB3_CODE(c, s[1], s[2]); - return 3; - } - else if (c < 0xf5) - { - if (s + 4 > e) /* We need 4 characters */ - return MY_CS_TOOSMALL4; - - if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3])) - return MY_CS_ILSEQ; - *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]); - return 4; - } - return MY_CS_ILSEQ; + return my_mb_wc_utf8mb4_quick(pwc, s, e); } @@ -7752,7 +7549,7 @@ my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t) It represents a single byte character. Convert it into weight according to collation. */ - s_wc= plane00[(uchar) s[0]].tolower; + s_wc= my_unicase_default_page00[(uchar) s[0]].tolower; s++; } else @@ -7776,7 +7573,7 @@ my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t) if ((uchar) t[0] < 128) { /* Convert single byte character into weight */ - t_wc= plane00[(uchar) t[0]].tolower; + t_wc= my_unicase_default_page00[(uchar) t[0]].tolower; t++; } else @@ -7847,6 +7644,13 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb4_quick(pwc, s, e) +#define OPTIMIZE_ASCII 1 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages #define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3) #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) #define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0) @@ -7897,7 +7701,7 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= NULL, /* init */ my_strnncoll_utf8mb4_general_ci, my_strnncollsp_utf8mb4_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf8mb4_general_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8mb4, @@ -7929,7 +7733,7 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_nopad_ci_handler= NULL, /* init */ my_strnncoll_utf8mb4_general_ci, my_strnncollsp_utf8mb4_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf8mb4_general_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8mb4, diff --git a/strings/ctype-utf8.h b/strings/ctype-utf8.h new file mode 100644 index 00000000000..9a44c1658f2 --- /dev/null +++ b/strings/ctype-utf8.h @@ -0,0 +1,190 @@ +/* + Copyright (c) 2018 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef _CTYPE_UTF8_H +#define _CTYPE_UTF8_H + +/* Detect special bytes and sequences */ +#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40) + +/* + Check MB2 character assuming that b0 is alredy known to be >= 0xC2. + Use this macro if the caller already checked b0 for: + - an MB1 character + - an unused gap between MB1 and MB2HEAD +*/ +#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \ + IS_CONTINUATION_BYTE((uchar) b1)) + +/* + Check MB3 character assuming that b0 is already known to be + in the valid MB3HEAD range [0xE0..0xEF]. +*/ +#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \ + IS_CONTINUATION_BYTE(b2) && \ + ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0)) + +/* + Check MB3 character assuming that b0 is already known to be >= 0xE0, + but is not checked for the high end 0xF0 yet. + Use this macro if the caller already checked b0 for: + - an MB1 character + - an unused gap between MB1 and MB2HEAD + - an MB2HEAD +*/ +#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \ + IS_UTF8MB3_STEP2(b0,b1,b2)) + +/* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] +*/ + +/* + Check MB4 character assuming that b0 is already + known to be in the range [0xF0..0xF4] +*/ +#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \ + IS_CONTINUATION_BYTE(b2) && \ + IS_CONTINUATION_BYTE(b3) && \ + (b0 >= 0xf1 || b1 >= 0x90) && \ + (b0 <= 0xf3 || b1 <= 0x8F)) +#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \ + IS_UTF8MB4_STEP2(b0,b1,b2,b3)) + +/* Convert individual bytes to Unicode code points */ +#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\ + ((my_wc_t) ((uchar) b1 ^ 0x80))) +#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\ + ((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\ + ((my_wc_t) ((uchar) b2 ^ 0x80))) +#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\ + ((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\ + ((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\ + (my_wc_t) ((uchar) b3 ^ 0x80)) + +static inline int +my_mb_wc_utf8mb3_quick(my_wc_t * pwc, const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0x80) + { + *pwc = c; + return 1; + } + else if (c < 0xc2) + return MY_CS_ILSEQ; + else if (c < 0xe0) + { + if (s+2 > e) /* We need 2 characters */ + return MY_CS_TOOSMALL2; + + if (!(IS_CONTINUATION_BYTE(s[1]))) + return MY_CS_ILSEQ; + + *pwc= UTF8MB2_CODE(c, s[1]); + return 2; + } + else if (c < 0xf0) + { + if (s+3 > e) /* We need 3 characters */ + return MY_CS_TOOSMALL3; + + if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) + return MY_CS_ILSEQ; + + *pwc= UTF8MB3_CODE(c, s[1], s[2]); + return 3; + } + return MY_CS_ILSEQ; +} + + +#ifdef HAVE_CHARSET_utf8mb4 +static inline int +my_mb_wc_utf8mb4_quick(my_wc_t *pwc, const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0x80) + { + *pwc= c; + return 1; + } + else if (c < 0xc2) + return MY_CS_ILSEQ; + else if (c < 0xe0) + { + if (s + 2 > e) /* We need 2 characters */ + return MY_CS_TOOSMALL2; + + if (!(IS_CONTINUATION_BYTE(s[1]))) + return MY_CS_ILSEQ; + + *pwc= UTF8MB2_CODE(c, s[1]); + return 2; + } + else if (c < 0xf0) + { + if (s + 3 > e) /* We need 3 characters */ + return MY_CS_TOOSMALL3; + + if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) + return MY_CS_ILSEQ; + + *pwc= UTF8MB3_CODE(c, s[1], s[2]); + return 3; + } + else if (c < 0xf5) + { + if (s + 4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3])) + return MY_CS_ILSEQ; + *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]); + return 4; + } + return MY_CS_ILSEQ; +} +#endif /* HAVE_CHARSET_utf8mb4*/ + + +#endif /* _CTYPE_UTF8_H */ diff --git a/strings/json_lib.c b/strings/json_lib.c index 24c79cb9044..3763ac4ed54 100644 --- a/strings/json_lib.c +++ b/strings/json_lib.c @@ -1845,3 +1845,252 @@ int json_path_compare(const json_path_t *a, const json_path_t *b, return json_path_parts_compare(a->steps+1, a->last_step, b->steps+1, b->last_step, vt); } + + +static enum json_types smart_read_value(json_engine_t *je, + const char **value, int *value_len) +{ + if (json_read_value(je)) + goto err_return; + + *value= (char *) je->value; + + if (json_value_scalar(je)) + *value_len= je->value_len; + else + { + if (json_skip_level(je)) + goto err_return; + + *value_len= (int) ((char *) je->s.c_str - *value); + } + + return je->value_type; + +err_return: + return JSV_BAD_JSON; +} + + +enum json_types json_type(const char *js, const char *js_end, + const char **value, int *value_len) +{ + json_engine_t je; + + json_scan_start(&je, &my_charset_utf8mb4_bin,(const uchar *) js, + (const uchar *) js_end); + + return smart_read_value(&je, value, value_len); +} + + +enum json_types json_get_array_item(const char *js, const char *js_end, + int n_item, + const char **value, int *value_len) +{ + json_engine_t je; + int c_item= 0; + + json_scan_start(&je, &my_charset_utf8mb4_bin,(const uchar *) js, + (const uchar *) js_end); + + if (json_read_value(&je) || + je.value_type != JSON_VALUE_ARRAY) + goto err_return; + + while (!json_scan_next(&je)) + { + switch (je.state) + { + case JST_VALUE: + if (c_item == n_item) + return smart_read_value(&je, value, value_len); + + if (json_skip_key(&je)) + goto err_return; + + c_item++; + break; + + case JST_ARRAY_END: + *value= (const char *) (je.s.c_str - je.sav_c_len); + *value_len= c_item; + return JSV_NOTHING; + } + } + +err_return: + return JSV_BAD_JSON; +} + + +/** Simple json lookup for a value by the key. + + Expects JSON object. + Only scans the 'first level' of the object, not + the nested structures. + + @param js [in] json object to search in + @param js_end [in] end of json string + @param key [in] key to search for + @param key_end [in] - " - + @param value_start [out] pointer into js (value or closing }) + @param value_len [out] length of the value found or number of keys + + @retval the type of the key value + @retval JSV_BAD_JSON - syntax error found reading JSON. + or not JSON object. + @retval JSV_NOTHING - no such key found. +*/ +enum json_types json_get_object_key(const char *js, const char *js_end, + const char *key, + const char **value, int *value_len) +{ + const char *key_end= key + strlen(key); + json_engine_t je; + json_string_t key_name; + int n_keys= 0; + + json_string_set_cs(&key_name, &my_charset_utf8mb4_bin); + + json_scan_start(&je, &my_charset_utf8mb4_bin,(const uchar *) js, + (const uchar *) js_end); + + if (json_read_value(&je) || + je.value_type != JSON_VALUE_OBJECT) + goto err_return; + + while (!json_scan_next(&je)) + { + switch (je.state) + { + case JST_KEY: + n_keys++; + json_string_set_str(&key_name, (const uchar *) key, + (const uchar *) key_end); + if (json_key_matches(&je, &key_name)) + return smart_read_value(&je, value, value_len); + + if (json_skip_key(&je)) + goto err_return; + + break; + + case JST_OBJ_END: + *value= (const char *) (je.s.c_str - je.sav_c_len); + *value_len= n_keys; + return JSV_NOTHING; + } + } + +err_return: + return JSV_BAD_JSON; +} + + +enum json_types json_get_object_nkey(const char *js,const char *js_end, int nkey, + const char **keyname, const char **keyname_end, + const char **value, int *value_len) +{ + return JSV_NOTHING; +} + + +/** Check if json is valid (well-formed) + + @retval 0 - success, json is well-formed + @retval 1 - error, json is invalid +*/ +int json_valid(const char *js, size_t js_len, CHARSET_INFO *cs) +{ + json_engine_t je; + json_scan_start(&je, cs, (const uchar *) js, (const uchar *) js + js_len); + while (json_scan_next(&je) == 0) /* no-op */ ; + return je.s.error == 0; +} + + +/* + Expects the JSON object as an js argument, and the key name. + Looks for this key in the object and returns + the location of all the text related to it. + The text includes the comma, separating this key. + + comma_pos - the hint where the comma is. It is important + if you plan to replace the key rather than just cut. + 1 - comma is on the left + 2 - comma is on the right. + 0 - no comma at all (the object has just this single key) + + if no such key found *key_start is set to NULL. +*/ +int json_locate_key(const char *js, const char *js_end, + const char *kname, + const char **key_start, const char **key_end, + int *comma_pos) +{ + const char *kname_end= kname + strlen(kname); + json_engine_t je; + json_string_t key_name; + int t_next, c_len, match_result; + + json_string_set_cs(&key_name, &my_charset_utf8mb4_bin); + + json_scan_start(&je, &my_charset_utf8mb4_bin,(const uchar *) js, + (const uchar *) js_end); + + if (json_read_value(&je) || + je.value_type != JSON_VALUE_OBJECT) + goto err_return; + + *key_start= (const char *) je.s.c_str; + *comma_pos= 0; + + while (!json_scan_next(&je)) + { + switch (je.state) + { + case JST_KEY: + json_string_set_str(&key_name, (const uchar *) kname, + (const uchar *) kname_end); + match_result= json_key_matches(&je, &key_name); + if (json_skip_key(&je)) + goto err_return; + get_first_nonspace(&je.s, &t_next, &c_len); + je.s.c_str-= c_len; + + if (match_result) + { + *key_end= (const char *) je.s.c_str; + + if (*comma_pos == 1) + return 0; + + DBUG_ASSERT(*comma_pos == 0); + + if (t_next == C_COMMA) + { + *key_end+= c_len; + *comma_pos= 2; + } + else if (t_next == C_RCURB) + *comma_pos= 0; + else + goto err_return; + return 0; + } + + *key_start= (const char *) je.s.c_str; + *comma_pos= 1; + break; + + case JST_OBJ_END: + *key_start= NULL; + return 0; + } + } + +err_return: + return 1; + +} diff --git a/strings/strcoll.ic b/strings/strcoll.ic index c647a5ef57e..9dfccb9018c 100644 --- a/strings/strcoll.ic +++ b/strings/strcoll.ic @@ -15,11 +15,18 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ - #ifndef MY_FUNCTION_NAME #error MY_FUNCTION_NAME is not defined #endif +/* + Define strnncoll() and strnncollsp() by default, + unless "#define DEFINE_STRNNCOLL 0" is specified. +*/ +#ifndef DEFINE_STRNNCOLL +#define DEFINE_STRNNCOLL 1 +#endif + /* The weight for automatically padded spaces when comparing strings with @@ -54,6 +61,8 @@ #endif +#if DEFINE_STRNNCOLL + /** Scan a valid character, or a bad byte, or an auto-padded space from a string and calculate the weight of the scanned sequence. @@ -278,6 +287,8 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), } #endif +#endif /* DEFINE_STRNNCOLL */ + #ifdef DEFINE_STRNXFRM #ifndef WEIGHT_MB2_FRM @@ -322,11 +333,261 @@ MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, #endif /* DEFINE_STRNXFRM */ +#if defined(DEFINE_STRNXFRM_UNICODE) || defined(DEFINE_STRNXFRM_UNICODE_NOPAD) + +/* + Store sorting weights using 2 bytes per character. + + This function is shared between + - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin + which support BMP only (U+0000..U+FFFF). + - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, + which map all supplementary characters to weight 0xFFFD. +*/ + +#ifndef MY_MB_WC +#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE +#endif + +#ifndef OPTIMIZE_ASCII +#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE +#endif + +#ifndef UNICASE_MAXCHAR +#error UNICASE_MAXCHAR must be defined for DEFINE_STRNXFRM_UNICODE +#endif + +#ifndef UNICASE_PAGE0 +#error UNICASE_PAGE0 must be defined for DEFINE_STRNXFRM_UNICODE +#endif + +#ifndef UNICASE_PAGES +#error UNICASE_PAGES must be defined for DEFINE_STRNXFRM_UNICODE +#endif + + +static size_t +MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs, + uchar *dst, uchar *de, + uint *nweights, + const uchar *src, const uchar *se) +{ + my_wc_t UNINIT_VAR(wc); + uchar *dst0= dst; + + DBUG_ASSERT(src || !se); + DBUG_ASSERT((cs->state & MY_CS_LOWER_SORT) == 0); + DBUG_ASSERT(0x7F <= UNICASE_MAXCHAR); + + for (; dst < de && *nweights; (*nweights)--) + { + int res; +#if OPTIMIZE_ASCII + if (src >= se) + break; + if (src[0] <= 0x7F) + { + wc= UNICASE_PAGE0[*src++].sort; + PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); + continue; + } +#endif + if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0) + break; + src+= res; + if (wc <= UNICASE_MAXCHAR) + { + MY_UNICASE_CHARACTER *page; + if ((page= UNICASE_PAGES[wc >> 8])) + wc= page[wc & 0xFF].sort; + } + else + wc= MY_CS_REPLACEMENT_CHARACTER; + PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); + } + return dst - dst0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *dst0= dst; + uchar *de= dst + dstlen; + dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, + src, src + srclen); + DBUG_ASSERT(dst <= de); /* Safety */ + + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights); + + my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); + + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + dst+= my_strxfrm_pad_unicode(dst, de); + return dst - dst0; +} + + +#ifdef DEFINE_STRNXFRM_UNICODE_NOPAD +static size_t +MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *dst0= dst; + uchar *de= dst + dstlen; + dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, + src, src + srclen); + DBUG_ASSERT(dst <= de); /* Safety */ + + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + { + size_t len= de - dst; + set_if_smaller(len, nweights * 2); + memset(dst, 0x00, len); + dst+= len; + } + + my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); + + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + { + memset(dst, 0x00, de - dst); + dst= de; + } + return dst - dst0; +} +#endif + +#endif /* DEFINE_STRNXFRM_UNICODE || DEFINE_STRNXFRM_UNICODE_NOPAD */ + + + +#ifdef DEFINE_STRNXFRM_UNICODE_BIN2 + +/* + Store sorting weights using 2 bytes per character. + + These functions are shared between + - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin + which support BMP only (U+0000..U+FFFF). + - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, + which map all supplementary characters to weight 0xFFFD. +*/ + +#ifndef MY_MB_WC +#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE_BIN2 +#endif + +#ifndef OPTIMIZE_ASCII +#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE_BIN2 +#endif + + +static size_t +MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs, + uchar *dst, uchar *de, + uint *nweights, + const uchar *src, + const uchar *se) +{ + my_wc_t UNINIT_VAR(wc); + uchar *dst0= dst; + + DBUG_ASSERT(src || !se); + + for (; dst < de && *nweights; (*nweights)--) + { + int res; +#if OPTIMIZE_ASCII + if (src >= se) + break; + if (src[0] <= 0x7F) + { + wc= *src++; + PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); + continue; + } +#endif + if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0) + break; + src+= res; + if (wc > 0xFFFF) + wc= MY_CS_REPLACEMENT_CHARACTER; + PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); + } + return dst - dst0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *dst0= dst; + uchar *de= dst + dstlen; + dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, + src, src + srclen); + DBUG_ASSERT(dst <= de); /* Safety */ + + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights); + + my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); + + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + dst+= my_strxfrm_pad_unicode(dst, de); + return dst - dst0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *dst0= dst; + uchar *de= dst + dstlen; + dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, + src, src + srclen); + DBUG_ASSERT(dst <= de); /* Safety */ + + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + { + size_t len= de - dst; + set_if_smaller(len, nweights * 2); + memset(dst, 0x00, len); + dst+= len; + } + + my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); + + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + { + memset(dst, 0x00, de - dst); + dst= de; + } + return dst - dst0; +} + +#endif /* DEFINE_STRNXFRM_UNICODE_BIN2 */ + + /* We usually include this file at least two times from the same source file, for the _ci and the _bin collations. Prepare for the second inclusion. */ #undef MY_FUNCTION_NAME +#undef MY_MB_WC +#undef OPTIMIZE_ASCII +#undef UNICASE_MAXCHAR +#undef UNICASE_PAGE0 +#undef UNICASE_PAGES #undef WEIGHT_ILSEQ #undef WEIGHT_MB1 #undef WEIGHT_MB2 @@ -335,4 +596,8 @@ MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, #undef WEIGHT_PAD_SPACE #undef WEIGHT_MB2_FRM #undef DEFINE_STRNXFRM +#undef DEFINE_STRNXFRM_UNICODE +#undef DEFINE_STRNXFRM_UNICODE_NOPAD +#undef DEFINE_STRNXFRM_UNICODE_BIN2 +#undef DEFINE_STRNNCOLL #undef DEFINE_STRNNCOLLSP_NOPAD |