summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.com>2018-10-16 19:10:57 +0400
committerAlexander Barkov <bar@mariadb.com>2018-10-18 07:49:58 +0400
commit475c6ec551fa8847f8993e7cb2d3ff1119f29f5a (patch)
tree94cd3cd9fa24d54c69601fc27e35abd746fe9d95 /strings
parentd88c136b9fb409cbf6421635a6175329e7182cd7 (diff)
downloadmariadb-git-475c6ec551fa8847f8993e7cb2d3ff1119f29f5a.tar.gz
MDEV-17474 Change Unicode collation implementation from "handler" to "inline" style (part#2)
Additional changes: 1. Adding a fast path for ASCII characters 2. Adding dedicated MY_COLLATION_HANDLERs for collations with no contractions (for utf8 and for utf8mb4 character sets). The choice between the full-featured handler and the "no contraction" handler is made at the collation initialization time.
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-uca.c183
-rw-r--r--strings/ctype-uca.ic88
2 files changed, 239 insertions, 32 deletions
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 4c670861a9f..320dc1ee225 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -31410,6 +31410,28 @@ my_uca_can_be_previous_context_tail(const MY_CONTRACTIONS *list, my_wc_t wc)
/**
+ Check if a character needs previous/next context handling:
+ - can be a previois context tail
+ - can be a contraction start
+
+ @param level Pointer to an UCA weight level data
+ @param wc Code point
+
+ @return
+ @retval FALSE - does not need context handling
+ @retval TRUE - needs context handing
+*/
+
+static inline my_bool
+my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc)
+{
+ return level->contractions.nitems > 0 &&
+ level->contractions.flags[wc & MY_UCA_CNT_FLAG_MASK] &
+ (MY_UCA_PREVIOUS_CONTEXT_TAIL | MY_UCA_CNT_HEAD);
+}
+
+
+/**
Compare two wide character strings, wide analog to strncmp().
@param a Pointer to the first string
@@ -31543,6 +31565,60 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
return NULL;
}
+
+/*
+ Find a context dependent weight of a character.
+ @param scanner - UCA weight scanner. The caller should set
+ its members "page" and "code" to the previous character
+ (or to zeros if there is no a previous character).
+ @param wc - an array of wide characters which has at least
+ MY_UCA_MAX_CONTRACTION elements, where wc[0] is set
+ to the current character (whose weight is being resolved).
+ The values of wc[i>0] is not important, but if wc[0]
+ appears to be a known contraction head, the function
+ will collect further contraction parts into wc[i>0].
+ If wc[0] and the previous character make a previous context
+ pair, then wc[1] is set to the previous character.
+
+ @retval NULL if could not find any contextual weights for wc[0]
+ @retval non null pointer to a zero-terminated weight string otherwise
+*/
+static inline uint16 *
+my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc)
+{
+ uint16 *cweight;
+ DBUG_ASSERT(scanner->level->contractions.nitems);
+ /*
+ If we have scanned a character which can have previous context,
+ and there were some more characters already before,
+ then reconstruct codepoint of the previous character
+ from "page" and "code" into w[1], and verify that {wc[1], wc[0]}
+ together form a real previous context pair.
+ Note, we support only 2-character long sequences with previous
+ context at the moment. CLDR does not have longer sequences.
+ */
+ if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
+ wc[0]) &&
+ scanner->wbeg != nochar && /* if not the very first character */
+ my_uca_can_be_previous_context_head(&scanner->level->contractions,
+ (wc[1]= ((scanner->page << 8) +
+ scanner->code))) &&
+ (cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
+ {
+ scanner->page= scanner->code= 0; /* Clear for the next character */
+ return cweight;
+ }
+ else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
+ wc[0]))
+ {
+ /* Check if w[0] starts a contraction */
+ if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
+ return cweight;
+ }
+ return NULL;
+}
+
+
/****************************************************************/
/**
@@ -31935,6 +32011,23 @@ int my_wildcmp_uca(CHARSET_INFO *cs,
/*
+ Tests if an optimized "no contraction" handler can be used for
+ the given collation.
+*/
+static my_bool
+my_uca_collation_can_optimize_no_contractions(CHARSET_INFO *cs)
+{
+ uint i;
+ for (i= 0; i < cs->levels_for_order ; i++)
+ {
+ if (my_uca_have_contractions_quick(&cs->uca->level[i]))
+ return FALSE;
+ }
+ return TRUE;
+}
+
+
+/*
Collation language is implemented according to
subset of ICU Collation Customization (tailorings):
http://icu.sourceforge.net/userguide/Collate_Customization.html
@@ -33645,6 +33738,31 @@ static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len)
/*
+ This structure is used at the collation initialization time, to switch
+ from a full-featured collation handler to a "no contraction" collation
+ handler if the collation is known not to have any contractions.
+*/
+typedef struct
+{
+ MY_COLLATION_HANDLER *pad;
+ MY_COLLATION_HANDLER *nopad;
+ MY_COLLATION_HANDLER *multilevel_pad;
+ MY_COLLATION_HANDLER *multilevel_nopad;
+} MY_COLLATION_HANDLER_PACKAGE;
+
+
+static void my_uca_handler_map(struct charset_info_st *cs,
+ const MY_COLLATION_HANDLER_PACKAGE *from,
+ const MY_COLLATION_HANDLER_PACKAGE *to)
+{
+ if (cs->coll == from->pad) cs->coll= to->pad;
+ else if (cs->coll == from->nopad) cs->coll= to->nopad;
+ else if (cs->coll == from->multilevel_pad) cs->coll= to->multilevel_pad;
+ else if (cs->coll == from->multilevel_nopad) cs->coll= to->multilevel_nopad;
+}
+
+
+/*
Define generic collation handlers for multi-level collations with tailoring:
my_uca_collation_handler_nopad_multilevel_generic
@@ -33656,6 +33774,9 @@ static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len)
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _generic
#define MY_MB_WC(scanner, wc, beg, end) (scanner->cs->cset->mb_wc(scanner->cs, wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
+#define MY_UCA_ASCII_OPTIMIZE 0
+#define MY_UCA_COMPILE_CONTRACTIONS 1
+#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"
@@ -33758,6 +33879,9 @@ ex:
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _ucs2
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_ucs2_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
+#define MY_UCA_ASCII_OPTIMIZE 0
+#define MY_UCA_COMPILE_CONTRACTIONS 1
+#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"
@@ -34711,12 +34835,38 @@ struct charset_info_st my_charset_ucs2_unicode_520_nopad_ci=
#ifdef HAVE_CHARSET_utf8
+static my_bool
+my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader);
+
#include "ctype-utf8.h"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb3
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
+#define MY_UCA_ASCII_OPTIMIZE 1
+#define MY_UCA_COMPILE_CONTRACTIONS 1
+#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3
#include "ctype-uca.ic"
+#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb3
+#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
+#define MY_LIKE_RANGE my_like_range_mb
+#define MY_UCA_ASCII_OPTIMIZE 1
+#define MY_UCA_COMPILE_CONTRACTIONS 0
+#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3
+#include "ctype-uca.ic"
+
+
+static my_bool
+my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader)
+{
+ if (my_coll_init_uca(cs, loader))
+ return TRUE;
+ if (my_uca_collation_can_optimize_no_contractions(cs))
+ my_uca_handler_map(cs, &my_uca_package_utf8mb3,
+ &my_uca_package_no_contractions_utf8mb3);
+ return FALSE;
+}
+
/*
We consider bytes with code more than 127 as a letter.
@@ -35690,12 +35840,39 @@ struct charset_info_st my_charset_utf8_unicode_520_nopad_ci=
#ifdef HAVE_CHARSET_utf8mb4
+static my_bool
+my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader);
+
+
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb4
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
+#define MY_UCA_ASCII_OPTIMIZE 1
+#define MY_UCA_COMPILE_CONTRACTIONS 1
+#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4
+#include "ctype-uca.ic"
+
+#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb4
+#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
+#define MY_LIKE_RANGE my_like_range_mb
+#define MY_UCA_ASCII_OPTIMIZE 1
+#define MY_UCA_COMPILE_CONTRACTIONS 0
+#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4
#include "ctype-uca.ic"
+static my_bool
+my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader)
+{
+ if (my_coll_init_uca(cs, loader))
+ return TRUE;
+ if (my_uca_collation_can_optimize_no_contractions(cs))
+ my_uca_handler_map(cs, &my_uca_package_utf8mb4,
+ &my_uca_package_no_contractions_utf8mb4);
+ return FALSE;
+}
+
+
extern MY_CHARSET_HANDLER my_charset_utf8mb4_handler;
#define MY_CS_UTF8MB4_UCA_FLAGS (MY_CS_COMMON_UCA_FLAGS|MY_CS_UNICODE_SUPPLEMENT)
@@ -36646,6 +36823,9 @@ struct charset_info_st my_charset_utf8mb4_unicode_520_nopad_ci=
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf32
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf32_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
+#define MY_UCA_ASCII_OPTIMIZE 0
+#define MY_UCA_COMPILE_CONTRACTIONS 1
+#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"
@@ -37601,6 +37781,9 @@ struct charset_info_st my_charset_utf32_unicode_520_nopad_ci=
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf16
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf16_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
+#define MY_UCA_ASCII_OPTIMIZE 0
+#define MY_UCA_COMPILE_CONTRACTIONS 1
+#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"
diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic
index 7b2ca3447dd..de002efab1b 100644
--- a/strings/ctype-uca.ic
+++ b/strings/ctype-uca.ic
@@ -25,6 +25,15 @@
#ifndef MY_LIKE_RANGE
#error MY_LIKE_RANGE is not defined
#endif
+#ifndef MY_UCA_ASCII_OPTIMIZE
+#error MY_ASCII_OPTIMIZE is not defined
+#endif
+#ifndef MY_UCA_COMPILE_CONTRACTIONS
+#error MY_UCA_COMPILE_CONTRACTIONS is not defined
+#endif
+#ifndef MY_UCA_COLL_INIT
+#error MY_UCA_COLL_INIT is not defined
+#endif
static inline int
@@ -46,6 +55,32 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
int mblen;
/* Get next character */
+#if MY_UCA_ASCII_OPTIMIZE
+ /* Get next ASCII character */
+ if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
+ {
+ wc[0]= scanner->sbeg[0];
+ scanner->sbeg+= 1;
+
+#if MY_UCA_COMPILE_CONTRACTIONS
+ if (my_uca_needs_context_handling(scanner->level, wc[0]))
+ {
+ uint16 *cweight= my_uca_context_weight_find(scanner, wc);
+ if (cweight)
+ return *cweight;
+ }
+#endif
+
+ scanner->page= 0;
+ scanner->code= (int) wc[0];
+ scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
+ if (scanner->wbeg[0])
+ return *scanner->wbeg++;
+ continue;
+ }
+ else
+#endif
+ /* Get next MB character */
if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
scanner->send)) <= 0))
{
@@ -76,37 +111,14 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
return 0xFFFD;
}
- if (my_uca_have_contractions_quick(scanner->level))
+#if MY_UCA_COMPILE_CONTRACTIONS
+ if (my_uca_needs_context_handling(scanner->level, wc[0]))
{
- uint16 *cweight;
- /*
- If we have scanned a character which can have previous context,
- and there were some more characters already before,
- then reconstruct codepoint of the previous character
- from "page" and "code" into w[1], and verify that {wc[1], wc[0]}
- together form a real previous context pair.
- Note, we support only 2-character long sequences with previous
- context at the moment. CLDR does not have longer sequences.
- */
- if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
- wc[0]) &&
- scanner->wbeg != nochar && /* if not the very first character */
- my_uca_can_be_previous_context_head(&scanner->level->contractions,
- (wc[1]= ((scanner->page << 8) +
- scanner->code))) &&
- (cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
- {
- scanner->page= scanner->code= 0; /* Clear for the next character */
+ uint16 *cweight= my_uca_context_weight_find(scanner, wc);
+ if (cweight)
return *cweight;
- }
- else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
- wc[0]))
- {
- /* Check if w[0] starts a contraction */
- if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
- return *cweight;
- }
}
+#endif
/* Process single character */
scanner->page= wc[0] >> 8;
@@ -685,7 +697,7 @@ MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs,
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
{
- my_coll_init_uca,
+ MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll),
MY_FUNCTION_NAME(strnncollsp),
MY_FUNCTION_NAME(strnxfrm),
@@ -706,7 +718,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
{
- my_coll_init_uca,
+ MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll),
MY_FUNCTION_NAME(strnncollsp_nopad),
MY_FUNCTION_NAME(strnxfrm_nopad),
@@ -725,7 +737,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
{
- my_coll_init_uca,
+ MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll_multilevel),
MY_FUNCTION_NAME(strnncollsp_multilevel),
MY_FUNCTION_NAME(strnxfrm_multilevel),
@@ -744,7 +756,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
{
- my_coll_init_uca,
+ MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll_multilevel),
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel),
MY_FUNCTION_NAME(strnxfrm_multilevel),
@@ -758,6 +770,18 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
};
+MY_COLLATION_HANDLER_PACKAGE MY_FUNCTION_NAME(package)=
+{
+ &MY_FUNCTION_NAME(collation_handler),
+ &MY_FUNCTION_NAME(collation_handler_nopad),
+ &MY_FUNCTION_NAME(collation_handler_multilevel),
+ &MY_FUNCTION_NAME(collation_handler_nopad_multilevel)
+};
+
+
#undef MY_FUNCTION_NAME
#undef MY_MB_WC
#undef MY_LIKE_RANGE
+#undef MY_UCA_ASCII_OPTIMIZE
+#undef MY_UCA_COMPILE_CONTRACTIONS
+#undef MY_UCA_COLL_INIT