diff options
author | Alexander Barkov <bar@mariadb.org> | 2016-11-08 20:57:19 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2016-11-08 20:57:19 +0400 |
commit | 0259b3cbbe47448beb385a33348af56300004aa6 (patch) | |
tree | 56ad0b2aa945a60a9a5ff2934a5931bd7eab1a48 /mysys/charset.c | |
parent | 90c5b2f505855c13e0bdc2e73e11fa83983712bd (diff) | |
download | mariadb-git-0259b3cbbe47448beb385a33348af56300004aa6.tar.gz |
MDEV-11255 LDML: allow defining 2-level UCA collations
Diffstat (limited to 'mysys/charset.c')
-rw-r--r-- | mysys/charset.c | 45 |
1 files changed, 38 insertions, 7 deletions
diff --git a/mysys/charset.c b/mysys/charset.c index 016d0fc3012..8939b6d7a4f 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -258,12 +258,38 @@ static my_bool simple_cs_is_full(CHARSET_INFO *cs) #if defined(HAVE_UCA_COLLATIONS) && (defined(HAVE_CHARSET_ucs2) || defined(HAVE_CHARSET_utf8)) +/** + Initialize a loaded collation. + @param [OUT] to - The new charset_info_st structure to initialize. + @param [IN] from - A template collation, to fill the missing data from. + @param [IN] loaded - The collation data loaded from the LDML file. + some data may be missing in "loaded". +*/ static void -copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from) +copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from, + CHARSET_INFO *loaded) { to->cset= from->cset; to->coll= from->coll; - to->strxfrm_multiply= from->strxfrm_multiply; + /* + Single-level UCA collation have strnxfrm_multiple=8. + In case of a multi-level UCA collation we use strnxfrm_multiply=4. + That means MY_COLLATION_HANDLER::strnfrmlen() will request the caller + to allocate a buffer smaller size for each level, for performance purpose, + and to fit longer VARCHARs to @@max_sort_length. + This makes filesort produce non-precise order for some rare Unicode + characters that produce more than 4 weights (long expansions). + UCA requires 2 bytes per weight multiplied by the number of levels. + In case of a 2-level collation, each character requires 4*2=8 bytes. + Therefore, the longest VARCHAR that fits into the default @@max_sort_length + is 1024/8=VARCHAR(128). With strnxfrm_multiply==8, only VARCHAR(64) + would fit. + Note, the built-in collation utf8_thai_520_w2 also uses strnxfrm_multiply=4, + for the same purpose. + TODO: we could add a new LDML syntax to choose strxfrm_multiply value. + */ + to->strxfrm_multiply= loaded->levels_for_order > 1 ? + 4 : from->strxfrm_multiply; to->min_sort_char= from->min_sort_char; to->max_sort_char= from->max_sort_char; to->mbminlen= from->mbminlen; @@ -312,7 +338,8 @@ static int add_collation(struct charset_info_st *cs) #if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS) copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ? &my_charset_ucs2_unicode_nopad_ci : - &my_charset_ucs2_unicode_ci); + &my_charset_ucs2_unicode_ci, + cs); newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII; #endif } @@ -321,7 +348,8 @@ static int add_collation(struct charset_info_st *cs) #if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS) copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ? &my_charset_utf8_unicode_nopad_ci : - &my_charset_utf8_unicode_ci); + &my_charset_utf8_unicode_ci, + cs); newcs->ctype= my_charset_utf8_unicode_ci.ctype; if (init_state_maps(newcs)) return MY_XML_ERROR; @@ -332,7 +360,8 @@ static int add_collation(struct charset_info_st *cs) #if defined (HAVE_CHARSET_utf8mb4) && defined(HAVE_UCA_COLLATIONS) copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ? &my_charset_utf8mb4_unicode_nopad_ci : - &my_charset_utf8mb4_unicode_ci); + &my_charset_utf8mb4_unicode_ci, + cs); newcs->ctype= my_charset_utf8mb4_unicode_ci.ctype; newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED; #endif @@ -342,7 +371,8 @@ static int add_collation(struct charset_info_st *cs) #if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS) copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ? &my_charset_utf16_unicode_nopad_ci : - &my_charset_utf16_unicode_ci); + &my_charset_utf16_unicode_ci, + cs); newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII; #endif } @@ -351,7 +381,8 @@ static int add_collation(struct charset_info_st *cs) #if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS) copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ? &my_charset_utf32_unicode_nopad_ci : - &my_charset_utf32_unicode_ci); + &my_charset_utf32_unicode_ci, + cs); newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII; #endif } |