MDEV-11255 LDML: allow defining 2-level UCA collations

author: Alexander Barkov <bar@mariadb.org> 2016-11-08 20:57:19 +0400
committer: Alexander Barkov <bar@mariadb.org> 2016-11-08 20:57:19 +0400
commit: 0259b3cbbe47448beb385a33348af56300004aa6 (patch)
tree: 56ad0b2aa945a60a9a5ff2934a5931bd7eab1a48 /mysys/charset.c
parent: 90c5b2f505855c13e0bdc2e73e11fa83983712bd (diff)
download: mariadb-git-0259b3cbbe47448beb385a33348af56300004aa6.tar.gz
1 files changed, 38 insertions, 7 deletions
diff --git a/mysys/charset.c b/mysys/charset.c
index 016d0fc3012..8939b6d7a4f 100644
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -258,12 +258,38 @@ static my_bool simple_cs_is_full(CHARSET_INFO *cs)
 
 
 #if defined(HAVE_UCA_COLLATIONS) && (defined(HAVE_CHARSET_ucs2) || defined(HAVE_CHARSET_utf8))
+/**
+  Initialize a loaded collation.
+  @param [OUT] to     - The new charset_info_st structure to initialize.
+  @param [IN]  from   - A template collation, to fill the missing data from.
+  @param [IN]  loaded - The collation data loaded from the LDML file.
+                        some data may be missing in "loaded".
+*/
 static void
-copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from)
+copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from,
+                   CHARSET_INFO *loaded)
 {
   to->cset= from->cset;
   to->coll= from->coll;
-  to->strxfrm_multiply= from->strxfrm_multiply;
+  /*
+    Single-level UCA collation have strnxfrm_multiple=8.
+    In case of a multi-level UCA collation we use strnxfrm_multiply=4.
+    That means MY_COLLATION_HANDLER::strnfrmlen() will request the caller
+    to allocate a buffer smaller size for each level, for performance purpose,
+    and to fit longer VARCHARs to @@max_sort_length.
+    This makes filesort produce non-precise order for some rare Unicode
+    characters that produce more than 4 weights (long expansions).
+    UCA requires 2 bytes per weight multiplied by the number of levels.
+    In case of a 2-level collation, each character requires 4*2=8 bytes.
+    Therefore, the longest VARCHAR that fits into the default @@max_sort_length
+    is 1024/8=VARCHAR(128). With strnxfrm_multiply==8, only VARCHAR(64)
+    would fit.
+    Note, the built-in collation utf8_thai_520_w2 also uses strnxfrm_multiply=4,
+    for the same purpose.
+    TODO: we could add a new LDML syntax to choose strxfrm_multiply value.
+  */
+  to->strxfrm_multiply= loaded->levels_for_order > 1 ?
+                        4 : from->strxfrm_multiply;
   to->min_sort_char= from->min_sort_char;
   to->max_sort_char= from->max_sort_char;
   to->mbminlen= from->mbminlen;
@@ -312,7 +338,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
         copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                   &my_charset_ucs2_unicode_nopad_ci :
-                                  &my_charset_ucs2_unicode_ci);
+                                  &my_charset_ucs2_unicode_ci,
+                                  cs);
         newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
 #endif        
       }
@@ -321,7 +348,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS)
         copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                   &my_charset_utf8_unicode_nopad_ci :
-                                  &my_charset_utf8_unicode_ci);
+                                  &my_charset_utf8_unicode_ci,
+                                  cs);
         newcs->ctype= my_charset_utf8_unicode_ci.ctype;
         if (init_state_maps(newcs))
           return MY_XML_ERROR;
@@ -332,7 +360,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined (HAVE_CHARSET_utf8mb4) && defined(HAVE_UCA_COLLATIONS)
         copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                   &my_charset_utf8mb4_unicode_nopad_ci :
-                                  &my_charset_utf8mb4_unicode_ci);
+                                  &my_charset_utf8mb4_unicode_ci,
+                                  cs);
         newcs->ctype= my_charset_utf8mb4_unicode_ci.ctype;
         newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
 #endif
@@ -342,7 +371,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
         copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                   &my_charset_utf16_unicode_nopad_ci :
-                                  &my_charset_utf16_unicode_ci);
+                                  &my_charset_utf16_unicode_ci,
+                                  cs);
         newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
 #endif
       }
@@ -351,7 +381,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
         copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                   &my_charset_utf32_unicode_nopad_ci :
-                                  &my_charset_utf32_unicode_ci);
+                                  &my_charset_utf32_unicode_ci,
+                                  cs);
         newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
 #endif
       }
author	Alexander Barkov <bar@mariadb.org>	2016-11-08 20:57:19 +0400
committer	Alexander Barkov <bar@mariadb.org>	2016-11-08 20:57:19 +0400
commit	0259b3cbbe47448beb385a33348af56300004aa6 (patch)
tree	56ad0b2aa945a60a9a5ff2934a5931bd7eab1a48 /mysys/charset.c
parent	90c5b2f505855c13e0bdc2e73e11fa83983712bd (diff)
download	mariadb-git-0259b3cbbe47448beb385a33348af56300004aa6.tar.gz