diff options
author | Michael Widenius <monty@askmonty.org> | 2009-11-30 14:42:24 +0200 |
---|---|---|
committer | Michael Widenius <monty@askmonty.org> | 2009-11-30 14:42:24 +0200 |
commit | 4c14f9f23c725fd15b6b3c29bff07d925047e8ac (patch) | |
tree | a7c47791ad6bd1a5c5c718fd3d611cdb9f40340f /strings | |
parent | 84911a9fd0a31a03a1692040d21ecbe0198a01d7 (diff) | |
download | mariadb-git-4c14f9f23c725fd15b6b3c29bff07d925047e8ac.tar.gz |
Added more general support for sorting 2 characters as one (contractions)
Added support for Croatian sorting orders utf8_croatian_ci and ucs2_croatian_ci.
Patch done by Alexander Barkov. See http://www.collation-charts.org/articles/croatian.htm
mysql-test/r/ctype_uca.result:
Added testing of Croatian sort order
mysql-test/t/ctype_uca.test:
Added testing of Croatian sort order
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-mb.c | 11 | ||||
-rw-r--r-- | strings/ctype-uca.c | 356 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 13 |
3 files changed, 313 insertions, 67 deletions
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index 903811e2ab9..51f93954bd5 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -567,8 +567,7 @@ my_bool my_like_range_mb(CHARSET_INFO *cs, char *min_end= min_str + res_length; char *max_end= max_str + res_length; size_t maxcharlen= res_length / cs->mbmaxlen; - const char *contraction_flags= cs->contractions ? - ((const char*) cs->contractions) + 0x40*0x40 : NULL; + my_bool have_contractions= my_uca_have_contractions(cs); for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--) { @@ -636,8 +635,8 @@ fill_max_and_min: 'ab\min\min\min\min' and 'ab\max\max\max\max'. */ - if (contraction_flags && ptr + 1 < end && - contraction_flags[(uchar) *ptr]) + if (have_contractions && ptr + 1 < end && + my_uca_can_be_contraction_head(cs, (uchar) *ptr)) { /* Ptr[0] is a contraction head. */ @@ -659,8 +658,8 @@ fill_max_and_min: is not a contraction, then we put only ptr[0], and continue with ptr[1] on the next loop. */ - if (contraction_flags[(uchar) ptr[1]] && - cs->contractions[(*ptr-0x40)*0x40 + ptr[1] - 0x40]) + if (my_uca_can_be_contraction_tail(cs, (uchar) ptr[1]) && + my_uca_contraction2_weight(cs, (uchar) ptr[0], (uchar) ptr[1])) { /* Contraction found */ if (maxcharlen == 1 || min_str + 1 >= min_end) diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 8a51f9d7f75..7b162618ed1 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -6713,6 +6713,16 @@ static const char hungarian[]= "&U < \\u00FC <<< \\u00DC << \\u0171 <<< \\u0170"; +static const char croatian[]= + +"&C < \\u010D <<< \\u010C < \\u0107 <<< \\u0106 " +"&D < d\\u017E <<< \\u01C6 <<< D\\u017E <<< \\u01C5 <<< D\\u017D <<< \\u01C4 " +" < \\u0111 <<< \\u0110 " +"&L < lj <<< \\u01C9 <<< Lj <<< \\u01C8 <<< LJ <<< \\u01C7 " +"&N < nj <<< \\u01CC <<< Nj <<< \\u01CB <<< NJ <<< \\u01CA " +"&S < \\u0161 <<< \\u0160 " +"&Z < \\u017E <<< \\u017D"; + /* Unicode Collation Algorithm: Collation element (weight) scanner, @@ -6726,7 +6736,7 @@ typedef struct my_uca_scanner_st const uchar *send; /* End of the input string */ uchar *uca_length; uint16 **uca_weight; - uint16 *contractions; + MY_CONTRACTIONS *contractions; uint16 implicit[2]; int page; int code; @@ -6747,6 +6757,164 @@ typedef struct my_uca_scanner_handler_st static uint16 nochar[]= {0,0}; +#define MY_UCA_CNT_FLAG_SIZE 4096 +#define MY_UCA_CNT_FLAG_MASK 4095 + +#define MY_UCA_CNT_HEAD 1 +#define MY_UCA_CNT_TAIL 2 + + + + +/********** Helper functions to handle contraction ************/ + + +/** + Mark a character as a contraction part + + @cs Pointer to CHARSET_INFO data + @wc Unicode code point + @flag flag: "is contraction head", "is contraction tail" +*/ + +static void +my_uca_add_contraction_flag(CHARSET_INFO *cs, my_wc_t wc, int flag) +{ + cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK]|= flag; +} + + +/** + Add a new contraction into contraction list + + @cs Pointer to CHARSET_INFO data + @wc Unicode code points of the characters + @len Number of characters + + @return New contraction + @retval Pointer to a newly added contraction +*/ + +static MY_CONTRACTION * +my_uca_add_contraction(CHARSET_INFO *cs, + my_wc_t *wc, int len __attribute__((unused))) +{ + MY_CONTRACTIONS *list= cs->contractions; + MY_CONTRACTION *next= &list->item[list->nitems]; + DBUG_ASSERT(len == 2); /* We currently support only contraction2 */ + next->ch[0]= wc[0]; + next->ch[1]= wc[1]; + list->nitems++; + return next; +} + + +/** + Allocate and initialize memory for contraction list and flags + + @cs Pointer to CHARSET_INFO data + @alloc Memory allocation function (typically points to my_alloc_once) + @n Number of contractions + + @return Error code + @retval 0 - memory allocated successfully + @retval 1 - not enough memory +*/ + +static my_bool +my_uca_alloc_contractions(CHARSET_INFO *cs, void *(*alloc)(size_t), size_t n) +{ + uint size= n * sizeof(MY_CONTRACTION); + if (!(cs->contractions= (*alloc)(sizeof(MY_CONTRACTIONS)))) + return 1; + bzero(cs->contractions, sizeof(MY_CONTRACTIONS)); + if (!(cs->contractions->item= (*alloc)(size)) || + !(cs->contractions->flags= (char*) (*alloc)(MY_UCA_CNT_FLAG_SIZE))) + return 1; + bzero((void*) cs->contractions->item, size); + bzero((void*) cs->contractions->flags, MY_UCA_CNT_FLAG_SIZE); + return 0; +} + + +/** + Check if UCA data has contractions (public version) + + @cs Pointer to CHARSET_INFO data + @retval 0 - no contraction, 1 - have contractions. +*/ + +my_bool +my_uca_have_contractions(CHARSET_INFO *cs) +{ + return cs->contractions != NULL; +} + + +/** + Check if a character can be contraction head + + @cs Pointer to CHARSET_INFO data + @wc Code point + + @retval 0 - cannot be contraction head + @retval 1 - can be contraction head +*/ + +my_bool +my_uca_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc) +{ + return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_HEAD; +} + + +/** + Check if a character can be contraction tail + + @cs Pointer to CHARSET_INFO data + @wc Code point + + @retval 0 - cannot be contraction tail + @retval 1 - can be contraction tail +*/ + +my_bool +my_uca_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc) +{ + return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_TAIL; +} + + +/** + Find a contraction and return its weight array + + @cs Pointer to CHARSET data + @wc1 First character + @wc2 Second character + + @return Weight array + @retval NULL - no contraction found + @retval ptr - contraction weight array +*/ + +uint16 * +my_uca_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2) +{ + MY_CONTRACTIONS *list= cs->contractions; + MY_CONTRACTION *c, *last; + for (c= list->item, last= &list->item[list->nitems]; c < last; c++) + { + if (c->ch[0] == wc1 && c->ch[1] == wc2) + { + return c->weight; + } + } + return NULL; +} + + + + #ifdef HAVE_CHARSET_ucs2 /* Initialize collation weight scanner @@ -6766,7 +6934,7 @@ static uint16 nochar[]= {0,0}; */ static void my_uca_scanner_init_ucs2(my_uca_scanner *scanner, - CHARSET_INFO *cs __attribute__((unused)), + CHARSET_INFO *cs, const uchar *str, size_t length) { scanner->wbeg= nochar; @@ -6777,6 +6945,7 @@ static void my_uca_scanner_init_ucs2(my_uca_scanner *scanner, scanner->uca_length= cs->sort_order; scanner->uca_weight= cs->sort_order_big; scanner->contractions= cs->contractions; + scanner->cs= cs; return; } @@ -6865,18 +7034,23 @@ static int my_uca_scanner_next_ucs2(my_uca_scanner *scanner) if (scanner->contractions && (scanner->sbeg <= scanner->send)) { - int cweight; + my_wc_t wc1= ((scanner->page << 8) | scanner->code); - if (!scanner->page && !scanner->sbeg[0] && - (scanner->sbeg[1] > 0x40) && (scanner->sbeg[1] < 0x80) && - (scanner->code > 0x40) && (scanner->code < 0x80) && - (cweight= scanner->contractions[(scanner->code-0x40)*0x40+scanner->sbeg[1]-0x40])) + if (my_uca_can_be_contraction_head(scanner->cs, wc1)) + { + uint16 *cweight; + my_wc_t wc2= (((my_wc_t) scanner->sbeg[0]) << 8) | scanner->sbeg[1]; + if (my_uca_can_be_contraction_tail(scanner->cs, wc2) && + (cweight= my_uca_contraction2_weight(scanner->cs, + scanner->code, + scanner->sbeg[1]))) { scanner->implicit[0]= 0; scanner->wbeg= scanner->implicit; scanner->sbeg+=2; - return cweight; + return *cweight; } + } } if (!ucaw[scanner->page]) @@ -6959,23 +7133,22 @@ static int my_uca_scanner_next_any(my_uca_scanner *scanner) scanner->code= wc & 0xFF; scanner->sbeg+= mb_len; - if (scanner->contractions && !scanner->page && - (scanner->code > 0x40) && (scanner->code < 0x80)) + if (my_uca_have_contractions(scanner->cs) && + my_uca_can_be_contraction_head(scanner->cs, wc)) { - uint page1, code1, cweight; + my_wc_t wc2; + uint16 *cweight; - if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc, + if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc2, scanner->sbeg, scanner->send)) >=0) && - (!(page1= (wc >> 8))) && - ((code1= (wc & 0xFF)) > 0x40) && - (code1 < 0x80) && - (cweight= scanner->contractions[(scanner->code-0x40)*0x40 + code1-0x40])) + my_uca_can_be_contraction_tail(scanner->cs, wc2) && + (cweight= my_uca_contraction2_weight(scanner->cs, wc, wc2))) { scanner->implicit[0]= 0; scanner->wbeg= scanner->implicit; scanner->sbeg+= mb_len; - return cweight; + return *cweight; } } @@ -7012,6 +7185,33 @@ static my_uca_scanner_handler my_any_uca_scanner_handler= my_uca_scanner_next_any }; + + +/** + Helper function: + Find address of weights of the given character. + + @weights UCA weight array + @lengths UCA length array + @ch character Unicode code point + + @return Weight array + @retval pointer to weight array for the given character, + or NULL if this page does not have implicit weights. +*/ + +static inline uint16 * +my_char_weight_addr(CHARSET_INFO *cs, uint wc) +{ + uint page= (wc >> 8); + uint ofst= wc & 0xFF; + return cs->sort_order_big[page] ? + cs->sort_order_big[page] + ofst * cs->sort_order[page] : + NULL; +} + + + /* Compares two strings according to the collation @@ -7683,8 +7883,8 @@ ex: typedef struct my_coll_rule_item_st { - uint base; /* Base character */ - uint curr[2]; /* Current character */ + my_wc_t base; /* Base character */ + my_wc_t curr[2]; /* Current character */ int diff[3]; /* Primary, Secondary and Tertiary difference */ } MY_COLL_RULE; @@ -7834,6 +8034,7 @@ static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems, static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(size_t)) { MY_COLL_RULE rule[MY_MAX_COLL_RULE]; + MY_COLL_RULE *r, *rfirst, *rlast; char errstr[128]; uchar *newlengths; uint16 **newweights; @@ -7858,6 +8059,9 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(size_t)) return 1; } + rfirst= rule; + rlast= rule + rc; + if (!cs->caseinfo) cs->caseinfo= my_unicase_default; @@ -7941,44 +8145,21 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(size_t)) /* Now process contractions */ if (ncontractions) { - /* - 8K for weights for basic latin letter pairs, - plus 256 bytes for "is contraction part" flags. - */ - uint size= 0x40*0x40*sizeof(uint16) + 256; - char *contraction_flags; - if (!(cs->contractions= (uint16*) (*alloc)(size))) - return 1; - bzero((void*)cs->contractions, size); - contraction_flags= ((char*) cs->contractions) + 0x40*0x40; - for (i=0; i < rc; i++) + if (my_uca_alloc_contractions(cs, alloc, ncontractions)) + return 1; + for (r= rfirst; r < rlast; r++) { - if (rule[i].curr[1]) + uint16 *to; + if (r->curr[1]) /* Contraction */ { - uint pageb= (rule[i].base >> 8) & 0xFF; - uint chb= rule[i].base & 0xFF; - uint16 *offsb= defweights[pageb] + chb*deflengths[pageb]; - uint offsc; - - if (offsb[1] || - rule[i].curr[0] < 0x40 || rule[i].curr[0] > 0x7f || - rule[i].curr[1] < 0x40 || rule[i].curr[1] > 0x7f) - { - /* - TODO: add error reporting; - We support only basic latin letters contractions at this point. - Also, We don't support contractions with weight longer than one. - Otherwise, we'd need much more memory. - */ - return 1; - } - offsc= (rule[i].curr[0]-0x40)*0x40+(rule[i].curr[1]-0x40); - - /* Copy base weight applying primary difference */ - cs->contractions[offsc]= offsb[0] + rule[i].diff[0]; - /* Mark both letters as "is contraction part */ - contraction_flags[rule[i].curr[0]]= 1; - contraction_flags[rule[i].curr[1]]= 1; + /* Mark both letters as "is contraction part" */ + my_uca_add_contraction_flag(cs, r->curr[0], MY_UCA_CNT_HEAD); + my_uca_add_contraction_flag(cs, r->curr[1], MY_UCA_CNT_TAIL); + to= my_uca_add_contraction(cs, r->curr, 2)->weight; + /* Copy weight from the reset character */ + to[0]= my_char_weight_addr(cs, r->base)[0]; + /* Apply primary difference */ + to[0]+= r->diff[0]; } } } @@ -8701,6 +8882,39 @@ CHARSET_INFO my_charset_ucs2_hungarian_uca_ci= }; +CHARSET_INFO my_charset_ucs2_croatian_uca_ci= +{ + 149,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_croatian_ci", /* name */ + "", /* comment */ + croatian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + + #endif @@ -9358,6 +9572,38 @@ CHARSET_INFO my_charset_utf8_hungarian_uca_ci= &my_collation_any_uca_handler }; +CHARSET_INFO my_charset_utf8_croatian_uca_ci= +{ + 213,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_croatian_ci", /* name */ + "", /* comment */ + croatian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 1, /* mbminlen */ + 3, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + #endif /* HAVE_CHARSET_utf8 */ #endif /* HAVE_UCA_COLLATIONS */ diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index f030c08523c..2607e0f6d43 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1526,8 +1526,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs, char *min_org=min_str; char *min_end=min_str+res_length; size_t charlen= res_length / cs->mbmaxlen; - const char *contraction_flags= cs->contractions ? - ((const char*) cs->contractions) + 0x40*0x40 : NULL; + my_bool have_contractions= my_uca_have_contractions(cs); for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0 ; ptr+=2, charlen--) @@ -1567,8 +1566,9 @@ fill_max_and_min: return 0; } - if (contraction_flags && ptr + 3 < end && - ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]]) + if (have_contractions && ptr + 3 < end && + ptr[0] == '\0' && + my_uca_can_be_contraction_head(cs, (uchar) ptr[1])) { /* Contraction head found */ if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many)) @@ -1581,8 +1581,9 @@ fill_max_and_min: Check if the second letter can be contraction part, and if two letters really produce a contraction. */ - if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] && - cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40]) + if (ptr[2] == '\0' && + my_uca_can_be_contraction_tail(cs, (uchar) ptr[3]) && + my_uca_contraction2_weight(cs,(uchar) ptr[1], (uchar) ptr[3])) { /* Contraction found */ if (charlen == 1 || min_str + 2 >= min_end) |