diff options
author | unknown <bar@bar.intranet.mysql.r18.ru> | 2004-06-03 17:45:53 +0500 |
---|---|---|
committer | unknown <bar@bar.intranet.mysql.r18.ru> | 2004-06-03 17:45:53 +0500 |
commit | 5a2b1ba6d06603dd7300046d61dc9b02ccb9a134 (patch) | |
tree | d2caac6cf74f71de0ad9187035aaebb5214b4309 | |
parent | 2a32bb2be3e65680649f2cfc402d0da7683c73f3 (diff) | |
download | mariadb-git-5a2b1ba6d06603dd7300046d61dc9b02ccb9a134.tar.gz |
Unicode collations: WL#916
XML and "collation customization" language parsers.
-rw-r--r-- | mysys/charset.c | 502 | ||||
-rw-r--r-- | strings/ctype.c | 53 |
2 files changed, 547 insertions, 8 deletions
diff --git a/mysys/charset.c b/mysys/charset.c index d801fcdbd76..62068beccae 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -21,6 +21,344 @@ #include <my_dir.h> #include <my_xml.h> + +/* + Collation language is implemented according to + subset of ICU Collation Customization (tailorings): + http://oss.software.ibm.com/icu/userguide/Collate_Customization.html + + Collation language elements: + Delimiters: + space - skipped + + <char> := A-Z | a-z | \uXXXX + + Shift command: + <shift> := & - reset at this letter. + + Diff command: + <d1> := < - Identifies a primary difference. + <d2> := << - Identifies a secondary difference. + <d3> := <<< - Idenfifies a tertiary difference. + + + Collation rules: + <ruleset> := <rule> { <ruleset> } + + <rule> := <d1> <string> + | <d2> <string> + | <d3> <string> + | <shift> <char> + + <string> := <char> [ <string> ] + + An example, Polish collation: + + &A < \u0105 <<< \u0104 + &C < \u0107 <<< \u0106 + &E < \u0119 <<< \u0118 + &L < \u0142 <<< \u0141 + &N < \u0144 <<< \u0143 + &O < \u00F3 <<< \u00D3 + &S < \u015B <<< \u015A + &Z < \u017A <<< \u017B +*/ + + +typedef enum my_coll_lexem_num_en +{ + MY_COLL_LEXEM_EOF = 0, + MY_COLL_LEXEM_DIFF = 1, + MY_COLL_LEXEM_SHIFT = 4, + MY_COLL_LEXEM_CHAR = 5, + MY_COLL_LEXEM_ERROR = 6 +} my_coll_lexem_num; + + +typedef struct my_coll_lexem_st +{ + const char *beg; + const char *end; + const char *prev; + int diff; + int code; +} MY_COLL_LEXEM; + + +/* + Initialize collation rule lexical anilizer + + SYNOPSIS + my_coll_lexem_init + lexem Lex analizer to init + str Const string to parse + strend End of the string + USAGE + + RETURN VALUES + N/A +*/ + +static void my_coll_lexem_init(MY_COLL_LEXEM *lexem, + const char *str, const char *strend) +{ + lexem->beg= str; + lexem->prev= str; + lexem->end= strend; + lexem->diff= 0; + lexem->code= 0; +} + + +/* + Print collation customization expression parse error, with context. + + SYNOPSIS + my_coll_lexem_print_error + lexem Lex analizer to take context from + errstr sting to write error to + errsize errstr size + txt error message + USAGE + + RETURN VALUES + N/A +*/ + +static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem, + char *errstr, size_t errsize, + const char *txt) +{ + char tail[30]; + size_t len= lexem->end - lexem->prev; + strmake (tail, lexem->prev, min(len, sizeof(tail)-1)); + errstr[errsize-1]= '\0'; + my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail); +} + + +/* + Convert a hex digit into its numeric value + + SYNOPSIS + ch2x + ch hex digit to convert + USAGE + + RETURN VALUES + an integer value in the range 0..15 + -1 on error +*/ + +static int ch2x(int ch) +{ + if (ch >= '0' && ch <= '9') + return ch - '0'; + + if (ch >= 'a' && ch <= 'f') + return 10 + ch - 'a'; + + if (ch >= 'A' && ch <= 'Z') + return 10 + ch - 'A'; + + return -1; +} + + +/* + Collation language lexical parser: + Scans the next lexem. + + SYNOPSIS + my_coll_lexem_next + lexem Lex analizer, previously initialized by + my_coll_lexem_init. + USAGE + Call this function in a loop + + RETURN VALUES + Lexem number: eof, diff, shift, char or error. +*/ + +static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem) +{ + for ( ;lexem->beg < lexem->end ; lexem->beg++) + { + lexem->prev= lexem->beg; + if (lexem->beg[0] == ' ' || lexem->beg[0] == '\t' || + lexem->beg[0] == '\r' || lexem->beg[0] == '\n') + continue; + + if (lexem->beg[0] == '&') + { + lexem->beg++; + return MY_COLL_LEXEM_SHIFT; + } + + if (lexem->beg[0] == '<') + { + for (lexem->beg++, lexem->diff=1; + (lexem->beg < lexem->end) && + (lexem->beg[0] == '<') && (lexem->diff<3); + lexem->beg++, lexem->diff++); + return MY_COLL_LEXEM_DIFF; + } + + if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') || + (lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z')) + { + lexem->code= lexem->beg[0]; + lexem->beg++; + return MY_COLL_LEXEM_CHAR; + } + + if ((lexem->beg[0] == '\\') && + (lexem->beg+2 < lexem->end) && + (lexem->beg[1] == 'u')) + { + int ch; + + lexem->code= 0; + for (lexem->beg+=2; + (lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ; + lexem->beg++) + { + lexem->code= (lexem->code << 4) + ch; + } + return MY_COLL_LEXEM_CHAR; + } + + return MY_COLL_LEXEM_ERROR; + } + return MY_COLL_LEXEM_EOF; +} + + +/* + Collation rule item +*/ + +typedef struct my_coll_rule_item_st +{ + uint base; /* Base character */ + uint curr; /* Current character */ + int diff[3]; /* Primary, Secondary and Tertiary difference */ +} MY_COLL_RULE; + + +/* + Collation language syntax parser. + Uses lexical parser. + + SYNOPSIS + my_coll_rule_parse + rule Collation rule list to load to. + str A string containin collation language expression. + strend End of the string. + USAGE + + RETURN VALUES + 0 - OK + 1 - ERROR, e.g. too many items. +*/ + +static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems, + const char *str, const char *strend, + char *errstr, size_t errsize) +{ + MY_COLL_LEXEM lexem; + my_coll_lexem_num lexnum; + my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR; + MY_COLL_RULE item; + int state= 0; + size_t nitems= 0; + + /* Init all variables */ + errstr[0]= '\0'; + bzero(&item, sizeof(item)); + my_coll_lexem_init(&lexem, str, strend); + + while ((lexnum= my_coll_lexem_next(&lexem))) + { + if (lexnum == MY_COLL_LEXEM_ERROR) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character"); + return -1; + } + + switch (state) { + case 0: + if (lexnum != MY_COLL_LEXEM_SHIFT) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected"); + return -1; + } + prevlexnum= lexnum; + state= 2; + continue; + + case 1: + if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected"); + return -1; + } + prevlexnum= lexnum; + state= 2; + continue; + + case 2: + if (lexnum != MY_COLL_LEXEM_CHAR) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected"); + return -1; + } + + if (prevlexnum == MY_COLL_LEXEM_SHIFT) + { + item.base= lexem.code; + item.diff[0]= 0; + item.diff[1]= 0; + item.diff[2]= 0; + } + else if (prevlexnum == MY_COLL_LEXEM_DIFF) + { + item.curr= lexem.code; + if (lexem.diff == 3) + { + item.diff[2]++; + } + else if (lexem.diff == 2) + { + item.diff[1]++; + item.diff[2]= 0; + } + else if (lexem.diff == 1) + { + item.diff[0]++; + item.diff[1]= 0; + item.diff[2]= 0; + } + if (nitems >= mitems) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules"); + return -1; + } + rule[nitems++]= item; + } + else + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen"); + return -1; + } + state= 1; + continue; + } + } + return (size_t) nitems; +} + + typedef struct { int nchars; @@ -284,6 +622,144 @@ err: } +#ifdef HAVE_CHARSET_ucs2 + +#define MY_MAX_COLL_RULE 64 + +/* + This function copies an UCS2 collation from + the default Unicode Collation Algorithm (UCA) + weights applying tailorings, i.e. a set of + alternative weights for some characters. + + The default UCA weights are stored in my_charset_ucs2_general_uca. + They consist of 256 pages, 256 character each. + + If a page is not overwritten by tailoring rules, + it is copies as is from UCA as is. + + If a page contains some overwritten characters, it is + allocated. Untouched characters are copied from the + default weights. +*/ + +static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from) +{ + MY_COLL_RULE rule[MY_MAX_COLL_RULE]; + char errstr[128]; + uchar *newlengths; + uint16 **newweights; + const uchar *deflengths= my_charset_ucs2_general_uca.sort_order; + uint16 **defweights= my_charset_ucs2_general_uca.sort_order_big; + int rc, i; + + to->number= from->number ? from->number : to->number; + + if (from->csname) + if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME)))) + goto err; + + if (from->name) + if (!(to->name= my_once_strdup(from->name,MYF(MY_WME)))) + goto err; + + if (from->comment) + if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME)))) + goto err; + + to->strxfrm_multiply= my_charset_ucs2_general_uca.strxfrm_multiply; + to->min_sort_char= my_charset_ucs2_general_uca.min_sort_char; + to->max_sort_char= my_charset_ucs2_general_uca.max_sort_char; + to->mbminlen= 2; + to->mbmaxlen= 2; + + + /* Parse ICU Collation Customization expression */ + if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE, + from->sort_order, + from->sort_order + strlen(from->sort_order), + errstr, sizeof(errstr))) <= 0) + { + /* + TODO: add error message reporting. + printf("Error: %d '%s'\n", rc, errstr); + */ + return 1; + } + + + if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME)))) + goto err; + bzero(newweights, 256*sizeof(uint16*)); + + if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME)))) + goto err; + + /* + Calculate maximum lenghts for the pages + which will be overwritten. + */ + for (i=0; i < rc; i++) + { + uint pageb= (rule[i].base >> 8) & 0xFF; + uint pagec= (rule[i].curr >> 8) & 0xFF; + + if (newlengths[pagec] < deflengths[pageb]) + newlengths[pagec]= deflengths[pageb]; + } + + for (i=0; i < rc; i++) + { + uint pageb= (rule[i].base >> 8) & 0xFF; + uint pagec= (rule[i].curr >> 8) & 0xFF; + uint chb, chc; + + if (!newweights[pagec]) + { + /* Alloc new page and copy the default UCA weights */ + uint size= 256*newlengths[pagec]*sizeof(uint16); + + if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME)))) + goto err; + bzero((void*) newweights[pagec], size); + + for (chc=0 ; chc < 256; chc++) + { + memcpy(newweights[pagec] + chc*newlengths[pagec], + defweights[pagec] + chc*deflengths[pagec], + deflengths[pagec]*sizeof(uint16)); + } + } + + /* + Aply the alternative rule: + shift to the base character and primary difference. + */ + chc= rule[i].curr & 0xFF; + chb= rule[i].base & 0xFF; + memcpy(newweights[pagec] + chc*newlengths[pagec], + defweights[pageb] + chb*deflengths[pageb], + deflengths[pageb]*sizeof(uint16)); + /* Apply primary difference */ + newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0]; + } + + /* Copy non-overwritten pages from the default UCA weights */ + for (i= 0; i < 256 ; i++) + if (!newweights[i]) + newweights[i]= defweights[i]; + + to->sort_order= newlengths; + to->sort_order_big= newweights; + + return 0; + +err: + return 1; +} +#endif + + static my_bool simple_cs_is_full(CHARSET_INFO *cs) { return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper && @@ -315,14 +791,28 @@ static int add_collation(CHARSET_INFO *cs) if (!(all_charsets[cs->number]->state & MY_CS_COMPILED)) { - simple_cs_init_functions(all_charsets[cs->number]); - if (simple_cs_copy_data(all_charsets[cs->number],cs)) - return MY_XML_ERROR; - if (simple_cs_is_full(all_charsets[cs->number])) + if (!strcmp(cs->csname,"ucs2") ) { - all_charsets[cs->number]->state |= MY_CS_LOADED; +#ifdef HAVE_CHARSET_ucs2 + CHARSET_INFO *new= all_charsets[cs->number]; + new->cset= my_charset_ucs2_general_uca.cset; + new->coll= my_charset_ucs2_general_uca.coll; + if (ucs2_copy_data(new, cs)) + return MY_XML_ERROR; + new->state |= MY_CS_AVAILABLE | MY_CS_LOADED; +#endif + } + else + { + simple_cs_init_functions(all_charsets[cs->number]); + if (simple_cs_copy_data(all_charsets[cs->number],cs)) + return MY_XML_ERROR; + if (simple_cs_is_full(all_charsets[cs->number])) + { + all_charsets[cs->number]->state |= MY_CS_LOADED; + } + all_charsets[cs->number]->state|= MY_CS_AVAILABLE; } - all_charsets[cs->number]->state|= MY_CS_AVAILABLE; } else { diff --git a/strings/ctype.c b/strings/ctype.c index cbd13111b70..44bf20ada5c 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -22,6 +22,23 @@ #endif +/* + + This files implements routines which parse XML based + character set and collation description files. + + Unicode collations are encoded according to + + Unicode Technical Standard #35 + Locale Data Markup Language (LDML) + http://www.unicode.org/reports/tr35/ + + and converted into ICU string according to + + Collation Customization + http://oss.software.ibm.com/icu/userguide/Collate_Customization.html + +*/ static char *mstr(char *str,const char *src,uint l1,uint l2) { @@ -54,6 +71,11 @@ struct my_cs_file_section_st #define _CS_PRIMARY_ID 15 #define _CS_BINARY_ID 16 #define _CS_CSDESCRIPT 17 +#define _CS_RESET 18 +#define _CS_DIFF1 19 +#define _CS_DIFF2 20 +#define _CS_DIFF3 21 + static struct my_cs_file_section_st sec[] = { @@ -83,6 +105,10 @@ static struct my_cs_file_section_st sec[] = {_CS_ORDER, "charsets.charset.collation.order"}, {_CS_FLAG, "charsets.charset.collation.flag"}, {_CS_COLLMAP, "charsets.charset.collation.map"}, + {_CS_RESET, "charsets.charset.collation.rules.reset"}, + {_CS_DIFF1, "charsets.charset.collation.rules.p"}, + {_CS_DIFF2, "charsets.charset.collation.rules.s"}, + {_CS_DIFF3, "charsets.charset.collation.rules.t"}, {0, NULL} }; @@ -109,6 +135,7 @@ typedef struct my_cs_file_info uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE]; uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE]; char comment[MY_CS_CSDESCR_SIZE]; + size_t sort_order_length; CHARSET_INFO cs; int (*add_collation)(CHARSET_INFO *cs); } MY_CHARSET_LOADER; @@ -156,9 +183,11 @@ static int cs_enter(MY_XML_PARSER *st,const char *attr, uint len) struct my_cs_file_section_st *s= cs_file_sec(attr,len); if ( s && (s->state == _CS_CHARSET)) - { bzero(&i->cs,sizeof(i->cs)); - } + + if (s && (s->state == _CS_COLLATION)) + i->sort_order_length= 0; + return MY_XML_OK; } @@ -242,6 +271,26 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, uint len) fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len); i->cs.ctype=i->ctype; break; + case _CS_RESET: + case _CS_DIFF1: + case _CS_DIFF2: + case _CS_DIFF3: + { + /* + Convert collation description from + Locale Data Markup Language (LDML) + into ICU Collation Customization expression. + */ + char arg[16]; + const char *cmd[]= {"&","<","<<","<<<"}; + i->cs.sort_order= i->sort_order; + mstr(arg,attr,len,sizeof(arg)-1); + if (i->sort_order_length + 20 < sizeof(i->sort_order)) + { + char *dst= i->sort_order_length + i->sort_order; + i->sort_order_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg); + } + } } return MY_XML_OK; } |