diff options
author | unknown <bar@mysql.com> | 2004-06-11 16:29:16 +0500 |
---|---|---|
committer | unknown <bar@mysql.com> | 2004-06-11 16:29:16 +0500 |
commit | 5275eb21c25eef0c5b799315da10f26b779fbfe9 (patch) | |
tree | 72245220b637b8d5caf3b8267ea6801af59f0bc7 /mysys/charset.c | |
parent | 4047b5ade3b02d73c60eac7ddae7e5e928de7fd3 (diff) | |
download | mariadb-git-5275eb21c25eef0c5b799315da10f26b779fbfe9.tar.gz |
Allocate memory when a character set is requested:
- For simple character sets: from_uni convertion table.
- For UCA: alternative weight arrays.
Use mbminlen instead of MY_CS_NONTEXT
Diffstat (limited to 'mysys/charset.c')
-rw-r--r-- | mysys/charset.c | 562 |
1 files changed, 14 insertions, 548 deletions
diff --git a/mysys/charset.c b/mysys/charset.c index d2d71689d7b..165fa19e3d5 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -23,354 +23,6 @@ /* - Collation language is implemented according to - subset of ICU Collation Customization (tailorings): - http://oss.software.ibm.com/icu/userguide/Collate_Customization.html - - Collation language elements: - Delimiters: - space - skipped - - <char> := A-Z | a-z | \uXXXX - - Shift command: - <shift> := & - reset at this letter. - - Diff command: - <d1> := < - Identifies a primary difference. - <d2> := << - Identifies a secondary difference. - <d3> := <<< - Idenfifies a tertiary difference. - - - Collation rules: - <ruleset> := <rule> { <ruleset> } - - <rule> := <d1> <string> - | <d2> <string> - | <d3> <string> - | <shift> <char> - - <string> := <char> [ <string> ] - - An example, Polish collation: - - &A < \u0105 <<< \u0104 - &C < \u0107 <<< \u0106 - &E < \u0119 <<< \u0118 - &L < \u0142 <<< \u0141 - &N < \u0144 <<< \u0143 - &O < \u00F3 <<< \u00D3 - &S < \u015B <<< \u015A - &Z < \u017A <<< \u017B -*/ - - -typedef enum my_coll_lexem_num_en -{ - MY_COLL_LEXEM_EOF = 0, - MY_COLL_LEXEM_DIFF = 1, - MY_COLL_LEXEM_SHIFT = 4, - MY_COLL_LEXEM_CHAR = 5, - MY_COLL_LEXEM_ERROR = 6 -} my_coll_lexem_num; - - -typedef struct my_coll_lexem_st -{ - const char *beg; - const char *end; - const char *prev; - int diff; - int code; -} MY_COLL_LEXEM; - - -/* - Initialize collation rule lexical anilizer - - SYNOPSIS - my_coll_lexem_init - lexem Lex analizer to init - str Const string to parse - strend End of the string - USAGE - - RETURN VALUES - N/A -*/ - -static void my_coll_lexem_init(MY_COLL_LEXEM *lexem, - const char *str, const char *strend) -{ - lexem->beg= str; - lexem->prev= str; - lexem->end= strend; - lexem->diff= 0; - lexem->code= 0; -} - - -/* - Print collation customization expression parse error, with context. - - SYNOPSIS - my_coll_lexem_print_error - lexem Lex analizer to take context from - errstr sting to write error to - errsize errstr size - txt error message - USAGE - - RETURN VALUES - N/A -*/ - -static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem, - char *errstr, size_t errsize, - const char *txt) -{ - char tail[30]; - size_t len= lexem->end - lexem->prev; - strmake (tail, lexem->prev, min(len, sizeof(tail)-1)); - errstr[errsize-1]= '\0'; - my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail); -} - - -/* - Convert a hex digit into its numeric value - - SYNOPSIS - ch2x - ch hex digit to convert - USAGE - - RETURN VALUES - an integer value in the range 0..15 - -1 on error -*/ - -static int ch2x(int ch) -{ - if (ch >= '0' && ch <= '9') - return ch - '0'; - - if (ch >= 'a' && ch <= 'f') - return 10 + ch - 'a'; - - if (ch >= 'A' && ch <= 'F') - return 10 + ch - 'A'; - - return -1; -} - - -/* - Collation language lexical parser: - Scans the next lexem. - - SYNOPSIS - my_coll_lexem_next - lexem Lex analizer, previously initialized by - my_coll_lexem_init. - USAGE - Call this function in a loop - - RETURN VALUES - Lexem number: eof, diff, shift, char or error. -*/ - -static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem) -{ - for ( ;lexem->beg < lexem->end ; lexem->beg++) - { - lexem->prev= lexem->beg; - if (lexem->beg[0] == ' ' || lexem->beg[0] == '\t' || - lexem->beg[0] == '\r' || lexem->beg[0] == '\n') - continue; - - if (lexem->beg[0] == '&') - { - lexem->beg++; - return MY_COLL_LEXEM_SHIFT; - } - - if (lexem->beg[0] == '<') - { - for (lexem->beg++, lexem->diff=1; - (lexem->beg < lexem->end) && - (lexem->beg[0] == '<') && (lexem->diff<3); - lexem->beg++, lexem->diff++); - return MY_COLL_LEXEM_DIFF; - } - - if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') || - (lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z')) - { - lexem->code= lexem->beg[0]; - lexem->beg++; - return MY_COLL_LEXEM_CHAR; - } - - if ((lexem->beg[0] == '\\') && - (lexem->beg+2 < lexem->end) && - (lexem->beg[1] == 'u')) - { - int ch; - - lexem->code= 0; - for (lexem->beg+=2; - (lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ; - lexem->beg++) - { - lexem->code= (lexem->code << 4) + ch; - } - return MY_COLL_LEXEM_CHAR; - } - - return MY_COLL_LEXEM_ERROR; - } - return MY_COLL_LEXEM_EOF; -} - - -/* - Collation rule item -*/ - -typedef struct my_coll_rule_item_st -{ - uint base; /* Base character */ - uint curr; /* Current character */ - int diff[3]; /* Primary, Secondary and Tertiary difference */ -} MY_COLL_RULE; - - -/* - Collation language syntax parser. - Uses lexical parser. - - SYNOPSIS - my_coll_rule_parse - rule Collation rule list to load to. - str A string containin collation language expression. - strend End of the string. - USAGE - - RETURN VALUES - 0 - OK - 1 - ERROR, e.g. too many items. -*/ - -static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems, - const char *str, const char *strend, - char *errstr, size_t errsize) -{ - MY_COLL_LEXEM lexem; - my_coll_lexem_num lexnum; - my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR; - MY_COLL_RULE item; - int state= 0; - size_t nitems= 0; - - /* Init all variables */ - errstr[0]= '\0'; - bzero(&item, sizeof(item)); - my_coll_lexem_init(&lexem, str, strend); - - while ((lexnum= my_coll_lexem_next(&lexem))) - { - if (lexnum == MY_COLL_LEXEM_ERROR) - { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character"); - return -1; - } - - switch (state) { - case 0: - if (lexnum != MY_COLL_LEXEM_SHIFT) - { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected"); - return -1; - } - prevlexnum= lexnum; - state= 2; - continue; - - case 1: - if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF) - { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected"); - return -1; - } - prevlexnum= lexnum; - state= 2; - continue; - - case 2: - if (lexnum != MY_COLL_LEXEM_CHAR) - { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected"); - return -1; - } - - if (prevlexnum == MY_COLL_LEXEM_SHIFT) - { - item.base= lexem.code; - item.diff[0]= 0; - item.diff[1]= 0; - item.diff[2]= 0; - } - else if (prevlexnum == MY_COLL_LEXEM_DIFF) - { - item.curr= lexem.code; - if (lexem.diff == 3) - { - item.diff[2]++; - } - else if (lexem.diff == 2) - { - item.diff[1]++; - item.diff[2]= 0; - } - else if (lexem.diff == 1) - { - item.diff[0]++; - item.diff[1]= 0; - item.diff[2]= 0; - } - if (nitems >= mitems) - { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules"); - return -1; - } - rule[nitems++]= item; - } - else - { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen"); - return -1; - } - state= 1; - continue; - } - } - return (size_t) nitems; -} - - -typedef struct -{ - int nchars; - MY_UNI_IDX uidx; -} uni_idx; - -#define PLANE_SIZE 0x100 -#define PLANE_NUM 0x100 -#define PLANE_NUMBER(x) (((x)>>8) % PLANE_NUM) - - -/* The code below implements this functionality: - Initializing charset related structures @@ -484,91 +136,6 @@ static void simple_cs_init_functions(CHARSET_INFO *cs) } -static int pcmp(const void * f, const void * s) -{ - const uni_idx *F= (const uni_idx*) f; - const uni_idx *S= (const uni_idx*) s; - int res; - - if (!(res=((S->nchars)-(F->nchars)))) - res=((F->uidx.from)-(S->uidx.to)); - return res; -} - - -static my_bool create_fromuni(CHARSET_INFO *cs) -{ - uni_idx idx[PLANE_NUM]; - int i,n; - - /* Clear plane statistics */ - bzero(idx,sizeof(idx)); - - /* Count number of characters in each plane */ - for (i=0; i< 0x100; i++) - { - uint16 wc=cs->tab_to_uni[i]; - int pl= PLANE_NUMBER(wc); - - if (wc || !i) - { - if (!idx[pl].nchars) - { - idx[pl].uidx.from=wc; - idx[pl].uidx.to=wc; - }else - { - idx[pl].uidx.from=wc<idx[pl].uidx.from?wc:idx[pl].uidx.from; - idx[pl].uidx.to=wc>idx[pl].uidx.to?wc:idx[pl].uidx.to; - } - idx[pl].nchars++; - } - } - - /* Sort planes in descending order */ - qsort(&idx,PLANE_NUM,sizeof(uni_idx),&pcmp); - - for (i=0; i < PLANE_NUM; i++) - { - int ch,numchars; - - /* Skip empty plane */ - if (!idx[i].nchars) - break; - - numchars=idx[i].uidx.to-idx[i].uidx.from+1; - if (!(idx[i].uidx.tab=(uchar*) my_once_alloc(numchars * - sizeof(*idx[i].uidx.tab), - MYF(MY_WME)))) - return TRUE; - - bzero(idx[i].uidx.tab,numchars*sizeof(*idx[i].uidx.tab)); - - for (ch=1; ch < PLANE_SIZE; ch++) - { - uint16 wc=cs->tab_to_uni[ch]; - if (wc >= idx[i].uidx.from && wc <= idx[i].uidx.to && wc) - { - int ofs= wc - idx[i].uidx.from; - idx[i].uidx.tab[ofs]= ch; - } - } - } - - /* Allocate and fill reverse table for each plane */ - n=i; - if (!(cs->tab_from_uni= (MY_UNI_IDX*) my_once_alloc(sizeof(MY_UNI_IDX)*(n+1), - MYF(MY_WME)))) - return TRUE; - - for (i=0; i< n; i++) - cs->tab_from_uni[i]= idx[i].uidx; - - /* Set end-of-list marker */ - bzero(&cs->tab_from_uni[i],sizeof(MY_UNI_IDX)); - return FALSE; -} - static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from) { @@ -622,8 +189,6 @@ static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from) if (!(to->tab_to_uni= (uint16*) my_once_memdup((char*)from->tab_to_uni, sz, MYF(MY_WME)))) goto err; - if (create_fromuni(to)) - goto err; } to->mbminlen= 1; to->mbmaxlen= 1; @@ -754,117 +319,6 @@ static my_tailoring tailoring[]= } }; -#define MY_MAX_COLL_RULE 64 - -/* - This function copies an UCS2 collation from - the default Unicode Collation Algorithm (UCA) - weights applying tailorings, i.e. a set of - alternative weights for some characters. - - The default UCA weights are stored in my_charset_ucs2_general_uca. - They consist of 256 pages, 256 character each. - - If a page is not overwritten by tailoring rules, - it is copies as is from UCA as is. - - If a page contains some overwritten characters, it is - allocated. Untouched characters are copied from the - default weights. -*/ - -static my_bool create_tailoring(CHARSET_INFO *cs) -{ - MY_COLL_RULE rule[MY_MAX_COLL_RULE]; - char errstr[128]; - uchar *newlengths; - uint16 **newweights; - const uchar *deflengths= my_charset_ucs2_general_uca.sort_order; - uint16 **defweights= my_charset_ucs2_general_uca.sort_order_big; - int rc, i; - - if (!cs->tailoring) - return 1; - - /* Parse ICU Collation Customization expression */ - if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE, - cs->tailoring, - cs->tailoring + strlen(cs->tailoring), - errstr, sizeof(errstr))) <= 0) - { - /* - TODO: add error message reporting. - printf("Error: %d '%s'\n", rc, errstr); - */ - return 1; - } - - if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME)))) - return 1; - bzero(newweights, 256*sizeof(uint16*)); - - if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME)))) - return 1; - - /* - Calculate maximum lenghts for the pages - which will be overwritten. - */ - for (i=0; i < rc; i++) - { - uint pageb= (rule[i].base >> 8) & 0xFF; - uint pagec= (rule[i].curr >> 8) & 0xFF; - - if (newlengths[pagec] < deflengths[pageb]) - newlengths[pagec]= deflengths[pageb]; - } - - for (i=0; i < rc; i++) - { - uint pageb= (rule[i].base >> 8) & 0xFF; - uint pagec= (rule[i].curr >> 8) & 0xFF; - uint chb, chc; - - if (!newweights[pagec]) - { - /* Alloc new page and copy the default UCA weights */ - uint size= 256*newlengths[pagec]*sizeof(uint16); - - if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME)))) - return 1; - bzero((void*) newweights[pagec], size); - - for (chc=0 ; chc < 256; chc++) - { - memcpy(newweights[pagec] + chc*newlengths[pagec], - defweights[pagec] + chc*deflengths[pagec], - deflengths[pagec]*sizeof(uint16)); - } - } - - /* - Aply the alternative rule: - shift to the base character and primary difference. - */ - chc= rule[i].curr & 0xFF; - chb= rule[i].base & 0xFF; - memcpy(newweights[pagec] + chc*newlengths[pagec], - defweights[pageb] + chb*deflengths[pageb], - deflengths[pageb]*sizeof(uint16)); - /* Apply primary difference */ - newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0]; - } - - /* Copy non-overwritten pages from the default UCA weights */ - for (i= 0; i < 256 ; i++) - if (!newweights[i]) - newweights[i]= defweights[i]; - - cs->sort_order= newlengths; - cs->sort_order_big= newweights; - - return 0; -} static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from) @@ -894,7 +348,7 @@ static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from) to->mbminlen= 2; to->mbmaxlen= 2; - return create_tailoring(to); + return 0; err: return 1; @@ -997,7 +451,7 @@ static my_bool init_uca_charsets() CHARSET_INFO cs= my_charset_ucs2_general_uca; char name[64]; - cs.state= MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONTEXT; + cs.state= MY_CS_STRNXFRM|MY_CS_UNICODE; for (t= tailoring; t->tailoring; t++) { cs.number= 128 + t->number; @@ -1083,6 +537,10 @@ void add_compiled_collation(CHARSET_INFO *cs) cs->state|= MY_CS_AVAILABLE; } +static void *cs_alloc(uint size) +{ + return my_once_alloc(size, MYF(MY_WME)); +} #ifdef __NETWARE__ @@ -1207,6 +665,14 @@ static CHARSET_INFO *get_internal_charset(uint cs_number, myf flags) cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL; } pthread_mutex_unlock(&THR_LOCK_charset); + if (cs && !(cs->state & MY_CS_READY)) + { + if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) || + (cs->coll->init && cs->coll->init(cs, cs_alloc))) + cs= NULL; + else + cs->state|= MY_CS_READY; + } return cs; } |