summaryrefslogtreecommitdiff
path: root/mysys/charset.c
diff options
context:
space:
mode:
authorunknown <bar@mysql.com>2004-06-11 16:29:16 +0500
committerunknown <bar@mysql.com>2004-06-11 16:29:16 +0500
commit5275eb21c25eef0c5b799315da10f26b779fbfe9 (patch)
tree72245220b637b8d5caf3b8267ea6801af59f0bc7 /mysys/charset.c
parent4047b5ade3b02d73c60eac7ddae7e5e928de7fd3 (diff)
downloadmariadb-git-5275eb21c25eef0c5b799315da10f26b779fbfe9.tar.gz
Allocate memory when a character set is requested:
- For simple character sets: from_uni convertion table. - For UCA: alternative weight arrays. Use mbminlen instead of MY_CS_NONTEXT
Diffstat (limited to 'mysys/charset.c')
-rw-r--r--mysys/charset.c562
1 files changed, 14 insertions, 548 deletions
diff --git a/mysys/charset.c b/mysys/charset.c
index d2d71689d7b..165fa19e3d5 100644
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -23,354 +23,6 @@
/*
- Collation language is implemented according to
- subset of ICU Collation Customization (tailorings):
- http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
-
- Collation language elements:
- Delimiters:
- space - skipped
-
- <char> := A-Z | a-z | \uXXXX
-
- Shift command:
- <shift> := & - reset at this letter.
-
- Diff command:
- <d1> := < - Identifies a primary difference.
- <d2> := << - Identifies a secondary difference.
- <d3> := <<< - Idenfifies a tertiary difference.
-
-
- Collation rules:
- <ruleset> := <rule> { <ruleset> }
-
- <rule> := <d1> <string>
- | <d2> <string>
- | <d3> <string>
- | <shift> <char>
-
- <string> := <char> [ <string> ]
-
- An example, Polish collation:
-
- &A < \u0105 <<< \u0104
- &C < \u0107 <<< \u0106
- &E < \u0119 <<< \u0118
- &L < \u0142 <<< \u0141
- &N < \u0144 <<< \u0143
- &O < \u00F3 <<< \u00D3
- &S < \u015B <<< \u015A
- &Z < \u017A <<< \u017B
-*/
-
-
-typedef enum my_coll_lexem_num_en
-{
- MY_COLL_LEXEM_EOF = 0,
- MY_COLL_LEXEM_DIFF = 1,
- MY_COLL_LEXEM_SHIFT = 4,
- MY_COLL_LEXEM_CHAR = 5,
- MY_COLL_LEXEM_ERROR = 6
-} my_coll_lexem_num;
-
-
-typedef struct my_coll_lexem_st
-{
- const char *beg;
- const char *end;
- const char *prev;
- int diff;
- int code;
-} MY_COLL_LEXEM;
-
-
-/*
- Initialize collation rule lexical anilizer
-
- SYNOPSIS
- my_coll_lexem_init
- lexem Lex analizer to init
- str Const string to parse
- strend End of the string
- USAGE
-
- RETURN VALUES
- N/A
-*/
-
-static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
- const char *str, const char *strend)
-{
- lexem->beg= str;
- lexem->prev= str;
- lexem->end= strend;
- lexem->diff= 0;
- lexem->code= 0;
-}
-
-
-/*
- Print collation customization expression parse error, with context.
-
- SYNOPSIS
- my_coll_lexem_print_error
- lexem Lex analizer to take context from
- errstr sting to write error to
- errsize errstr size
- txt error message
- USAGE
-
- RETURN VALUES
- N/A
-*/
-
-static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
- char *errstr, size_t errsize,
- const char *txt)
-{
- char tail[30];
- size_t len= lexem->end - lexem->prev;
- strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
- errstr[errsize-1]= '\0';
- my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
-}
-
-
-/*
- Convert a hex digit into its numeric value
-
- SYNOPSIS
- ch2x
- ch hex digit to convert
- USAGE
-
- RETURN VALUES
- an integer value in the range 0..15
- -1 on error
-*/
-
-static int ch2x(int ch)
-{
- if (ch >= '0' && ch <= '9')
- return ch - '0';
-
- if (ch >= 'a' && ch <= 'f')
- return 10 + ch - 'a';
-
- if (ch >= 'A' && ch <= 'F')
- return 10 + ch - 'A';
-
- return -1;
-}
-
-
-/*
- Collation language lexical parser:
- Scans the next lexem.
-
- SYNOPSIS
- my_coll_lexem_next
- lexem Lex analizer, previously initialized by
- my_coll_lexem_init.
- USAGE
- Call this function in a loop
-
- RETURN VALUES
- Lexem number: eof, diff, shift, char or error.
-*/
-
-static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
-{
- for ( ;lexem->beg < lexem->end ; lexem->beg++)
- {
- lexem->prev= lexem->beg;
- if (lexem->beg[0] == ' ' || lexem->beg[0] == '\t' ||
- lexem->beg[0] == '\r' || lexem->beg[0] == '\n')
- continue;
-
- if (lexem->beg[0] == '&')
- {
- lexem->beg++;
- return MY_COLL_LEXEM_SHIFT;
- }
-
- if (lexem->beg[0] == '<')
- {
- for (lexem->beg++, lexem->diff=1;
- (lexem->beg < lexem->end) &&
- (lexem->beg[0] == '<') && (lexem->diff<3);
- lexem->beg++, lexem->diff++);
- return MY_COLL_LEXEM_DIFF;
- }
-
- if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') ||
- (lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z'))
- {
- lexem->code= lexem->beg[0];
- lexem->beg++;
- return MY_COLL_LEXEM_CHAR;
- }
-
- if ((lexem->beg[0] == '\\') &&
- (lexem->beg+2 < lexem->end) &&
- (lexem->beg[1] == 'u'))
- {
- int ch;
-
- lexem->code= 0;
- for (lexem->beg+=2;
- (lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ;
- lexem->beg++)
- {
- lexem->code= (lexem->code << 4) + ch;
- }
- return MY_COLL_LEXEM_CHAR;
- }
-
- return MY_COLL_LEXEM_ERROR;
- }
- return MY_COLL_LEXEM_EOF;
-}
-
-
-/*
- Collation rule item
-*/
-
-typedef struct my_coll_rule_item_st
-{
- uint base; /* Base character */
- uint curr; /* Current character */
- int diff[3]; /* Primary, Secondary and Tertiary difference */
-} MY_COLL_RULE;
-
-
-/*
- Collation language syntax parser.
- Uses lexical parser.
-
- SYNOPSIS
- my_coll_rule_parse
- rule Collation rule list to load to.
- str A string containin collation language expression.
- strend End of the string.
- USAGE
-
- RETURN VALUES
- 0 - OK
- 1 - ERROR, e.g. too many items.
-*/
-
-static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
- const char *str, const char *strend,
- char *errstr, size_t errsize)
-{
- MY_COLL_LEXEM lexem;
- my_coll_lexem_num lexnum;
- my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
- MY_COLL_RULE item;
- int state= 0;
- size_t nitems= 0;
-
- /* Init all variables */
- errstr[0]= '\0';
- bzero(&item, sizeof(item));
- my_coll_lexem_init(&lexem, str, strend);
-
- while ((lexnum= my_coll_lexem_next(&lexem)))
- {
- if (lexnum == MY_COLL_LEXEM_ERROR)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
- return -1;
- }
-
- switch (state) {
- case 0:
- if (lexnum != MY_COLL_LEXEM_SHIFT)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
- return -1;
- }
- prevlexnum= lexnum;
- state= 2;
- continue;
-
- case 1:
- if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
- return -1;
- }
- prevlexnum= lexnum;
- state= 2;
- continue;
-
- case 2:
- if (lexnum != MY_COLL_LEXEM_CHAR)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
- return -1;
- }
-
- if (prevlexnum == MY_COLL_LEXEM_SHIFT)
- {
- item.base= lexem.code;
- item.diff[0]= 0;
- item.diff[1]= 0;
- item.diff[2]= 0;
- }
- else if (prevlexnum == MY_COLL_LEXEM_DIFF)
- {
- item.curr= lexem.code;
- if (lexem.diff == 3)
- {
- item.diff[2]++;
- }
- else if (lexem.diff == 2)
- {
- item.diff[1]++;
- item.diff[2]= 0;
- }
- else if (lexem.diff == 1)
- {
- item.diff[0]++;
- item.diff[1]= 0;
- item.diff[2]= 0;
- }
- if (nitems >= mitems)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
- return -1;
- }
- rule[nitems++]= item;
- }
- else
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
- return -1;
- }
- state= 1;
- continue;
- }
- }
- return (size_t) nitems;
-}
-
-
-typedef struct
-{
- int nchars;
- MY_UNI_IDX uidx;
-} uni_idx;
-
-#define PLANE_SIZE 0x100
-#define PLANE_NUM 0x100
-#define PLANE_NUMBER(x) (((x)>>8) % PLANE_NUM)
-
-
-/*
The code below implements this functionality:
- Initializing charset related structures
@@ -484,91 +136,6 @@ static void simple_cs_init_functions(CHARSET_INFO *cs)
}
-static int pcmp(const void * f, const void * s)
-{
- const uni_idx *F= (const uni_idx*) f;
- const uni_idx *S= (const uni_idx*) s;
- int res;
-
- if (!(res=((S->nchars)-(F->nchars))))
- res=((F->uidx.from)-(S->uidx.to));
- return res;
-}
-
-
-static my_bool create_fromuni(CHARSET_INFO *cs)
-{
- uni_idx idx[PLANE_NUM];
- int i,n;
-
- /* Clear plane statistics */
- bzero(idx,sizeof(idx));
-
- /* Count number of characters in each plane */
- for (i=0; i< 0x100; i++)
- {
- uint16 wc=cs->tab_to_uni[i];
- int pl= PLANE_NUMBER(wc);
-
- if (wc || !i)
- {
- if (!idx[pl].nchars)
- {
- idx[pl].uidx.from=wc;
- idx[pl].uidx.to=wc;
- }else
- {
- idx[pl].uidx.from=wc<idx[pl].uidx.from?wc:idx[pl].uidx.from;
- idx[pl].uidx.to=wc>idx[pl].uidx.to?wc:idx[pl].uidx.to;
- }
- idx[pl].nchars++;
- }
- }
-
- /* Sort planes in descending order */
- qsort(&idx,PLANE_NUM,sizeof(uni_idx),&pcmp);
-
- for (i=0; i < PLANE_NUM; i++)
- {
- int ch,numchars;
-
- /* Skip empty plane */
- if (!idx[i].nchars)
- break;
-
- numchars=idx[i].uidx.to-idx[i].uidx.from+1;
- if (!(idx[i].uidx.tab=(uchar*) my_once_alloc(numchars *
- sizeof(*idx[i].uidx.tab),
- MYF(MY_WME))))
- return TRUE;
-
- bzero(idx[i].uidx.tab,numchars*sizeof(*idx[i].uidx.tab));
-
- for (ch=1; ch < PLANE_SIZE; ch++)
- {
- uint16 wc=cs->tab_to_uni[ch];
- if (wc >= idx[i].uidx.from && wc <= idx[i].uidx.to && wc)
- {
- int ofs= wc - idx[i].uidx.from;
- idx[i].uidx.tab[ofs]= ch;
- }
- }
- }
-
- /* Allocate and fill reverse table for each plane */
- n=i;
- if (!(cs->tab_from_uni= (MY_UNI_IDX*) my_once_alloc(sizeof(MY_UNI_IDX)*(n+1),
- MYF(MY_WME))))
- return TRUE;
-
- for (i=0; i< n; i++)
- cs->tab_from_uni[i]= idx[i].uidx;
-
- /* Set end-of-list marker */
- bzero(&cs->tab_from_uni[i],sizeof(MY_UNI_IDX));
- return FALSE;
-}
-
static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
{
@@ -622,8 +189,6 @@ static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
if (!(to->tab_to_uni= (uint16*) my_once_memdup((char*)from->tab_to_uni,
sz, MYF(MY_WME))))
goto err;
- if (create_fromuni(to))
- goto err;
}
to->mbminlen= 1;
to->mbmaxlen= 1;
@@ -754,117 +319,6 @@ static my_tailoring tailoring[]=
}
};
-#define MY_MAX_COLL_RULE 64
-
-/*
- This function copies an UCS2 collation from
- the default Unicode Collation Algorithm (UCA)
- weights applying tailorings, i.e. a set of
- alternative weights for some characters.
-
- The default UCA weights are stored in my_charset_ucs2_general_uca.
- They consist of 256 pages, 256 character each.
-
- If a page is not overwritten by tailoring rules,
- it is copies as is from UCA as is.
-
- If a page contains some overwritten characters, it is
- allocated. Untouched characters are copied from the
- default weights.
-*/
-
-static my_bool create_tailoring(CHARSET_INFO *cs)
-{
- MY_COLL_RULE rule[MY_MAX_COLL_RULE];
- char errstr[128];
- uchar *newlengths;
- uint16 **newweights;
- const uchar *deflengths= my_charset_ucs2_general_uca.sort_order;
- uint16 **defweights= my_charset_ucs2_general_uca.sort_order_big;
- int rc, i;
-
- if (!cs->tailoring)
- return 1;
-
- /* Parse ICU Collation Customization expression */
- if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
- cs->tailoring,
- cs->tailoring + strlen(cs->tailoring),
- errstr, sizeof(errstr))) <= 0)
- {
- /*
- TODO: add error message reporting.
- printf("Error: %d '%s'\n", rc, errstr);
- */
- return 1;
- }
-
- if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME))))
- return 1;
- bzero(newweights, 256*sizeof(uint16*));
-
- if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME))))
- return 1;
-
- /*
- Calculate maximum lenghts for the pages
- which will be overwritten.
- */
- for (i=0; i < rc; i++)
- {
- uint pageb= (rule[i].base >> 8) & 0xFF;
- uint pagec= (rule[i].curr >> 8) & 0xFF;
-
- if (newlengths[pagec] < deflengths[pageb])
- newlengths[pagec]= deflengths[pageb];
- }
-
- for (i=0; i < rc; i++)
- {
- uint pageb= (rule[i].base >> 8) & 0xFF;
- uint pagec= (rule[i].curr >> 8) & 0xFF;
- uint chb, chc;
-
- if (!newweights[pagec])
- {
- /* Alloc new page and copy the default UCA weights */
- uint size= 256*newlengths[pagec]*sizeof(uint16);
-
- if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME))))
- return 1;
- bzero((void*) newweights[pagec], size);
-
- for (chc=0 ; chc < 256; chc++)
- {
- memcpy(newweights[pagec] + chc*newlengths[pagec],
- defweights[pagec] + chc*deflengths[pagec],
- deflengths[pagec]*sizeof(uint16));
- }
- }
-
- /*
- Aply the alternative rule:
- shift to the base character and primary difference.
- */
- chc= rule[i].curr & 0xFF;
- chb= rule[i].base & 0xFF;
- memcpy(newweights[pagec] + chc*newlengths[pagec],
- defweights[pageb] + chb*deflengths[pageb],
- deflengths[pageb]*sizeof(uint16));
- /* Apply primary difference */
- newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
- }
-
- /* Copy non-overwritten pages from the default UCA weights */
- for (i= 0; i < 256 ; i++)
- if (!newweights[i])
- newweights[i]= defweights[i];
-
- cs->sort_order= newlengths;
- cs->sort_order_big= newweights;
-
- return 0;
-}
static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
@@ -894,7 +348,7 @@ static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
to->mbminlen= 2;
to->mbmaxlen= 2;
- return create_tailoring(to);
+ return 0;
err:
return 1;
@@ -997,7 +451,7 @@ static my_bool init_uca_charsets()
CHARSET_INFO cs= my_charset_ucs2_general_uca;
char name[64];
- cs.state= MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONTEXT;
+ cs.state= MY_CS_STRNXFRM|MY_CS_UNICODE;
for (t= tailoring; t->tailoring; t++)
{
cs.number= 128 + t->number;
@@ -1083,6 +537,10 @@ void add_compiled_collation(CHARSET_INFO *cs)
cs->state|= MY_CS_AVAILABLE;
}
+static void *cs_alloc(uint size)
+{
+ return my_once_alloc(size, MYF(MY_WME));
+}
#ifdef __NETWARE__
@@ -1207,6 +665,14 @@ static CHARSET_INFO *get_internal_charset(uint cs_number, myf flags)
cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
}
pthread_mutex_unlock(&THR_LOCK_charset);
+ if (cs && !(cs->state & MY_CS_READY))
+ {
+ if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
+ (cs->coll->init && cs->coll->init(cs, cs_alloc)))
+ cs= NULL;
+ else
+ cs->state|= MY_CS_READY;
+ }
return cs;
}