summaryrefslogtreecommitdiff
path: root/mysys/charset.c
diff options
context:
space:
mode:
authorunknown <bar@bar.intranet.mysql.r18.ru>2004-06-03 17:45:53 +0500
committerunknown <bar@bar.intranet.mysql.r18.ru>2004-06-03 17:45:53 +0500
commitc5f8d17447f9d48ecb98004029c946341c03a6d1 (patch)
treed2caac6cf74f71de0ad9187035aaebb5214b4309 /mysys/charset.c
parent7d10bea2537d839476fbeab54f47513959f6031d (diff)
downloadmariadb-git-c5f8d17447f9d48ecb98004029c946341c03a6d1.tar.gz
Unicode collations: WL#916
XML and "collation customization" language parsers.
Diffstat (limited to 'mysys/charset.c')
-rw-r--r--mysys/charset.c502
1 files changed, 496 insertions, 6 deletions
diff --git a/mysys/charset.c b/mysys/charset.c
index d801fcdbd76..62068beccae 100644
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -21,6 +21,344 @@
#include <my_dir.h>
#include <my_xml.h>
+
+/*
+ Collation language is implemented according to
+ subset of ICU Collation Customization (tailorings):
+ http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
+
+ Collation language elements:
+ Delimiters:
+ space - skipped
+
+ <char> := A-Z | a-z | \uXXXX
+
+ Shift command:
+ <shift> := & - reset at this letter.
+
+ Diff command:
+ <d1> := < - Identifies a primary difference.
+ <d2> := << - Identifies a secondary difference.
+ <d3> := <<< - Idenfifies a tertiary difference.
+
+
+ Collation rules:
+ <ruleset> := <rule> { <ruleset> }
+
+ <rule> := <d1> <string>
+ | <d2> <string>
+ | <d3> <string>
+ | <shift> <char>
+
+ <string> := <char> [ <string> ]
+
+ An example, Polish collation:
+
+ &A < \u0105 <<< \u0104
+ &C < \u0107 <<< \u0106
+ &E < \u0119 <<< \u0118
+ &L < \u0142 <<< \u0141
+ &N < \u0144 <<< \u0143
+ &O < \u00F3 <<< \u00D3
+ &S < \u015B <<< \u015A
+ &Z < \u017A <<< \u017B
+*/
+
+
+typedef enum my_coll_lexem_num_en
+{
+ MY_COLL_LEXEM_EOF = 0,
+ MY_COLL_LEXEM_DIFF = 1,
+ MY_COLL_LEXEM_SHIFT = 4,
+ MY_COLL_LEXEM_CHAR = 5,
+ MY_COLL_LEXEM_ERROR = 6
+} my_coll_lexem_num;
+
+
+typedef struct my_coll_lexem_st
+{
+ const char *beg;
+ const char *end;
+ const char *prev;
+ int diff;
+ int code;
+} MY_COLL_LEXEM;
+
+
+/*
+ Initialize collation rule lexical anilizer
+
+ SYNOPSIS
+ my_coll_lexem_init
+ lexem Lex analizer to init
+ str Const string to parse
+ strend End of the string
+ USAGE
+
+ RETURN VALUES
+ N/A
+*/
+
+static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
+ const char *str, const char *strend)
+{
+ lexem->beg= str;
+ lexem->prev= str;
+ lexem->end= strend;
+ lexem->diff= 0;
+ lexem->code= 0;
+}
+
+
+/*
+ Print collation customization expression parse error, with context.
+
+ SYNOPSIS
+ my_coll_lexem_print_error
+ lexem Lex analizer to take context from
+ errstr sting to write error to
+ errsize errstr size
+ txt error message
+ USAGE
+
+ RETURN VALUES
+ N/A
+*/
+
+static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
+ char *errstr, size_t errsize,
+ const char *txt)
+{
+ char tail[30];
+ size_t len= lexem->end - lexem->prev;
+ strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
+ errstr[errsize-1]= '\0';
+ my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
+}
+
+
+/*
+ Convert a hex digit into its numeric value
+
+ SYNOPSIS
+ ch2x
+ ch hex digit to convert
+ USAGE
+
+ RETURN VALUES
+ an integer value in the range 0..15
+ -1 on error
+*/
+
+static int ch2x(int ch)
+{
+ if (ch >= '0' && ch <= '9')
+ return ch - '0';
+
+ if (ch >= 'a' && ch <= 'f')
+ return 10 + ch - 'a';
+
+ if (ch >= 'A' && ch <= 'Z')
+ return 10 + ch - 'A';
+
+ return -1;
+}
+
+
+/*
+ Collation language lexical parser:
+ Scans the next lexem.
+
+ SYNOPSIS
+ my_coll_lexem_next
+ lexem Lex analizer, previously initialized by
+ my_coll_lexem_init.
+ USAGE
+ Call this function in a loop
+
+ RETURN VALUES
+ Lexem number: eof, diff, shift, char or error.
+*/
+
+static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
+{
+ for ( ;lexem->beg < lexem->end ; lexem->beg++)
+ {
+ lexem->prev= lexem->beg;
+ if (lexem->beg[0] == ' ' || lexem->beg[0] == '\t' ||
+ lexem->beg[0] == '\r' || lexem->beg[0] == '\n')
+ continue;
+
+ if (lexem->beg[0] == '&')
+ {
+ lexem->beg++;
+ return MY_COLL_LEXEM_SHIFT;
+ }
+
+ if (lexem->beg[0] == '<')
+ {
+ for (lexem->beg++, lexem->diff=1;
+ (lexem->beg < lexem->end) &&
+ (lexem->beg[0] == '<') && (lexem->diff<3);
+ lexem->beg++, lexem->diff++);
+ return MY_COLL_LEXEM_DIFF;
+ }
+
+ if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') ||
+ (lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z'))
+ {
+ lexem->code= lexem->beg[0];
+ lexem->beg++;
+ return MY_COLL_LEXEM_CHAR;
+ }
+
+ if ((lexem->beg[0] == '\\') &&
+ (lexem->beg+2 < lexem->end) &&
+ (lexem->beg[1] == 'u'))
+ {
+ int ch;
+
+ lexem->code= 0;
+ for (lexem->beg+=2;
+ (lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ;
+ lexem->beg++)
+ {
+ lexem->code= (lexem->code << 4) + ch;
+ }
+ return MY_COLL_LEXEM_CHAR;
+ }
+
+ return MY_COLL_LEXEM_ERROR;
+ }
+ return MY_COLL_LEXEM_EOF;
+}
+
+
+/*
+ Collation rule item
+*/
+
+typedef struct my_coll_rule_item_st
+{
+ uint base; /* Base character */
+ uint curr; /* Current character */
+ int diff[3]; /* Primary, Secondary and Tertiary difference */
+} MY_COLL_RULE;
+
+
+/*
+ Collation language syntax parser.
+ Uses lexical parser.
+
+ SYNOPSIS
+ my_coll_rule_parse
+ rule Collation rule list to load to.
+ str A string containin collation language expression.
+ strend End of the string.
+ USAGE
+
+ RETURN VALUES
+ 0 - OK
+ 1 - ERROR, e.g. too many items.
+*/
+
+static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
+ const char *str, const char *strend,
+ char *errstr, size_t errsize)
+{
+ MY_COLL_LEXEM lexem;
+ my_coll_lexem_num lexnum;
+ my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
+ MY_COLL_RULE item;
+ int state= 0;
+ size_t nitems= 0;
+
+ /* Init all variables */
+ errstr[0]= '\0';
+ bzero(&item, sizeof(item));
+ my_coll_lexem_init(&lexem, str, strend);
+
+ while ((lexnum= my_coll_lexem_next(&lexem)))
+ {
+ if (lexnum == MY_COLL_LEXEM_ERROR)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
+ return -1;
+ }
+
+ switch (state) {
+ case 0:
+ if (lexnum != MY_COLL_LEXEM_SHIFT)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
+ return -1;
+ }
+ prevlexnum= lexnum;
+ state= 2;
+ continue;
+
+ case 1:
+ if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
+ return -1;
+ }
+ prevlexnum= lexnum;
+ state= 2;
+ continue;
+
+ case 2:
+ if (lexnum != MY_COLL_LEXEM_CHAR)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
+ return -1;
+ }
+
+ if (prevlexnum == MY_COLL_LEXEM_SHIFT)
+ {
+ item.base= lexem.code;
+ item.diff[0]= 0;
+ item.diff[1]= 0;
+ item.diff[2]= 0;
+ }
+ else if (prevlexnum == MY_COLL_LEXEM_DIFF)
+ {
+ item.curr= lexem.code;
+ if (lexem.diff == 3)
+ {
+ item.diff[2]++;
+ }
+ else if (lexem.diff == 2)
+ {
+ item.diff[1]++;
+ item.diff[2]= 0;
+ }
+ else if (lexem.diff == 1)
+ {
+ item.diff[0]++;
+ item.diff[1]= 0;
+ item.diff[2]= 0;
+ }
+ if (nitems >= mitems)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
+ return -1;
+ }
+ rule[nitems++]= item;
+ }
+ else
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
+ return -1;
+ }
+ state= 1;
+ continue;
+ }
+ }
+ return (size_t) nitems;
+}
+
+
typedef struct
{
int nchars;
@@ -284,6 +622,144 @@ err:
}
+#ifdef HAVE_CHARSET_ucs2
+
+#define MY_MAX_COLL_RULE 64
+
+/*
+ This function copies an UCS2 collation from
+ the default Unicode Collation Algorithm (UCA)
+ weights applying tailorings, i.e. a set of
+ alternative weights for some characters.
+
+ The default UCA weights are stored in my_charset_ucs2_general_uca.
+ They consist of 256 pages, 256 character each.
+
+ If a page is not overwritten by tailoring rules,
+ it is copies as is from UCA as is.
+
+ If a page contains some overwritten characters, it is
+ allocated. Untouched characters are copied from the
+ default weights.
+*/
+
+static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
+{
+ MY_COLL_RULE rule[MY_MAX_COLL_RULE];
+ char errstr[128];
+ uchar *newlengths;
+ uint16 **newweights;
+ const uchar *deflengths= my_charset_ucs2_general_uca.sort_order;
+ uint16 **defweights= my_charset_ucs2_general_uca.sort_order_big;
+ int rc, i;
+
+ to->number= from->number ? from->number : to->number;
+
+ if (from->csname)
+ if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
+ goto err;
+
+ if (from->name)
+ if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
+ goto err;
+
+ if (from->comment)
+ if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
+ goto err;
+
+ to->strxfrm_multiply= my_charset_ucs2_general_uca.strxfrm_multiply;
+ to->min_sort_char= my_charset_ucs2_general_uca.min_sort_char;
+ to->max_sort_char= my_charset_ucs2_general_uca.max_sort_char;
+ to->mbminlen= 2;
+ to->mbmaxlen= 2;
+
+
+ /* Parse ICU Collation Customization expression */
+ if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
+ from->sort_order,
+ from->sort_order + strlen(from->sort_order),
+ errstr, sizeof(errstr))) <= 0)
+ {
+ /*
+ TODO: add error message reporting.
+ printf("Error: %d '%s'\n", rc, errstr);
+ */
+ return 1;
+ }
+
+
+ if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME))))
+ goto err;
+ bzero(newweights, 256*sizeof(uint16*));
+
+ if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME))))
+ goto err;
+
+ /*
+ Calculate maximum lenghts for the pages
+ which will be overwritten.
+ */
+ for (i=0; i < rc; i++)
+ {
+ uint pageb= (rule[i].base >> 8) & 0xFF;
+ uint pagec= (rule[i].curr >> 8) & 0xFF;
+
+ if (newlengths[pagec] < deflengths[pageb])
+ newlengths[pagec]= deflengths[pageb];
+ }
+
+ for (i=0; i < rc; i++)
+ {
+ uint pageb= (rule[i].base >> 8) & 0xFF;
+ uint pagec= (rule[i].curr >> 8) & 0xFF;
+ uint chb, chc;
+
+ if (!newweights[pagec])
+ {
+ /* Alloc new page and copy the default UCA weights */
+ uint size= 256*newlengths[pagec]*sizeof(uint16);
+
+ if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME))))
+ goto err;
+ bzero((void*) newweights[pagec], size);
+
+ for (chc=0 ; chc < 256; chc++)
+ {
+ memcpy(newweights[pagec] + chc*newlengths[pagec],
+ defweights[pagec] + chc*deflengths[pagec],
+ deflengths[pagec]*sizeof(uint16));
+ }
+ }
+
+ /*
+ Aply the alternative rule:
+ shift to the base character and primary difference.
+ */
+ chc= rule[i].curr & 0xFF;
+ chb= rule[i].base & 0xFF;
+ memcpy(newweights[pagec] + chc*newlengths[pagec],
+ defweights[pageb] + chb*deflengths[pageb],
+ deflengths[pageb]*sizeof(uint16));
+ /* Apply primary difference */
+ newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
+ }
+
+ /* Copy non-overwritten pages from the default UCA weights */
+ for (i= 0; i < 256 ; i++)
+ if (!newweights[i])
+ newweights[i]= defweights[i];
+
+ to->sort_order= newlengths;
+ to->sort_order_big= newweights;
+
+ return 0;
+
+err:
+ return 1;
+}
+#endif
+
+
static my_bool simple_cs_is_full(CHARSET_INFO *cs)
{
return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper &&
@@ -315,14 +791,28 @@ static int add_collation(CHARSET_INFO *cs)
if (!(all_charsets[cs->number]->state & MY_CS_COMPILED))
{
- simple_cs_init_functions(all_charsets[cs->number]);
- if (simple_cs_copy_data(all_charsets[cs->number],cs))
- return MY_XML_ERROR;
- if (simple_cs_is_full(all_charsets[cs->number]))
+ if (!strcmp(cs->csname,"ucs2") )
{
- all_charsets[cs->number]->state |= MY_CS_LOADED;
+#ifdef HAVE_CHARSET_ucs2
+ CHARSET_INFO *new= all_charsets[cs->number];
+ new->cset= my_charset_ucs2_general_uca.cset;
+ new->coll= my_charset_ucs2_general_uca.coll;
+ if (ucs2_copy_data(new, cs))
+ return MY_XML_ERROR;
+ new->state |= MY_CS_AVAILABLE | MY_CS_LOADED;
+#endif
+ }
+ else
+ {
+ simple_cs_init_functions(all_charsets[cs->number]);
+ if (simple_cs_copy_data(all_charsets[cs->number],cs))
+ return MY_XML_ERROR;
+ if (simple_cs_is_full(all_charsets[cs->number]))
+ {
+ all_charsets[cs->number]->state |= MY_CS_LOADED;
+ }
+ all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
}
- all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
}
else
{