summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorunknown <tomas@mc05.(none)>2004-06-03 14:45:54 +0200
committerunknown <tomas@mc05.(none)>2004-06-03 14:45:54 +0200
commit3b2201c837ec2c329dc905b7da3e4f06c926afe6 (patch)
tree84e98cdd8485c4571584925f280a134aa241ad45
parent8dd79895e5c6edf7d65049a2b62cf81b69a5beb0 (diff)
parent5a2b1ba6d06603dd7300046d61dc9b02ccb9a134 (diff)
downloadmariadb-git-3b2201c837ec2c329dc905b7da3e4f06c926afe6.tar.gz
Merge tulin@bk-internal.mysql.com:/home/bk/mysql-4.1
into mc05.(none):/space2/tomas/mysql-4.1-ndb-test
-rw-r--r--mysys/charset.c502
-rw-r--r--strings/ctype.c53
2 files changed, 547 insertions, 8 deletions
diff --git a/mysys/charset.c b/mysys/charset.c
index d801fcdbd76..62068beccae 100644
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -21,6 +21,344 @@
#include <my_dir.h>
#include <my_xml.h>
+
+/*
+ Collation language is implemented according to
+ subset of ICU Collation Customization (tailorings):
+ http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
+
+ Collation language elements:
+ Delimiters:
+ space - skipped
+
+ <char> := A-Z | a-z | \uXXXX
+
+ Shift command:
+ <shift> := & - reset at this letter.
+
+ Diff command:
+ <d1> := < - Identifies a primary difference.
+ <d2> := << - Identifies a secondary difference.
+ <d3> := <<< - Idenfifies a tertiary difference.
+
+
+ Collation rules:
+ <ruleset> := <rule> { <ruleset> }
+
+ <rule> := <d1> <string>
+ | <d2> <string>
+ | <d3> <string>
+ | <shift> <char>
+
+ <string> := <char> [ <string> ]
+
+ An example, Polish collation:
+
+ &A < \u0105 <<< \u0104
+ &C < \u0107 <<< \u0106
+ &E < \u0119 <<< \u0118
+ &L < \u0142 <<< \u0141
+ &N < \u0144 <<< \u0143
+ &O < \u00F3 <<< \u00D3
+ &S < \u015B <<< \u015A
+ &Z < \u017A <<< \u017B
+*/
+
+
+typedef enum my_coll_lexem_num_en
+{
+ MY_COLL_LEXEM_EOF = 0,
+ MY_COLL_LEXEM_DIFF = 1,
+ MY_COLL_LEXEM_SHIFT = 4,
+ MY_COLL_LEXEM_CHAR = 5,
+ MY_COLL_LEXEM_ERROR = 6
+} my_coll_lexem_num;
+
+
+typedef struct my_coll_lexem_st
+{
+ const char *beg;
+ const char *end;
+ const char *prev;
+ int diff;
+ int code;
+} MY_COLL_LEXEM;
+
+
+/*
+ Initialize collation rule lexical anilizer
+
+ SYNOPSIS
+ my_coll_lexem_init
+ lexem Lex analizer to init
+ str Const string to parse
+ strend End of the string
+ USAGE
+
+ RETURN VALUES
+ N/A
+*/
+
+static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
+ const char *str, const char *strend)
+{
+ lexem->beg= str;
+ lexem->prev= str;
+ lexem->end= strend;
+ lexem->diff= 0;
+ lexem->code= 0;
+}
+
+
+/*
+ Print collation customization expression parse error, with context.
+
+ SYNOPSIS
+ my_coll_lexem_print_error
+ lexem Lex analizer to take context from
+ errstr sting to write error to
+ errsize errstr size
+ txt error message
+ USAGE
+
+ RETURN VALUES
+ N/A
+*/
+
+static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
+ char *errstr, size_t errsize,
+ const char *txt)
+{
+ char tail[30];
+ size_t len= lexem->end - lexem->prev;
+ strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
+ errstr[errsize-1]= '\0';
+ my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
+}
+
+
+/*
+ Convert a hex digit into its numeric value
+
+ SYNOPSIS
+ ch2x
+ ch hex digit to convert
+ USAGE
+
+ RETURN VALUES
+ an integer value in the range 0..15
+ -1 on error
+*/
+
+static int ch2x(int ch)
+{
+ if (ch >= '0' && ch <= '9')
+ return ch - '0';
+
+ if (ch >= 'a' && ch <= 'f')
+ return 10 + ch - 'a';
+
+ if (ch >= 'A' && ch <= 'Z')
+ return 10 + ch - 'A';
+
+ return -1;
+}
+
+
+/*
+ Collation language lexical parser:
+ Scans the next lexem.
+
+ SYNOPSIS
+ my_coll_lexem_next
+ lexem Lex analizer, previously initialized by
+ my_coll_lexem_init.
+ USAGE
+ Call this function in a loop
+
+ RETURN VALUES
+ Lexem number: eof, diff, shift, char or error.
+*/
+
+static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
+{
+ for ( ;lexem->beg < lexem->end ; lexem->beg++)
+ {
+ lexem->prev= lexem->beg;
+ if (lexem->beg[0] == ' ' || lexem->beg[0] == '\t' ||
+ lexem->beg[0] == '\r' || lexem->beg[0] == '\n')
+ continue;
+
+ if (lexem->beg[0] == '&')
+ {
+ lexem->beg++;
+ return MY_COLL_LEXEM_SHIFT;
+ }
+
+ if (lexem->beg[0] == '<')
+ {
+ for (lexem->beg++, lexem->diff=1;
+ (lexem->beg < lexem->end) &&
+ (lexem->beg[0] == '<') && (lexem->diff<3);
+ lexem->beg++, lexem->diff++);
+ return MY_COLL_LEXEM_DIFF;
+ }
+
+ if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') ||
+ (lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z'))
+ {
+ lexem->code= lexem->beg[0];
+ lexem->beg++;
+ return MY_COLL_LEXEM_CHAR;
+ }
+
+ if ((lexem->beg[0] == '\\') &&
+ (lexem->beg+2 < lexem->end) &&
+ (lexem->beg[1] == 'u'))
+ {
+ int ch;
+
+ lexem->code= 0;
+ for (lexem->beg+=2;
+ (lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ;
+ lexem->beg++)
+ {
+ lexem->code= (lexem->code << 4) + ch;
+ }
+ return MY_COLL_LEXEM_CHAR;
+ }
+
+ return MY_COLL_LEXEM_ERROR;
+ }
+ return MY_COLL_LEXEM_EOF;
+}
+
+
+/*
+ Collation rule item
+*/
+
+typedef struct my_coll_rule_item_st
+{
+ uint base; /* Base character */
+ uint curr; /* Current character */
+ int diff[3]; /* Primary, Secondary and Tertiary difference */
+} MY_COLL_RULE;
+
+
+/*
+ Collation language syntax parser.
+ Uses lexical parser.
+
+ SYNOPSIS
+ my_coll_rule_parse
+ rule Collation rule list to load to.
+ str A string containin collation language expression.
+ strend End of the string.
+ USAGE
+
+ RETURN VALUES
+ 0 - OK
+ 1 - ERROR, e.g. too many items.
+*/
+
+static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
+ const char *str, const char *strend,
+ char *errstr, size_t errsize)
+{
+ MY_COLL_LEXEM lexem;
+ my_coll_lexem_num lexnum;
+ my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
+ MY_COLL_RULE item;
+ int state= 0;
+ size_t nitems= 0;
+
+ /* Init all variables */
+ errstr[0]= '\0';
+ bzero(&item, sizeof(item));
+ my_coll_lexem_init(&lexem, str, strend);
+
+ while ((lexnum= my_coll_lexem_next(&lexem)))
+ {
+ if (lexnum == MY_COLL_LEXEM_ERROR)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
+ return -1;
+ }
+
+ switch (state) {
+ case 0:
+ if (lexnum != MY_COLL_LEXEM_SHIFT)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
+ return -1;
+ }
+ prevlexnum= lexnum;
+ state= 2;
+ continue;
+
+ case 1:
+ if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
+ return -1;
+ }
+ prevlexnum= lexnum;
+ state= 2;
+ continue;
+
+ case 2:
+ if (lexnum != MY_COLL_LEXEM_CHAR)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
+ return -1;
+ }
+
+ if (prevlexnum == MY_COLL_LEXEM_SHIFT)
+ {
+ item.base= lexem.code;
+ item.diff[0]= 0;
+ item.diff[1]= 0;
+ item.diff[2]= 0;
+ }
+ else if (prevlexnum == MY_COLL_LEXEM_DIFF)
+ {
+ item.curr= lexem.code;
+ if (lexem.diff == 3)
+ {
+ item.diff[2]++;
+ }
+ else if (lexem.diff == 2)
+ {
+ item.diff[1]++;
+ item.diff[2]= 0;
+ }
+ else if (lexem.diff == 1)
+ {
+ item.diff[0]++;
+ item.diff[1]= 0;
+ item.diff[2]= 0;
+ }
+ if (nitems >= mitems)
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
+ return -1;
+ }
+ rule[nitems++]= item;
+ }
+ else
+ {
+ my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
+ return -1;
+ }
+ state= 1;
+ continue;
+ }
+ }
+ return (size_t) nitems;
+}
+
+
typedef struct
{
int nchars;
@@ -284,6 +622,144 @@ err:
}
+#ifdef HAVE_CHARSET_ucs2
+
+#define MY_MAX_COLL_RULE 64
+
+/*
+ This function copies an UCS2 collation from
+ the default Unicode Collation Algorithm (UCA)
+ weights applying tailorings, i.e. a set of
+ alternative weights for some characters.
+
+ The default UCA weights are stored in my_charset_ucs2_general_uca.
+ They consist of 256 pages, 256 character each.
+
+ If a page is not overwritten by tailoring rules,
+ it is copies as is from UCA as is.
+
+ If a page contains some overwritten characters, it is
+ allocated. Untouched characters are copied from the
+ default weights.
+*/
+
+static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
+{
+ MY_COLL_RULE rule[MY_MAX_COLL_RULE];
+ char errstr[128];
+ uchar *newlengths;
+ uint16 **newweights;
+ const uchar *deflengths= my_charset_ucs2_general_uca.sort_order;
+ uint16 **defweights= my_charset_ucs2_general_uca.sort_order_big;
+ int rc, i;
+
+ to->number= from->number ? from->number : to->number;
+
+ if (from->csname)
+ if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
+ goto err;
+
+ if (from->name)
+ if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
+ goto err;
+
+ if (from->comment)
+ if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
+ goto err;
+
+ to->strxfrm_multiply= my_charset_ucs2_general_uca.strxfrm_multiply;
+ to->min_sort_char= my_charset_ucs2_general_uca.min_sort_char;
+ to->max_sort_char= my_charset_ucs2_general_uca.max_sort_char;
+ to->mbminlen= 2;
+ to->mbmaxlen= 2;
+
+
+ /* Parse ICU Collation Customization expression */
+ if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
+ from->sort_order,
+ from->sort_order + strlen(from->sort_order),
+ errstr, sizeof(errstr))) <= 0)
+ {
+ /*
+ TODO: add error message reporting.
+ printf("Error: %d '%s'\n", rc, errstr);
+ */
+ return 1;
+ }
+
+
+ if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME))))
+ goto err;
+ bzero(newweights, 256*sizeof(uint16*));
+
+ if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME))))
+ goto err;
+
+ /*
+ Calculate maximum lenghts for the pages
+ which will be overwritten.
+ */
+ for (i=0; i < rc; i++)
+ {
+ uint pageb= (rule[i].base >> 8) & 0xFF;
+ uint pagec= (rule[i].curr >> 8) & 0xFF;
+
+ if (newlengths[pagec] < deflengths[pageb])
+ newlengths[pagec]= deflengths[pageb];
+ }
+
+ for (i=0; i < rc; i++)
+ {
+ uint pageb= (rule[i].base >> 8) & 0xFF;
+ uint pagec= (rule[i].curr >> 8) & 0xFF;
+ uint chb, chc;
+
+ if (!newweights[pagec])
+ {
+ /* Alloc new page and copy the default UCA weights */
+ uint size= 256*newlengths[pagec]*sizeof(uint16);
+
+ if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME))))
+ goto err;
+ bzero((void*) newweights[pagec], size);
+
+ for (chc=0 ; chc < 256; chc++)
+ {
+ memcpy(newweights[pagec] + chc*newlengths[pagec],
+ defweights[pagec] + chc*deflengths[pagec],
+ deflengths[pagec]*sizeof(uint16));
+ }
+ }
+
+ /*
+ Aply the alternative rule:
+ shift to the base character and primary difference.
+ */
+ chc= rule[i].curr & 0xFF;
+ chb= rule[i].base & 0xFF;
+ memcpy(newweights[pagec] + chc*newlengths[pagec],
+ defweights[pageb] + chb*deflengths[pageb],
+ deflengths[pageb]*sizeof(uint16));
+ /* Apply primary difference */
+ newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
+ }
+
+ /* Copy non-overwritten pages from the default UCA weights */
+ for (i= 0; i < 256 ; i++)
+ if (!newweights[i])
+ newweights[i]= defweights[i];
+
+ to->sort_order= newlengths;
+ to->sort_order_big= newweights;
+
+ return 0;
+
+err:
+ return 1;
+}
+#endif
+
+
static my_bool simple_cs_is_full(CHARSET_INFO *cs)
{
return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper &&
@@ -315,14 +791,28 @@ static int add_collation(CHARSET_INFO *cs)
if (!(all_charsets[cs->number]->state & MY_CS_COMPILED))
{
- simple_cs_init_functions(all_charsets[cs->number]);
- if (simple_cs_copy_data(all_charsets[cs->number],cs))
- return MY_XML_ERROR;
- if (simple_cs_is_full(all_charsets[cs->number]))
+ if (!strcmp(cs->csname,"ucs2") )
{
- all_charsets[cs->number]->state |= MY_CS_LOADED;
+#ifdef HAVE_CHARSET_ucs2
+ CHARSET_INFO *new= all_charsets[cs->number];
+ new->cset= my_charset_ucs2_general_uca.cset;
+ new->coll= my_charset_ucs2_general_uca.coll;
+ if (ucs2_copy_data(new, cs))
+ return MY_XML_ERROR;
+ new->state |= MY_CS_AVAILABLE | MY_CS_LOADED;
+#endif
+ }
+ else
+ {
+ simple_cs_init_functions(all_charsets[cs->number]);
+ if (simple_cs_copy_data(all_charsets[cs->number],cs))
+ return MY_XML_ERROR;
+ if (simple_cs_is_full(all_charsets[cs->number]))
+ {
+ all_charsets[cs->number]->state |= MY_CS_LOADED;
+ }
+ all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
}
- all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
}
else
{
diff --git a/strings/ctype.c b/strings/ctype.c
index cbd13111b70..44bf20ada5c 100644
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -22,6 +22,23 @@
#endif
+/*
+
+ This files implements routines which parse XML based
+ character set and collation description files.
+
+ Unicode collations are encoded according to
+
+ Unicode Technical Standard #35
+ Locale Data Markup Language (LDML)
+ http://www.unicode.org/reports/tr35/
+
+ and converted into ICU string according to
+
+ Collation Customization
+ http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
+
+*/
static char *mstr(char *str,const char *src,uint l1,uint l2)
{
@@ -54,6 +71,11 @@ struct my_cs_file_section_st
#define _CS_PRIMARY_ID 15
#define _CS_BINARY_ID 16
#define _CS_CSDESCRIPT 17
+#define _CS_RESET 18
+#define _CS_DIFF1 19
+#define _CS_DIFF2 20
+#define _CS_DIFF3 21
+
static struct my_cs_file_section_st sec[] =
{
@@ -83,6 +105,10 @@ static struct my_cs_file_section_st sec[] =
{_CS_ORDER, "charsets.charset.collation.order"},
{_CS_FLAG, "charsets.charset.collation.flag"},
{_CS_COLLMAP, "charsets.charset.collation.map"},
+ {_CS_RESET, "charsets.charset.collation.rules.reset"},
+ {_CS_DIFF1, "charsets.charset.collation.rules.p"},
+ {_CS_DIFF2, "charsets.charset.collation.rules.s"},
+ {_CS_DIFF3, "charsets.charset.collation.rules.t"},
{0, NULL}
};
@@ -109,6 +135,7 @@ typedef struct my_cs_file_info
uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
char comment[MY_CS_CSDESCR_SIZE];
+ size_t sort_order_length;
CHARSET_INFO cs;
int (*add_collation)(CHARSET_INFO *cs);
} MY_CHARSET_LOADER;
@@ -156,9 +183,11 @@ static int cs_enter(MY_XML_PARSER *st,const char *attr, uint len)
struct my_cs_file_section_st *s= cs_file_sec(attr,len);
if ( s && (s->state == _CS_CHARSET))
- {
bzero(&i->cs,sizeof(i->cs));
- }
+
+ if (s && (s->state == _CS_COLLATION))
+ i->sort_order_length= 0;
+
return MY_XML_OK;
}
@@ -242,6 +271,26 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, uint len)
fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
i->cs.ctype=i->ctype;
break;
+ case _CS_RESET:
+ case _CS_DIFF1:
+ case _CS_DIFF2:
+ case _CS_DIFF3:
+ {
+ /*
+ Convert collation description from
+ Locale Data Markup Language (LDML)
+ into ICU Collation Customization expression.
+ */
+ char arg[16];
+ const char *cmd[]= {"&","<","<<","<<<"};
+ i->cs.sort_order= i->sort_order;
+ mstr(arg,attr,len,sizeof(arg)-1);
+ if (i->sort_order_length + 20 < sizeof(i->sort_order))
+ {
+ char *dst= i->sort_order_length + i->sort_order;
+ i->sort_order_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg);
+ }
+ }
}
return MY_XML_OK;
}