diff options
author | unknown <bar@mysql.com> | 2006-02-02 10:07:47 +0400 |
---|---|---|
committer | unknown <bar@mysql.com> | 2006-02-02 10:07:47 +0400 |
commit | 4fa4383ba8ac841b9d24d5502811ad06ce2d801e (patch) | |
tree | 379982f5e16065284e45a307694ab32d835c9795 /strings | |
parent | 55c304a17b25ed19af984bf3b923e5aaf966f74a (diff) | |
download | mariadb-git-4fa4383ba8ac841b9d24d5502811ad06ce2d801e.tar.gz |
WL#1386 - CTYPE table for unicode character sets
A prerequisite for several fulltext and XML bugs.
MY_CHARSET_HANDLER now has a new function "ctype"
to detect a type of the next character in a string
(i.e. digit, letter, space, punctuation, control, etc),
which now works correctly for both 8bit and multibyte charsets.
Previously only 8bit charsets worked correctly,
while any multibyte character was considered as letter
in multibyte charsets.
Many files:
Adding new function
Makefile.am:
Adding build rules for uctypedump,
a dump tool to create my_uctype.h
using Unicode Character Database file.
m_ctype.h:
Adding declaration of my_uni_ctype,
ctype data for Unicode.
Adding new member into MY_CHARSET_HANDLER
Makefile.am:
Adding my_uctype.h into noinst_HEADERS
my_uctype.h, uctypedump.c:
new files:
ctype data for unicode,
and the tool to generate it from
a Unicode Character Database file.
include/Makefile.am:
Adding my_uctype.h
include/m_ctype.h:
Adding declaration of my_uni_ctype,
ctype data for Unicode.
strings/Makefile.am:
Adding build rules for uctypedump,
a dump tool to create my_uctype.h
using Unicode Character Database file.
strings/ctype-big5.c:
Adding new function
strings/ctype-bin.c:
Adding new function
strings/ctype-cp932.c:
Adding new function
strings/ctype-euc_kr.c:
Adding new function
strings/ctype-eucjpms.c:
Adding new function
strings/ctype-gb2312.c:
Adding new function
strings/ctype-gbk.c:
Adding new function
strings/ctype-latin1.c:
Adding new function
strings/ctype-mb.c:
Adding new function
strings/ctype-simple.c:
Adding new function
strings/ctype-sjis.c:
Adding new function
strings/ctype-tis620.c:
Adding new function
strings/ctype-ucs2.c:
Adding new function
strings/ctype-ujis.c:
Adding new function
strings/ctype-utf8.c:
Adding new function
Diffstat (limited to 'strings')
-rw-r--r-- | strings/Makefile.am | 4 | ||||
-rw-r--r-- | strings/ctype-big5.c | 1 | ||||
-rw-r--r-- | strings/ctype-bin.c | 1 | ||||
-rw-r--r-- | strings/ctype-cp932.c | 1 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 1 | ||||
-rw-r--r-- | strings/ctype-eucjpms.c | 1 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 1 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 1 | ||||
-rw-r--r-- | strings/ctype-latin1.c | 1 | ||||
-rw-r--r-- | strings/ctype-mb.c | 16 | ||||
-rw-r--r-- | strings/ctype-simple.c | 14 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 1 | ||||
-rw-r--r-- | strings/ctype-tis620.c | 1 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 1 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 1 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 4 | ||||
-rw-r--r-- | strings/uctypedump.c | 226 |
17 files changed, 276 insertions, 0 deletions
diff --git a/strings/Makefile.am b/strings/Makefile.am index b423d395778..fcb97a071f1 100644 --- a/strings/Makefile.am +++ b/strings/Makefile.am @@ -40,6 +40,7 @@ endif libmystrings_a_SOURCES = $(ASRCS) $(CSRCS) noinst_PROGRAMS = conf_to_src +CLEANFILES = str_test uctypedump test_decimal # Default charset definitions EXTRA_DIST = ctype-big5.c ctype-cp932.c ctype-czech.c ctype-eucjpms.c ctype-euc_kr.c ctype-win1250ch.c \ ctype-gb2312.c ctype-gbk.c ctype-sjis.c ctype-utf8.c \ @@ -77,6 +78,9 @@ FLAGS=$(DEFS) $(INCLUDES) $(CPPFLAGS) $(CFLAGS) @NOINST_LDFLAGS@ str_test: str_test.c $(pkglib_LIBRARIES) $(LINK) $(FLAGS) -DMAIN $(INCLUDES) $(srcdir)/str_test.c $(LDADD) $(pkglib_LIBRARIES) +uctypedump: uctypedump.c + $(LINK) $(INCLUDES) $(srcdir)/uctypedump.c + test_decimal$(EXEEXT): decimal.c $(pkglib_LIBRARIES) $(CP) $(srcdir)/decimal.c ./test_decimal.c $(LINK) $(FLAGS) -DMAIN ./test_decimal.c $(LDADD) $(pkglib_LIBRARIES) diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index 460215418f8..c1efc08e012 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -6356,6 +6356,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler= my_numcells_8bit, my_mb_wc_big5, /* mb_wc */ my_wc_mb_big5, /* wc_mb */ + my_mb_ctype_mb, my_caseup_str_mb, my_casedn_str_mb, my_caseup_mb, diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 973a6ebf12a..557f8f76647 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -503,6 +503,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_8bit, my_mb_wc_bin, my_wc_mb_bin, + my_mb_ctype_8bit, my_case_str_bin, my_case_str_bin, my_case_bin, diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index 1e3126865f3..38f625916a0 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -5478,6 +5478,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_cp932, my_mb_wc_cp932, /* mb_wc */ my_wc_mb_cp932, /* wc_mb */ + my_mb_ctype_mb, my_caseup_str_8bit, my_casedn_str_8bit, my_caseup_8bit, diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index 5e357e0b65c..4ed25f8ee6f 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -8697,6 +8697,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_8bit, my_mb_wc_euc_kr, /* mb_wc */ my_wc_mb_euc_kr, /* wc_mb */ + my_mb_ctype_mb, my_caseup_str_mb, my_casedn_str_mb, my_caseup_mb, diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index f9210fcb10e..4fb2bbbd0a7 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -8663,6 +8663,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_eucjp, my_mb_wc_euc_jp, /* mb_wc */ my_wc_mb_euc_jp, /* wc_mb */ + my_mb_ctype_mb, my_caseup_str_mb, my_casedn_str_mb, my_caseup_mb, diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index f3938cc27ba..109a89ef907 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -5748,6 +5748,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_8bit, my_mb_wc_gb2312, /* mb_wc */ my_wc_mb_gb2312, /* wc_mb */ + my_mb_ctype_mb, my_caseup_str_mb, my_casedn_str_mb, my_caseup_mb, diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index edc595875d7..7d45969d251 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -10001,6 +10001,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_8bit, my_mb_wc_gbk, my_wc_mb_gbk, + my_mb_ctype_mb, my_caseup_str_mb, my_casedn_str_mb, my_caseup_mb, diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 746cb5a4003..3dae13cfadc 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -397,6 +397,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_8bit, my_mb_wc_latin1, my_wc_mb_latin1, + my_mb_ctype_8bit, my_caseup_str_8bit, my_casedn_str_8bit, my_caseup_8bit, diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index a3e10ba7650..e1899d693a4 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -914,6 +914,22 @@ uint my_numcells_mb(CHARSET_INFO *cs, const char *b, const char *e) } +int my_mb_ctype_mb(CHARSET_INFO *cs, int *ctype, + const unsigned char *s, const unsigned char *e) +{ + my_wc_t wc; + int res= cs->cset->mb_wc(cs, &wc, s, e); + if (res <= 0) + *ctype= 0; + else + *ctype= my_uni_ctype[wc>>8].ctype ? + my_uni_ctype[wc>>8].ctype[wc&0xFF] : + my_uni_ctype[wc>>8].pctype; + return res; +} + + + MY_COLLATION_HANDLER my_collation_mb_bin_handler = { NULL, /* init */ diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index d6f2f0e5fe5..17d828b0ff5 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -1354,6 +1354,19 @@ longlong my_strtoll10_8bit(CHARSET_INFO *cs __attribute__((unused)), } +int my_mb_ctype_8bit(CHARSET_INFO *cs, int *ctype, + const unsigned char *s, const unsigned char *e) +{ + if (s >= e) + { + *ctype= 0; + return MY_CS_TOOFEW(0); + } + *ctype= cs->ctype[*s]; + return 1; +} + + /* Check if a constant can be propagated @@ -1420,6 +1433,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler= my_numcells_8bit, my_mb_wc_8bit, my_wc_mb_8bit, + my_mb_ctype_8bit, my_caseup_str_8bit, my_casedn_str_8bit, my_caseup_8bit, diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 398aea08b05..bf70b4a05db 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -4649,6 +4649,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_sjis, my_mb_wc_sjis, /* mb_wc */ my_wc_mb_sjis, /* wc_mb */ + my_mb_ctype_mb, my_caseup_str_8bit, my_casedn_str_8bit, my_caseup_8bit, diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 1200644de3c..2ae0142d510 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -877,6 +877,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_8bit, my_mb_wc_tis620, /* mb_wc */ my_wc_mb_tis620, /* wc_mb */ + my_mb_ctype_8bit, my_caseup_str_8bit, my_casedn_str_8bit, my_caseup_8bit, diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index e2629f445cb..9b9e4aedff3 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1615,6 +1615,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler= my_numcells_mb, my_ucs2_uni, /* mb_wc */ my_uni_ucs2, /* wc_mb */ + my_mb_ctype_mb, my_caseup_str_ucs2, my_casedn_str_ucs2, my_caseup_ucs2, diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 696eecaa794..a3e48a704f4 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -8531,6 +8531,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_numcells_eucjp, my_mb_wc_euc_jp, /* mb_wc */ my_wc_mb_euc_jp, /* wc_mb */ + my_mb_ctype_mb, my_caseup_str_mb, my_casedn_str_mb, my_caseup_mb, diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 62f405f6049..27f3a57883c 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -41,6 +41,8 @@ #ifdef HAVE_UNIDATA +#include "my_uctype.h" + static MY_UNICASE_INFO plane00[]={ {0x0000,0x0000,0x0000}, {0x0001,0x0001,0x0001}, {0x0002,0x0002,0x0002}, {0x0003,0x0003,0x0003}, @@ -2534,6 +2536,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_numcells_mb, my_utf8_uni, my_uni_utf8, + my_mb_ctype_mb, my_caseup_str_utf8, my_casedn_str_utf8, my_caseup_utf8, @@ -4027,6 +4030,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler= my_numcells_mb, my_mb_wc_filename, my_wc_mb_filename, + my_mb_ctype_mb, my_caseup_str_utf8, my_casedn_str_utf8, my_caseup_utf8, diff --git a/strings/uctypedump.c b/strings/uctypedump.c new file mode 100644 index 00000000000..174a5909d48 --- /dev/null +++ b/strings/uctypedump.c @@ -0,0 +1,226 @@ +/* +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +*/ +#include <my_global.h> +#include <m_string.h> +#include <m_ctype.h> +#include "m_ctype.h" + + +typedef struct my_ctype_name_st +{ + const char *name; + int val; +} MY_CTYPE_NAME_ST; + + +static MY_CTYPE_NAME_ST my_ctype_name[]= +{ + {"Lu", _MY_U}, /* Letter, Uppercase */ + {"Ll", _MY_L}, /* Letter, Lowercase */ + {"Lt", _MY_U}, /* Letter, Titlecase */ + {"Lm", _MY_L}, /* Letter, Modifier */ + {"Lo", _MY_L}, /* Letter, other */ + + {"Nd", _MY_NMR}, /* Number, Decimal Digit */ + {"Nl", _MY_NMR|_MY_U|_MY_L}, /* Number, Letter */ + {"No", _MY_NMR|_MY_PNT}, /* Number, Other */ + + {"Mn", _MY_L|_MY_PNT}, /* Mark, Nonspacing */ + {"Mc", _MY_L|_MY_PNT}, /* Mark, Spacing Combining */ + {"Me", _MY_L|_MY_PNT}, /* Mark, Enclosing */ + + {"Pc", _MY_PNT}, /* Punctuation, Connector */ + {"Pd", _MY_PNT}, /* Punctuation, Dash */ + {"Ps", _MY_PNT}, /* Punctuation, Open */ + {"Pe", _MY_PNT}, /* Punctuation, Close */ + {"Pi", _MY_PNT}, /* Punctuation, Initial quote */ + {"Pf", _MY_PNT}, /* Punctuation, Final quote */ + {"Po", _MY_PNT}, /* Punctuation, Other */ + + {"Sm", _MY_PNT}, /* Symbol, Math */ + {"Sc", _MY_PNT}, /* Symbol, Currency */ + {"Sk", _MY_PNT}, /* Symbol, Modifier */ + {"So", _MY_PNT}, /* Symbol, Other */ + + {"Zs", _MY_SPC}, /* Separator, Space */ + {"Zl", _MY_SPC}, /* Separator, Line */ + {"Zp", _MY_SPC}, /* Separator, Paragraph */ + + {"Cc", _MY_CTR}, /* Other, Control */ + {"Cf", _MY_CTR}, /* Other, Format */ + {"Cs", _MY_CTR}, /* Other, Surrogate */ + {"Co", _MY_CTR}, /* Other, Private Use */ + {"Cn", _MY_CTR}, /* Other, Not Assigned */ + {NULL, 0} +}; + + +static int +ctypestr2num(const char *tok) +{ + MY_CTYPE_NAME_ST *p; + for (p= my_ctype_name; p->name; p++) + { + if (!strncasecmp(p->name, tok, 2)) + return p->val; + } + return 0; +} + + +int main(int ac, char ** av) +{ + char str[1024]; + unsigned char ctypea[64*1024]; + size_t i; + size_t plane; + MY_UNI_CTYPE uctype[256]; + FILE *f= stdin; + + if (ac > 1 && av[1] && !(f= fopen(av[1],"r"))) + { + fprintf(stderr, "Can't open file %s\n", av[1]); + exit(1); + } + bzero(&ctypea,sizeof(ctypea)); + bzero(&uctype, sizeof(uctype)); + + printf("/*\n"); + printf(" Unicode ctype data\n"); + printf(" Generated from %s\n", av[1] ? av[1] : "stdin"); + printf("*/\n"); + + while(fgets(str, sizeof(str), f)) + { + size_t n= 0, code= 0; + char *s,*e; + int ctype= 0; + + for(s= str; s; ) + { + char *end; + char tok[1024]=""; + e=strchr(s,';'); + if(e) + { + strncpy(tok,s,(unsigned int)(e-s)); + tok[e-s]=0; + } + else + { + strcpy(tok,s); + } + + end=tok+strlen(tok); + + switch(n) + { + case 0: code= strtol(tok,&end,16);break; + case 2: ctype= ctypestr2num(tok);break; + } + + n++; + if(e) s=e+1; + else s=e; + } + if(code<=0xFFFF) + { + ctypea[code]= ctype; + } + } + + /* Fill digits */ + for (i= '0'; i <= '9'; i++) + ctypea[i]= _MY_NMR; + + for (i= 'a'; i <= 'z'; i++) + ctypea[i]|= _MY_X; + for (i= 'A'; i <= 'Z'; i++) + ctypea[i]|= _MY_X; + + + /* Fill ideographs */ + + /* CJK Ideographs Extension A (U+3400 - U+4DB5) */ + for(i=0x3400;i<=0x4DB5;i++) + { + ctypea[i]= _MY_L | _MY_U; + } + + /* CJK Ideographs (U+4E00 - U+9FA5) */ + for(i=0x4E00;i<=0x9FA5;i++){ + ctypea[i]= _MY_L | _MY_U; + } + + /* Hangul Syllables (U+AC00 - U+D7A3) */ + for(i=0xAC00;i<=0xD7A3;i++) + { + ctypea[i]= _MY_L | _MY_U; + } + + + /* Calc plane parameters */ + for(plane=0;plane<256;plane++) + { + size_t character; + uctype[plane].ctype= ctypea+plane*256; + + uctype[plane].pctype= uctype[plane].ctype[0]; + for(character=1;character<256;character++) + { + if (uctype[plane].ctype[character] != uctype[plane].pctype) + { + uctype[plane].pctype= 0; /* Mixed plane */ + break; + } + } + if (character==256) /* All the same, no needs to dump whole plane */ + uctype[plane].ctype= NULL; + } + + /* Dump mixed planes */ + for(plane=0;plane<256;plane++) + { + if(uctype[plane].ctype) + { + int charnum=0; + int num=0; + + printf("static unsigned char uctype_page%02X[256]=\n{\n",plane); + + for(charnum=0;charnum<256;charnum++) + { + int cod; + + cod=(plane<<8)+charnum; + printf(" %2d%s",uctype[plane].ctype[charnum],charnum<255?",":""); + + num++; + if(num==16) + { + printf("\n"); + num=0; + } + } + printf("};\n\n"); + } + } + + + /* Dump plane index */ + printf("MY_UNI_CTYPE my_uni_ctype[256]={\n"); + for(plane=0;plane<256;plane++) + { + char plane_name[128]="NULL"; + if(uctype[plane].ctype){ + sprintf(plane_name,"uctype_page%02X",plane); + } + printf("\t{%d,%s}%s\n",uctype[plane].pctype,plane_name,plane<255?",":""); + } + printf("};\n"); + + return 0; +} |