diff options
author | unknown <monty@mysql.com> | 2004-10-29 19:26:52 +0300 |
---|---|---|
committer | unknown <monty@mysql.com> | 2004-10-29 19:26:52 +0300 |
commit | f095274fe8c3d3394d6c0ce0a68f4bea04311999 (patch) | |
tree | 23bcc9a71fe7237887a111b158e30f5a6bb665d3 /strings | |
parent | f41bba8c6156a7adf4c67dfa75e16112767a5d3c (diff) | |
parent | 5be6c328f5a9f78f37176bbbd88a538fa3b65fe9 (diff) | |
download | mariadb-git-f095274fe8c3d3394d6c0ce0a68f4bea04311999.tar.gz |
merge with 4.1
BitKeeper/etc/ignore:
auto-union
BitKeeper/etc/logging_ok:
auto-union
BitKeeper/triggers/post-commit:
Auto merged
Docs/Support/texi2html:
Auto merged
Makefile.am:
Auto merged
client/Makefile.am:
Auto merged
client/mysql.cc:
Auto merged
client/mysqldump.c:
Auto merged
include/my_base.h:
Auto merged
include/my_global.h:
Auto merged
include/my_pthread.h:
Auto merged
include/my_sys.h:
Auto merged
include/my_time.h:
Auto merged
include/mysql.h:
Auto merged
include/mysql_com.h:
Auto merged
innobase/buf/buf0buf.c:
Auto merged
innobase/include/row0mysql.h:
Auto merged
innobase/row/row0sel.c:
Auto merged
libmysql/libmysql.c:
Auto merged
libmysqld/examples/Makefile.am:
Auto merged
myisam/mi_check.c:
Auto merged
mysql-test/include/ps_modify.inc:
Auto merged
mysql-test/install_test_db.sh:
Auto merged
mysql-test/r/alter_table.result:
Auto merged
mysql-test/r/auto_increment.result:
Auto merged
mysql-test/r/bdb.result:
Auto merged
mysql-test/r/ctype_latin1_de.result:
Auto merged
mysql-test/r/ctype_recoding.result:
Auto merged
mysql-test/r/fulltext.result:
Auto merged
mysql-test/r/func_gconcat.result:
Auto merged
mysql-test/r/func_group.result:
Auto merged
mysql-test/r/func_if.result:
Auto merged
mysql-test/t/derived.test:
Auto merged
mysql-test/t/insert.test:
merge with 4.1
Fixed test case to not use 'if exists' when it shouldn't
mysql-test/t/range.test:
merge with 4.1
Added missing drop table
sql/ha_ndbcluster.cc:
merge with 4.1
Simple optimization: use max() instead of ? :
sql/item_func.cc:
merge with 4.1
(Added back old variable names for easier merges)
sql/opt_range.cc:
merge with 4.1
Removed argument 'parent_alloc' from QUICK_RANGE_SELECT as this was not used
Added assert if using QUICK_GROUP_MIN_MAX_SELECT with parent_alloc as the init() function can't handle this
Changed back get_quick_select_for_ref() to use it's own alloc root becasue this function may be called several times for one query
sql/sql_handler.cc:
merge with 4.1
change variable 'err' to 'error' as same function had a label named 'err'
sql/sql_update.cc:
Use multi-update code from 5.0 instead of 4.1
We will fix the locking code shortly in 5.0 to be faster than in 4.1
Diffstat (limited to 'strings')
-rw-r--r-- | strings/CHARSET_INFO.txt | 230 | ||||
-rw-r--r-- | strings/Makefile.am | 2 | ||||
-rw-r--r-- | strings/ctype-big5.c | 3 | ||||
-rw-r--r-- | strings/ctype-bin.c | 1 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 3 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 3 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 3 | ||||
-rw-r--r-- | strings/ctype-latin1.c | 1 | ||||
-rw-r--r-- | strings/ctype-mb.c | 182 | ||||
-rw-r--r-- | strings/ctype-simple.c | 10 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 69 | ||||
-rw-r--r-- | strings/ctype-tis620.c | 1 | ||||
-rw-r--r-- | strings/ctype-uca.c | 157 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 363 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 37 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 294 | ||||
-rw-r--r-- | strings/xml.c | 7 |
17 files changed, 1083 insertions, 283 deletions
diff --git a/strings/CHARSET_INFO.txt b/strings/CHARSET_INFO.txt new file mode 100644 index 00000000000..f7a10f95880 --- /dev/null +++ b/strings/CHARSET_INFO.txt @@ -0,0 +1,230 @@ + +CHARSET_INFO +============ +A structure containing data for charset+collation pair implementation. + +Virtual functions which use this data are collected +into separate structures MY_CHARSET_HANDLER and +MY_COLLATION_HANDLER. + + +typedef struct charset_info_st +{ + uint number; + uint primary_number; + uint binary_number; + uint state; + + const char *csname; + const char *name; + const char *comment; + + uchar *ctype; + uchar *to_lower; + uchar *to_upper; + uchar *sort_order; + + uint16 *tab_to_uni; + MY_UNI_IDX *tab_from_uni; + + uchar state_map[256]; + uchar ident_map[256]; + + uint strxfrm_multiply; + uint mbminlen; + uint mbmaxlen; + char max_sort_char; /* For LIKE optimization */ + + MY_CHARSET_HANDLER *cset; + MY_COLLATION_HANDLER *coll; + +} CHARSET_INFO; + + +CHARSET_INFO fields description: +=============================== + + +Numbers (identifiers) +--------------------- + +number - an ID uniquely identifying this charset+collation pair. + +primary_number - ID of a charset+collation pair, which consists +of the same character set and the default collation of this +character set. Not really used now. Intended to optimize some +parts of the code where we need to find the default collation +using its non-default counterpart for the given character set. + +binary_numner - ID of a charset+collation pair, which consists +of the same character set and the binary collation of this +character set. Not really used now. + +Names +----- + + csname - name of the character set for this charset+collation pair. + name - name of the collation for this charset+collation pair. + comment - a text comment, dysplayed in "Description" column of + SHOW CHARACTER SET output. + +Conversion tables +----------------- + + ctype - pointer to array[257] of "type of characters" + bit mask for each chatacter, e.g. if a + character is a digit or a letter or a separator, etc. + + Monty 2004-10-21: + If you look at the macros, we use ctype[(char)+1]. + ctype[0] is traditionally in most ctype libraries + reserved for EOF (-1). The idea is that you can use + the result from fgetc() directly with ctype[]. As + we have to be compatible with external ctype[] versions, + it's better to do it the same way as they do... + + to_lower - pointer to array[256] used in LCASE() + to_upper - pointer to array[256] used in UCASE() + sort_order - pointer to array[256] used for strings comparison + + + +Unicode conversion data +----------------------- +For 8bit character sets: + +tab_to_uni : array[256] of charset->Unicode translation +tab_from_uni: a structure for Unicode->charset translation + +Non-8 bit charsets have their own structures per charset +hidden in correspondent ctype-xxx.c file and don't use +tab_to_uni and tab_from_uni tables. + + +Parser maps +----------- +state_map[] +ident_map[] + + These maps are to quickly identify if a character is +an identificator part, a digit, a special character, +or a part of other SQL language lexical item. + +Probably can be combined with ctype array in the future. +But for some reasons these two arrays are used in the parser, +while a separate ctype[] array is used in the other part of the +code, like fulltext, etc. + + +Misc fields +----------- + + strxfrm_multiply - how many times a sort key (i.e. a string + which can be passed into memcmp() for comparison) + can be longer than the original string. + Usually it is 1. For some complex + collations it can be bigger. For example + in latin1_german2_ci, a sort key is up to + twice longer than the original string. + e.g. Letter 'A' with two dots above is + substituted with 'AE'. + mbminlen - mininum multibyte sequence length. + Now always 1 except ucs2. For ucs2 + it is 2. + mbmaxlen - maximum multibyte sequence length. + 1 for 8bit charsets. Can be also 2 or 3. + + + +MY_CHARSET_HANDLER +================== + +MY_CHARSET_HANDLER is a collection of character-set +related routines. Defined in m_ctype.h. Have the +following set of functions: + +Multibyte routines +------------------ +ismbchar() - detects if the given string is a multibyte sequence +mbcharlen() - returns length of multibyte sequence starting with + the given character +numchars() - returns number of characters in the given string, e.g. + in SQL function CHAR_LENGTH(). +charpos() - calculates the offset of the given position in the string. + Used in SQL functions LEFT(), RIGHT(), SUBSTRING(), + INSERT() + +well_formed_length() + - finds the length of correctly formed multybyte beginning. + Used in INSERTs to cut a beginning of the given string + which is + a) "well formed" according to the given character set. + b) can fit into the given data type + Terminates the string in the good position, taking in account + multibyte character boundaries. + +lengthsp() - returns the length of the given string without traling spaces. + + +Unicode conversion routines +--------------------------- +mb_wc - converts the left multibyte sequence into it Unicode code. +mc_mb - converts the given Unicode code into multibyte sequence. + + +Case and sort convertion +------------------------ +caseup_str - converts the given 0-terminated string into the upper case +casedn_str - converts the given 0-terminated string into the lower case +caseup - converts the given string into the lower case using length +casedn - converts the given string into the lower case using length + +Number-to-string conversion routines +------------------------------------ +snprintf() +long10_to_str() +longlong10_to_str() + +The names are pretty self-descripting. + +String padding routines +----------------------- +fill() - writes the given Unicode value into the given string + with the given length. Used to pad the string, usually + with space character, according to the given charset. + +String-to-numner conversion routines +------------------------------------ +strntol() +strntoul() +strntoll() +strntoull() +strntod() + +These functions are almost for the same thing with their +STDLIB counterparts, but also: + - accept length instead of 0-terminator + - and are character set dependant + +Simple scanner routines +----------------------- +scan() - to skip leading spaces in the given string. + Used when a string value is inserted into a numeric field. + + + +MY_COLLATION_HANDLER +==================== +strnncoll() - compares two strings according to the given collation +strnncollsp() - like the above but ignores trailing spaces +strnxfrm() - makes a sort key suitable for memcmp() corresponding + to the given string +like_range() - creates a LIKE range, for optimizer +wildcmp() - wildcard comparison, for LIKE +strcasecmp() - 0-terminated string comparison +instr() - finds the first substring appearence in the string +hash_sort() - calculates hash value taking in account + the collation rules, e.g. case-insensitivity, + accent sensitivity, etc. + +
\ No newline at end of file diff --git a/strings/Makefile.am b/strings/Makefile.am index 31b5195d5cb..f8fcfbc5ea3 100644 --- a/strings/Makefile.am +++ b/strings/Makefile.am @@ -57,7 +57,7 @@ EXTRA_DIST = ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-win1250ch.c \ t_ctype.h libmystrings_a_LIBADD= -conf_to_src_SOURCES = conf_to_src.c xml.c ctype.c +conf_to_src_SOURCES = conf_to_src.c xml.c ctype.c bcmp.c conf_to_src_LDADD= #force static linking of conf_to_src - essential when linking against #custom installation of libc diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index 3f35f7504ac..8345c53202c 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -6290,7 +6290,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler= my_charpos_mb, my_well_formed_len_mb, my_lengthsp_8bit, - my_numcells_mb, + my_numcells_8bit, my_mb_wc_big5, /* mb_wc */ my_wc_mb_big5, /* wc_mb */ my_caseup_str_mb, @@ -6306,6 +6306,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 42dc0ab086d..7d17f62c8d0 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -465,6 +465,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index 43a50b0dfbe..ee792d9c3e4 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -8657,7 +8657,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_charpos_mb, my_well_formed_len_mb, my_lengthsp_8bit, - my_numcells_mb, + my_numcells_8bit, my_mb_wc_euc_kr, /* mb_wc */ my_wc_mb_euc_kr, /* wc_mb */ my_caseup_str_mb, @@ -8673,6 +8673,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index 8d97ac9ca1d..f17cc94723f 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -5708,7 +5708,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_charpos_mb, my_well_formed_len_mb, my_lengthsp_8bit, - my_numcells_mb, + my_numcells_8bit, my_mb_wc_gb2312, /* mb_wc */ my_wc_mb_gb2312, /* wc_mb */ my_caseup_str_mb, @@ -5724,6 +5724,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index 9400fb08f2b..0be56e8d946 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -9939,7 +9939,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_charpos_mb, my_well_formed_len_mb, my_lengthsp_8bit, - my_numcells_mb, + my_numcells_8bit, my_mb_wc_gbk, my_wc_mb_gbk, my_caseup_str_mb, @@ -9955,6 +9955,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index aea517811ab..5f1850b7772 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -403,6 +403,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index 2548a68ab19..7d81766c4cb 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -123,8 +123,7 @@ int my_strcasecmp_mb(CHARSET_INFO * cs,const char *s, const char *t) ** 1 if matched with wildcard */ -#define INC_PTR(cs,A,B) A+=((use_mb_flag && \ - my_ismbchar(cs,A,B)) ? my_ismbchar(cs,A,B) : 1) +#define INC_PTR(cs,A,B) A+=(my_ismbchar(cs,A,B) ? my_ismbchar(cs,A,B) : 1) #define likeconv(s,A) (uchar) (s)->sort_order[(uchar) (A)] @@ -135,8 +134,6 @@ int my_wildcmp_mb(CHARSET_INFO *cs, { int result= -1; /* Not found, using wildcards */ - bool use_mb_flag=use_mb(cs); - while (wildstr != wildend) { while (*wildstr != w_many && *wildstr != w_one) @@ -144,8 +141,7 @@ int my_wildcmp_mb(CHARSET_INFO *cs, int l; if (*wildstr == escape && wildstr+1 != wildend) wildstr++; - if (use_mb_flag && - (l = my_ismbchar(cs, wildstr, wildend))) + if ((l = my_ismbchar(cs, wildstr, wildend))) { if (str+l > str_end || memcmp(str, wildstr, l) != 0) return 1; @@ -200,41 +196,30 @@ int my_wildcmp_mb(CHARSET_INFO *cs, cmp= *++wildstr; mb=wildstr; - LINT_INIT(mblen); - if (use_mb_flag) - mblen = my_ismbchar(cs, wildstr, wildend); + mblen= my_ismbchar(cs, wildstr, wildend); INC_PTR(cs,wildstr,wildend); /* This is compared trough cmp */ cmp=likeconv(cs,cmp); do { - if (use_mb_flag) - { - for (;;) + for (;;) + { + if (str >= str_end) + return -1; + if (mblen) { - if (str >= str_end) - return -1; - if (mblen) + if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0) { - if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0) - { - str += mblen; - break; - } - } - else if (!my_ismbchar(cs, str, str_end) && - likeconv(cs,*str) == cmp) - { - str++; + str += mblen; break; } - INC_PTR(cs,str, str_end); } - } - else - { - while (str != str_end && likeconv(cs,*str) != cmp) + else if (!my_ismbchar(cs, str, str_end) && + likeconv(cs,*str) == cmp) + { str++; - if (str++ == str_end) return (-1); + break; + } + INC_PTR(cs,str, str_end); } { int tmp=my_wildcmp_mb(cs,str,str_end,wildstr,wildend,escape,w_one, @@ -458,6 +443,97 @@ static void my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)), } } +/* +** Calculate min_str and max_str that ranges a LIKE string. +** Arguments: +** ptr Pointer to LIKE string. +** ptr_length Length of LIKE string. +** escape Escape character in LIKE. (Normally '\'). +** All escape characters should be removed from min_str and max_str +** res_length Length of min_str and max_str. +** min_str Smallest case sensitive string that ranges LIKE. +** Should be space padded to res_length. +** max_str Largest case sensitive string that ranges LIKE. +** Normally padded with the biggest character sort value. +** +** The function should return 0 if ok and 1 if the LIKE string can't be +** optimized ! +*/ + +my_bool my_like_range_mb(CHARSET_INFO *cs, + const char *ptr,uint ptr_length, + pbool escape, pbool w_one, pbool w_many, + uint res_length, + char *min_str,char *max_str, + uint *min_length,uint *max_length) +{ + const char *end=ptr+ptr_length; + char *min_org=min_str; + char *min_end=min_str+res_length; + char *max_end=max_str+res_length; + + for (; ptr != end && min_str != min_end ; ptr++) + { + if (*ptr == escape && ptr+1 != end) + { + ptr++; /* Skip escape */ + *min_str++= *max_str++ = *ptr; + continue; + } + if (*ptr == w_one || *ptr == w_many) /* '_' and '%' in SQL */ + { + char buf[10]; + uint buflen; + uint charlen= my_charpos(cs, min_org, min_str, res_length/cs->mbmaxlen); + + if (charlen < (uint) (min_str - min_org)) + min_str= min_org + charlen; + + /* Write min key */ + *min_length= (uint) (min_str - min_org); + *max_length=res_length; + do + { + *min_str++= (char) cs->min_sort_char; + } while (min_str != min_end); + + /* + Write max key: create a buffer with multibyte + representation of the max_sort_char character, + and copy it into max_str in a loop. + */ + buflen= cs->cset->wc_mb(cs, cs->max_sort_char, (uchar*) buf, + (uchar*) buf + sizeof(buf)); + DBUG_ASSERT(buflen > 0); + do + { + if ((max_str + buflen) <= max_end) + { + /* Enough space for max characer */ + memcpy(max_str, buf, buflen); + max_str+= buflen; + } + else + { + /* + There is no space for whole multibyte + character, then add trailing spaces. + */ + + *max_str++= ' '; + } + } while (max_str != max_end); + return 0; + } + *min_str++= *max_str++ = *ptr; + } + *min_length= *max_length = (uint) (min_str - min_org); + + while (min_str != min_end) + *min_str++ = *max_str++ = ' '; /* Because if key compression */ + return 0; +} + static int my_wildcmp_mb_bin(CHARSET_INFO *cs, const char *str,const char *str_end, const char *wildstr,const char *wildend, @@ -465,8 +541,6 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs, { int result= -1; /* Not found, using wildcards */ - bool use_mb_flag=use_mb(cs); - while (wildstr != wildend) { while (*wildstr != w_many && *wildstr != w_one) @@ -474,8 +548,7 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs, int l; if (*wildstr == escape && wildstr+1 != wildend) wildstr++; - if (use_mb_flag && - (l = my_ismbchar(cs, wildstr, wildend))) + if ((l = my_ismbchar(cs, wildstr, wildend))) { if (str+l > str_end || memcmp(str, wildstr, l) != 0) return 1; @@ -530,42 +603,31 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs, cmp= *++wildstr; mb=wildstr; - LINT_INIT(mblen); - if (use_mb_flag) - mblen = my_ismbchar(cs, wildstr, wildend); + mblen= my_ismbchar(cs, wildstr, wildend); INC_PTR(cs,wildstr,wildend); /* This is compared trough cmp */ do { - if (use_mb_flag) - { - for (;;) + for (;;) + { + if (str >= str_end) + return -1; + if (mblen) { - if (str >= str_end) - return -1; - if (mblen) - { - if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0) - { - str += mblen; - break; - } - } - else if (!my_ismbchar(cs, str, str_end) && *str == cmp) + if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0) { - str++; + str += mblen; break; } - INC_PTR(cs,str, str_end); } - } - else - { - while (str != str_end && *str != cmp) + else if (!my_ismbchar(cs, str, str_end) && *str == cmp) + { str++; - if (str++ == str_end) return (-1); + break; + } + INC_PTR(cs,str, str_end); } { - int tmp=my_wildcmp_mb(cs,str,str_end,wildstr,wildend,escape,w_one,w_many); + int tmp=my_wildcmp_mb_bin(cs,str,str_end,wildstr,wildend,escape,w_one,w_many); if (tmp <= 0) return (tmp); } diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index 84bfcb0b171..a019665a235 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -27,8 +27,7 @@ int my_strnxfrm_simple(CHARSET_INFO * cs, const uchar *src, uint srclen) { uchar *map= cs->sort_order; - DBUG_ASSERT(len >= srclen); - len= min(len,srclen); + set_if_smaller(len, srclen); if (dest != src) { const uchar *end; @@ -1284,6 +1283,12 @@ static my_bool my_coll_init_simple(CHARSET_INFO *cs, } +longlong my_strtoll10_8bit(CHARSET_INFO *cs __attribute__((unused)), + const char *nptr, char **endptr, int *error) +{ + return my_strtoll10(nptr, endptr, error); +} + MY_CHARSET_HANDLER my_charset_8bit_handler= { @@ -1310,6 +1315,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index b4cfee0f24a..4176ff2e538 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -4534,6 +4534,70 @@ my_mb_wc_sjis(CHARSET_INFO *cs __attribute__((unused)), return 2; } +static +uint my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)), + const char *str, const char *strend) +{ + uint clen= 0; + const unsigned char *b= (const unsigned char *) str; + const unsigned char *e= (const unsigned char *) strend; + + for (clen= 0; b < e; ) + { + if (*b >= 0xA1 && *b <= 0xDF) + { + clen++; + b++; + } + else if (*b > 0x7F) + { + clen+= 2; + b+= 2; + } + else + { + clen++; + b++; + } + } + return clen; +} + +/* + Returns a well formed length of a SJIS string. + CP932 additional characters are also accepted. +*/ +static +uint my_well_formed_len_sjis(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, uint pos) +{ + const char *b0= b; + while (pos && b < e) + { + /* + Cast to int8 for extra safety. + "char" can be unsigned by default + on some platforms. + */ + if (((int8)b[0]) >= 0) + { + /* Single byte character */ + b+= 1; + } + else if (issjishead((uchar)*b) && (e-b)>1 && issjistail((uchar)b[1])) + { + /* Double byte character */ + b+= 2; + } + else + { + /* Wrong byte sequence */ + break; + } + } + return b - b0; +} + static MY_COLLATION_HANDLER my_collation_ci_handler = { @@ -4556,9 +4620,9 @@ static MY_CHARSET_HANDLER my_charset_handler= mbcharlen_sjis, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_sjis, my_lengthsp_8bit, - my_numcells_mb, + my_numcells_sjis, my_mb_wc_sjis, /* mb_wc */ my_wc_mb_sjis, /* wc_mb */ my_caseup_str_8bit, @@ -4574,6 +4638,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 420c5b5582e..a2ba4783591 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -946,6 +946,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index cecc3be5045..91af7af0c54 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -6658,6 +6658,42 @@ static const char roman[]= /* i.e. Classical Latin */ "& V << u <<< U "; /* + Persian collation support was provided by + Jody McIntyre <mysql@modernduck.com> + + To: internals@lists.mysql.com + Subject: Persian UTF8 collation support + Date: 17.08.2004 + + Contraction is not implemented. Some implementations do perform + contraction but others do not, and it is able to sort all my test + strings correctly. + + Jody. +*/ +static const char persian[]= + "& \\u066D < \\u064E < \\uFE76 < \\uFE77 < \\u0650 < \\uFE7A < \\uFE7B" + " < \\u064F < \\uFE78 < \\uFE79 < \\u064B < \\uFE70 < \\uFE71" + " < \\u064D < \\uFE74 < \\u064C < \\uFE72" + "& \\uFE7F < \\u0653 < \\u0654 < \\u0655 < \\u0670" + "& \\u0669 < \\u0622 < \\u0627 < \\u0671 < \\u0621 < \\u0623 < \\u0625" + " < \\u0624 < \\u0626" + "& \\u0642 < \\u06A9 < \\u0643" + "& \\u0648 < \\u0647 < \\u0629 < \\u06C0 < \\u06CC < \\u0649 < \\u064A" + "& \\uFE80 < \\uFE81 < \\uFE82 < \\uFE8D < \\uFE8E < \\uFB50 < \\uFB51" + " < \\uFE80 < \\uFE83 < \\uFE84 < \\uFE87 < \\uFE88 < \\uFE85" + " < \\uFE86 < \\u0689 < \\u068A" + "& \\uFEAE < \\uFDFC" + "& \\uFED8 < \\uFB8E < \\uFB8F < \\uFB90 < \\uFB91 < \\uFED9 < \\uFEDA" + " < \\uFEDB < \\uFEDC" + "& \\uFEEE < \\uFEE9 < \\uFEEA < \\uFEEB < \\uFEEC < \\uFE93 < \\uFE94" + " < \\uFBA4 < \\uFBA5 < \\uFBFC < \\uFBFD < \\uFBFE < \\uFBFF" + " < \\uFEEF < \\uFEF0 < \\uFEF1 < \\uFEF2 < \\uFEF3 < \\uFEF4" + " < \\uFEF5 < \\uFEF6 < \\uFEF7 < \\uFEF8 < \\uFEF9 < \\uFEFA" + " < \\uFEFB < \\uFEFC"; + + +/* Unicode Collation Algorithm: Collation element (weight) scanner, for consequent scan of collations @@ -6876,7 +6912,8 @@ static int my_uca_scanner_next_any(my_uca_scanner *scanner) int mblen; if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, &wc, - scanner->sbeg, scanner->send)) < 0)) + scanner->sbeg, + scanner->send)) <= 0)) return -1; scanner->page= wc >> 8; @@ -7015,6 +7052,28 @@ static int my_strnncoll_uca(CHARSET_INFO *cs, NOTES: Works exactly the same with my_strnncoll_uca(), but ignores trailing spaces. + + In the while() comparison these situations are possible: + 1. (s_res>0) and (t_res>0) and (s_res == t_res) + Weights are the same so far, continue comparison + 2. (s_res>0) and (t_res>0) and (s_res!=t_res) + A difference has been found, return. + 3. (s_res>0) and (t_res<0) + We have reached the end of the second string, or found + an illegal multibyte sequence in the second string. + Compare the first string to an infinite array of + space characters until difference is found, or until + the end of the first string. + 4. (s_res<0) and (t_res>0) + We have reached the end of the first string, or found + an illegal multibyte sequence in the first string. + Compare the second string to an infinite array of + space characters until difference is found or until + the end of the second steing. + 5. (s_res<0) and (t_res<0) + Both scanners returned -1. It means we have riched + the end-of-string of illegal-sequence in both strings + at the same time. Return 0, strings are equal. RETURN Difference between two strings, according to the collation: @@ -7033,9 +7092,6 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs, int s_res; int t_res; - slen= cs->cset->lengthsp(cs, (char*) s, slen); - tlen= cs->cset->lengthsp(cs, (char*) t, tlen); - scanner_handler->init(&sscanner, cs, s, slen); scanner_handler->init(&tscanner, cs, t, tlen); @@ -7044,6 +7100,36 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs, s_res= scanner_handler->next(&sscanner); t_res= scanner_handler->next(&tscanner); } while ( s_res == t_res && s_res >0); + + if (s_res > 0 && t_res < 0) + { + /* Calculate weight for SPACE character */ + t_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]]; + + /* compare the first string to spaces */ + do + { + if (s_res != t_res) + return (s_res - t_res); + s_res= scanner_handler->next(&sscanner); + } while (s_res > 0); + return 0; + } + + if (s_res < 0 && t_res > 0) + { + /* Calculate weight for SPACE character */ + s_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]]; + + /* compare the second string to spaces */ + do + { + if (s_res != t_res) + return (s_res - t_res); + t_res= scanner_handler->next(&tscanner); + } while (t_res > 0); + return 0; + } return ( s_res - t_res ); } @@ -7670,7 +7756,7 @@ static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems, return (size_t) nitems; } -#define MY_MAX_COLL_RULE 64 +#define MY_MAX_COLL_RULE 128 /* This function copies an UCS2 collation from @@ -7918,7 +8004,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler = my_strnncoll_ucs2_uca, my_strnncollsp_ucs2_uca, my_strnxfrm_ucs2_uca, - my_like_range_simple, + my_like_range_ucs2, my_wildcmp_uca, NULL, my_instr_mb, @@ -8359,6 +8445,35 @@ CHARSET_INFO my_charset_ucs2_roman_uca_ci= &my_collation_ucs2_uca_handler }; + +CHARSET_INFO my_charset_ucs2_persian_uca_ci= +{ + 144,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_persian_ci", /* name */ + "", /* comment */ + persian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + + #endif @@ -8369,7 +8484,7 @@ MY_COLLATION_HANDLER my_collation_any_uca_handler = my_strnncoll_any_uca, my_strnncollsp_any_uca, my_strnxfrm_any_uca, - my_like_range_simple, + my_like_range_mb, my_wildcmp_uca, NULL, my_instr_mb, @@ -8837,4 +8952,32 @@ CHARSET_INFO my_charset_utf8_roman_uca_ci= &my_charset_utf8_handler, &my_collation_any_uca_handler }; + +CHARSET_INFO my_charset_utf8_persian_uca_ci= +{ + 208,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_persian_ci", /* name */ + "", /* comment */ + persian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + #endif diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index c6e55ee8f0e..851c2044f47 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -18,6 +18,7 @@ /* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */ #include <my_global.h> +#include <my_sys.h> #include "m_string.h" #include "m_ctype.h" #include <errno.h> @@ -852,7 +853,6 @@ bs: return (negative ? -((longlong) res) : (longlong) res); } - double my_strntod_ucs2(CHARSET_INFO *cs __attribute__((unused)), char *nptr, uint length, char **endptr, int *err) @@ -1000,6 +1000,188 @@ cnv: } +#undef ULONGLONG_MAX +#define ULONGLONG_MAX (~(ulonglong) 0) +#define MAX_NEGATIVE_NUMBER ((ulonglong) LL(0x8000000000000000)) +#define INIT_CNT 9 +#define LFACTOR ULL(1000000000) +#define LFACTOR1 ULL(10000000000) +#define LFACTOR2 ULL(100000000000) + +static unsigned long lfactor[9]= +{ + 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L +}; + + +longlong my_strtoll10_ucs2(CHARSET_INFO *cs __attribute__((unused)), + const char *nptr, char **endptr, int *error) +{ + const char *s, *end, *start, *n_end, *true_end; + unsigned char c; + unsigned long i, j, k; + ulonglong li; + int negative; + ulong cutoff, cutoff2, cutoff3; + + s= nptr; + /* If fixed length string */ + if (endptr) + { + /* Make sure string length is even */ + end= s + ((*endptr - s) / 2) * 2; + while (s < end && !s[0] && (s[1] == ' ' || s[1] == '\t')) + s+= 2; + if (s == end) + goto no_conv; + } + else + { + /* We don't support null terminated strings in UCS2 */ + goto no_conv; + } + + /* Check for a sign. */ + negative= 0; + if (!s[0] && s[1] == '-') + { + *error= -1; /* Mark as negative number */ + negative= 1; + s+= 2; + if (s == end) + goto no_conv; + cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2; + cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100; + cutoff3= MAX_NEGATIVE_NUMBER % 100; + } + else + { + *error= 0; + if (!s[0] && s[1] == '+') + { + s+= 2; + if (s == end) + goto no_conv; + } + cutoff= ULONGLONG_MAX / LFACTOR2; + cutoff2= ULONGLONG_MAX % LFACTOR2 / 100; + cutoff3= ULONGLONG_MAX % 100; + } + + /* Handle case where we have a lot of pre-zero */ + if (!s[0] && s[1] == '0') + { + i= 0; + do + { + s+= 2; + if (s == end) + goto end_i; /* Return 0 */ + } + while (!s[0] && s[1] == '0'); + n_end= s + 2 * INIT_CNT; + } + else + { + /* Read first digit to check that it's a valid number */ + if (s[0] || (c= (s[1]-'0')) > 9) + goto no_conv; + i= c; + s+= 2; + n_end= s + 2 * (INIT_CNT-1); + } + + /* Handle first 9 digits and store them in i */ + if (n_end > end) + n_end= end; + for (; s != n_end ; s+= 2) + { + if (s[0] || (c= (s[1]-'0')) > 9) + goto end_i; + i= i*10+c; + } + if (s == end) + goto end_i; + + /* Handle next 9 digits and store them in j */ + j= 0; + start= s; /* Used to know how much to shift i */ + n_end= true_end= s + 2 * INIT_CNT; + if (n_end > end) + n_end= end; + do + { + if (s[0] || (c= (s[1]-'0')) > 9) + goto end_i_and_j; + j= j*10+c; + s+= 2; + } while (s != n_end); + if (s == end) + { + if (s != true_end) + goto end_i_and_j; + goto end3; + } + if (s[0] || (c= (s[1]-'0')) > 9) + goto end3; + + /* Handle the next 1 or 2 digits and store them in k */ + k=c; + s+= 2; + if (s == end || s[0] || (c= (s[1]-'0')) > 9) + goto end4; + k= k*10+c; + s+= 2; + *endptr= (char*) s; + + /* number string should have ended here */ + if (s != end && !s[0] && (c= (s[1]-'0')) <= 9) + goto overflow; + + /* Check that we didn't get an overflow with the last digit */ + if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) && + k > cutoff3))) + goto overflow; + li=i*LFACTOR2+ (ulonglong) j*100 + k; + return (longlong) li; + +overflow: /* *endptr is set here */ + *error= MY_ERRNO_ERANGE; + return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX; + +end_i: + *endptr= (char*) s; + return (negative ? ((longlong) -(long) i) : (longlong) i); + +end_i_and_j: + li= (ulonglong) i * lfactor[(uint) (s-start) / 2] + j; + *endptr= (char*) s; + return (negative ? -((longlong) li) : (longlong) li); + +end3: + li=(ulonglong) i*LFACTOR+ (ulonglong) j; + *endptr= (char*) s; + return (negative ? -((longlong) li) : (longlong) li); + +end4: + li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k; + *endptr= (char*) s; + if (negative) + { + if (li > MAX_NEGATIVE_NUMBER) + goto overflow; + return -((longlong) li); + } + return (longlong) li; + +no_conv: + /* There was no number to convert. */ + *error= MY_ERRNO_EDOM; + *endptr= (char *) nptr; + return 0; +} + + static uint my_numchars_ucs2(CHARSET_INFO *cs __attribute__((unused)), const char *b, const char *e) @@ -1049,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)), } -/* -** Compare string against string with wildcard -** 0 if matched -** -1 if not matched with wildcard -** 1 if matched with wildcard -*/ - -static -int my_wildcmp_ucs2(CHARSET_INFO *cs, - const char *str,const char *str_end, - const char *wildstr,const char *wildend, - int escape, int w_one, int w_many, - MY_UNICASE_INFO **weights) -{ - int result= -1; /* Not found, using wildcards */ - my_wc_t s_wc, w_wc; - int scan, plane; - - while (wildstr != wildend) - { - - while (1) - { - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)escape) - { - wildstr+= scan; - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - } - - if (w_wc == (my_wc_t)w_many) - { - result= 1; /* Found an anchor char */ - break; - } - - wildstr+= scan; - scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, (const uchar*)str_end); - if (scan <=0) - return 1; - str+= scan; - - if (w_wc == (my_wc_t)w_one) - { - result= 1; /* Found an anchor char */ - } - else - { - if (weights) - { - plane=(s_wc>>8) & 0xFF; - s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; - plane=(w_wc>>8) & 0xFF; - w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; - } - if (s_wc != w_wc) - return 1; /* No match */ - } - if (wildstr == wildend) - return (str != str_end); /* Match if both are at end */ - } - - - if (w_wc == (my_wc_t)w_many) - { /* Found w_many */ - - /* Remove any '%' and '_' from the wild search string */ - for ( ; wildstr != wildend ; ) - { - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)w_many) - { - wildstr+= scan; - continue; - } - - if (w_wc == (my_wc_t)w_one) - { - wildstr+= scan; - scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, - (const uchar*)str_end); - if (scan <=0) - return 1; - str+= scan; - continue; - } - break; /* Not a wild character */ - } - - if (wildstr == wildend) - return 0; /* Ok if w_many is last */ - - if (str == str_end) - return -1; - - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)escape) - { - wildstr+= scan; - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - } - - while (1) - { - /* Skip until the first character from wildstr is found */ - while (str != str_end) - { - scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str, - (const uchar*)str_end); - if (scan <= 0) - return 1; - if (weights) - { - plane=(s_wc>>8) & 0xFF; - s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; - plane=(w_wc>>8) & 0xFF; - w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; - } - - if (s_wc == w_wc) - break; - str+= scan; - } - if (str == str_end) - return -1; - - result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape, - w_one,w_many,weights); - - if (result <= 0) - return result; - - str+= scan; - } - } - } - return (str != str_end ? 1 : 0); -} - - static int my_wildcmp_ucs2_ci(CHARSET_INFO *cs, const char *str,const char *str_end, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend, - escape,w_one,w_many,uni_plane); + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,uni_plane); } @@ -1224,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend, - escape,w_one,w_many,NULL); + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,NULL); } @@ -1345,10 +1369,10 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs, } if (ptr[0] == '\0' && ptr[1] == w_one) /* '_' in SQL */ { - *min_str++= (char) cs->min_sort_char >> 8; - *min_str++= (char) cs->min_sort_char & 255; - *max_str++= (char) cs->max_sort_char >> 8; - *max_str++= (char) cs->max_sort_char & 255; + *min_str++= (char) (cs->min_sort_char >> 8); + *min_str++= (char) (cs->min_sort_char & 255); + *max_str++= (char) (cs->max_sort_char >> 8); + *max_str++= (char) (cs->max_sort_char & 255); continue; } if (ptr[0] == '\0' && ptr[1] == w_many) /* '%' in SQL */ @@ -1358,8 +1382,8 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs, do { *min_str++ = 0; *min_str++ = 0; - *max_str++ = (char) cs->max_sort_char >>8; - *max_str++ = (char) cs->max_sort_char & 255; + *max_str++ = (char) (cs->max_sort_char >> 8); + *max_str++ = (char) (cs->max_sort_char & 255); } while (min_str + 1 < min_end); return 0; } @@ -1439,6 +1463,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler= my_strntoll_ucs2, my_strntoull_ucs2, my_strntod_ucs2, + my_strtoll10_ucs2, my_scan_8bit }; diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 37c26a3bbc4..94673a20795 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -8252,6 +8252,40 @@ my_jisx0212_uni_onechar(int code){ [xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char) */ +static +uint my_numcells_eucjp(CHARSET_INFO *cs __attribute__((unused)), + const char *str, const char *strend) +{ + uint clen= 0; + const unsigned char *b= (const unsigned char *) str; + const unsigned char *e= (const unsigned char *) strend; + + for (clen= 0; b < e; ) + { + if (*b == 0x8E) + { + clen++; + b+= 2; + } + else if (*b == 0x8F) + { + clen+= 2; + b+= 3; + } + else if (*b & 0x80) + { + clen+= 2; + b+= 2; + } + else + { + clen++; + b++; + } + } + return clen; +} + static int my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e) { @@ -8443,7 +8477,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_charpos_mb, my_well_formed_len_mb, my_lengthsp_8bit, - my_numcells_mb, + my_numcells_eucjp, my_mb_wc_euc_jp, /* mb_wc */ my_wc_mb_euc_jp, /* wc_mb */ my_caseup_str_mb, @@ -8459,6 +8493,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 5e339725b1a..b3097649158 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={ }; + +/* +** Compare string against string with wildcard +** This function is used in UTF8 and UCS2 +** +** 0 if matched +** -1 if not matched with wildcard +** 1 if matched with wildcard +*/ + +int my_wildcmp_unicode(CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many, + MY_UNICASE_INFO **weights) +{ + int result= -1; /* Not found, using wildcards */ + my_wc_t s_wc, w_wc; + int scan, plane; + int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc, + const unsigned char *s,const unsigned char *e); + mb_wc= cs->cset->mb_wc; + + while (wildstr != wildend) + { + while (1) + { + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + + if (w_wc == (my_wc_t)escape) + { + wildstr+= scan; + if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + } + + if (w_wc == (my_wc_t)w_many) + { + result= 1; /* Found an anchor char */ + break; + } + + wildstr+= scan; + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + str+= scan; + + if (w_wc == (my_wc_t)w_one) + { + result= 1; /* Found an anchor char */ + } + else + { + if (weights) + { + plane=(s_wc>>8) & 0xFF; + s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; + plane=(w_wc>>8) & 0xFF; + w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; + } + if (s_wc != w_wc) + return 1; /* No match */ + } + if (wildstr == wildend) + return (str != str_end); /* Match if both are at end */ + } + + + if (w_wc == (my_wc_t)w_many) + { /* Found w_many */ + + /* Remove any '%' and '_' from the wild search string */ + for ( ; wildstr != wildend ; ) + { + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + + if (w_wc == (my_wc_t)w_many) + { + wildstr+= scan; + continue; + } + + if (w_wc == (my_wc_t)w_one) + { + wildstr+= scan; + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + str+= scan; + continue; + } + break; /* Not a wild character */ + } + + if (wildstr == wildend) + return 0; /* Ok if w_many is last */ + + if (str == str_end) + return -1; + + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <=0) + return 1; + + if (w_wc == (my_wc_t)escape) + { + wildstr+= scan; + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <=0) + return 1; + } + + while (1) + { + /* Skip until the first character from wildstr is found */ + while (str != str_end) + { + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + if (weights) + { + plane=(s_wc>>8) & 0xFF; + s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; + plane=(w_wc>>8) & 0xFF; + w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; + } + + if (s_wc == w_wc) + break; + str+= scan; + } + if (str == str_end) + return -1; + + result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend, + escape, w_one, w_many, + weights); + + if (result <= 0) + return result; + + str+= scan; + } + } + } + return (str != str_end ? 1 : 0); +} + #endif @@ -1948,50 +2103,120 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs, } -static int my_strncasecmp_utf8(CHARSET_INFO *cs, - const char *s, const char *t, uint len) -{ - int s_res,t_res; - my_wc_t s_wc,t_wc; - const char *se=s+len; - const char *te=t+len; +/* + Compare 0-terminated UTF8 strings. - while ( s < se && t < te ) - { - int plane; + SYNOPSIS + my_strcasecmp_utf8() + cs character set handler + s First 0-terminated string to compare + t Second 0-terminated string to compare - s_res=my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*)se); - t_res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*)te); + IMPLEMENTATION - if ( s_res <= 0 || t_res <= 0 ) + RETURN + - negative number if s < t + - positive number if s > t + - 0 is the strings are equal +*/ + +static +int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) +{ + while (s[0] && t[0]) + { + my_wc_t s_wc,t_wc; + + /* + Cast to int8 for extra safety. + char can be unsigned by default + on some platforms. + */ + if (((int8)s[0]) >= 0) { - /* Incorrect string, compare byte by byte value */ - return bincmp(s, se, t, te); + /* + s[0] is between 0 and 127. + It represents a single byte character. + Convert it into weight according to collation. + */ + s_wc= plane00[(uchar) s[0]].tolower; + s++; } - - plane=(s_wc>>8) & 0xFF; - s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc; - - plane=(t_wc>>8) & 0xFF; - t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc; - + else + { + int plane, res; + + /* + Scan a multibyte character. + + In the future it is worth to write a special version of my_utf8_uni() + for 0-terminated strings which will not take in account length. Now + we call the regular version of my_utf8_uni() with s+3 in the + last argument. s+3 is enough to scan any multibyte sequence. + + Calling the regular version of my_utf8_uni is safe for 0-terminated + strings: we will never lose the end of the string: + If we have 0 character in the middle of a multibyte sequence, + then my_utf8_uni will always return a negative number, so the + loop with finish. + */ + + res= my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*) s + 3); + + /* + In the case of wrong multibyte sequence we will + call strcmp() for byte-to-byte comparison. + */ + if (res <= 0) + return strcmp(s, t); + s+= res; + + /* Convert Unicode code into weight according to collation */ + plane=(s_wc>>8) & 0xFF; + s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc; + } + + + /* Do the same for the second string */ + + if (((int8)t[0]) >= 0) + { + /* Convert single byte character into weight */ + t_wc= plane00[(uchar) t[0]].tolower; + t++; + } + else + { + int plane; + int res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*) t + 3); + if (res <= 0) + return strcmp(s, t); + t+= res; + + /* Convert code into weight */ + plane=(t_wc>>8) & 0xFF; + t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc; + } + + /* Now we have two weights, let's compare them */ if ( s_wc != t_wc ) return ((int) s_wc) - ((int) t_wc); - - s+=s_res; - t+=t_res; } - return ( (se-s) - (te-t) ); + return ((int)(uchar)s[0]) - ((int) (uchar) t[0]); } -static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) + +static +int my_wildcmp_utf8(CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many) { - uint s_len=strlen(s); - uint t_len=strlen(t); - uint len = (s_len > t_len) ? s_len : t_len; - return my_strncasecmp_utf8(cs, s, t, len); + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,uni_plane); } + static int my_strnxfrm_utf8(CHARSET_INFO *cs, uchar *dst, uint dstlen, const uchar *src, uint srclen) @@ -2059,8 +2284,8 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_utf8, my_strnncollsp_utf8, my_strnxfrm_utf8, - my_like_range_simple, - my_wildcmp_mb, + my_like_range_mb, + my_wildcmp_utf8, my_strcasecmp_utf8, my_instr_mb, my_hash_sort_utf8 @@ -2091,6 +2316,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_strntoll_8bit, my_strntoull_8bit, my_strntod_8bit, + my_strtoll10_8bit, my_scan_8bit }; @@ -2118,7 +2344,7 @@ CHARSET_INFO my_charset_utf8_general_ci= 1, /* mbminlen */ 3, /* mbmaxlen */ 0, /* min_sort_char */ - 255, /* max_sort_char */ + 0xFFFF, /* max_sort_char */ &my_charset_utf8_handler, &my_collation_ci_handler }; diff --git a/strings/xml.c b/strings/xml.c index 7d7839e1603..6ba52ea41a8 100644 --- a/strings/xml.c +++ b/strings/xml.c @@ -81,10 +81,11 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a) a->beg=p->cur; a->end=p->cur; - if (!memcmp(p->cur,"<!--",4)) + if (!bcmp(p->cur,"<!--",4)) { - for( ; (p->cur < p->end) && memcmp(p->cur, "-->", 3); p->cur++); - if(!memcmp(p->cur, "-->", 3)) + for( ; (p->cur < p->end) && bcmp(p->cur, "-->", 3); p->cur++) + {} + if (!bcmp(p->cur, "-->", 3)) p->cur+=3; a->end=p->cur; lex=MY_XML_COMMENT; |