diff options
-rw-r--r-- | client/client_priv.h | 2 | ||||
-rw-r--r-- | client/mysqldump.c | 61 | ||||
-rw-r--r-- | include/m_ctype.h | 5 | ||||
-rw-r--r-- | innobase/srv/srv0srv.c | 8 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8.result | 9 | ||||
-rw-r--r-- | mysql-test/r/union.result | 78 | ||||
-rw-r--r-- | mysql-test/t/ctype_utf8.test | 8 | ||||
-rw-r--r-- | mysql-test/t/union.test | 55 | ||||
-rw-r--r-- | sql/item.cc | 4 | ||||
-rw-r--r-- | sql/sql_union.cc | 20 | ||||
-rw-r--r-- | strings/CHARSET_INFO.txt | 221 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 166 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 168 |
13 files changed, 628 insertions, 177 deletions
diff --git a/client/client_priv.h b/client/client_priv.h index 328c051905c..f16ec0e802b 100644 --- a/client/client_priv.h +++ b/client/client_priv.h @@ -45,5 +45,5 @@ enum options_client OPT_COMPATIBLE, OPT_RECONNECT, OPT_DELIMITER, OPT_SECURE_AUTH, OPT_OPEN_FILES_LIMIT, OPT_SET_CHARSET, OPT_CREATE_OPTIONS, OPT_START_POSITION, OPT_STOP_POSITION, OPT_START_DATETIME, OPT_STOP_DATETIME, - OPT_SIGINT_IGNORE + OPT_SIGINT_IGNORE, OPT_HEXBLOB }; diff --git a/client/mysqldump.c b/client/mysqldump.c index 56505afd235..1686278096b 100644 --- a/client/mysqldump.c +++ b/client/mysqldump.c @@ -81,7 +81,8 @@ static my_bool verbose=0,tFlag=0,cFlag=0,dFlag=0,quick= 1, extended_insert= 1, opt_alldbs=0,opt_create_db=0,opt_first_slave=0,opt_set_charset, opt_autocommit=0,opt_master_data,opt_disable_keys=1,opt_xml=0, opt_delete_master_logs=0, tty_password=0, - opt_single_transaction=0, opt_comments= 0, opt_compact= 0; + opt_single_transaction=0, opt_comments= 0, opt_compact= 0, + opt_hex_blob=0; static ulong opt_max_allowed_packet, opt_net_buffer_length; static MYSQL mysql_connection,*sock=0; static char insert_pat[12 * 1024],*opt_password=0,*current_user=0, @@ -316,6 +317,8 @@ static struct my_option my_long_options[] = {"comments", 'i', "Write additional information.", (gptr*) &opt_comments, (gptr*) &opt_comments, 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, + {"hex-blob", OPT_HEXBLOB, "Dump BLOBs in HEX. this mode does not work with extended-insert", + (gptr*) &opt_hex_blob, (gptr*) &opt_hex_blob, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} }; @@ -1507,6 +1510,7 @@ static void dumpTable(uint numFields, char *table) for (i = 0; i < mysql_num_fields(res); i++) { + int is_blob; if (!(field = mysql_fetch_field(res))) { sprintf(query,"%s: Not enough fields from table %s! Aborting.\n", @@ -1515,6 +1519,17 @@ static void dumpTable(uint numFields, char *table) error= EX_CONSCHECK; goto err; } + + /* + 63 is my_charset_bin. If charsetnr is not 63, + we have not a BLOB but a TEXT column. + we'll dump it in hex only BLOB columns. + */ + is_blob= (opt_hex_blob && field->charsetnr == 63 && + (field->type == FIELD_TYPE_BLOB || + field->type == FIELD_TYPE_LONG_BLOB || + field->type == FIELD_TYPE_MEDIUM_BLOB || + field->type == FIELD_TYPE_TINY_BLOB)) ? 1 : 0; if (extended_insert) { ulong length = lengths[i]; @@ -1535,12 +1550,28 @@ static void dumpTable(uint numFields, char *table) error= EX_EOM; goto err; } - dynstr_append(&extended_row,"'"); - extended_row.length += - mysql_real_escape_string(&mysql_connection, - &extended_row.str[extended_row.length],row[i],length); - extended_row.str[extended_row.length]='\0'; - dynstr_append(&extended_row,"'"); + if (opt_hex_blob && is_blob) + { + ulong counter; + unsigned char *ptr= row[i]; + dynstr_append(&extended_row, "0x"); + for (counter = 0; counter < lengths[i]; counter++) + { + char xx[3]; + sprintf(xx, "%02X", ptr[counter]); + dynstr_append(&extended_row, xx); + } + } + else + { + dynstr_append(&extended_row,"'"); + extended_row.length += + mysql_real_escape_string(&mysql_connection, + &extended_row.str[extended_row.length], + row[i],length); + extended_row.str[extended_row.length]='\0'; + dynstr_append(&extended_row,"'"); + } } else { @@ -1591,8 +1622,20 @@ static void dumpTable(uint numFields, char *table) print_quoted_xml(md_result_file, row[i], lengths[i]); fputs("</field>\n", md_result_file); } - else - unescape(md_result_file, row[i], lengths[i]); + else if (opt_hex_blob && is_blob) + { /* sakaik got this idea. */ + ulong counter; + char xx[4]; + unsigned char *ptr= row[i]; + fputs("0x", md_result_file); + for (counter = 0; counter < lengths[i]; counter++) + { + sprintf(xx, "%02X", ptr[counter]); + fputs(xx, md_result_file); + } + } + else + unescape(md_result_file, row[i], lengths[i]); } else { diff --git a/include/m_ctype.h b/include/m_ctype.h index 1f42b514a1b..ddc21070547 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -365,6 +365,11 @@ uint my_instr_mb(struct charset_info_st *, const char *s, uint s_length, my_match_t *match, uint nmatch); +int my_wildcmp_unicode(CHARSET_INFO *cs, + const char *str, const char *str_end, + const char *wildstr, const char *wildend, + int escape, int w_one, int w_many, + MY_UNICASE_INFO **weights); extern my_bool my_parse_charset_xml(const char *bug, uint len, int (*add)(CHARSET_INFO *cs)); diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index d913d77fdfc..b34ae9f36f4 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -951,7 +951,13 @@ retry: trx->op_info = "sleeping before joining InnoDB queue"; - os_thread_sleep(50000); + /* Peter Zaitsev suggested that we take the sleep away + altogether. But the sleep may be good in pathological + situations of lots of thread switches. Simply put some + threads aside for a while to reduce the number of thread + switches. */ + + os_thread_sleep(10000); trx->op_info = ""; diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 2e8bbc8fa92..e65eb96cb68 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -63,6 +63,15 @@ select 'A' like 'a' collate utf8_bin; select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%'); _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%') 1 +select convert(_latin1'Günter André' using utf8) like CONVERT(_latin1'GÜNTER%' USING utf8); +convert(_latin1'Günter André' using utf8) like CONVERT(_latin1'GÜNTER%' USING utf8) +1 +select CONVERT(_koi8r'×ÁÓÑ' USING utf8) LIKE CONVERT(_koi8r'÷áóñ' USING utf8); +CONVERT(_koi8r'×ÁÓÑ' USING utf8) LIKE CONVERT(_koi8r'÷áóñ' USING utf8) +1 +select CONVERT(_koi8r'÷áóñ' USING utf8) LIKE CONVERT(_koi8r'×ÁÓÑ' USING utf8); +CONVERT(_koi8r'÷áóñ' USING utf8) LIKE CONVERT(_koi8r'×ÁÓÑ' USING utf8) +1 SELECT 'a' = 'a '; 'a' = 'a ' 1 diff --git a/mysql-test/r/union.result b/mysql-test/r/union.result index fbd4f8e11dc..7820cd1d6ff 100644 --- a/mysql-test/r/union.result +++ b/mysql-test/r/union.result @@ -1033,3 +1033,81 @@ a No aaa,bbb drop table t1,t2,t3,t4; +create table t1 as +(select _latin1'test') union +(select _latin1'TEST') union +(select _latin1'TeST'); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `test` char(4) NOT NULL default '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +select count(*) from t1; +count(*) +1 +drop table t1; +create table t1 as +(select _latin1'test' collate latin1_bin) union +(select _latin1'TEST') union +(select _latin1'TeST'); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `_latin1'test' collate latin1_bin` char(4) character set latin1 collate latin1_bin NOT NULL default '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +select count(*) from t1; +count(*) +3 +drop table t1; +create table t1 as +(select _latin1'test') union +(select _latin1'TEST' collate latin1_bin) union +(select _latin1'TeST'); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `test` char(4) character set latin1 collate latin1_bin NOT NULL default '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +select count(*) from t1; +count(*) +3 +drop table t1; +create table t1 as +(select _latin1'test') union +(select _latin1'TEST') union +(select _latin1'TeST' collate latin1_bin); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `test` char(4) character set latin1 collate latin1_bin NOT NULL default '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +select count(*) from t1; +count(*) +3 +drop table t1; +create table t2 ( +a char character set latin1 collate latin1_swedish_ci, +b char character set latin1 collate latin1_bin); +create table t1 as +(select a from t2) union +(select b from t2); +ERROR HY000: Illegal mix of collations for operation 'UNION' +create table t1 as +(select a collate latin1_german1_ci from t2) union +(select b from t2); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a collate latin1_german1_ci` char(1) character set latin1 collate latin1_german1_ci default NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +drop table t1; +create table t1 as +(select a from t2) union +(select b collate latin1_german1_ci from t2); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` char(1) character set latin1 collate latin1_german1_ci default NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +drop table t1; +drop table t2; diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index c74bb59ae6b..238cd6daef3 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -33,6 +33,14 @@ select 'A' like 'a'; select 'A' like 'a' collate utf8_bin; select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%'); +# Bug #6040: can't retrieve records with umlaut +# characters in case insensitive manner. +# Case insensitive search LIKE comparison +# was broken for multibyte characters: +select convert(_latin1'Günter André' using utf8) like CONVERT(_latin1'GÜNTER%' USING utf8); +select CONVERT(_koi8r'×ÁÓÑ' USING utf8) LIKE CONVERT(_koi8r'÷áóñ' USING utf8); +select CONVERT(_koi8r'÷áóñ' USING utf8) LIKE CONVERT(_koi8r'×ÁÓÑ' USING utf8); + # # Check the following: # "a" == "a " diff --git a/mysql-test/t/union.test b/mysql-test/t/union.test index c5e72e85835..6e16a2b02aa 100644 --- a/mysql-test/t/union.test +++ b/mysql-test/t/union.test @@ -595,3 +595,58 @@ select a as a from t3 union select "1"; select a as a from t4 union select a from t3; select a as a from t1 union select a from t4; drop table t1,t2,t3,t4; + +# +# Bug #6139 UNION doesn't understand collate in the column of second select +# +create table t1 as +(select _latin1'test') union +(select _latin1'TEST') union +(select _latin1'TeST'); +show create table t1; +select count(*) from t1; +drop table t1; + +create table t1 as +(select _latin1'test' collate latin1_bin) union +(select _latin1'TEST') union +(select _latin1'TeST'); +show create table t1; +select count(*) from t1; +drop table t1; + +create table t1 as +(select _latin1'test') union +(select _latin1'TEST' collate latin1_bin) union +(select _latin1'TeST'); +show create table t1; +select count(*) from t1; +drop table t1; + +create table t1 as +(select _latin1'test') union +(select _latin1'TEST') union +(select _latin1'TeST' collate latin1_bin); +show create table t1; +select count(*) from t1; +drop table t1; + +create table t2 ( +a char character set latin1 collate latin1_swedish_ci, +b char character set latin1 collate latin1_bin); +--error 1271 +create table t1 as +(select a from t2) union +(select b from t2); +create table t1 as +(select a collate latin1_german1_ci from t2) union +(select b from t2); +show create table t1; +drop table t1; +create table t1 as +(select a from t2) union +(select b collate latin1_german1_ci from t2); +show create table t1; +drop table t1; +drop table t2; + diff --git a/sql/item.cc b/sql/item.cc index 0366ea29485..304579d65a2 100644 --- a/sql/item.cc +++ b/sql/item.cc @@ -2558,8 +2558,8 @@ bool Item_type_holder::join_types(THD *thd, Item *item) if (use_new_field || use_expression_type || (new_result_type != item_type) || (new_length > max_length) || (!maybe_null && item->maybe_null) || - (item_type == STRING_RESULT && - !my_charset_same(collation.collation, item->collation.collation))) + (item_type == STRING_RESULT && + collation.collation != item->collation.collation)) { if (use_expression_type || item->type() != Item::FIELD_ITEM) field_example= 0; diff --git a/sql/sql_union.cc b/sql/sql_union.cc index b46cfc05538..fc2d2a3a5e4 100644 --- a/sql/sql_union.cc +++ b/sql/sql_union.cc @@ -264,9 +264,27 @@ int st_select_lex_unit::prepare(THD *thd_arg, select_result *sel_result, } } - // it is not single select if (first_select->next_select()) { + + // it is not single select + + /* + Check that it was possible to aggregate + all collations together for UNION. + */ + List_iterator_fast<Item> tp(types); + Item *type; + while ((type= tp++)) + { + if (type->result_type() == STRING_RESULT && + type->collation.derivation == DERIVATION_NONE) + { + my_error(ER_CANT_AGGREGATE_NCOLLATIONS, MYF(0), "UNION"); + goto err; + } + } + union_result->tmp_table_param.field_count= types.elements; if (!(table= create_tmp_table(thd_arg, &union_result->tmp_table_param, types, diff --git a/strings/CHARSET_INFO.txt b/strings/CHARSET_INFO.txt new file mode 100644 index 00000000000..883000e7ade --- /dev/null +++ b/strings/CHARSET_INFO.txt @@ -0,0 +1,221 @@ + +CHARSET_INFO +============ +A structure containing data for charset+collation pair implementation. + +Virtual functions which use this data are collected +into separate structures MY_CHARSET_HANDLER and +MY_COLLATION_HANDLER. + + +typedef struct charset_info_st +{ + uint number; + uint primary_number; + uint binary_number; + uint state; + + const char *csname; + const char *name; + const char *comment; + + uchar *ctype; + uchar *to_lower; + uchar *to_upper; + uchar *sort_order; + + uint16 *tab_to_uni; + MY_UNI_IDX *tab_from_uni; + + uchar state_map[256]; + uchar ident_map[256]; + + uint strxfrm_multiply; + uint mbminlen; + uint mbmaxlen; + char max_sort_char; /* For LIKE optimization */ + + MY_CHARSET_HANDLER *cset; + MY_COLLATION_HANDLER *coll; + +} CHARSET_INFO; + + +CHARSET_INFO fields description: +=============================== + + +Numbers (identifiers) +--------------------- + +number - an ID uniquely identifying this charset+collation pair. + +primary_number - ID of a charset+collation pair, which consists +of the same character set and the default collation of this +character set. Not really used now. Intended to optimize some +parts of the code where we need to find the default collation +using its non-default counterpart for the given character set. + +binary_numner - ID of a charset+collation pair, which consists +of the same character set and the binary collation of this +character set. Not really used now. + +Names +----- + + csname - name of the character set for this charset+collation pair. + name - name of the collation for this charset+collation pair. + comment - a text comment, dysplayed in "Description" column of + SHOW CHARACTER SET output. + +Conversion tables +----------------- + + ctype - pointer to array[257] of "type of characters" + bit mask for each chatacter, e.g. if a + character is a digit or a letter or a separator, etc. + to_lower - pointer to arrat[256] used in LCASE() + to_upper - pointer to array[256] used in UCASE() + sort_order - pointer to array[256] used for strings comparison + + + +Unicode conversion data +----------------------- +For 8bit character sets: + +tab_to_uni : array[256] of charset->Unicode translation +tab_from_uni: a structure for Unicode->charset translation + +Non-8 bit charsets have their own structures per charset +hidden in correspondent ctype-xxx.c file and don't use +tab_to_uni and tab_from_uni tables. + + +Parser maps +----------- +state_map[] +ident_map[] + + These maps are to quickly identify if a character is +an identificator part, a digit, a special character, +or a part of other SQL language lexical item. + +Probably can be combined with ctype array in the future. +But for some reasons these two arrays are used in the parser, +while a separate ctype[] array is used in the other part of the +code, like fulltext, etc. + + +Misc fields +----------- + + strxfrm_multiply - how many times a sort key (i.e. a string + which can be passed into memcmp() for comparison) + can be longer than the original string. + Usually it is 1. For some complex + collations it can be bigger. For example + in latin1_german2_ci, a sort key is up to + twice longer than the original string. + e.g. Letter 'A' with two dots above is + substituted with 'AE'. + mbminlen - mininum multibyte sequence length. + Now always 1 except ucs2. For ucs2 + it is 2. + mbmaxlen - maximum multibyte sequence length. + 1 for 8bit charsets. Can be also 2 or 3. + + + +MY_CHARSET_HANDLER +================== + +MY_CHARSET_HANDLER is a collection of character-set +related routines. Defined in m_ctype.h. Have the +following set of functions: + +Multibyte routines +------------------ +ismbchar() - detects if the given string is a multibyte sequence +mbcharlen() - retuturns length of multibyte sequence starting with + the given character +numchars() - returns number of characters in the given string, e.g. + in SQL function CHAR_LENGTH(). +charpos() - calculates the offset of the given position in the string. + Used in SQL functions LEFT(), RIGHT(), SUBSTRING(), + INSERT() + +well_formed_length() + - finds the length of correctly formed multybyte beginning. + Used in INSERTs to cut a beginning of the given string + which is + a) "well formed" according to the given character set. + b) can fit into the given data type + Terminates the string in the good position, taking in account + multibyte character boundaries. + +lengthsp() - returns the length of the given string without traling spaces. + + +Unicode conversion routines +--------------------------- +mb_wc - converts the left multibyte sequence into it Unicode code. +mc_mb - converts the given Unicode code into multibyte sequence. + + +Case and sort convertion +------------------------ +caseup_str - converts the given 0-terminated string into the upper case +casedn_str - converts the given 0-terminated string into the lower case +caseup - converts the given string into the lower case using length +casedn - converts the given string into the lower case using length + +Number-to-string conversion routines +------------------------------------ +snprintf() +long10_to_str() +longlong10_to_str() + +The names are pretty self-descripting. + +String padding routines +----------------------- +fill() - writes the given Unicode value into the given string + with the given length. Used to pad the string, usually + with space character, according to the given charset. + +String-to-numner conversion routines +------------------------------------ +strntol() +strntoul() +strntoll() +strntoull() +strntod() + +These functions are almost for the same thing with their +STDLIB counterparts, but also: + - accept length instead of 0-terminator + - and are character set dependant + +Simple scanner routines +----------------------- +scan() - to skip leading spaces in the given string. + Used when a string value is inserted into a numeric field. + + + +MY_COLLATION_HANDLER +==================== +strnncoll() - compares two strings according to the given collation +strnncollsp() - like the above but ignores trailing spaces +strnxfrm() - makes a sort key suitable for memcmp() corresponding + to the given string +like_range() - creates a LIKE range, for optimizer +wildcmp() - wildcard comparison, for LIKE +strcasecmp() - 0-terminated string comparison +instr() - finds the first substring appearence in the string +hash_sort() - calculates hash value taking in account + the collation rules, e.g. case-insensitivity, + accent sensitivity, etc. + +
\ No newline at end of file diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 3247e1d7424..851c2044f47 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1231,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)), } -/* -** Compare string against string with wildcard -** 0 if matched -** -1 if not matched with wildcard -** 1 if matched with wildcard -*/ - -static -int my_wildcmp_ucs2(CHARSET_INFO *cs, - const char *str,const char *str_end, - const char *wildstr,const char *wildend, - int escape, int w_one, int w_many, - MY_UNICASE_INFO **weights) -{ - int result= -1; /* Not found, using wildcards */ - my_wc_t s_wc, w_wc; - int scan, plane; - - while (wildstr != wildend) - { - - while (1) - { - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)escape) - { - wildstr+= scan; - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - } - - if (w_wc == (my_wc_t)w_many) - { - result= 1; /* Found an anchor char */ - break; - } - - wildstr+= scan; - scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, (const uchar*)str_end); - if (scan <=0) - return 1; - str+= scan; - - if (w_wc == (my_wc_t)w_one) - { - result= 1; /* Found an anchor char */ - } - else - { - if (weights) - { - plane=(s_wc>>8) & 0xFF; - s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; - plane=(w_wc>>8) & 0xFF; - w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; - } - if (s_wc != w_wc) - return 1; /* No match */ - } - if (wildstr == wildend) - return (str != str_end); /* Match if both are at end */ - } - - - if (w_wc == (my_wc_t)w_many) - { /* Found w_many */ - - /* Remove any '%' and '_' from the wild search string */ - for ( ; wildstr != wildend ; ) - { - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)w_many) - { - wildstr+= scan; - continue; - } - - if (w_wc == (my_wc_t)w_one) - { - wildstr+= scan; - scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, - (const uchar*)str_end); - if (scan <=0) - return 1; - str+= scan; - continue; - } - break; /* Not a wild character */ - } - - if (wildstr == wildend) - return 0; /* Ok if w_many is last */ - - if (str == str_end) - return -1; - - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)escape) - { - wildstr+= scan; - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - } - - while (1) - { - /* Skip until the first character from wildstr is found */ - while (str != str_end) - { - scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str, - (const uchar*)str_end); - if (scan <= 0) - return 1; - if (weights) - { - plane=(s_wc>>8) & 0xFF; - s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; - plane=(w_wc>>8) & 0xFF; - w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; - } - - if (s_wc == w_wc) - break; - str+= scan; - } - if (str == str_end) - return -1; - - result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape, - w_one,w_many,weights); - - if (result <= 0) - return result; - - str+= scan; - } - } - } - return (str != str_end ? 1 : 0); -} - - static int my_wildcmp_ucs2_ci(CHARSET_INFO *cs, const char *str,const char *str_end, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend, - escape,w_one,w_many,uni_plane); + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,uni_plane); } @@ -1406,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend, - escape,w_one,w_many,NULL); + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,NULL); } diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index fd6610b72b1..f5d40fb8ded 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={ }; + +/* +** Compare string against string with wildcard +** This function is used in UTF8 and UCS2 +** +** 0 if matched +** -1 if not matched with wildcard +** 1 if matched with wildcard +*/ + +int my_wildcmp_unicode(CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many, + MY_UNICASE_INFO **weights) +{ + int result= -1; /* Not found, using wildcards */ + my_wc_t s_wc, w_wc; + int scan, plane; + int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc, + const unsigned char *s,const unsigned char *e); + mb_wc= cs->cset->mb_wc; + + while (wildstr != wildend) + { + while (1) + { + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + + if (w_wc == (my_wc_t)escape) + { + wildstr+= scan; + if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + } + + if (w_wc == (my_wc_t)w_many) + { + result= 1; /* Found an anchor char */ + break; + } + + wildstr+= scan; + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + str+= scan; + + if (w_wc == (my_wc_t)w_one) + { + result= 1; /* Found an anchor char */ + } + else + { + if (weights) + { + plane=(s_wc>>8) & 0xFF; + s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; + plane=(w_wc>>8) & 0xFF; + w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; + } + if (s_wc != w_wc) + return 1; /* No match */ + } + if (wildstr == wildend) + return (str != str_end); /* Match if both are at end */ + } + + + if (w_wc == (my_wc_t)w_many) + { /* Found w_many */ + + /* Remove any '%' and '_' from the wild search string */ + for ( ; wildstr != wildend ; ) + { + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + + if (w_wc == (my_wc_t)w_many) + { + wildstr+= scan; + continue; + } + + if (w_wc == (my_wc_t)w_one) + { + wildstr+= scan; + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + str+= scan; + continue; + } + break; /* Not a wild character */ + } + + if (wildstr == wildend) + return 0; /* Ok if w_many is last */ + + if (str == str_end) + return -1; + + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <=0) + return 1; + + if (w_wc == (my_wc_t)escape) + { + wildstr+= scan; + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <=0) + return 1; + } + + while (1) + { + /* Skip until the first character from wildstr is found */ + while (str != str_end) + { + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + if (weights) + { + plane=(s_wc>>8) & 0xFF; + s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; + plane=(w_wc>>8) & 0xFF; + w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; + } + + if (s_wc == w_wc) + break; + str+= scan; + } + if (str == str_end) + return -1; + + result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend, + escape, w_one, w_many, + weights); + + if (result <= 0) + return result; + + str+= scan; + } + } + } + return (str != str_end ? 1 : 0); +} + #endif @@ -1992,6 +2147,17 @@ static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) return my_strncasecmp_utf8(cs, s, t, len); } +static +int my_wildcmp_utf8(CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many) +{ + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,uni_plane); +} + + static int my_strnxfrm_utf8(CHARSET_INFO *cs, uchar *dst, uint dstlen, const uchar *src, uint srclen) @@ -2060,7 +2226,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncollsp_utf8, my_strnxfrm_utf8, my_like_range_mb, - my_wildcmp_mb, + my_wildcmp_utf8, my_strcasecmp_utf8, my_instr_mb, my_hash_sort_utf8 |