diff options
-rw-r--r-- | include/m_ctype.h | 11 | ||||
-rw-r--r-- | mysql-test/r/ctype_ucs.result | 26 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8.result | 36 | ||||
-rw-r--r-- | mysql-test/r/func_time.result | 16 | ||||
-rw-r--r-- | mysql-test/t/ctype_ucs.test | 29 | ||||
-rw-r--r-- | mysql-test/t/ctype_utf8.test | 40 | ||||
-rw-r--r-- | mysql-test/t/func_time.test | 26 | ||||
-rw-r--r-- | mysys/charset.c | 3 | ||||
-rw-r--r-- | sql/item.cc | 46 | ||||
-rw-r--r-- | sql/item.h | 43 | ||||
-rw-r--r-- | sql/item_func.cc | 2 | ||||
-rw-r--r-- | sql/item_strfunc.cc | 3 | ||||
-rw-r--r-- | sql/item_timefunc.cc | 6 | ||||
-rw-r--r-- | sql/sql_lex.cc | 6 | ||||
-rw-r--r-- | sql/sql_lex.h | 4 | ||||
-rw-r--r-- | sql/sql_string.cc | 2 | ||||
-rw-r--r-- | sql/sql_yacc.yy | 88 | ||||
-rw-r--r-- | strings/conf_to_src.c | 10 | ||||
-rw-r--r-- | strings/ctype-extra.c | 4 | ||||
-rw-r--r-- | strings/ctype.c | 86 |
20 files changed, 431 insertions, 56 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index 9f21ac16a05..218ec2daadb 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -78,8 +78,14 @@ extern MY_UNICASE_INFO *my_unicase_turkish[256]; #define MY_CS_READY 256 /* if a charset is initialized */ #define MY_CS_AVAILABLE 512 /* If either compiled-in or loaded*/ #define MY_CS_CSSORT 1024 /* if case sensitive sort order */ +#define MY_CS_PUREASCII 2048 /* if a charset is pure ascii */ #define MY_CHARSET_UNDEFINED 0 +/* Character repertoire flags */ +#define MY_REPERTOIRE_ASCII 1 /* Pure ASCII U+0000..U+007F */ +#define MY_REPERTOIRE_EXTENDED 2 /* Extended characters: U+0080..U+FFFF */ +#define MY_REPERTOIRE_UNICODE30 3 /* ASCII | EXTENDED: U+0000..U+FFFF */ + typedef struct my_uni_idx_st { @@ -436,6 +442,11 @@ my_bool my_propagate_simple(CHARSET_INFO *cs, const uchar *str, uint len); my_bool my_propagate_complex(CHARSET_INFO *cs, const uchar *str, uint len); +uint my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong len); +my_bool my_charset_is_ascii_based(CHARSET_INFO *cs); +my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs); + + #define _MY_U 01 /* Upper case */ #define _MY_L 02 /* Lower case */ #define _MY_NMR 04 /* Numeral (digit) */ diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result index 006f4193ca1..023267c227c 100644 --- a/mysql-test/r/ctype_ucs.result +++ b/mysql-test/r/ctype_ucs.result @@ -896,4 +896,30 @@ select hex(convert(s1 using latin1)) from t1; hex(convert(s1 using latin1)) 7F drop table t1; +create table t1 (a varchar(15) character set ascii not null, b int); +insert into t1 values ('a',1); +select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1; +concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) +aa +select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1; +concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) +ab +select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062); +a b +a 1 +select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062); +a b +select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat' +select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat' +select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat' +select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat' +select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062); +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '=' +select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0); +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '=' +drop table t1; End of 5.0 tests diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index d38480dced1..710cac388a5 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -1639,6 +1639,42 @@ coercibility(col1) collation(col1) 0 utf8_swedish_ci drop view v1, v2; drop table t1; +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, N'x', N'y')) from t1; +concat(a, if(b>10, N'x', N'y')) +ay +select concat(a, if(b>10, N'æ', N'ß')) from t1; +ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat' +drop table t1; +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1; +concat(a, if(b>10, _utf8'x', _utf8'y')) +ay +select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1; +ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat' +drop table t1; +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1; +concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) +ay +select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1; +ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat' +drop table t1; +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1; +concat(a, if(b>10, 'x' 'x', 'y' 'y')) +ayy +select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1; +ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat' +drop table t1; CREATE TABLE t1 ( colA int(11) NOT NULL, colB varchar(255) character set utf8 NOT NULL, diff --git a/mysql-test/r/func_time.result b/mysql-test/r/func_time.result index f92ecac329a..ee8b8c1e908 100644 --- a/mysql-test/r/func_time.result +++ b/mysql-test/r/func_time.result @@ -1246,3 +1246,19 @@ SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SE TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") 838:59:58 838:59:59 +set names latin1; +create table t1 (a varchar(15) character set ascii not null); +insert into t1 values ('070514-000000'); +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) +# +set names swe7; +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (swe7_swedish_ci,COERCIBLE) for operation 'concat' +set names latin1; +set lc_time_names=fr_FR; +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (latin1_swedish_ci,COERCIBLE) for operation 'concat' +set lc_time_names=en_US; +drop table t1; +End of 5.0 tests diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test index 18a18d6c632..bca3a9c3a96 100644 --- a/mysql-test/t/ctype_ucs.test +++ b/mysql-test/t/ctype_ucs.test @@ -622,4 +622,33 @@ select hex(s2) from t1; select hex(convert(s1 using latin1)) from t1; drop table t1; +# +# Conversion from UCS2 to ASCII is possible +# if the UCS2 string consists of only ASCII characters +# +create table t1 (a varchar(15) character set ascii not null, b int); +insert into t1 values ('a',1); +select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1; +select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1; +select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062); +select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062); + +# +# Conversion from UCS2 to ASCII is not possible if +# the UCS2 string has non-ASCII characters +# +--error 1267 +select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1; +--error 1267 +select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1; +--error 1267 +select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1; +--error 1267 +select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1; +--error 1267 +select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062); +--error 1267 +select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0); +drop table t1; + --echo End of 5.0 tests diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index 0f1a7cf8c84..f8eed0bae9a 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -1314,6 +1314,46 @@ select coercibility(col1), collation(col1) from v2; drop view v1, v2; drop table t1; +# +# Check conversion of NCHAR strings to subset (e.g. latin1). +# Conversion is possible if string repertoire is ASCII. +# Conversion is not possible if the string have extended characters +# +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, N'x', N'y')) from t1; +--error 1267 +select concat(a, if(b>10, N'æ', N'ß')) from t1; +drop table t1; + +# Conversion tests for character set introducers +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1; +--error 1267 +select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1; +drop table t1; + +# Conversion tests for introducer + HEX string +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1; +--error 1267 +select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1; +drop table t1; + +# Conversion tests for "text_literal TEXT_STRING_literal" syntax structure +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1; +--error 1267 +select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1; +drop table t1; + # # Bug#19960: Inconsistent results when joining diff --git a/mysql-test/t/func_time.test b/mysql-test/t/func_time.test index 4f35a681228..86c848983fa 100644 --- a/mysql-test/t/func_time.test +++ b/mysql-test/t/func_time.test @@ -752,3 +752,29 @@ DROP TABLE t1; # Check if using GROUP BY with TIME_FORMAT() produces correct results SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SELECT 3020398 ) x GROUP BY 1; + +# +# Bug#28875 Conversion between ASCII and LATIN1 charsets does not function +# +set names latin1; +create table t1 (a varchar(15) character set ascii not null); +insert into t1 values ('070514-000000'); +# Conversion of date_format() result to ASCII +# is safe with the default locale en_US +--replace_column 1 # +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +# Error for swe7: it is not ASCII compatible +set names swe7; +--error 1267 +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +set names latin1; +# Conversion of date_format() result to ASCII +# is not safe with the non-default locale fr_FR +# because month and day names can have accented characters +set lc_time_names=fr_FR; +--error 1267 +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +set lc_time_names=en_US; +drop table t1; + +--echo End of 5.0 tests diff --git a/mysys/charset.c b/mysys/charset.c index 9ea17c6515c..4c3f2d0ab71 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -277,6 +277,9 @@ static int add_collation(CHARSET_INFO *cs) if (sort_order && sort_order['A'] < sort_order['a'] && sort_order['a'] < sort_order['B']) all_charsets[cs->number]->state|= MY_CS_CSSORT; + + if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number])) + all_charsets[cs->number]->state|= MY_CS_PUREASCII; } } else diff --git a/sql/item.cc b/sql/item.cc index 9612fbc5243..c99946ac9d2 100644 --- a/sql/item.cc +++ b/sql/item.cc @@ -1327,6 +1327,25 @@ void Item::split_sum_func2(THD *thd, Item **ref_pointer_array, } +static bool +left_is_superset(DTCollation *left, DTCollation *right) +{ + /* Allow convert to Unicode */ + if (left->collation->state & MY_CS_UNICODE && + (left->derivation < right->derivation || + (left->derivation == right->derivation && + !(right->collation->state & MY_CS_UNICODE)))) + return TRUE; + /* Allow convert from ASCII */ + if (right->repertoire == MY_REPERTOIRE_ASCII && + (left->derivation < right->derivation || + (left->derivation == right->derivation && + !(left->repertoire == MY_REPERTOIRE_ASCII)))) + return TRUE; + /* Disallow conversion otherwise */ + return FALSE; +} + /* Aggregate two collations together taking into account their coercibility (aka derivation): @@ -1391,18 +1410,12 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags) ; // Do nothing } else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) && - collation->state & MY_CS_UNICODE && - (derivation < dt.derivation || - (derivation == dt.derivation && - !(dt.collation->state & MY_CS_UNICODE)))) + left_is_superset(this, &dt)) { // Do nothing } else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) && - dt.collation->state & MY_CS_UNICODE && - (dt.derivation < derivation || - (dt.derivation == derivation && - !(collation->state & MY_CS_UNICODE)))) + left_is_superset(&dt, this)) { set(dt); } @@ -1421,7 +1434,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags) else { // Cannot apply conversion - set(0, DERIVATION_NONE); + set(0, DERIVATION_NONE, 0); return 1; } } @@ -1443,8 +1456,8 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags) { if (derivation == DERIVATION_EXPLICIT) { - set(0, DERIVATION_NONE); - return 1; + set(0, DERIVATION_NONE, 0); + return 1; } if (collation->state & MY_CS_BINSORT) return 0; @@ -1458,6 +1471,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags) set(bin, DERIVATION_NONE); } } + repertoire|= dt.repertoire; return 0; } @@ -1597,12 +1611,16 @@ bool agg_item_charsets(DTCollation &coll, const char *fname, { Item* conv; uint32 dummy_offset; - if (!String::needs_conversion(0, coll.collation, - (*arg)->collation.collation, + if (!String::needs_conversion(0, (*arg)->collation.collation, + coll.collation, &dummy_offset)) continue; - if (!(conv= (*arg)->safe_charset_converter(coll.collation))) + if (!(conv= (*arg)->safe_charset_converter(coll.collation)) && + ((*arg)->collation.repertoire == MY_REPERTOIRE_ASCII)) + conv= new Item_func_conv_charset(*arg, coll.collation, 1); + + if (!conv) { if (nargs >=2 && nargs <= 3) { diff --git a/sql/item.h b/sql/item.h index 23f6977a0f8..3c699c0eda3 100644 --- a/sql/item.h +++ b/sql/item.h @@ -49,29 +49,50 @@ class DTCollation { public: CHARSET_INFO *collation; enum Derivation derivation; + uint repertoire; + void set_repertoire_from_charset(CHARSET_INFO *cs) + { + repertoire= cs->state & MY_CS_PUREASCII ? + MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; + } DTCollation() { collation= &my_charset_bin; derivation= DERIVATION_NONE; + repertoire= MY_REPERTOIRE_UNICODE30; } DTCollation(CHARSET_INFO *collation_arg, Derivation derivation_arg) { collation= collation_arg; derivation= derivation_arg; + set_repertoire_from_charset(collation_arg); } void set(DTCollation &dt) { collation= dt.collation; derivation= dt.derivation; + repertoire= dt.repertoire; } void set(CHARSET_INFO *collation_arg, Derivation derivation_arg) { collation= collation_arg; derivation= derivation_arg; + set_repertoire_from_charset(collation_arg); + } + void set(CHARSET_INFO *collation_arg, + Derivation derivation_arg, + uint repertoire_arg) + { + collation= collation_arg; + derivation= derivation_arg; + repertoire= repertoire_arg; } void set(CHARSET_INFO *collation_arg) - { collation= collation_arg; } + { + collation= collation_arg; + set_repertoire_from_charset(collation_arg); + } void set(Derivation derivation_arg) { derivation= derivation_arg; } bool aggregate(DTCollation &dt, uint flags= 0); @@ -1672,10 +1693,11 @@ class Item_string :public Item { public: Item_string(const char *str,uint length, - CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE) + CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE, + uint repertoire= MY_REPERTOIRE_UNICODE30) { - collation.set(cs, dv); - str_value.set_or_copy_aligned(str,length,cs); + str_value.set_or_copy_aligned(str, length, cs); + collation.set(cs, dv, repertoire); /* We have to have a different max_length than 'length' here to ensure that we get the right length if we do use the item @@ -1699,10 +1721,11 @@ public: fixed= 1; } Item_string(const char *name_par, const char *str, uint length, - CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE) + CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE, + uint repertoire= MY_REPERTOIRE_UNICODE30) { - collation.set(cs, dv); - str_value.set_or_copy_aligned(str,length,cs); + str_value.set_or_copy_aligned(str, length, cs); + collation.set(cs, dv, repertoire); max_length= str_value.numchars()*cs->mbmaxlen; set_name(name_par, 0, cs); decimals=NOT_FIXED_DEC; @@ -1718,6 +1741,12 @@ public: str_value.copy(str_arg, length_arg, collation.collation); max_length= str_value.numchars() * collation.collation->mbmaxlen; } + void set_repertoire_from_value() + { + collation.repertoire= my_string_repertoire(str_value.charset(), + str_value.ptr(), + str_value.length()); + } enum Type type() const { return STRING_ITEM; } double val_real(); longlong val_int(); diff --git a/sql/item_func.cc b/sql/item_func.cc index c70cfa1ce2a..d03d497dfd0 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -3767,7 +3767,7 @@ static user_var_entry *get_variable(HASH *hash, LEX_STRING &name, entry->value=0; entry->length=0; entry->update_query_id=0; - entry->collation.set(NULL, DERIVATION_IMPLICIT); + entry->collation.set(NULL, DERIVATION_IMPLICIT, 0); entry->unsigned_flag= 0; /* If we are here, we were called from a SET or a query which sets a diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc index 20a4b64640a..0c11c9eece8 100644 --- a/sql/item_strfunc.cc +++ b/sql/item_strfunc.cc @@ -2673,7 +2673,8 @@ void Item_func_set_collation::fix_length_and_dec() colname, args[0]->collation.collation->csname); return; } - collation.set(set_collation, DERIVATION_EXPLICIT); + collation.set(set_collation, DERIVATION_EXPLICIT, + args[0]->collation.repertoire); max_length= args[0]->max_length; } diff --git a/sql/item_timefunc.cc b/sql/item_timefunc.cc index 873e2833a1e..ae18e4786d7 100644 --- a/sql/item_timefunc.cc +++ b/sql/item_timefunc.cc @@ -1718,7 +1718,11 @@ void Item_func_date_format::fix_length_and_dec() Item *arg1= args[1]->this_item(); decimals=0; - collation.set(thd->variables.collation_connection); + CHARSET_INFO *cs= thd->variables.collation_connection; + uint32 repertoire= arg1->collation.repertoire; + if (!thd->variables.lc_time_names->is_ascii) + repertoire|= MY_REPERTOIRE_EXTENDED; + collation.set(cs, arg1->collation.derivation, repertoire); if (arg1->type() == STRING_ITEM) { // Optimize the normal case fixed_length=1; diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index a62d8b383a5..7911da69862 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -311,10 +311,12 @@ static char *get_text(Lex_input_stream *lip) uint found_escape=0; CHARSET_INFO *cs= lip->m_thd->charset(); + lip->tok_bitmap= 0; sep= yyGetLast(); // String should end with this while (lip->ptr != lip->end_of_query) { - c = yyGet(); + c= yyGet(); + lip->tok_bitmap|= c; #ifdef USE_MB { int l; @@ -605,6 +607,7 @@ int MYSQLlex(void *arg, void *yythd) break; } yylval->lex_str.length= lip->yytoklen; + lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1; return(NCHAR_STRING); case MY_LEX_IDENT_OR_HEX: @@ -926,6 +929,7 @@ int MYSQLlex(void *arg, void *yythd) break; } yylval->lex_str.length=lip->yytoklen; + lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1; return(TEXT_STRING); case MY_LEX_COMMENT: // Comment diff --git a/sql/sql_lex.h b/sql/sql_lex.h index bfa6c05974f..b9c6abd2b06 100644 --- a/sql/sql_lex.h +++ b/sql/sql_lex.h @@ -957,6 +957,9 @@ public: /** Position of ';' in the stream, to delimit multiple queries. */ const char* found_semicolon; + + /** Token character bitmaps, to detect 7bit strings. */ + uchar tok_bitmap; /** SQL_MODE = IGNORE_SPACE. */ bool ignore_space; @@ -994,6 +997,7 @@ typedef struct st_lex : public Query_tables_list gptr yacc_yyss,yacc_yyvs; THD *thd; CHARSET_INFO *charset, *underscore_charset; + bool text_string_is_7bit; /* store original leaf_tables for INSERT SELECT and PS/SP */ TABLE_LIST *leaf_tables_insert; /* Position (first character index) of SELECT of CREATE VIEW statement */ diff --git a/sql/sql_string.cc b/sql/sql_string.cc index 9d7df73cd7a..a87074c3359 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -263,6 +263,8 @@ bool String::needs_conversion(uint32 arg_length, (to_cs == &my_charset_bin) || (to_cs == from_cs) || my_charset_same(from_cs, to_cs) || + (my_charset_is_ascii_based(to_cs) && + my_charset_is_8bit_pure_ascii(from_cs)) || ((from_cs == &my_charset_bin) && (!(*offset=(arg_length % to_cs->mbminlen))))) return FALSE; diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index 638da3b1bb0..a53cadafda8 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -7523,18 +7523,54 @@ opt_load_data_set_spec: /* Common definitions */ text_literal: - TEXT_STRING_literal - { - THD *thd= YYTHD; - $$ = new Item_string($1.str,$1.length,thd->variables.collation_connection); - } - | NCHAR_STRING - { $$= new Item_string($1.str,$1.length,national_charset_info); } - | UNDERSCORE_CHARSET TEXT_STRING - { $$ = new Item_string($2.str,$2.length,Lex->underscore_charset); } - | text_literal TEXT_STRING_literal - { ((Item_string*) $1)->append($2.str,$2.length); } - ; + TEXT_STRING + { + LEX_STRING tmp; + THD *thd= YYTHD; + CHARSET_INFO *cs_con= thd->variables.collation_connection; + CHARSET_INFO *cs_cli= thd->variables.character_set_client; + uint repertoire= thd->lex->text_string_is_7bit && + my_charset_is_ascii_based(cs_cli) ? + MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; + if (thd->charset_is_collation_connection || + (repertoire == MY_REPERTOIRE_ASCII && + my_charset_is_ascii_based(cs_con))) + tmp= $1; + else + thd->convert_string(&tmp, cs_con, $1.str, $1.length, cs_cli); + $$= new Item_string(tmp.str, tmp.length, cs_con, + DERIVATION_COERCIBLE, repertoire); + } + | NCHAR_STRING + { + uint repertoire= Lex->text_string_is_7bit ? + MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; + DBUG_ASSERT(my_charset_is_ascii_based(national_charset_info)); + $$= new Item_string($1.str, $1.length, national_charset_info, + DERIVATION_COERCIBLE, repertoire); + } + | UNDERSCORE_CHARSET TEXT_STRING + { + $$= new Item_string($2.str, $2.length, Lex->underscore_charset); + ((Item_string*) $$)->set_repertoire_from_value(); + } + | text_literal TEXT_STRING_literal + { + Item_string* item= (Item_string*) $1; + item->append($2.str, $2.length); + if (!(item->collation.repertoire & MY_REPERTOIRE_EXTENDED)) + { + /* + If the string has been pure ASCII so far, + check the new part. + */ + CHARSET_INFO *cs= YYTHD->variables.collation_connection; + item->collation.repertoire|= my_string_repertoire(cs, + $2.str, + $2.length); + } + } + ; text_string: TEXT_STRING_literal @@ -7606,20 +7642,22 @@ literal: | TRUE_SYM { $$= new Item_int((char*) "TRUE",1,1); } | HEX_NUM { $$ = new Item_hex_string($1.str, $1.length);} | BIN_NUM { $$= new Item_bin_string($1.str, $1.length); } - | UNDERSCORE_CHARSET HEX_NUM - { - Item *tmp= new Item_hex_string($2.str, $2.length); - /* - it is OK only emulate fix_fieds, because we need only + | UNDERSCORE_CHARSET HEX_NUM + { + Item *tmp= new Item_hex_string($2.str, $2.length); + /* + it is OK only emulate fix_fieds, because we need only value of constant - */ - String *str= tmp ? - tmp->quick_fix_field(), tmp->val_str((String*) 0) : - (String*) 0; - $$= new Item_string(str ? str->ptr() : "", - str ? str->length() : 0, - Lex->underscore_charset); - } + */ + String *str= tmp ? + tmp->quick_fix_field(), tmp->val_str((String*) 0) : + (String*) 0; + $$= new Item_string(str ? str->ptr() : "", + str ? str->length() : 0, + Lex->underscore_charset); + if ($$) + ((Item_string *) $$)->set_repertoire_from_value(); + } | UNDERSCORE_CHARSET BIN_NUM { Item *tmp= new Item_bin_string($2.str, $2.length); diff --git a/strings/conf_to_src.c b/strings/conf_to_src.c index 75776d5e6d0..dc2a300a2ec 100644 --- a/strings/conf_to_src.c +++ b/strings/conf_to_src.c @@ -179,14 +179,16 @@ is_case_sensitive(CHARSET_INFO *cs) cs->sort_order['a'] < cs->sort_order['B']) ? 1 : 0; } + void dispcset(FILE *f,CHARSET_INFO *cs) { fprintf(f,"{\n"); fprintf(f," %d,%d,%d,\n",cs->number,0,0); - fprintf(f," MY_CS_COMPILED%s%s%s,\n", - cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "", - cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "", - is_case_sensitive(cs) ? "|MY_CS_CSSORT" : ""); + fprintf(f," MY_CS_COMPILED%s%s%s%s,\n", + cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "", + cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "", + is_case_sensitive(cs) ? "|MY_CS_CSSORT" : "", + my_charset_is_8bit_pure_ascii(cs) ? "|MY_CS_PUREASCII" : ""); if (cs->name) { diff --git a/strings/ctype-extra.c b/strings/ctype-extra.c index bf45b5b5d75..38aa3a05adf 100644 --- a/strings/ctype-extra.c +++ b/strings/ctype-extra.c @@ -6722,7 +6722,7 @@ CHARSET_INFO compiled_charsets[] = { #ifdef HAVE_CHARSET_ascii { 11,0,0, - MY_CS_COMPILED|MY_CS_PRIMARY, + MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_PUREASCII, "ascii", /* cset name */ "ascii_general_ci", /* coll name */ "", /* comment */ @@ -7811,7 +7811,7 @@ CHARSET_INFO compiled_charsets[] = { #ifdef HAVE_CHARSET_ascii { 65,0,0, - MY_CS_COMPILED|MY_CS_BINSORT, + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_PUREASCII, "ascii", /* cset name */ "ascii_bin", /* coll name */ "", /* comment */ diff --git a/strings/ctype.c b/strings/ctype.c index e7399c5438b..372a1a8a468 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -306,3 +306,89 @@ my_bool my_parse_charset_xml(const char *buf, uint len, my_xml_parser_free(&p); return rc; } + + +/* + Check repertoire: detect pure ascii strings +*/ +uint +my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length) +{ + const char *strend= str + length; + if (cs->mbminlen == 1) + { + for ( ; str < strend; str++) + { + if (((uchar) *str) > 0x7F) + return MY_REPERTOIRE_UNICODE30; + } + } + else + { + my_wc_t wc; + int chlen; + for (; (chlen= cs->cset->mb_wc(cs, &wc, str, strend)) > 0; str+= chlen) + { + if (wc > 0x7F) + return MY_REPERTOIRE_UNICODE30; + } + } + return MY_REPERTOIRE_ASCII; +} + + +/* + Detect whether a character set is ASCII compatible. + + Returns TRUE for: + + - all 8bit character sets whose Unicode mapping of 0x7B is '{' + (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS") + + - all multi-byte character sets having mbminlen == 1 + (ignores ucs2 whose mbminlen is 2) + + TODO: + + When merging to 5.2, this function should be changed + to check a new flag MY_CS_NONASCII, + + return (cs->flag & MY_CS_NONASCII) ? 0 : 1; + + This flag was previously added into 5.2 under terms + of WL#3759 "Optimize identifier conversion in client-server protocol" + especially to mark character sets not compatible with ASCII. + + We won't backport this flag to 5.0 or 5.1. + This function is Ok for 5.0 and 5.1, because we're not going + to introduce new tricky character sets between 5.0 and 5.2. +*/ +my_bool +my_charset_is_ascii_based(CHARSET_INFO *cs) +{ + return + (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') || + (cs->mbminlen == 1 && cs->mbmaxlen > 1); +} + + +/* + Detect if a character set is 8bit, + and it is pure ascii, i.e. doesn't have + characters outside U+0000..U+007F + This functions is shared between "conf_to_src" + and dynamic charsets loader in "mysqld". +*/ +my_bool +my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs) +{ + size_t code; + if (!cs->tab_to_uni) + return 0; + for (code= 0; code < 256; code++) + { + if (cs->tab_to_uni[code] > 0x7F) + return 0; + } + return 1; +} |