diff options
author | unknown <bar@mysql.com/bar.myoffice.izhnet.ru> | 2007-08-03 15:25:23 +0500 |
---|---|---|
committer | unknown <bar@mysql.com/bar.myoffice.izhnet.ru> | 2007-08-03 15:25:23 +0500 |
commit | 53df09a9a6a99b82e2a8869eb16737a78772b29e (patch) | |
tree | 01c0ee2f5244a4d68fc6cdfb6555fa1af4589a8c | |
parent | b307fc4d8fd5380cec948c07550b5ae73624e274 (diff) | |
download | mariadb-git-53df09a9a6a99b82e2a8869eb16737a78772b29e.tar.gz |
Bug#28875 Conversion between ASCII and LATIN1 charsets does not function
(Regression, caused by a patch for the bug 22646).
Problem: when result type of date_format() was changed from
binary string to character string, mixing date_format()
with a ascii column in CONCAT() stopped to work.
Fix:
- adding "repertoire" flag into DTCollation class,
to mark items which can return only pure ASCII strings.
- allow character set conversion from pure ASCII to other character sets.
include/m_ctype.h:
Defining new flags.
Adding new function prototypes.
mysql-test/r/ctype_ucs.result:
Adding tests.
mysql-test/r/ctype_utf8.result:
Adding tests.
mysql-test/r/func_time.result:
Adding tests.
mysql-test/t/ctype_ucs.test:
Adding tests.
mysql-test/t/ctype_utf8.test:
Adding tests.
mysql-test/t/func_time.test:
Adding test.
mysys/charset.c:
Adding pure ASCII detection when loading a dynamic character set.
sql/item.cc:
- Moving detection of a Unicode superset into function.
- Adding detection of a ASCII subset.
- Adding creation of to-ASCII character set convertor when
safe_charset_converter() failed and when the argument.
repertoire is know to be pure ASCII.
sql/item.h:
- Adding "repertoire" member into DTCollation class.
- Adding "repertoire" argument to constructors.
- Adding new methods:
set_repertoire_from_charset()
set_repertoire_from_value()
sql/item_func.cc:
Adding "repertoire" argument.
sql/item_strfunc.cc:
Adding "repertoire" argument.
sql/item_timefunc.cc:
Initializing the result repertoire taking into account the "is_ascii"
flag of the current locale.
sql/sql_lex.cc:
Detect 7bit strings, return in Lex->text_string_is_7bit.
sql/sql_lex.h:
Adding new member into LEX structure.
Adding new member into Lex_input_stream
sql/sql_string.cc:
Allow simple copy from pure ASCII to a ASCII-based character set.
sql/sql_yacc.yy:
Depening on Lex->text_string_is_7bit and character set features,
create Item_string with MY_REPERTOIRE_ASCII when it is possible.
strings/conf_to_src.c:
- Adding printing of the "MY_CS_PUREASCII" flag
- Adding printing of copyright
strings/ctype-extra.c:
Recreating ctype-extra.c: ascii_general_ci and ascii_bin
are now marked with MY_CS_PUREASCII flag.
strings/ctype.c:
Adding new functions.
-rw-r--r-- | include/m_ctype.h | 11 | ||||
-rw-r--r-- | mysql-test/r/ctype_ucs.result | 26 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8.result | 36 | ||||
-rw-r--r-- | mysql-test/r/func_time.result | 16 | ||||
-rw-r--r-- | mysql-test/t/ctype_ucs.test | 30 | ||||
-rw-r--r-- | mysql-test/t/ctype_utf8.test | 40 | ||||
-rw-r--r-- | mysql-test/t/func_time.test | 26 | ||||
-rw-r--r-- | mysys/charset.c | 3 | ||||
-rw-r--r-- | sql/item.cc | 46 | ||||
-rw-r--r-- | sql/item.h | 43 | ||||
-rw-r--r-- | sql/item_func.cc | 2 | ||||
-rw-r--r-- | sql/item_strfunc.cc | 3 | ||||
-rw-r--r-- | sql/item_timefunc.cc | 6 | ||||
-rw-r--r-- | sql/sql_lex.cc | 6 | ||||
-rw-r--r-- | sql/sql_lex.h | 4 | ||||
-rw-r--r-- | sql/sql_string.cc | 2 | ||||
-rw-r--r-- | sql/sql_yacc.yy | 88 | ||||
-rw-r--r-- | strings/conf_to_src.c | 33 | ||||
-rw-r--r-- | strings/ctype-extra.c | 7 | ||||
-rw-r--r-- | strings/ctype.c | 86 |
20 files changed, 457 insertions, 57 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index 9f21ac16a05..218ec2daadb 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -78,8 +78,14 @@ extern MY_UNICASE_INFO *my_unicase_turkish[256]; #define MY_CS_READY 256 /* if a charset is initialized */ #define MY_CS_AVAILABLE 512 /* If either compiled-in or loaded*/ #define MY_CS_CSSORT 1024 /* if case sensitive sort order */ +#define MY_CS_PUREASCII 2048 /* if a charset is pure ascii */ #define MY_CHARSET_UNDEFINED 0 +/* Character repertoire flags */ +#define MY_REPERTOIRE_ASCII 1 /* Pure ASCII U+0000..U+007F */ +#define MY_REPERTOIRE_EXTENDED 2 /* Extended characters: U+0080..U+FFFF */ +#define MY_REPERTOIRE_UNICODE30 3 /* ASCII | EXTENDED: U+0000..U+FFFF */ + typedef struct my_uni_idx_st { @@ -436,6 +442,11 @@ my_bool my_propagate_simple(CHARSET_INFO *cs, const uchar *str, uint len); my_bool my_propagate_complex(CHARSET_INFO *cs, const uchar *str, uint len); +uint my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong len); +my_bool my_charset_is_ascii_based(CHARSET_INFO *cs); +my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs); + + #define _MY_U 01 /* Upper case */ #define _MY_L 02 /* Lower case */ #define _MY_NMR 04 /* Numeral (digit) */ diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result index 960953b3c5e..350fc3f6bd6 100644 --- a/mysql-test/r/ctype_ucs.result +++ b/mysql-test/r/ctype_ucs.result @@ -865,4 +865,30 @@ blob 65535 65535 text 65535 65535 text 65535 32767 drop table t1; +create table t1 (a varchar(15) character set ascii not null, b int); +insert into t1 values ('a',1); +select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1; +concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) +aa +select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1; +concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) +ab +select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062); +a b +a 1 +select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062); +a b +select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat' +select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat' +select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat' +select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat' +select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062); +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '=' +select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0); +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '=' +drop table t1; End of 5.0 tests diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 216b5f393fb..3b20ded7361 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -1639,6 +1639,42 @@ coercibility(col1) collation(col1) 0 utf8_swedish_ci drop view v1, v2; drop table t1; +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, N'x', N'y')) from t1; +concat(a, if(b>10, N'x', N'y')) +ay +select concat(a, if(b>10, N'æ', N'ß')) from t1; +ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat' +drop table t1; +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1; +concat(a, if(b>10, _utf8'x', _utf8'y')) +ay +select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1; +ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat' +drop table t1; +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1; +concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) +ay +select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1; +ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat' +drop table t1; +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1; +concat(a, if(b>10, 'x' 'x', 'y' 'y')) +ayy +select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1; +ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat' +drop table t1; CREATE TABLE t1 ( colA int(11) NOT NULL, colB varchar(255) character set utf8 NOT NULL, diff --git a/mysql-test/r/func_time.result b/mysql-test/r/func_time.result index 56ea72a8ee3..2207cd27243 100644 --- a/mysql-test/r/func_time.result +++ b/mysql-test/r/func_time.result @@ -1246,3 +1246,19 @@ SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SE TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") 838:59:58 838:59:59 +set names latin1; +create table t1 (a varchar(15) character set ascii not null); +insert into t1 values ('070514-000000'); +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) +# +set names swe7; +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (swe7_swedish_ci,COERCIBLE) for operation 'concat' +set names latin1; +set lc_time_names=fr_FR; +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (latin1_swedish_ci,COERCIBLE) for operation 'concat' +set lc_time_names=en_US; +drop table t1; +End of 5.0 tests diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test index c3320159c41..d1dd2378bd0 100644 --- a/mysql-test/t/ctype_ucs.test +++ b/mysql-test/t/ctype_ucs.test @@ -594,4 +594,34 @@ select data_type, character_octet_length, character_maximum_length from information_schema.columns where table_name='t1'; drop table t1; +# +# Conversion from UCS2 to ASCII is possible +# if the UCS2 string consists of only ASCII characters +# +create table t1 (a varchar(15) character set ascii not null, b int); +insert into t1 values ('a',1); +select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1; +select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1; +select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062); +select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062); + +# +# Conversion from UCS2 to ASCII is not possible if +# the UCS2 string has non-ASCII characters +# +--error 1267 +select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1; +--error 1267 +select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1; +--error 1267 +select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1; +--error 1267 +select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1; +--error 1267 +select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062); +--error 1267 +select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0); +drop table t1; + + --echo End of 5.0 tests diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index c4637d14edc..603c60faf82 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -1314,6 +1314,46 @@ select coercibility(col1), collation(col1) from v2; drop view v1, v2; drop table t1; +# +# Check conversion of NCHAR strings to subset (e.g. latin1). +# Conversion is possible if string repertoire is ASCII. +# Conversion is not possible if the string have extended characters +# +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, N'x', N'y')) from t1; +--error 1267 +select concat(a, if(b>10, N'æ', N'ß')) from t1; +drop table t1; + +# Conversion tests for character set introducers +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1; +--error 1267 +select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1; +drop table t1; + +# Conversion tests for introducer + HEX string +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1; +--error 1267 +select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1; +drop table t1; + +# Conversion tests for "text_literal TEXT_STRING_literal" syntax structure +set names utf8; +create table t1 (a varchar(10) character set latin1, b int); +insert into t1 values ('a',1); +select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1; +--error 1267 +select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1; +drop table t1; + # # Bug#19960: Inconsistent results when joining diff --git a/mysql-test/t/func_time.test b/mysql-test/t/func_time.test index da909dc578f..96d064fdf41 100644 --- a/mysql-test/t/func_time.test +++ b/mysql-test/t/func_time.test @@ -752,3 +752,29 @@ DROP TABLE t1; # Check if using GROUP BY with TIME_FORMAT() produces correct results SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SELECT 3020398 ) x GROUP BY 1; + +# +# Bug#28875 Conversion between ASCII and LATIN1 charsets does not function +# +set names latin1; +create table t1 (a varchar(15) character set ascii not null); +insert into t1 values ('070514-000000'); +# Conversion of date_format() result to ASCII +# is safe with the default locale en_US +--replace_column 1 # +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +# Error for swe7: it is not ASCII compatible +set names swe7; +--error 1267 +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +set names latin1; +# Conversion of date_format() result to ASCII +# is not safe with the non-default locale fr_FR +# because month and day names can have accented characters +set lc_time_names=fr_FR; +--error 1267 +select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1; +set lc_time_names=en_US; +drop table t1; + +--echo End of 5.0 tests diff --git a/mysys/charset.c b/mysys/charset.c index 9ea17c6515c..4c3f2d0ab71 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -277,6 +277,9 @@ static int add_collation(CHARSET_INFO *cs) if (sort_order && sort_order['A'] < sort_order['a'] && sort_order['a'] < sort_order['B']) all_charsets[cs->number]->state|= MY_CS_CSSORT; + + if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number])) + all_charsets[cs->number]->state|= MY_CS_PUREASCII; } } else diff --git a/sql/item.cc b/sql/item.cc index 92ea35072f9..30fc32706fd 100644 --- a/sql/item.cc +++ b/sql/item.cc @@ -1296,6 +1296,25 @@ void Item::split_sum_func2(THD *thd, Item **ref_pointer_array, } +static bool +left_is_superset(DTCollation *left, DTCollation *right) +{ + /* Allow convert to Unicode */ + if (left->collation->state & MY_CS_UNICODE && + (left->derivation < right->derivation || + (left->derivation == right->derivation && + !(right->collation->state & MY_CS_UNICODE)))) + return TRUE; + /* Allow convert from ASCII */ + if (right->repertoire == MY_REPERTOIRE_ASCII && + (left->derivation < right->derivation || + (left->derivation == right->derivation && + !(left->repertoire == MY_REPERTOIRE_ASCII)))) + return TRUE; + /* Disallow conversion otherwise */ + return FALSE; +} + /* Aggregate two collations together taking into account their coercibility (aka derivation): @@ -1360,18 +1379,12 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags) ; // Do nothing } else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) && - collation->state & MY_CS_UNICODE && - (derivation < dt.derivation || - (derivation == dt.derivation && - !(dt.collation->state & MY_CS_UNICODE)))) + left_is_superset(this, &dt)) { // Do nothing } else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) && - dt.collation->state & MY_CS_UNICODE && - (dt.derivation < derivation || - (dt.derivation == derivation && - !(collation->state & MY_CS_UNICODE)))) + left_is_superset(&dt, this)) { set(dt); } @@ -1390,7 +1403,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags) else { // Cannot apply conversion - set(0, DERIVATION_NONE); + set(0, DERIVATION_NONE, 0); return 1; } } @@ -1412,8 +1425,8 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags) { if (derivation == DERIVATION_EXPLICIT) { - set(0, DERIVATION_NONE); - return 1; + set(0, DERIVATION_NONE, 0); + return 1; } if (collation->state & MY_CS_BINSORT) return 0; @@ -1427,6 +1440,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags) set(bin, DERIVATION_NONE); } } + repertoire|= dt.repertoire; return 0; } @@ -1566,12 +1580,16 @@ bool agg_item_charsets(DTCollation &coll, const char *fname, { Item* conv; uint32 dummy_offset; - if (!String::needs_conversion(0, coll.collation, - (*arg)->collation.collation, + if (!String::needs_conversion(0, (*arg)->collation.collation, + coll.collation, &dummy_offset)) continue; - if (!(conv= (*arg)->safe_charset_converter(coll.collation))) + if (!(conv= (*arg)->safe_charset_converter(coll.collation)) && + ((*arg)->collation.repertoire == MY_REPERTOIRE_ASCII)) + conv= new Item_func_conv_charset(*arg, coll.collation, 1); + + if (!conv) { if (nargs >=2 && nargs <= 3) { diff --git a/sql/item.h b/sql/item.h index 11dce3a7758..9a45314651a 100644 --- a/sql/item.h +++ b/sql/item.h @@ -49,29 +49,50 @@ class DTCollation { public: CHARSET_INFO *collation; enum Derivation derivation; + uint repertoire; + void set_repertoire_from_charset(CHARSET_INFO *cs) + { + repertoire= cs->state & MY_CS_PUREASCII ? + MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; + } DTCollation() { collation= &my_charset_bin; derivation= DERIVATION_NONE; + repertoire= MY_REPERTOIRE_UNICODE30; } DTCollation(CHARSET_INFO *collation_arg, Derivation derivation_arg) { collation= collation_arg; derivation= derivation_arg; + set_repertoire_from_charset(collation_arg); } void set(DTCollation &dt) { collation= dt.collation; derivation= dt.derivation; + repertoire= dt.repertoire; } void set(CHARSET_INFO *collation_arg, Derivation derivation_arg) { collation= collation_arg; derivation= derivation_arg; + set_repertoire_from_charset(collation_arg); + } + void set(CHARSET_INFO *collation_arg, + Derivation derivation_arg, + uint repertoire_arg) + { + collation= collation_arg; + derivation= derivation_arg; + repertoire= repertoire_arg; } void set(CHARSET_INFO *collation_arg) - { collation= collation_arg; } + { + collation= collation_arg; + set_repertoire_from_charset(collation_arg); + } void set(Derivation derivation_arg) { derivation= derivation_arg; } bool aggregate(DTCollation &dt, uint flags= 0); @@ -1650,10 +1671,11 @@ class Item_string :public Item { public: Item_string(const char *str,uint length, - CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE) + CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE, + uint repertoire= MY_REPERTOIRE_UNICODE30) { - collation.set(cs, dv); - str_value.set_or_copy_aligned(str,length,cs); + str_value.set_or_copy_aligned(str, length, cs); + collation.set(cs, dv, repertoire); /* We have to have a different max_length than 'length' here to ensure that we get the right length if we do use the item @@ -1677,10 +1699,11 @@ public: fixed= 1; } Item_string(const char *name_par, const char *str, uint length, - CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE) + CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE, + uint repertoire= MY_REPERTOIRE_UNICODE30) { - collation.set(cs, dv); - str_value.set_or_copy_aligned(str,length,cs); + str_value.set_or_copy_aligned(str, length, cs); + collation.set(cs, dv, repertoire); max_length= str_value.numchars()*cs->mbmaxlen; set_name(name_par, 0, cs); decimals=NOT_FIXED_DEC; @@ -1696,6 +1719,12 @@ public: str_value.copy(str_arg, length_arg, collation.collation); max_length= str_value.numchars() * collation.collation->mbmaxlen; } + void set_repertoire_from_value() + { + collation.repertoire= my_string_repertoire(str_value.charset(), + str_value.ptr(), + str_value.length()); + } enum Type type() const { return STRING_ITEM; } double val_real(); longlong val_int(); diff --git a/sql/item_func.cc b/sql/item_func.cc index 580d19fbd4e..c2943197a7c 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -3751,7 +3751,7 @@ static user_var_entry *get_variable(HASH *hash, LEX_STRING &name, entry->value=0; entry->length=0; entry->update_query_id=0; - entry->collation.set(NULL, DERIVATION_IMPLICIT); + entry->collation.set(NULL, DERIVATION_IMPLICIT, 0); entry->unsigned_flag= 0; /* If we are here, we were called from a SET or a query which sets a diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc index f9a0f715985..3d59bd27972 100644 --- a/sql/item_strfunc.cc +++ b/sql/item_strfunc.cc @@ -2672,7 +2672,8 @@ void Item_func_set_collation::fix_length_and_dec() colname, args[0]->collation.collation->csname); return; } - collation.set(set_collation, DERIVATION_EXPLICIT); + collation.set(set_collation, DERIVATION_EXPLICIT, + args[0]->collation.repertoire); max_length= args[0]->max_length; } diff --git a/sql/item_timefunc.cc b/sql/item_timefunc.cc index 9aabd068d25..62093154097 100644 --- a/sql/item_timefunc.cc +++ b/sql/item_timefunc.cc @@ -1717,7 +1717,11 @@ void Item_func_date_format::fix_length_and_dec() Item *arg1= args[1]->this_item(); decimals=0; - collation.set(thd->variables.collation_connection); + CHARSET_INFO *cs= thd->variables.collation_connection; + uint32 repertoire= arg1->collation.repertoire; + if (!thd->variables.lc_time_names->is_ascii) + repertoire|= MY_REPERTOIRE_EXTENDED; + collation.set(cs, arg1->collation.derivation, repertoire); if (arg1->type() == STRING_ITEM) { // Optimize the normal case fixed_length=1; diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index cbfba3d4d80..f74c963e26d 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -311,10 +311,12 @@ static char *get_text(Lex_input_stream *lip) uint found_escape=0; CHARSET_INFO *cs= lip->m_thd->charset(); + lip->tok_bitmap= 0; sep= yyGetLast(); // String should end with this while (lip->ptr != lip->end_of_query) { - c = yyGet(); + c= yyGet(); + lip->tok_bitmap|= c; #ifdef USE_MB { int l; @@ -605,6 +607,7 @@ int MYSQLlex(void *arg, void *yythd) break; } yylval->lex_str.length= lip->yytoklen; + lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1; return(NCHAR_STRING); case MY_LEX_IDENT_OR_HEX: @@ -926,6 +929,7 @@ int MYSQLlex(void *arg, void *yythd) break; } yylval->lex_str.length=lip->yytoklen; + lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1; return(TEXT_STRING); case MY_LEX_COMMENT: // Comment diff --git a/sql/sql_lex.h b/sql/sql_lex.h index f8405ef14ca..4b96218de80 100644 --- a/sql/sql_lex.h +++ b/sql/sql_lex.h @@ -957,6 +957,9 @@ public: /** Position of ';' in the stream, to delimit multiple queries. */ const char* found_semicolon; + + /** Token character bitmaps, to detect 7bit strings. */ + uchar tok_bitmap; /** SQL_MODE = IGNORE_SPACE. */ bool ignore_space; @@ -994,6 +997,7 @@ typedef struct st_lex : public Query_tables_list gptr yacc_yyss,yacc_yyvs; THD *thd; CHARSET_INFO *charset, *underscore_charset; + bool text_string_is_7bit; /* store original leaf_tables for INSERT SELECT and PS/SP */ TABLE_LIST *leaf_tables_insert; /* Position (first character index) of SELECT of CREATE VIEW statement */ diff --git a/sql/sql_string.cc b/sql/sql_string.cc index 9d7df73cd7a..a87074c3359 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -263,6 +263,8 @@ bool String::needs_conversion(uint32 arg_length, (to_cs == &my_charset_bin) || (to_cs == from_cs) || my_charset_same(from_cs, to_cs) || + (my_charset_is_ascii_based(to_cs) && + my_charset_is_8bit_pure_ascii(from_cs)) || ((from_cs == &my_charset_bin) && (!(*offset=(arg_length % to_cs->mbminlen))))) return FALSE; diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index 5ae2f6db581..a45ebc4c640 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -7509,18 +7509,54 @@ opt_load_data_set_spec: /* Common definitions */ text_literal: - TEXT_STRING_literal - { - THD *thd= YYTHD; - $$ = new Item_string($1.str,$1.length,thd->variables.collation_connection); - } - | NCHAR_STRING - { $$= new Item_string($1.str,$1.length,national_charset_info); } - | UNDERSCORE_CHARSET TEXT_STRING - { $$ = new Item_string($2.str,$2.length,Lex->underscore_charset); } - | text_literal TEXT_STRING_literal - { ((Item_string*) $1)->append($2.str,$2.length); } - ; + TEXT_STRING + { + LEX_STRING tmp; + THD *thd= YYTHD; + CHARSET_INFO *cs_con= thd->variables.collation_connection; + CHARSET_INFO *cs_cli= thd->variables.character_set_client; + uint repertoire= thd->lex->text_string_is_7bit && + my_charset_is_ascii_based(cs_cli) ? + MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; + if (thd->charset_is_collation_connection || + (repertoire == MY_REPERTOIRE_ASCII && + my_charset_is_ascii_based(cs_con))) + tmp= $1; + else + thd->convert_string(&tmp, cs_con, $1.str, $1.length, cs_cli); + $$= new Item_string(tmp.str, tmp.length, cs_con, + DERIVATION_COERCIBLE, repertoire); + } + | NCHAR_STRING + { + uint repertoire= Lex->text_string_is_7bit ? + MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; + DBUG_ASSERT(my_charset_is_ascii_based(national_charset_info)); + $$= new Item_string($1.str, $1.length, national_charset_info, + DERIVATION_COERCIBLE, repertoire); + } + | UNDERSCORE_CHARSET TEXT_STRING + { + $$= new Item_string($2.str, $2.length, Lex->underscore_charset); + ((Item_string*) $$)->set_repertoire_from_value(); + } + | text_literal TEXT_STRING_literal + { + Item_string* item= (Item_string*) $1; + item->append($2.str, $2.length); + if (!(item->collation.repertoire & MY_REPERTOIRE_EXTENDED)) + { + /* + If the string has been pure ASCII so far, + check the new part. + */ + CHARSET_INFO *cs= YYTHD->variables.collation_connection; + item->collation.repertoire|= my_string_repertoire(cs, + $2.str, + $2.length); + } + } + ; text_string: TEXT_STRING_literal @@ -7592,20 +7628,22 @@ literal: | TRUE_SYM { $$= new Item_int((char*) "TRUE",1,1); } | HEX_NUM { $$ = new Item_hex_string($1.str, $1.length);} | BIN_NUM { $$= new Item_bin_string($1.str, $1.length); } - | UNDERSCORE_CHARSET HEX_NUM - { - Item *tmp= new Item_hex_string($2.str, $2.length); - /* - it is OK only emulate fix_fieds, because we need only + | UNDERSCORE_CHARSET HEX_NUM + { + Item *tmp= new Item_hex_string($2.str, $2.length); + /* + it is OK only emulate fix_fieds, because we need only value of constant - */ - String *str= tmp ? - tmp->quick_fix_field(), tmp->val_str((String*) 0) : - (String*) 0; - $$= new Item_string(str ? str->ptr() : "", - str ? str->length() : 0, - Lex->underscore_charset); - } + */ + String *str= tmp ? + tmp->quick_fix_field(), tmp->val_str((String*) 0) : + (String*) 0; + $$= new Item_string(str ? str->ptr() : "", + str ? str->length() : 0, + Lex->underscore_charset); + if ($$) + ((Item_string *) $$)->set_repertoire_from_value(); + } | UNDERSCORE_CHARSET BIN_NUM { Item *tmp= new Item_bin_string($2.str, $2.length); diff --git a/strings/conf_to_src.c b/strings/conf_to_src.c index e2ac9846c85..dc2a300a2ec 100644 --- a/strings/conf_to_src.c +++ b/strings/conf_to_src.c @@ -179,14 +179,16 @@ is_case_sensitive(CHARSET_INFO *cs) cs->sort_order['a'] < cs->sort_order['B']) ? 1 : 0; } + void dispcset(FILE *f,CHARSET_INFO *cs) { fprintf(f,"{\n"); fprintf(f," %d,%d,%d,\n",cs->number,0,0); - fprintf(f," MY_CS_COMPILED%s%s%s,\n", - cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "", - cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "", - is_case_sensitive(cs) ? "|MY_CS_CSSORT" : ""); + fprintf(f," MY_CS_COMPILED%s%s%s%s,\n", + cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "", + cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "", + is_case_sensitive(cs) ? "|MY_CS_CSSORT" : "", + my_charset_is_8bit_pure_ascii(cs) ? "|MY_CS_PUREASCII" : ""); if (cs->name) { @@ -243,6 +245,28 @@ void dispcset(FILE *f,CHARSET_INFO *cs) } +static void +fprint_copyright(FILE *file) +{ + fprintf(file, +"/* Copyright (C) 2000-2007 MySQL AB\n" +"\n" +" This program is free software; you can redistribute it and/or modify\n" +" it under the terms of the GNU General Public License as published by\n" +" the Free Software Foundation; version 2 of the License.\n" +"\n" +" This program is distributed in the hope that it will be useful,\n" +" but WITHOUT ANY WARRANTY; without even the implied warranty of\n" +" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" +" GNU General Public License for more details.\n" +"\n" +" You should have received a copy of the GNU General Public License\n" +" along with this program; if not, write to the Free Software\n" +" Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */\n" +"\n"); +} + + int main(int argc, char **argv __attribute__((unused))) { @@ -283,6 +307,7 @@ main(int argc, char **argv __attribute__((unused))) "directory:\n"); fprintf(f, " ./conf_to_src ../sql/share/charsets/ > FILE\n"); fprintf(f, "*/\n\n"); + fprint_copyright(f); fprintf(f,"#include <my_global.h>\n"); fprintf(f,"#include <m_ctype.h>\n\n"); diff --git a/strings/ctype-extra.c b/strings/ctype-extra.c index 1c20828ea54..2a7fcbd383e 100644 --- a/strings/ctype-extra.c +++ b/strings/ctype-extra.c @@ -5,7 +5,8 @@ To re-generate, run the following in the strings/ directory: ./conf_to_src ../sql/share/charsets/ > FILE */ -/* Copyright (C) 2000-2003 MySQL AB + +/* Copyright (C) 2000-2007 MySQL AB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -6721,7 +6722,7 @@ CHARSET_INFO compiled_charsets[] = { #ifdef HAVE_CHARSET_ascii { 11,0,0, - MY_CS_COMPILED|MY_CS_PRIMARY, + MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_PUREASCII, "ascii", /* cset name */ "ascii_general_ci", /* coll name */ "", /* comment */ @@ -7810,7 +7811,7 @@ CHARSET_INFO compiled_charsets[] = { #ifdef HAVE_CHARSET_ascii { 65,0,0, - MY_CS_COMPILED|MY_CS_BINSORT, + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_PUREASCII, "ascii", /* cset name */ "ascii_bin", /* coll name */ "", /* comment */ diff --git a/strings/ctype.c b/strings/ctype.c index e7399c5438b..372a1a8a468 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -306,3 +306,89 @@ my_bool my_parse_charset_xml(const char *buf, uint len, my_xml_parser_free(&p); return rc; } + + +/* + Check repertoire: detect pure ascii strings +*/ +uint +my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length) +{ + const char *strend= str + length; + if (cs->mbminlen == 1) + { + for ( ; str < strend; str++) + { + if (((uchar) *str) > 0x7F) + return MY_REPERTOIRE_UNICODE30; + } + } + else + { + my_wc_t wc; + int chlen; + for (; (chlen= cs->cset->mb_wc(cs, &wc, str, strend)) > 0; str+= chlen) + { + if (wc > 0x7F) + return MY_REPERTOIRE_UNICODE30; + } + } + return MY_REPERTOIRE_ASCII; +} + + +/* + Detect whether a character set is ASCII compatible. + + Returns TRUE for: + + - all 8bit character sets whose Unicode mapping of 0x7B is '{' + (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS") + + - all multi-byte character sets having mbminlen == 1 + (ignores ucs2 whose mbminlen is 2) + + TODO: + + When merging to 5.2, this function should be changed + to check a new flag MY_CS_NONASCII, + + return (cs->flag & MY_CS_NONASCII) ? 0 : 1; + + This flag was previously added into 5.2 under terms + of WL#3759 "Optimize identifier conversion in client-server protocol" + especially to mark character sets not compatible with ASCII. + + We won't backport this flag to 5.0 or 5.1. + This function is Ok for 5.0 and 5.1, because we're not going + to introduce new tricky character sets between 5.0 and 5.2. +*/ +my_bool +my_charset_is_ascii_based(CHARSET_INFO *cs) +{ + return + (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') || + (cs->mbminlen == 1 && cs->mbmaxlen > 1); +} + + +/* + Detect if a character set is 8bit, + and it is pure ascii, i.e. doesn't have + characters outside U+0000..U+007F + This functions is shared between "conf_to_src" + and dynamic charsets loader in "mysqld". +*/ +my_bool +my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs) +{ + size_t code; + if (!cs->tab_to_uni) + return 0; + for (code= 0; code < 256; code++) + { + if (cs->tab_to_uni[code] > 0x7F) + return 0; + } + return 1; +} |