summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorunknown <bar@mysql.com/bar.myoffice.izhnet.ru>2007-08-03 15:25:23 +0500
committerunknown <bar@mysql.com/bar.myoffice.izhnet.ru>2007-08-03 15:25:23 +0500
commit53df09a9a6a99b82e2a8869eb16737a78772b29e (patch)
tree01c0ee2f5244a4d68fc6cdfb6555fa1af4589a8c
parentb307fc4d8fd5380cec948c07550b5ae73624e274 (diff)
downloadmariadb-git-53df09a9a6a99b82e2a8869eb16737a78772b29e.tar.gz
Bug#28875 Conversion between ASCII and LATIN1 charsets does not function
(Regression, caused by a patch for the bug 22646). Problem: when result type of date_format() was changed from binary string to character string, mixing date_format() with a ascii column in CONCAT() stopped to work. Fix: - adding "repertoire" flag into DTCollation class, to mark items which can return only pure ASCII strings. - allow character set conversion from pure ASCII to other character sets. include/m_ctype.h: Defining new flags. Adding new function prototypes. mysql-test/r/ctype_ucs.result: Adding tests. mysql-test/r/ctype_utf8.result: Adding tests. mysql-test/r/func_time.result: Adding tests. mysql-test/t/ctype_ucs.test: Adding tests. mysql-test/t/ctype_utf8.test: Adding tests. mysql-test/t/func_time.test: Adding test. mysys/charset.c: Adding pure ASCII detection when loading a dynamic character set. sql/item.cc: - Moving detection of a Unicode superset into function. - Adding detection of a ASCII subset. - Adding creation of to-ASCII character set convertor when safe_charset_converter() failed and when the argument. repertoire is know to be pure ASCII. sql/item.h: - Adding "repertoire" member into DTCollation class. - Adding "repertoire" argument to constructors. - Adding new methods: set_repertoire_from_charset() set_repertoire_from_value() sql/item_func.cc: Adding "repertoire" argument. sql/item_strfunc.cc: Adding "repertoire" argument. sql/item_timefunc.cc: Initializing the result repertoire taking into account the "is_ascii" flag of the current locale. sql/sql_lex.cc: Detect 7bit strings, return in Lex->text_string_is_7bit. sql/sql_lex.h: Adding new member into LEX structure. Adding new member into Lex_input_stream sql/sql_string.cc: Allow simple copy from pure ASCII to a ASCII-based character set. sql/sql_yacc.yy: Depening on Lex->text_string_is_7bit and character set features, create Item_string with MY_REPERTOIRE_ASCII when it is possible. strings/conf_to_src.c: - Adding printing of the "MY_CS_PUREASCII" flag - Adding printing of copyright strings/ctype-extra.c: Recreating ctype-extra.c: ascii_general_ci and ascii_bin are now marked with MY_CS_PUREASCII flag. strings/ctype.c: Adding new functions.
-rw-r--r--include/m_ctype.h11
-rw-r--r--mysql-test/r/ctype_ucs.result26
-rw-r--r--mysql-test/r/ctype_utf8.result36
-rw-r--r--mysql-test/r/func_time.result16
-rw-r--r--mysql-test/t/ctype_ucs.test30
-rw-r--r--mysql-test/t/ctype_utf8.test40
-rw-r--r--mysql-test/t/func_time.test26
-rw-r--r--mysys/charset.c3
-rw-r--r--sql/item.cc46
-rw-r--r--sql/item.h43
-rw-r--r--sql/item_func.cc2
-rw-r--r--sql/item_strfunc.cc3
-rw-r--r--sql/item_timefunc.cc6
-rw-r--r--sql/sql_lex.cc6
-rw-r--r--sql/sql_lex.h4
-rw-r--r--sql/sql_string.cc2
-rw-r--r--sql/sql_yacc.yy88
-rw-r--r--strings/conf_to_src.c33
-rw-r--r--strings/ctype-extra.c7
-rw-r--r--strings/ctype.c86
20 files changed, 457 insertions, 57 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 9f21ac16a05..218ec2daadb 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -78,8 +78,14 @@ extern MY_UNICASE_INFO *my_unicase_turkish[256];
#define MY_CS_READY 256 /* if a charset is initialized */
#define MY_CS_AVAILABLE 512 /* If either compiled-in or loaded*/
#define MY_CS_CSSORT 1024 /* if case sensitive sort order */
+#define MY_CS_PUREASCII 2048 /* if a charset is pure ascii */
#define MY_CHARSET_UNDEFINED 0
+/* Character repertoire flags */
+#define MY_REPERTOIRE_ASCII 1 /* Pure ASCII U+0000..U+007F */
+#define MY_REPERTOIRE_EXTENDED 2 /* Extended characters: U+0080..U+FFFF */
+#define MY_REPERTOIRE_UNICODE30 3 /* ASCII | EXTENDED: U+0000..U+FFFF */
+
typedef struct my_uni_idx_st
{
@@ -436,6 +442,11 @@ my_bool my_propagate_simple(CHARSET_INFO *cs, const uchar *str, uint len);
my_bool my_propagate_complex(CHARSET_INFO *cs, const uchar *str, uint len);
+uint my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong len);
+my_bool my_charset_is_ascii_based(CHARSET_INFO *cs);
+my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs);
+
+
#define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */
#define _MY_NMR 04 /* Numeral (digit) */
diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result
index 960953b3c5e..350fc3f6bd6 100644
--- a/mysql-test/r/ctype_ucs.result
+++ b/mysql-test/r/ctype_ucs.result
@@ -865,4 +865,30 @@ blob 65535 65535
text 65535 65535
text 65535 32767
drop table t1;
+create table t1 (a varchar(15) character set ascii not null, b int);
+insert into t1 values ('a',1);
+select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
+concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062))
+aa
+select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
+concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062))
+ab
+select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062);
+a b
+a 1
+select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062);
+a b
+select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
+select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
+select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
+select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
+select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062);
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '='
+select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0);
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '='
+drop table t1;
End of 5.0 tests
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 216b5f393fb..3b20ded7361 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -1639,6 +1639,42 @@ coercibility(col1) collation(col1)
0 utf8_swedish_ci
drop view v1, v2;
drop table t1;
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, N'x', N'y')) from t1;
+concat(a, if(b>10, N'x', N'y'))
+ay
+select concat(a, if(b>10, N'æ', N'ß')) from t1;
+ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
+drop table t1;
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1;
+concat(a, if(b>10, _utf8'x', _utf8'y'))
+ay
+select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1;
+ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
+drop table t1;
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1;
+concat(a, if(b>10, _utf8 0x78, _utf8 0x79))
+ay
+select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1;
+ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
+drop table t1;
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1;
+concat(a, if(b>10, 'x' 'x', 'y' 'y'))
+ayy
+select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1;
+ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
+drop table t1;
CREATE TABLE t1 (
colA int(11) NOT NULL,
colB varchar(255) character set utf8 NOT NULL,
diff --git a/mysql-test/r/func_time.result b/mysql-test/r/func_time.result
index 56ea72a8ee3..2207cd27243 100644
--- a/mysql-test/r/func_time.result
+++ b/mysql-test/r/func_time.result
@@ -1246,3 +1246,19 @@ SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SE
TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s")
838:59:58
838:59:59
+set names latin1;
+create table t1 (a varchar(15) character set ascii not null);
+insert into t1 values ('070514-000000');
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull'))
+#
+set names swe7;
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (swe7_swedish_ci,COERCIBLE) for operation 'concat'
+set names latin1;
+set lc_time_names=fr_FR;
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (latin1_swedish_ci,COERCIBLE) for operation 'concat'
+set lc_time_names=en_US;
+drop table t1;
+End of 5.0 tests
diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test
index c3320159c41..d1dd2378bd0 100644
--- a/mysql-test/t/ctype_ucs.test
+++ b/mysql-test/t/ctype_ucs.test
@@ -594,4 +594,34 @@ select data_type, character_octet_length, character_maximum_length
from information_schema.columns where table_name='t1';
drop table t1;
+#
+# Conversion from UCS2 to ASCII is possible
+# if the UCS2 string consists of only ASCII characters
+#
+create table t1 (a varchar(15) character set ascii not null, b int);
+insert into t1 values ('a',1);
+select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
+select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
+select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062);
+select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062);
+
+#
+# Conversion from UCS2 to ASCII is not possible if
+# the UCS2 string has non-ASCII characters
+#
+--error 1267
+select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
+--error 1267
+select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
+--error 1267
+select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
+--error 1267
+select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
+--error 1267
+select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062);
+--error 1267
+select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0);
+drop table t1;
+
+
--echo End of 5.0 tests
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index c4637d14edc..603c60faf82 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1314,6 +1314,46 @@ select coercibility(col1), collation(col1) from v2;
drop view v1, v2;
drop table t1;
+#
+# Check conversion of NCHAR strings to subset (e.g. latin1).
+# Conversion is possible if string repertoire is ASCII.
+# Conversion is not possible if the string have extended characters
+#
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, N'x', N'y')) from t1;
+--error 1267
+select concat(a, if(b>10, N'æ', N'ß')) from t1;
+drop table t1;
+
+# Conversion tests for character set introducers
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1;
+--error 1267
+select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1;
+drop table t1;
+
+# Conversion tests for introducer + HEX string
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1;
+--error 1267
+select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1;
+drop table t1;
+
+# Conversion tests for "text_literal TEXT_STRING_literal" syntax structure
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1;
+--error 1267
+select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1;
+drop table t1;
+
#
# Bug#19960: Inconsistent results when joining
diff --git a/mysql-test/t/func_time.test b/mysql-test/t/func_time.test
index da909dc578f..96d064fdf41 100644
--- a/mysql-test/t/func_time.test
+++ b/mysql-test/t/func_time.test
@@ -752,3 +752,29 @@ DROP TABLE t1;
# Check if using GROUP BY with TIME_FORMAT() produces correct results
SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SELECT 3020398 ) x GROUP BY 1;
+
+#
+# Bug#28875 Conversion between ASCII and LATIN1 charsets does not function
+#
+set names latin1;
+create table t1 (a varchar(15) character set ascii not null);
+insert into t1 values ('070514-000000');
+# Conversion of date_format() result to ASCII
+# is safe with the default locale en_US
+--replace_column 1 #
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+# Error for swe7: it is not ASCII compatible
+set names swe7;
+--error 1267
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+set names latin1;
+# Conversion of date_format() result to ASCII
+# is not safe with the non-default locale fr_FR
+# because month and day names can have accented characters
+set lc_time_names=fr_FR;
+--error 1267
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+set lc_time_names=en_US;
+drop table t1;
+
+--echo End of 5.0 tests
diff --git a/mysys/charset.c b/mysys/charset.c
index 9ea17c6515c..4c3f2d0ab71 100644
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -277,6 +277,9 @@ static int add_collation(CHARSET_INFO *cs)
if (sort_order && sort_order['A'] < sort_order['a'] &&
sort_order['a'] < sort_order['B'])
all_charsets[cs->number]->state|= MY_CS_CSSORT;
+
+ if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number]))
+ all_charsets[cs->number]->state|= MY_CS_PUREASCII;
}
}
else
diff --git a/sql/item.cc b/sql/item.cc
index 92ea35072f9..30fc32706fd 100644
--- a/sql/item.cc
+++ b/sql/item.cc
@@ -1296,6 +1296,25 @@ void Item::split_sum_func2(THD *thd, Item **ref_pointer_array,
}
+static bool
+left_is_superset(DTCollation *left, DTCollation *right)
+{
+ /* Allow convert to Unicode */
+ if (left->collation->state & MY_CS_UNICODE &&
+ (left->derivation < right->derivation ||
+ (left->derivation == right->derivation &&
+ !(right->collation->state & MY_CS_UNICODE))))
+ return TRUE;
+ /* Allow convert from ASCII */
+ if (right->repertoire == MY_REPERTOIRE_ASCII &&
+ (left->derivation < right->derivation ||
+ (left->derivation == right->derivation &&
+ !(left->repertoire == MY_REPERTOIRE_ASCII))))
+ return TRUE;
+ /* Disallow conversion otherwise */
+ return FALSE;
+}
+
/*
Aggregate two collations together taking
into account their coercibility (aka derivation):
@@ -1360,18 +1379,12 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
; // Do nothing
}
else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) &&
- collation->state & MY_CS_UNICODE &&
- (derivation < dt.derivation ||
- (derivation == dt.derivation &&
- !(dt.collation->state & MY_CS_UNICODE))))
+ left_is_superset(this, &dt))
{
// Do nothing
}
else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) &&
- dt.collation->state & MY_CS_UNICODE &&
- (dt.derivation < derivation ||
- (dt.derivation == derivation &&
- !(collation->state & MY_CS_UNICODE))))
+ left_is_superset(&dt, this))
{
set(dt);
}
@@ -1390,7 +1403,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
else
{
// Cannot apply conversion
- set(0, DERIVATION_NONE);
+ set(0, DERIVATION_NONE, 0);
return 1;
}
}
@@ -1412,8 +1425,8 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
{
if (derivation == DERIVATION_EXPLICIT)
{
- set(0, DERIVATION_NONE);
- return 1;
+ set(0, DERIVATION_NONE, 0);
+ return 1;
}
if (collation->state & MY_CS_BINSORT)
return 0;
@@ -1427,6 +1440,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
set(bin, DERIVATION_NONE);
}
}
+ repertoire|= dt.repertoire;
return 0;
}
@@ -1566,12 +1580,16 @@ bool agg_item_charsets(DTCollation &coll, const char *fname,
{
Item* conv;
uint32 dummy_offset;
- if (!String::needs_conversion(0, coll.collation,
- (*arg)->collation.collation,
+ if (!String::needs_conversion(0, (*arg)->collation.collation,
+ coll.collation,
&dummy_offset))
continue;
- if (!(conv= (*arg)->safe_charset_converter(coll.collation)))
+ if (!(conv= (*arg)->safe_charset_converter(coll.collation)) &&
+ ((*arg)->collation.repertoire == MY_REPERTOIRE_ASCII))
+ conv= new Item_func_conv_charset(*arg, coll.collation, 1);
+
+ if (!conv)
{
if (nargs >=2 && nargs <= 3)
{
diff --git a/sql/item.h b/sql/item.h
index 11dce3a7758..9a45314651a 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -49,29 +49,50 @@ class DTCollation {
public:
CHARSET_INFO *collation;
enum Derivation derivation;
+ uint repertoire;
+ void set_repertoire_from_charset(CHARSET_INFO *cs)
+ {
+ repertoire= cs->state & MY_CS_PUREASCII ?
+ MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
+ }
DTCollation()
{
collation= &my_charset_bin;
derivation= DERIVATION_NONE;
+ repertoire= MY_REPERTOIRE_UNICODE30;
}
DTCollation(CHARSET_INFO *collation_arg, Derivation derivation_arg)
{
collation= collation_arg;
derivation= derivation_arg;
+ set_repertoire_from_charset(collation_arg);
}
void set(DTCollation &dt)
{
collation= dt.collation;
derivation= dt.derivation;
+ repertoire= dt.repertoire;
}
void set(CHARSET_INFO *collation_arg, Derivation derivation_arg)
{
collation= collation_arg;
derivation= derivation_arg;
+ set_repertoire_from_charset(collation_arg);
+ }
+ void set(CHARSET_INFO *collation_arg,
+ Derivation derivation_arg,
+ uint repertoire_arg)
+ {
+ collation= collation_arg;
+ derivation= derivation_arg;
+ repertoire= repertoire_arg;
}
void set(CHARSET_INFO *collation_arg)
- { collation= collation_arg; }
+ {
+ collation= collation_arg;
+ set_repertoire_from_charset(collation_arg);
+ }
void set(Derivation derivation_arg)
{ derivation= derivation_arg; }
bool aggregate(DTCollation &dt, uint flags= 0);
@@ -1650,10 +1671,11 @@ class Item_string :public Item
{
public:
Item_string(const char *str,uint length,
- CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE)
+ CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE,
+ uint repertoire= MY_REPERTOIRE_UNICODE30)
{
- collation.set(cs, dv);
- str_value.set_or_copy_aligned(str,length,cs);
+ str_value.set_or_copy_aligned(str, length, cs);
+ collation.set(cs, dv, repertoire);
/*
We have to have a different max_length than 'length' here to
ensure that we get the right length if we do use the item
@@ -1677,10 +1699,11 @@ public:
fixed= 1;
}
Item_string(const char *name_par, const char *str, uint length,
- CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE)
+ CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE,
+ uint repertoire= MY_REPERTOIRE_UNICODE30)
{
- collation.set(cs, dv);
- str_value.set_or_copy_aligned(str,length,cs);
+ str_value.set_or_copy_aligned(str, length, cs);
+ collation.set(cs, dv, repertoire);
max_length= str_value.numchars()*cs->mbmaxlen;
set_name(name_par, 0, cs);
decimals=NOT_FIXED_DEC;
@@ -1696,6 +1719,12 @@ public:
str_value.copy(str_arg, length_arg, collation.collation);
max_length= str_value.numchars() * collation.collation->mbmaxlen;
}
+ void set_repertoire_from_value()
+ {
+ collation.repertoire= my_string_repertoire(str_value.charset(),
+ str_value.ptr(),
+ str_value.length());
+ }
enum Type type() const { return STRING_ITEM; }
double val_real();
longlong val_int();
diff --git a/sql/item_func.cc b/sql/item_func.cc
index 580d19fbd4e..c2943197a7c 100644
--- a/sql/item_func.cc
+++ b/sql/item_func.cc
@@ -3751,7 +3751,7 @@ static user_var_entry *get_variable(HASH *hash, LEX_STRING &name,
entry->value=0;
entry->length=0;
entry->update_query_id=0;
- entry->collation.set(NULL, DERIVATION_IMPLICIT);
+ entry->collation.set(NULL, DERIVATION_IMPLICIT, 0);
entry->unsigned_flag= 0;
/*
If we are here, we were called from a SET or a query which sets a
diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc
index f9a0f715985..3d59bd27972 100644
--- a/sql/item_strfunc.cc
+++ b/sql/item_strfunc.cc
@@ -2672,7 +2672,8 @@ void Item_func_set_collation::fix_length_and_dec()
colname, args[0]->collation.collation->csname);
return;
}
- collation.set(set_collation, DERIVATION_EXPLICIT);
+ collation.set(set_collation, DERIVATION_EXPLICIT,
+ args[0]->collation.repertoire);
max_length= args[0]->max_length;
}
diff --git a/sql/item_timefunc.cc b/sql/item_timefunc.cc
index 9aabd068d25..62093154097 100644
--- a/sql/item_timefunc.cc
+++ b/sql/item_timefunc.cc
@@ -1717,7 +1717,11 @@ void Item_func_date_format::fix_length_and_dec()
Item *arg1= args[1]->this_item();
decimals=0;
- collation.set(thd->variables.collation_connection);
+ CHARSET_INFO *cs= thd->variables.collation_connection;
+ uint32 repertoire= arg1->collation.repertoire;
+ if (!thd->variables.lc_time_names->is_ascii)
+ repertoire|= MY_REPERTOIRE_EXTENDED;
+ collation.set(cs, arg1->collation.derivation, repertoire);
if (arg1->type() == STRING_ITEM)
{ // Optimize the normal case
fixed_length=1;
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index cbfba3d4d80..f74c963e26d 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -311,10 +311,12 @@ static char *get_text(Lex_input_stream *lip)
uint found_escape=0;
CHARSET_INFO *cs= lip->m_thd->charset();
+ lip->tok_bitmap= 0;
sep= yyGetLast(); // String should end with this
while (lip->ptr != lip->end_of_query)
{
- c = yyGet();
+ c= yyGet();
+ lip->tok_bitmap|= c;
#ifdef USE_MB
{
int l;
@@ -605,6 +607,7 @@ int MYSQLlex(void *arg, void *yythd)
break;
}
yylval->lex_str.length= lip->yytoklen;
+ lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
return(NCHAR_STRING);
case MY_LEX_IDENT_OR_HEX:
@@ -926,6 +929,7 @@ int MYSQLlex(void *arg, void *yythd)
break;
}
yylval->lex_str.length=lip->yytoklen;
+ lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
return(TEXT_STRING);
case MY_LEX_COMMENT: // Comment
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index f8405ef14ca..4b96218de80 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -957,6 +957,9 @@ public:
/** Position of ';' in the stream, to delimit multiple queries. */
const char* found_semicolon;
+
+ /** Token character bitmaps, to detect 7bit strings. */
+ uchar tok_bitmap;
/** SQL_MODE = IGNORE_SPACE. */
bool ignore_space;
@@ -994,6 +997,7 @@ typedef struct st_lex : public Query_tables_list
gptr yacc_yyss,yacc_yyvs;
THD *thd;
CHARSET_INFO *charset, *underscore_charset;
+ bool text_string_is_7bit;
/* store original leaf_tables for INSERT SELECT and PS/SP */
TABLE_LIST *leaf_tables_insert;
/* Position (first character index) of SELECT of CREATE VIEW statement */
diff --git a/sql/sql_string.cc b/sql/sql_string.cc
index 9d7df73cd7a..a87074c3359 100644
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@@ -263,6 +263,8 @@ bool String::needs_conversion(uint32 arg_length,
(to_cs == &my_charset_bin) ||
(to_cs == from_cs) ||
my_charset_same(from_cs, to_cs) ||
+ (my_charset_is_ascii_based(to_cs) &&
+ my_charset_is_8bit_pure_ascii(from_cs)) ||
((from_cs == &my_charset_bin) &&
(!(*offset=(arg_length % to_cs->mbminlen)))))
return FALSE;
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
index 5ae2f6db581..a45ebc4c640 100644
--- a/sql/sql_yacc.yy
+++ b/sql/sql_yacc.yy
@@ -7509,18 +7509,54 @@ opt_load_data_set_spec:
/* Common definitions */
text_literal:
- TEXT_STRING_literal
- {
- THD *thd= YYTHD;
- $$ = new Item_string($1.str,$1.length,thd->variables.collation_connection);
- }
- | NCHAR_STRING
- { $$= new Item_string($1.str,$1.length,national_charset_info); }
- | UNDERSCORE_CHARSET TEXT_STRING
- { $$ = new Item_string($2.str,$2.length,Lex->underscore_charset); }
- | text_literal TEXT_STRING_literal
- { ((Item_string*) $1)->append($2.str,$2.length); }
- ;
+ TEXT_STRING
+ {
+ LEX_STRING tmp;
+ THD *thd= YYTHD;
+ CHARSET_INFO *cs_con= thd->variables.collation_connection;
+ CHARSET_INFO *cs_cli= thd->variables.character_set_client;
+ uint repertoire= thd->lex->text_string_is_7bit &&
+ my_charset_is_ascii_based(cs_cli) ?
+ MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
+ if (thd->charset_is_collation_connection ||
+ (repertoire == MY_REPERTOIRE_ASCII &&
+ my_charset_is_ascii_based(cs_con)))
+ tmp= $1;
+ else
+ thd->convert_string(&tmp, cs_con, $1.str, $1.length, cs_cli);
+ $$= new Item_string(tmp.str, tmp.length, cs_con,
+ DERIVATION_COERCIBLE, repertoire);
+ }
+ | NCHAR_STRING
+ {
+ uint repertoire= Lex->text_string_is_7bit ?
+ MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
+ DBUG_ASSERT(my_charset_is_ascii_based(national_charset_info));
+ $$= new Item_string($1.str, $1.length, national_charset_info,
+ DERIVATION_COERCIBLE, repertoire);
+ }
+ | UNDERSCORE_CHARSET TEXT_STRING
+ {
+ $$= new Item_string($2.str, $2.length, Lex->underscore_charset);
+ ((Item_string*) $$)->set_repertoire_from_value();
+ }
+ | text_literal TEXT_STRING_literal
+ {
+ Item_string* item= (Item_string*) $1;
+ item->append($2.str, $2.length);
+ if (!(item->collation.repertoire & MY_REPERTOIRE_EXTENDED))
+ {
+ /*
+ If the string has been pure ASCII so far,
+ check the new part.
+ */
+ CHARSET_INFO *cs= YYTHD->variables.collation_connection;
+ item->collation.repertoire|= my_string_repertoire(cs,
+ $2.str,
+ $2.length);
+ }
+ }
+ ;
text_string:
TEXT_STRING_literal
@@ -7592,20 +7628,22 @@ literal:
| TRUE_SYM { $$= new Item_int((char*) "TRUE",1,1); }
| HEX_NUM { $$ = new Item_hex_string($1.str, $1.length);}
| BIN_NUM { $$= new Item_bin_string($1.str, $1.length); }
- | UNDERSCORE_CHARSET HEX_NUM
- {
- Item *tmp= new Item_hex_string($2.str, $2.length);
- /*
- it is OK only emulate fix_fieds, because we need only
+ | UNDERSCORE_CHARSET HEX_NUM
+ {
+ Item *tmp= new Item_hex_string($2.str, $2.length);
+ /*
+ it is OK only emulate fix_fieds, because we need only
value of constant
- */
- String *str= tmp ?
- tmp->quick_fix_field(), tmp->val_str((String*) 0) :
- (String*) 0;
- $$= new Item_string(str ? str->ptr() : "",
- str ? str->length() : 0,
- Lex->underscore_charset);
- }
+ */
+ String *str= tmp ?
+ tmp->quick_fix_field(), tmp->val_str((String*) 0) :
+ (String*) 0;
+ $$= new Item_string(str ? str->ptr() : "",
+ str ? str->length() : 0,
+ Lex->underscore_charset);
+ if ($$)
+ ((Item_string *) $$)->set_repertoire_from_value();
+ }
| UNDERSCORE_CHARSET BIN_NUM
{
Item *tmp= new Item_bin_string($2.str, $2.length);
diff --git a/strings/conf_to_src.c b/strings/conf_to_src.c
index e2ac9846c85..dc2a300a2ec 100644
--- a/strings/conf_to_src.c
+++ b/strings/conf_to_src.c
@@ -179,14 +179,16 @@ is_case_sensitive(CHARSET_INFO *cs)
cs->sort_order['a'] < cs->sort_order['B']) ? 1 : 0;
}
+
void dispcset(FILE *f,CHARSET_INFO *cs)
{
fprintf(f,"{\n");
fprintf(f," %d,%d,%d,\n",cs->number,0,0);
- fprintf(f," MY_CS_COMPILED%s%s%s,\n",
- cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "",
- cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "",
- is_case_sensitive(cs) ? "|MY_CS_CSSORT" : "");
+ fprintf(f," MY_CS_COMPILED%s%s%s%s,\n",
+ cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "",
+ cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "",
+ is_case_sensitive(cs) ? "|MY_CS_CSSORT" : "",
+ my_charset_is_8bit_pure_ascii(cs) ? "|MY_CS_PUREASCII" : "");
if (cs->name)
{
@@ -243,6 +245,28 @@ void dispcset(FILE *f,CHARSET_INFO *cs)
}
+static void
+fprint_copyright(FILE *file)
+{
+ fprintf(file,
+"/* Copyright (C) 2000-2007 MySQL AB\n"
+"\n"
+" This program is free software; you can redistribute it and/or modify\n"
+" it under the terms of the GNU General Public License as published by\n"
+" the Free Software Foundation; version 2 of the License.\n"
+"\n"
+" This program is distributed in the hope that it will be useful,\n"
+" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"
+" GNU General Public License for more details.\n"
+"\n"
+" You should have received a copy of the GNU General Public License\n"
+" along with this program; if not, write to the Free Software\n"
+" Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */\n"
+"\n");
+}
+
+
int
main(int argc, char **argv __attribute__((unused)))
{
@@ -283,6 +307,7 @@ main(int argc, char **argv __attribute__((unused)))
"directory:\n");
fprintf(f, " ./conf_to_src ../sql/share/charsets/ > FILE\n");
fprintf(f, "*/\n\n");
+ fprint_copyright(f);
fprintf(f,"#include <my_global.h>\n");
fprintf(f,"#include <m_ctype.h>\n\n");
diff --git a/strings/ctype-extra.c b/strings/ctype-extra.c
index 1c20828ea54..2a7fcbd383e 100644
--- a/strings/ctype-extra.c
+++ b/strings/ctype-extra.c
@@ -5,7 +5,8 @@
To re-generate, run the following in the strings/ directory:
./conf_to_src ../sql/share/charsets/ > FILE
*/
-/* Copyright (C) 2000-2003 MySQL AB
+
+/* Copyright (C) 2000-2007 MySQL AB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -6721,7 +6722,7 @@ CHARSET_INFO compiled_charsets[] = {
#ifdef HAVE_CHARSET_ascii
{
11,0,0,
- MY_CS_COMPILED|MY_CS_PRIMARY,
+ MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_PUREASCII,
"ascii", /* cset name */
"ascii_general_ci", /* coll name */
"", /* comment */
@@ -7810,7 +7811,7 @@ CHARSET_INFO compiled_charsets[] = {
#ifdef HAVE_CHARSET_ascii
{
65,0,0,
- MY_CS_COMPILED|MY_CS_BINSORT,
+ MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_PUREASCII,
"ascii", /* cset name */
"ascii_bin", /* coll name */
"", /* comment */
diff --git a/strings/ctype.c b/strings/ctype.c
index e7399c5438b..372a1a8a468 100644
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -306,3 +306,89 @@ my_bool my_parse_charset_xml(const char *buf, uint len,
my_xml_parser_free(&p);
return rc;
}
+
+
+/*
+ Check repertoire: detect pure ascii strings
+*/
+uint
+my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length)
+{
+ const char *strend= str + length;
+ if (cs->mbminlen == 1)
+ {
+ for ( ; str < strend; str++)
+ {
+ if (((uchar) *str) > 0x7F)
+ return MY_REPERTOIRE_UNICODE30;
+ }
+ }
+ else
+ {
+ my_wc_t wc;
+ int chlen;
+ for (; (chlen= cs->cset->mb_wc(cs, &wc, str, strend)) > 0; str+= chlen)
+ {
+ if (wc > 0x7F)
+ return MY_REPERTOIRE_UNICODE30;
+ }
+ }
+ return MY_REPERTOIRE_ASCII;
+}
+
+
+/*
+ Detect whether a character set is ASCII compatible.
+
+ Returns TRUE for:
+
+ - all 8bit character sets whose Unicode mapping of 0x7B is '{'
+ (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
+
+ - all multi-byte character sets having mbminlen == 1
+ (ignores ucs2 whose mbminlen is 2)
+
+ TODO:
+
+ When merging to 5.2, this function should be changed
+ to check a new flag MY_CS_NONASCII,
+
+ return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
+
+ This flag was previously added into 5.2 under terms
+ of WL#3759 "Optimize identifier conversion in client-server protocol"
+ especially to mark character sets not compatible with ASCII.
+
+ We won't backport this flag to 5.0 or 5.1.
+ This function is Ok for 5.0 and 5.1, because we're not going
+ to introduce new tricky character sets between 5.0 and 5.2.
+*/
+my_bool
+my_charset_is_ascii_based(CHARSET_INFO *cs)
+{
+ return
+ (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
+ (cs->mbminlen == 1 && cs->mbmaxlen > 1);
+}
+
+
+/*
+ Detect if a character set is 8bit,
+ and it is pure ascii, i.e. doesn't have
+ characters outside U+0000..U+007F
+ This functions is shared between "conf_to_src"
+ and dynamic charsets loader in "mysqld".
+*/
+my_bool
+my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs)
+{
+ size_t code;
+ if (!cs->tab_to_uni)
+ return 0;
+ for (code= 0; code < 256; code++)
+ {
+ if (cs->tab_to_uni[code] > 0x7F)
+ return 0;
+ }
+ return 1;
+}