Bug#28875 Conversion between ASCII and LATIN1 charsets does not function

(Regression, caused by a patch for the bug 22646). Problem: when result type of date_format() was changed from binary string to character string, mixing date_format() with a ascii column in CONCAT() stopped to work. Fix: - adding "repertoire" flag into DTCollation class, to mark items which can return only pure ASCII strings. - allow character set conversion from pure ASCII to other character sets. include/m_ctype.h: Defining new flags. Adding new function prototypes. mysql-test/r/ctype_ucs.result: Adding tests. mysql-test/r/ctype_utf8.result: Adding tests. mysql-test/r/func_time.result: Adding tests. mysql-test/t/ctype_ucs.test: Adding tests. mysql-test/t/ctype_utf8.test: Adding tests. mysql-test/t/func_time.test: Adding test. mysys/charset.c: Adding pure ASCII detection when loading a dynamic character set. sql/item.cc: - Moving detection of a Unicode superset into function. - Adding detection of a ASCII subset. - Adding creation of to-ASCII character set convertor when safe_charset_converter() failed and when the argument. repertoire is know to be pure ASCII. sql/item.h: - Adding "repertoire" member into DTCollation class. - Adding "repertoire" argument to constructors. - Adding new methods: set_repertoire_from_charset() set_repertoire_from_value() sql/item_func.cc: Adding "repertoire" argument. sql/item_strfunc.cc: Adding "repertoire" argument. sql/item_timefunc.cc: Initializing the result repertoire taking into account the "is_ascii" flag of the current locale. sql/sql_lex.cc: Detect 7bit strings, return in Lex->text_string_is_7bit. sql/sql_lex.h: Adding new member into LEX structure. Adding new member into Lex_input_stream sql/sql_string.cc: Allow simple copy from pure ASCII to a ASCII-based character set. sql/sql_yacc.yy: Depening on Lex->text_string_is_7bit and character set features, create Item_string with MY_REPERTOIRE_ASCII when it is possible. strings/conf_to_src.c: - Adding printing of the "MY_CS_PUREASCII" flag - Adding printing of copyright strings/ctype-extra.c: Recreating ctype-extra.c: ascii_general_ci and ascii_bin are now marked with MY_CS_PUREASCII flag. strings/ctype.c: Adding new functions.
author: unknown <bar@mysql.com/bar.myoffice.izhnet.ru> 2007-08-03 15:25:23 +0500
committer: unknown <bar@mysql.com/bar.myoffice.izhnet.ru> 2007-08-03 15:25:23 +0500
commit: 53df09a9a6a99b82e2a8869eb16737a78772b29e (patch)
tree: 01c0ee2f5244a4d68fc6cdfb6555fa1af4589a8c
parent: b307fc4d8fd5380cec948c07550b5ae73624e274 (diff)
download: mariadb-git-53df09a9a6a99b82e2a8869eb16737a78772b29e.tar.gz
20 files changed, 457 insertions, 57 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 9f21ac16a05..218ec2daadb 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -78,8 +78,14 @@ extern MY_UNICASE_INFO *my_unicase_turkish[256];
 #define MY_CS_READY	256    /* if a charset is initialized    */
 #define MY_CS_AVAILABLE	512    /* If either compiled-in or loaded*/
 #define MY_CS_CSSORT	1024   /* if case sensitive sort order   */	
+#define MY_CS_PUREASCII 2048   /* if a charset is pure ascii     */
 #define MY_CHARSET_UNDEFINED 0
 
+/* Character repertoire flags */
+#define MY_REPERTOIRE_ASCII      1 /* Pure ASCII            U+0000..U+007F */
+#define MY_REPERTOIRE_EXTENDED   2 /* Extended characters:  U+0080..U+FFFF */
+#define MY_REPERTOIRE_UNICODE30  3 /* ASCII | EXTENDED:     U+0000..U+FFFF */
+
 
 typedef struct my_uni_idx_st
 {
@@ -436,6 +442,11 @@ my_bool my_propagate_simple(CHARSET_INFO *cs, const uchar *str, uint len);
 my_bool my_propagate_complex(CHARSET_INFO *cs, const uchar *str, uint len);
 
 
+uint my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong len);
+my_bool my_charset_is_ascii_based(CHARSET_INFO *cs);
+my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs);
+
+
 #define	_MY_U	01	/* Upper case */
 #define	_MY_L	02	/* Lower case */
 #define	_MY_NMR	04	/* Numeral (digit) */
diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result
index 960953b3c5e..350fc3f6bd6 100644
--- a/mysql-test/r/ctype_ucs.result
+++ b/mysql-test/r/ctype_ucs.result
@@ -865,4 +865,30 @@ blob	65535	65535
 text	65535	65535
 text	65535	32767
 drop table t1;
+create table t1 (a varchar(15) character set ascii not null, b int);
+insert into t1 values ('a',1);
+select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
+concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062))
+aa
+select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
+concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062))
+ab
+select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062);
+a	b
+a	1
+select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062);
+a	b
+select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
+select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
+select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
+select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
+select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062);
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '='
+select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0);
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '='
+drop table t1;
 End of 5.0 tests
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 216b5f393fb..3b20ded7361 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -1639,6 +1639,42 @@ coercibility(col1)	collation(col1)
 0	utf8_swedish_ci
 drop view v1, v2;
 drop table t1;
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, N'x', N'y')) from t1;
+concat(a, if(b>10, N'x', N'y'))
+ay
+select concat(a, if(b>10, N'æ', N'ß')) from t1;
+ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
+drop table t1;
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1;
+concat(a, if(b>10, _utf8'x', _utf8'y'))
+ay
+select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1;
+ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
+drop table t1;
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1;
+concat(a, if(b>10, _utf8 0x78, _utf8 0x79))
+ay
+select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1;
+ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
+drop table t1;
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1;
+concat(a, if(b>10, 'x' 'x', 'y' 'y'))
+ayy
+select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1;
+ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
+drop table t1;
 CREATE TABLE t1 (
 colA int(11) NOT NULL,
 colB varchar(255) character set utf8 NOT NULL,
diff --git a/mysql-test/r/func_time.result b/mysql-test/r/func_time.result
index 56ea72a8ee3..2207cd27243 100644
--- a/mysql-test/r/func_time.result
+++ b/mysql-test/r/func_time.result
@@ -1246,3 +1246,19 @@ SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SE
 TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s")
 838:59:58
 838:59:59
+set names latin1;
+create table t1 (a varchar(15) character set ascii not null);
+insert into t1 values ('070514-000000');
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull'))
+#
+set names swe7;
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (swe7_swedish_ci,COERCIBLE) for operation 'concat'
+set names latin1;
+set lc_time_names=fr_FR;
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (latin1_swedish_ci,COERCIBLE) for operation 'concat'
+set lc_time_names=en_US;
+drop table t1;
+End of 5.0 tests
diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test
index c3320159c41..d1dd2378bd0 100644
--- a/mysql-test/t/ctype_ucs.test
+++ b/mysql-test/t/ctype_ucs.test
@@ -594,4 +594,34 @@ select data_type, character_octet_length, character_maximum_length
   from information_schema.columns where table_name='t1';
 drop table t1;
 
+#
+# Conversion from UCS2 to ASCII is possible
+# if the UCS2 string consists of only ASCII characters
+#
+create table t1 (a varchar(15) character set ascii not null, b int);
+insert into t1 values ('a',1);
+select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
+select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
+select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062);
+select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062);
+
+#
+# Conversion from UCS2 to ASCII is not possible if 
+# the UCS2 string has non-ASCII characters
+#
+--error 1267
+select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
+--error 1267
+select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
+--error 1267
+select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
+--error 1267
+select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
+--error 1267
+select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062);
+--error 1267
+select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0);
+drop table t1;
+
+
 --echo End of 5.0 tests
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index c4637d14edc..603c60faf82 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1314,6 +1314,46 @@ select coercibility(col1), collation(col1) from v2;
 drop view v1, v2;
 drop table t1;
 
+#
+# Check conversion of NCHAR strings to subset (e.g. latin1).
+# Conversion is possible if string repertoire is ASCII.
+# Conversion is not possible if the string have extended characters
+#
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, N'x', N'y')) from t1;
+--error 1267
+select concat(a, if(b>10, N'æ', N'ß')) from t1;
+drop table t1;
+
+# Conversion tests for character set introducers
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1;
+--error 1267
+select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1;
+drop table t1;
+
+# Conversion tests for introducer + HEX string
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1;
+--error 1267
+select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1;
+drop table t1;
+
+# Conversion tests for "text_literal TEXT_STRING_literal" syntax structure
+set names utf8;
+create table t1 (a varchar(10) character set latin1, b int);
+insert into t1 values ('a',1);
+select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1;
+--error 1267
+select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1;
+drop table t1;
+
 
 #
 # Bug#19960: Inconsistent results when joining
diff --git a/mysql-test/t/func_time.test b/mysql-test/t/func_time.test
index da909dc578f..96d064fdf41 100644
--- a/mysql-test/t/func_time.test
+++ b/mysql-test/t/func_time.test
@@ -752,3 +752,29 @@ DROP TABLE t1;
 # Check if using GROUP BY with TIME_FORMAT() produces correct results
 
 SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SELECT 3020398 ) x GROUP BY 1;
+
+#
+# Bug#28875 Conversion between ASCII and LATIN1 charsets does not function
+#
+set names latin1;
+create table t1 (a varchar(15) character set ascii not null);
+insert into t1 values ('070514-000000');
+# Conversion of date_format() result to ASCII
+# is safe with the default locale en_US
+--replace_column 1 #
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+# Error for swe7: it is not ASCII compatible
+set names swe7;
+--error 1267
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+set names latin1;
+# Conversion of date_format() result to ASCII
+# is not safe with the non-default locale fr_FR
+# because month and day names can have accented characters
+set lc_time_names=fr_FR;
+--error 1267
+select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
+set lc_time_names=en_US;
+drop table t1;
+
+--echo End of 5.0 tests
diff --git a/mysys/charset.c b/mysys/charset.c
index 9ea17c6515c..4c3f2d0ab71 100644
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -277,6 +277,9 @@ static int add_collation(CHARSET_INFO *cs)
         if (sort_order && sort_order['A'] < sort_order['a'] &&
                           sort_order['a'] < sort_order['B'])
           all_charsets[cs->number]->state|= MY_CS_CSSORT; 
+
+        if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number]))
+          all_charsets[cs->number]->state|= MY_CS_PUREASCII;
       }
     }
     else
diff --git a/sql/item.cc b/sql/item.cc
index 92ea35072f9..30fc32706fd 100644
--- a/sql/item.cc
+++ b/sql/item.cc
@@ -1296,6 +1296,25 @@ void Item::split_sum_func2(THD *thd, Item **ref_pointer_array,
 }
 
 
+static bool
+left_is_superset(DTCollation *left, DTCollation *right)
+{
+  /* Allow convert to Unicode */
+  if (left->collation->state & MY_CS_UNICODE &&
+      (left->derivation < right->derivation ||
+       (left->derivation == right->derivation &&
+        !(right->collation->state & MY_CS_UNICODE))))
+    return TRUE;
+  /* Allow convert from ASCII */
+  if (right->repertoire == MY_REPERTOIRE_ASCII &&
+      (left->derivation < right->derivation ||
+       (left->derivation == right->derivation &&
+        !(left->repertoire == MY_REPERTOIRE_ASCII))))
+    return TRUE;
+  /* Disallow conversion otherwise */
+  return FALSE;
+}
+
 /*
    Aggregate two collations together taking
    into account their coercibility (aka derivation):
@@ -1360,18 +1379,12 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
        ; // Do nothing
     }
     else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) &&
-             collation->state & MY_CS_UNICODE &&
-             (derivation < dt.derivation ||
-             (derivation == dt.derivation &&
-             !(dt.collation->state & MY_CS_UNICODE))))
+             left_is_superset(this, &dt))
     {
       // Do nothing
     }
     else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) &&
-             dt.collation->state & MY_CS_UNICODE &&
-             (dt.derivation < derivation ||
-              (dt.derivation == derivation &&
-             !(collation->state & MY_CS_UNICODE))))
+             left_is_superset(&dt, this))
     {
       set(dt);
     }
@@ -1390,7 +1403,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
     else
     {
       // Cannot apply conversion
-      set(0, DERIVATION_NONE);
+      set(0, DERIVATION_NONE, 0);
       return 1;
     }
   }
@@ -1412,8 +1425,8 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
     {
       if (derivation == DERIVATION_EXPLICIT)
       {
-	set(0, DERIVATION_NONE);
-	return 1;
+        set(0, DERIVATION_NONE, 0);
+        return 1;
       }
       if (collation->state & MY_CS_BINSORT)
         return 0;
@@ -1427,6 +1440,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
       set(bin, DERIVATION_NONE);
     }
   }
+  repertoire|= dt.repertoire;
   return 0;
 }
 
@@ -1566,12 +1580,16 @@ bool agg_item_charsets(DTCollation &coll, const char *fname,
   {
     Item* conv;
     uint32 dummy_offset;
-    if (!String::needs_conversion(0, coll.collation,
-                                  (*arg)->collation.collation,
+    if (!String::needs_conversion(0, (*arg)->collation.collation,
+                                  coll.collation,
                                   &dummy_offset))
       continue;
 
-    if (!(conv= (*arg)->safe_charset_converter(coll.collation)))
+    if (!(conv= (*arg)->safe_charset_converter(coll.collation)) &&
+        ((*arg)->collation.repertoire == MY_REPERTOIRE_ASCII))
+      conv= new Item_func_conv_charset(*arg, coll.collation, 1);
+
+    if (!conv)
     {
       if (nargs >=2 && nargs <= 3)
       {
diff --git a/sql/item.h b/sql/item.h
index 11dce3a7758..9a45314651a 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -49,29 +49,50 @@ class DTCollation {
 public:
   CHARSET_INFO     *collation;
   enum Derivation derivation;
+  uint repertoire;
   
+  void set_repertoire_from_charset(CHARSET_INFO *cs)
+  {
+    repertoire= cs->state & MY_CS_PUREASCII ?
+                MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
+  }
   DTCollation()
   {
     collation= &my_charset_bin;
     derivation= DERIVATION_NONE;
+    repertoire= MY_REPERTOIRE_UNICODE30;
   }
   DTCollation(CHARSET_INFO *collation_arg, Derivation derivation_arg)
   {
     collation= collation_arg;
     derivation= derivation_arg;
+    set_repertoire_from_charset(collation_arg);
   }
   void set(DTCollation &dt)
   { 
     collation= dt.collation;
     derivation= dt.derivation;
+    repertoire= dt.repertoire;
   }
   void set(CHARSET_INFO *collation_arg, Derivation derivation_arg)
   {
     collation= collation_arg;
     derivation= derivation_arg;
+    set_repertoire_from_charset(collation_arg);
+  }
+  void set(CHARSET_INFO *collation_arg,
+           Derivation derivation_arg,
+           uint repertoire_arg)
+  {
+    collation= collation_arg;
+    derivation= derivation_arg;
+    repertoire= repertoire_arg;
   }
   void set(CHARSET_INFO *collation_arg)
-  { collation= collation_arg; }
+  {
+    collation= collation_arg;
+    set_repertoire_from_charset(collation_arg);
+  }
   void set(Derivation derivation_arg)
   { derivation= derivation_arg; }
   bool aggregate(DTCollation &dt, uint flags= 0);
@@ -1650,10 +1671,11 @@ class Item_string :public Item
 {
 public:
   Item_string(const char *str,uint length,
-  	      CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE)
+              CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE,
+              uint repertoire= MY_REPERTOIRE_UNICODE30)
   {
-    collation.set(cs, dv);
-    str_value.set_or_copy_aligned(str,length,cs);
+    str_value.set_or_copy_aligned(str, length, cs);
+    collation.set(cs, dv, repertoire);
     /*
       We have to have a different max_length than 'length' here to
       ensure that we get the right length if we do use the item
@@ -1677,10 +1699,11 @@ public:
     fixed= 1;
   }
   Item_string(const char *name_par, const char *str, uint length,
-	      CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE)
+              CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE,
+              uint repertoire= MY_REPERTOIRE_UNICODE30)
   {
-    collation.set(cs, dv);
-    str_value.set_or_copy_aligned(str,length,cs);
+    str_value.set_or_copy_aligned(str, length, cs);
+    collation.set(cs, dv, repertoire);
     max_length= str_value.numchars()*cs->mbmaxlen;
     set_name(name_par, 0, cs);
     decimals=NOT_FIXED_DEC;
@@ -1696,6 +1719,12 @@ public:
     str_value.copy(str_arg, length_arg, collation.collation);
     max_length= str_value.numchars() * collation.collation->mbmaxlen;
   }
+  void set_repertoire_from_value()
+  {
+    collation.repertoire= my_string_repertoire(str_value.charset(),
+                                               str_value.ptr(),
+                                               str_value.length());
+  }
   enum Type type() const { return STRING_ITEM; }
   double val_real();
   longlong val_int();
diff --git a/sql/item_func.cc b/sql/item_func.cc
index 580d19fbd4e..c2943197a7c 100644
--- a/sql/item_func.cc
+++ b/sql/item_func.cc
@@ -3751,7 +3751,7 @@ static user_var_entry *get_variable(HASH *hash, LEX_STRING &name,
     entry->value=0;
     entry->length=0;
     entry->update_query_id=0;
-    entry->collation.set(NULL, DERIVATION_IMPLICIT);
+    entry->collation.set(NULL, DERIVATION_IMPLICIT, 0);
     entry->unsigned_flag= 0;
     /*
       If we are here, we were called from a SET or a query which sets a
diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc
index f9a0f715985..3d59bd27972 100644
--- a/sql/item_strfunc.cc
+++ b/sql/item_strfunc.cc
@@ -2672,7 +2672,8 @@ void Item_func_set_collation::fix_length_and_dec()
              colname, args[0]->collation.collation->csname);
     return;
   }
-  collation.set(set_collation, DERIVATION_EXPLICIT);
+  collation.set(set_collation, DERIVATION_EXPLICIT,
+                args[0]->collation.repertoire);
   max_length= args[0]->max_length;
 }
 
diff --git a/sql/item_timefunc.cc b/sql/item_timefunc.cc
index 9aabd068d25..62093154097 100644
--- a/sql/item_timefunc.cc
+++ b/sql/item_timefunc.cc
@@ -1717,7 +1717,11 @@ void Item_func_date_format::fix_length_and_dec()
   Item *arg1= args[1]->this_item();
 
   decimals=0;
-  collation.set(thd->variables.collation_connection);
+  CHARSET_INFO *cs= thd->variables.collation_connection;
+  uint32 repertoire= arg1->collation.repertoire;
+  if (!thd->variables.lc_time_names->is_ascii)
+    repertoire|= MY_REPERTOIRE_EXTENDED;
+  collation.set(cs, arg1->collation.derivation, repertoire);
   if (arg1->type() == STRING_ITEM)
   {						// Optimize the normal case
     fixed_length=1;
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index cbfba3d4d80..f74c963e26d 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -311,10 +311,12 @@ static char *get_text(Lex_input_stream *lip)
   uint found_escape=0;
   CHARSET_INFO *cs= lip->m_thd->charset();
 
+  lip->tok_bitmap= 0;
   sep= yyGetLast();			// String should end with this
   while (lip->ptr != lip->end_of_query)
   {
-    c = yyGet();
+    c= yyGet();
+    lip->tok_bitmap|= c;
 #ifdef USE_MB
     {
       int l;
@@ -605,6 +607,7 @@ int MYSQLlex(void *arg, void *yythd)
 	break;
       }
       yylval->lex_str.length= lip->yytoklen;
+      lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
       return(NCHAR_STRING);
 
     case MY_LEX_IDENT_OR_HEX:
@@ -926,6 +929,7 @@ int MYSQLlex(void *arg, void *yythd)
 	break;
       }
       yylval->lex_str.length=lip->yytoklen;
+      lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
       return(TEXT_STRING);
 
     case MY_LEX_COMMENT:			//  Comment
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index f8405ef14ca..4b96218de80 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -957,6 +957,9 @@ public:
 
   /** Position of ';' in the stream, to delimit multiple queries. */
   const char* found_semicolon;
+  
+  /** Token character bitmaps, to detect 7bit strings. */
+  uchar tok_bitmap;
 
   /** SQL_MODE = IGNORE_SPACE. */
   bool ignore_space;
@@ -994,6 +997,7 @@ typedef struct st_lex : public Query_tables_list
   gptr yacc_yyss,yacc_yyvs;
   THD *thd;
   CHARSET_INFO *charset, *underscore_charset;
+  bool text_string_is_7bit;
   /* store original leaf_tables for INSERT SELECT and PS/SP */
   TABLE_LIST *leaf_tables_insert;
   /* Position (first character index) of SELECT of CREATE VIEW statement */
diff --git a/sql/sql_string.cc b/sql/sql_string.cc
index 9d7df73cd7a..a87074c3359 100644
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@@ -263,6 +263,8 @@ bool String::needs_conversion(uint32 arg_length,
       (to_cs == &my_charset_bin) || 
       (to_cs == from_cs) ||
       my_charset_same(from_cs, to_cs) ||
+      (my_charset_is_ascii_based(to_cs) &&
+       my_charset_is_8bit_pure_ascii(from_cs)) ||
       ((from_cs == &my_charset_bin) &&
        (!(*offset=(arg_length % to_cs->mbminlen)))))
     return FALSE;
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
index 5ae2f6db581..a45ebc4c640 100644
--- a/sql/sql_yacc.yy
+++ b/sql/sql_yacc.yy
@@ -7509,18 +7509,54 @@ opt_load_data_set_spec:
 /* Common definitions */
 
 text_literal:
-	TEXT_STRING_literal
-	{
-	  THD *thd= YYTHD;
-	  $$ = new Item_string($1.str,$1.length,thd->variables.collation_connection);
-	}
-	| NCHAR_STRING
-	{ $$=  new Item_string($1.str,$1.length,national_charset_info); }
-	| UNDERSCORE_CHARSET TEXT_STRING
-	  { $$ = new Item_string($2.str,$2.length,Lex->underscore_charset); }
-	| text_literal TEXT_STRING_literal
-	  { ((Item_string*) $1)->append($2.str,$2.length); }
-	;
+        TEXT_STRING
+        {
+          LEX_STRING tmp;
+          THD *thd= YYTHD;
+          CHARSET_INFO *cs_con= thd->variables.collation_connection;
+          CHARSET_INFO *cs_cli= thd->variables.character_set_client;
+          uint repertoire= thd->lex->text_string_is_7bit &&
+                             my_charset_is_ascii_based(cs_cli) ?
+                           MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
+          if (thd->charset_is_collation_connection ||
+              (repertoire == MY_REPERTOIRE_ASCII &&
+               my_charset_is_ascii_based(cs_con)))
+            tmp= $1;
+          else
+            thd->convert_string(&tmp, cs_con, $1.str, $1.length, cs_cli);
+          $$= new Item_string(tmp.str, tmp.length, cs_con,
+                              DERIVATION_COERCIBLE, repertoire);
+        }
+        | NCHAR_STRING
+        {
+          uint repertoire= Lex->text_string_is_7bit ?
+                           MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
+          DBUG_ASSERT(my_charset_is_ascii_based(national_charset_info));
+          $$= new Item_string($1.str, $1.length, national_charset_info,
+                              DERIVATION_COERCIBLE, repertoire);
+        }
+        | UNDERSCORE_CHARSET TEXT_STRING
+          {
+            $$= new Item_string($2.str, $2.length, Lex->underscore_charset);
+            ((Item_string*) $$)->set_repertoire_from_value();
+          }
+        | text_literal TEXT_STRING_literal
+          {
+            Item_string* item= (Item_string*) $1;
+            item->append($2.str, $2.length);
+            if (!(item->collation.repertoire & MY_REPERTOIRE_EXTENDED))
+            {
+              /*
+                 If the string has been pure ASCII so far,
+                 check the new part.
+              */
+              CHARSET_INFO *cs= YYTHD->variables.collation_connection;
+              item->collation.repertoire|= my_string_repertoire(cs,
+                                                                $2.str,
+                                                                $2.length);
+            }
+          }
+        ;
 
 text_string:
 	TEXT_STRING_literal
@@ -7592,20 +7628,22 @@ literal:
 	| TRUE_SYM	{ $$= new Item_int((char*) "TRUE",1,1); }
 	| HEX_NUM	{ $$ =	new Item_hex_string($1.str, $1.length);}
 	| BIN_NUM	{ $$= new Item_bin_string($1.str, $1.length); }
-	| UNDERSCORE_CHARSET HEX_NUM
-	  {
-	    Item *tmp= new Item_hex_string($2.str, $2.length);
-	    /*
-	      it is OK only emulate fix_fieds, because we need only
+        | UNDERSCORE_CHARSET HEX_NUM
+          {
+            Item *tmp= new Item_hex_string($2.str, $2.length);
+            /*
+              it is OK only emulate fix_fieds, because we need only
               value of constant
-	    */
-	    String *str= tmp ?
-	      tmp->quick_fix_field(), tmp->val_str((String*) 0) :
-	      (String*) 0;
-	    $$= new Item_string(str ? str->ptr() : "",
-				str ? str->length() : 0,
-				Lex->underscore_charset);
-	  }
+            */
+            String *str= tmp ?
+              tmp->quick_fix_field(), tmp->val_str((String*) 0) :
+              (String*) 0;
+            $$= new Item_string(str ? str->ptr() : "",
+                                str ? str->length() : 0,
+                                Lex->underscore_charset);
+            if ($$)
+              ((Item_string *) $$)->set_repertoire_from_value();
+          }
 	| UNDERSCORE_CHARSET BIN_NUM
           {
 	    Item *tmp= new Item_bin_string($2.str, $2.length);
diff --git a/strings/conf_to_src.c b/strings/conf_to_src.c
index e2ac9846c85..dc2a300a2ec 100644
--- a/strings/conf_to_src.c
+++ b/strings/conf_to_src.c
@@ -179,14 +179,16 @@ is_case_sensitive(CHARSET_INFO *cs)
          cs->sort_order['a'] < cs->sort_order['B']) ? 1 : 0;
 }
 
+
 void dispcset(FILE *f,CHARSET_INFO *cs)
 {
   fprintf(f,"{\n");
   fprintf(f,"  %d,%d,%d,\n",cs->number,0,0);
-  fprintf(f,"  MY_CS_COMPILED%s%s%s,\n",
-          cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "",
-          cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "",
-          is_case_sensitive(cs)     ? "|MY_CS_CSSORT"  : "");
+  fprintf(f,"  MY_CS_COMPILED%s%s%s%s,\n",
+          cs->state & MY_CS_BINSORT         ? "|MY_CS_BINSORT"   : "",
+          cs->state & MY_CS_PRIMARY         ? "|MY_CS_PRIMARY"   : "",
+          is_case_sensitive(cs)             ? "|MY_CS_CSSORT"    : "",
+          my_charset_is_8bit_pure_ascii(cs) ? "|MY_CS_PUREASCII" : "");
   
   if (cs->name)
   {
@@ -243,6 +245,28 @@ void dispcset(FILE *f,CHARSET_INFO *cs)
 }
 
 
+static void
+fprint_copyright(FILE *file)
+{
+  fprintf(file,
+"/* Copyright (C) 2000-2007 MySQL AB\n"
+"\n"
+"   This program is free software; you can redistribute it and/or modify\n"
+"   it under the terms of the GNU General Public License as published by\n"
+"   the Free Software Foundation; version 2 of the License.\n"
+"\n"
+"   This program is distributed in the hope that it will be useful,\n"
+"   but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+"   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
+"   GNU General Public License for more details.\n"
+"\n"
+"   You should have received a copy of the GNU General Public License\n"
+"   along with this program; if not, write to the Free Software\n"
+"   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */\n"
+"\n");
+}
+
+
 int
 main(int argc, char **argv  __attribute__((unused)))
 {
@@ -283,6 +307,7 @@ main(int argc, char **argv  __attribute__((unused)))
           "directory:\n");
   fprintf(f, "    ./conf_to_src ../sql/share/charsets/ > FILE\n");
   fprintf(f, "*/\n\n");
+  fprint_copyright(f);
   fprintf(f,"#include <my_global.h>\n");
   fprintf(f,"#include <m_ctype.h>\n\n");
   
diff --git a/strings/ctype-extra.c b/strings/ctype-extra.c
index 1c20828ea54..2a7fcbd383e 100644
--- a/strings/ctype-extra.c
+++ b/strings/ctype-extra.c
@@ -5,7 +5,8 @@
   To re-generate, run the following in the strings/ directory:
     ./conf_to_src ../sql/share/charsets/ > FILE
 */
-/* Copyright (C) 2000-2003 MySQL AB
+
+/* Copyright (C) 2000-2007 MySQL AB
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -6721,7 +6722,7 @@ CHARSET_INFO compiled_charsets[] = {
 #ifdef HAVE_CHARSET_ascii
 {
   11,0,0,
-  MY_CS_COMPILED|MY_CS_PRIMARY,
+  MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_PUREASCII,
   "ascii",                     /* cset name     */
   "ascii_general_ci",                     /* coll name     */
   "",                       /* comment       */
@@ -7810,7 +7811,7 @@ CHARSET_INFO compiled_charsets[] = {
 #ifdef HAVE_CHARSET_ascii
 {
   65,0,0,
-  MY_CS_COMPILED|MY_CS_BINSORT,
+  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_PUREASCII,
   "ascii",                     /* cset name     */
   "ascii_bin",                     /* coll name     */
   "",                       /* comment       */
diff --git a/strings/ctype.c b/strings/ctype.c
index e7399c5438b..372a1a8a468 100644
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -306,3 +306,89 @@ my_bool my_parse_charset_xml(const char *buf, uint len,
   my_xml_parser_free(&p);
   return rc;
 }
+
+
+/*
+  Check repertoire: detect pure ascii strings
+*/
+uint
+my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length)
+{
+  const char *strend= str + length;
+  if (cs->mbminlen == 1)
+  {
+    for ( ; str < strend; str++)
+    {
+      if (((uchar) *str) > 0x7F)
+        return MY_REPERTOIRE_UNICODE30;
+    }
+  }
+  else
+  {
+    my_wc_t wc;
+    int chlen;
+    for (; (chlen= cs->cset->mb_wc(cs, &wc, str, strend)) > 0; str+= chlen)
+    {
+      if (wc > 0x7F)
+        return MY_REPERTOIRE_UNICODE30;
+    }
+  }
+  return MY_REPERTOIRE_ASCII;
+}
+
+
+/*
+  Detect whether a character set is ASCII compatible.
+
+  Returns TRUE for:
+  
+  - all 8bit character sets whose Unicode mapping of 0x7B is '{'
+    (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
+  
+  - all multi-byte character sets having mbminlen == 1
+    (ignores ucs2 whose mbminlen is 2)
+  
+  TODO:
+  
+  When merging to 5.2, this function should be changed
+  to check a new flag MY_CS_NONASCII, 
+  
+     return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
+  
+  This flag was previously added into 5.2 under terms
+  of WL#3759 "Optimize identifier conversion in client-server protocol"
+  especially to mark character sets not compatible with ASCII.
+  
+  We won't backport this flag to 5.0 or 5.1.
+  This function is Ok for 5.0 and 5.1, because we're not going
+  to introduce new tricky character sets between 5.0 and 5.2.
+*/
+my_bool
+my_charset_is_ascii_based(CHARSET_INFO *cs)
+{
+  return 
+    (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
+    (cs->mbminlen == 1 && cs->mbmaxlen > 1);
+}
+
+
+/*
+  Detect if a character set is 8bit,
+  and it is pure ascii, i.e. doesn't have
+  characters outside U+0000..U+007F
+  This functions is shared between "conf_to_src"
+  and dynamic charsets loader in "mysqld".
+*/
+my_bool
+my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs)
+{
+  size_t code;
+  if (!cs->tab_to_uni)
+    return 0;
+  for (code= 0; code < 256; code++)
+  {
+    if (cs->tab_to_uni[code] > 0x7F)
+      return 0;
+  }
+  return 1;
+}
author	unknown <bar@mysql.com/bar.myoffice.izhnet.ru>	2007-08-03 15:25:23 +0500
committer	unknown <bar@mysql.com/bar.myoffice.izhnet.ru>	2007-08-03 15:25:23 +0500
commit	53df09a9a6a99b82e2a8869eb16737a78772b29e (patch)
tree	01c0ee2f5244a4d68fc6cdfb6555fa1af4589a8c
parent	b307fc4d8fd5380cec948c07550b5ae73624e274 (diff)
download	mariadb-git-53df09a9a6a99b82e2a8869eb16737a78772b29e.tar.gz