summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.com>2020-05-07 19:20:17 +0400
committerAlexander Barkov <bar@mariadb.com>2020-05-09 16:01:30 +0400
commitcfe5ee90c8e4b9dfa98a41fcd299197a59261be7 (patch)
tree35fdaabac55d4b36d228bc9600112e986850b162 /strings
parentc675886dcdecd29571bd08605a409325ee81004c (diff)
downloadmariadb-git-cfe5ee90c8e4b9dfa98a41fcd299197a59261be7.tar.gz
MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)
The code did not take into account that: - U+005C (backslash) can occupy more than mbminlen characters (e.g. in sjis) - Some character sets do not have a code for U+005C (e.g. swe7) Adding a new function my_wc_to_printable into MY_CHARSET_HANDLER to cover all special cases easier.
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-big5.c1
-rw-r--r--strings/ctype-bin.c1
-rw-r--r--strings/ctype-cp932.c1
-rw-r--r--strings/ctype-euc_kr.c1
-rw-r--r--strings/ctype-eucjpms.c1
-rw-r--r--strings/ctype-gb2312.c1
-rw-r--r--strings/ctype-gbk.c1
-rw-r--r--strings/ctype-latin1.c1
-rw-r--r--strings/ctype-simple.c1
-rw-r--r--strings/ctype-sjis.c10
-rw-r--r--strings/ctype-tis620.c1
-rw-r--r--strings/ctype-ucs2.c4
-rw-r--r--strings/ctype-ujis.c1
-rw-r--r--strings/ctype-utf8.c13
-rw-r--r--strings/ctype.c64
-rw-r--r--strings/strings_def.h15
16 files changed, 105 insertions, 12 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index 3991a219ab5..945bbdfdc62 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -6800,6 +6800,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
my_well_formed_char_length_big5,
my_copy_fix_mb,
my_native_to_mb_big5,
+ my_wc_to_printable_generic
};
struct charset_info_st my_charset_big5_chinese_ci=
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 0324c0665e2..fe28752a3f7 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -560,6 +560,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_8bit,
my_copy_8bit,
my_wc_mb_bin,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index bf97d1feb83..45b5bde9510 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -34756,6 +34756,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_cp932,
my_copy_fix_mb,
my_native_to_mb_cp932,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index deb13957900..0362f799fc6 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -10046,6 +10046,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_euckr,
my_copy_fix_mb,
my_native_to_mb_euckr,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 118e8286703..1dd179fed57 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -67584,6 +67584,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_eucjpms,
my_copy_fix_mb,
my_native_to_mb_eucjpms,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index 166619bf5cc..266799f32a3 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -6451,6 +6451,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_gb2312,
my_copy_fix_mb,
my_native_to_mb_gb2312,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index efaa2e5c728..fa6dba9bfb5 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -10733,6 +10733,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_gbk,
my_copy_fix_mb,
my_native_to_mb_gbk,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index f9fa1488aa6..53ce27e491e 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -423,6 +423,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_8bit,
my_copy_8bit,
my_wc_mb_bin, /* native_to_mb */
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index 975cb503872..eac05ea68f5 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -2088,6 +2088,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
my_well_formed_char_length_8bit,
my_copy_8bit,
my_wc_mb_bin, /* native_to_mb */
+ my_wc_to_printable_8bit
};
MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler =
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index 902034b435d..e1c6a871772 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -34004,6 +34004,15 @@ size_t my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)),
}
+static int
+my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ return my_wc_to_printable_ex(cs, wc, str, end,
+ '\\', 2, 1);
+}
+
+
/*
sjis_chinese_ci and sjis_bin sort character blocks in this order:
1. [00..7F] - 7BIT characters (ASCII)
@@ -34135,6 +34144,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_sjis,
my_copy_fix_mb,
my_native_to_mb_sjis,
+ my_wc_to_printable_sjis
};
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index 6a351c05823..772294fb5c0 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -905,6 +905,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_8bit,
my_copy_8bit,
my_wc_mb_bin, /* native_to_mb */
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index e4234a9582a..d764849c01e 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1591,6 +1591,7 @@ MY_CHARSET_HANDLER my_charset_utf16_handler=
my_well_formed_char_length_utf16,
my_copy_fix_mb2_or_mb4,
my_uni_utf16,
+ my_wc_to_printable_generic
};
@@ -1931,6 +1932,7 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler=
my_well_formed_char_length_utf16,
my_copy_fix_mb2_or_mb4,
my_uni_utf16le,
+ my_wc_to_printable_generic
};
@@ -2753,6 +2755,7 @@ MY_CHARSET_HANDLER my_charset_utf32_handler=
my_well_formed_char_length_utf32,
my_copy_fix_mb2_or_mb4,
my_uni_utf32,
+ my_wc_to_printable_generic
};
@@ -3343,6 +3346,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
my_well_formed_char_length_ucs2,
my_copy_fix_mb2_or_mb4,
my_uni_ucs2,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index 949f3aadc36..9ec3b578549 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -67328,6 +67328,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_ujis,
my_copy_fix_mb,
my_native_to_mb_ujis,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 3329b6d23ef..b8e71b1f7a9 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -5466,6 +5466,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb3_handler=
my_well_formed_char_length_utf8mb3,
my_copy_fix_mb,
my_uni_utf8mb3,
+ my_wc_to_printable_generic
};
@@ -7030,6 +7031,16 @@ my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end)
}
+static int
+my_wc_to_printable_filename(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ return my_wc_to_printable_ex(cs, wc, str, end,
+ '\\', 5, 1);
+}
+
+
+
#define MY_FUNCTION_NAME(x) my_ ## x ## _filename
#define CHARLEN(cs,str,end) my_charlen_filename(cs,str,end)
#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
@@ -7102,6 +7113,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
my_well_formed_char_length_filename,
my_copy_fix_mb,
my_wc_mb_filename,
+ my_wc_to_printable_filename
};
@@ -7792,6 +7804,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_well_formed_char_length_utf8mb4,
my_copy_fix_mb,
my_wc_mb_utf8mb4,
+ my_wc_to_printable_generic
};
diff --git a/strings/ctype.c b/strings/ctype.c
index 3fbe4143da2..4df9b9c2f09 100644
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -1020,7 +1020,7 @@ my_is_printable(my_wc_t wc)
}
-static uint to_printable_8bit(uchar *dst, my_wc_t wc)
+static uint to_printable_8bit(uchar *dst, my_wc_t wc, uint bs)
{
/*
This function is used only in context of error messages for now.
@@ -1028,7 +1028,7 @@ static uint to_printable_8bit(uchar *dst, my_wc_t wc)
when a message is put into diagnostics area.
*/
DBUG_ASSERT(wc < 0x10000);
- *dst++= '\\';
+ *dst++= (char) bs;
*dst++= _dig_vec_upper[(wc >> 12) & 0x0F];
*dst++= _dig_vec_upper[(wc >> 8) & 0x0F];
*dst++= _dig_vec_upper[(wc >> 4) & 0x0F];
@@ -1037,18 +1037,25 @@ static uint to_printable_8bit(uchar *dst, my_wc_t wc)
}
+static uint my_printable_length(uint bslen, uint diglen)
+{
+ return bslen + (MY_CS_PRINTABLE_CHAR_LENGTH - 1) * diglen;
+}
+
+
/**
Encode an Unicode character "wc" into a printable string.
This function is suitable for any character set, including
ASCII-incompatible multi-byte character sets, e.g. ucs2, utf16, utf32.
*/
int
-my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
- uchar *str, uchar *end)
+my_wc_to_printable_ex(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *str, uchar *end,
+ uint bs, uint bslen, uint diglen)
{
uchar *str0;
uint i, length;
- uchar tmp[MY_CS_PRINTABLE_CHAR_LENGTH];
+ uchar tmp[MY_CS_PRINTABLE_CHAR_LENGTH * MY_CS_MBMAXLEN];
if (my_is_printable(wc))
{
@@ -1057,27 +1064,62 @@ my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
return mblen;
}
- if (str + MY_CS_PRINTABLE_CHAR_LENGTH * cs->mbminlen > end)
- return MY_CS_TOOSMALLN(MY_CS_PRINTABLE_CHAR_LENGTH * cs->mbminlen);
+ if (str + my_printable_length(bslen, diglen) > end)
+ return MY_CS_TOOSMALLN(my_printable_length(bslen, diglen));
if ((cs->state & MY_CS_NONASCII) == 0)
- return to_printable_8bit(str, wc);
+ return to_printable_8bit(str, wc, bs);
- length= to_printable_8bit(tmp, wc);
+ length= to_printable_8bit(tmp, wc, bs);
str0= str;
for (i= 0; i < length; i++)
{
- if (my_ci_wc_mb(cs, tmp[i], str, end) != (int) cs->mbminlen)
+ uint expected_length= i == 0 ? bslen : diglen;
+ if (my_ci_wc_mb(cs, tmp[i], str, end) != (int) expected_length)
{
DBUG_ASSERT(0);
return MY_CS_ILSEQ;
}
- str+= cs->mbminlen;
+ str+= expected_length;
}
return (int) (str - str0);
}
+int
+my_wc_to_printable_8bit(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ /*
+ Special case: swe7 does not have the backslash character.
+ Use dot instead of backslash for escaping.
+ */
+ uint bs= cs->tab_to_uni && cs->tab_to_uni['\\'] != '\\' ? '.' : '\\';
+ DBUG_ASSERT(cs->mbminlen == 1);
+ /*
+ Additionally, if the original swe7 string contains backslashes,
+ replace them to dots, so this error message:
+ Invalid swe7 character string: '\xEF\xBC\xB4'
+ is displayed as:
+ Invalid swe7 character string: '.xEF.xBC.xB4'
+ which is more readable than what would happen without '\'-to-dot mapping:
+ Invalid swe7 character string: '.005CxEF.005CxBC.005CxB4'
+ */
+ if (bs == '.' && wc == '\\')
+ wc= '.';
+ return my_wc_to_printable_ex(cs, wc, str, end, bs, 1, 1);
+}
+
+
+int
+my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *str, uchar *end)
+{
+ return my_wc_to_printable_ex(cs, wc, str, end, '\\',
+ cs->mbminlen, cs->mbminlen);
+}
+
+
/*
Convert a string between two character sets.
'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
diff --git a/strings/strings_def.h b/strings/strings_def.h
index b3727321e19..d4f51bcd0a5 100644
--- a/strings/strings_def.h
+++ b/strings/strings_def.h
@@ -117,4 +117,17 @@ uint my_8bit_collation_flags_from_data(CHARSET_INFO *cs);
#define MY_HASH_ADD_16(A, B, value) \
do { MY_HASH_ADD(A, B, ((value) & 0xFF)) ; MY_HASH_ADD(A, B, ((value >>8 ))); } while(0)
-#endif
+
+#define my_wc_t ulong
+
+int my_wc_to_printable_ex(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *s, uchar *e,
+ uint bs, uint bslen, uint diglen);
+
+int my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *s, uchar *e);
+
+int my_wc_to_printable_8bit(CHARSET_INFO *cs, my_wc_t wc,
+ uchar *s, uchar *e);
+
+#endif /*STRINGS_DEF_INCLUDED */