diff options
author | Alexander Barkov <bar@mariadb.com> | 2020-05-07 19:20:17 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.com> | 2020-05-09 16:01:30 +0400 |
commit | cfe5ee90c8e4b9dfa98a41fcd299197a59261be7 (patch) | |
tree | 35fdaabac55d4b36d228bc9600112e986850b162 /strings | |
parent | c675886dcdecd29571bd08605a409325ee81004c (diff) | |
download | mariadb-git-cfe5ee90c8e4b9dfa98a41fcd299197a59261be7.tar.gz |
MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)
The code did not take into account that:
- U+005C (backslash) can occupy more than mbminlen characters (e.g. in sjis)
- Some character sets do not have a code for U+005C (e.g. swe7)
Adding a new function my_wc_to_printable into MY_CHARSET_HANDLER to
cover all special cases easier.
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-big5.c | 1 | ||||
-rw-r--r-- | strings/ctype-bin.c | 1 | ||||
-rw-r--r-- | strings/ctype-cp932.c | 1 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 1 | ||||
-rw-r--r-- | strings/ctype-eucjpms.c | 1 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 1 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 1 | ||||
-rw-r--r-- | strings/ctype-latin1.c | 1 | ||||
-rw-r--r-- | strings/ctype-simple.c | 1 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 10 | ||||
-rw-r--r-- | strings/ctype-tis620.c | 1 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 4 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 1 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 13 | ||||
-rw-r--r-- | strings/ctype.c | 64 | ||||
-rw-r--r-- | strings/strings_def.h | 15 |
16 files changed, 105 insertions, 12 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index 3991a219ab5..945bbdfdc62 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -6800,6 +6800,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler= my_well_formed_char_length_big5, my_copy_fix_mb, my_native_to_mb_big5, + my_wc_to_printable_generic }; struct charset_info_st my_charset_big5_chinese_ci= diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 0324c0665e2..fe28752a3f7 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -560,6 +560,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_8bit, my_copy_8bit, my_wc_mb_bin, + my_wc_to_printable_generic }; diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index bf97d1feb83..45b5bde9510 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -34756,6 +34756,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_cp932, my_copy_fix_mb, my_native_to_mb_cp932, + my_wc_to_printable_generic }; diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index deb13957900..0362f799fc6 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -10046,6 +10046,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_euckr, my_copy_fix_mb, my_native_to_mb_euckr, + my_wc_to_printable_generic }; diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index 118e8286703..1dd179fed57 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -67584,6 +67584,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_eucjpms, my_copy_fix_mb, my_native_to_mb_eucjpms, + my_wc_to_printable_generic }; diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index 166619bf5cc..266799f32a3 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -6451,6 +6451,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_gb2312, my_copy_fix_mb, my_native_to_mb_gb2312, + my_wc_to_printable_generic }; diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index efaa2e5c728..fa6dba9bfb5 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -10733,6 +10733,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_gbk, my_copy_fix_mb, my_native_to_mb_gbk, + my_wc_to_printable_generic }; diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index f9fa1488aa6..53ce27e491e 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -423,6 +423,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_8bit, my_copy_8bit, my_wc_mb_bin, /* native_to_mb */ + my_wc_to_printable_generic }; diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index 975cb503872..eac05ea68f5 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -2088,6 +2088,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler= my_well_formed_char_length_8bit, my_copy_8bit, my_wc_mb_bin, /* native_to_mb */ + my_wc_to_printable_8bit }; MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler = diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 902034b435d..e1c6a871772 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -34004,6 +34004,15 @@ size_t my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)), } +static int +my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_to_printable_ex(cs, wc, str, end, + '\\', 2, 1); +} + + /* sjis_chinese_ci and sjis_bin sort character blocks in this order: 1. [00..7F] - 7BIT characters (ASCII) @@ -34135,6 +34144,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_sjis, my_copy_fix_mb, my_native_to_mb_sjis, + my_wc_to_printable_sjis }; diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 6a351c05823..772294fb5c0 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -905,6 +905,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_8bit, my_copy_8bit, my_wc_mb_bin, /* native_to_mb */ + my_wc_to_printable_generic }; diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index e4234a9582a..d764849c01e 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1591,6 +1591,7 @@ MY_CHARSET_HANDLER my_charset_utf16_handler= my_well_formed_char_length_utf16, my_copy_fix_mb2_or_mb4, my_uni_utf16, + my_wc_to_printable_generic }; @@ -1931,6 +1932,7 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler= my_well_formed_char_length_utf16, my_copy_fix_mb2_or_mb4, my_uni_utf16le, + my_wc_to_printable_generic }; @@ -2753,6 +2755,7 @@ MY_CHARSET_HANDLER my_charset_utf32_handler= my_well_formed_char_length_utf32, my_copy_fix_mb2_or_mb4, my_uni_utf32, + my_wc_to_printable_generic }; @@ -3343,6 +3346,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler= my_well_formed_char_length_ucs2, my_copy_fix_mb2_or_mb4, my_uni_ucs2, + my_wc_to_printable_generic }; diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 949f3aadc36..9ec3b578549 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -67328,6 +67328,7 @@ static MY_CHARSET_HANDLER my_charset_handler= my_well_formed_char_length_ujis, my_copy_fix_mb, my_native_to_mb_ujis, + my_wc_to_printable_generic }; diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 3329b6d23ef..b8e71b1f7a9 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -5466,6 +5466,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb3_handler= my_well_formed_char_length_utf8mb3, my_copy_fix_mb, my_uni_utf8mb3, + my_wc_to_printable_generic }; @@ -7030,6 +7031,16 @@ my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end) } +static int +my_wc_to_printable_filename(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_to_printable_ex(cs, wc, str, end, + '\\', 5, 1); +} + + + #define MY_FUNCTION_NAME(x) my_ ## x ## _filename #define CHARLEN(cs,str,end) my_charlen_filename(cs,str,end) #define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN @@ -7102,6 +7113,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler= my_well_formed_char_length_filename, my_copy_fix_mb, my_wc_mb_filename, + my_wc_to_printable_filename }; @@ -7792,6 +7804,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_well_formed_char_length_utf8mb4, my_copy_fix_mb, my_wc_mb_utf8mb4, + my_wc_to_printable_generic }; diff --git a/strings/ctype.c b/strings/ctype.c index 3fbe4143da2..4df9b9c2f09 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -1020,7 +1020,7 @@ my_is_printable(my_wc_t wc) } -static uint to_printable_8bit(uchar *dst, my_wc_t wc) +static uint to_printable_8bit(uchar *dst, my_wc_t wc, uint bs) { /* This function is used only in context of error messages for now. @@ -1028,7 +1028,7 @@ static uint to_printable_8bit(uchar *dst, my_wc_t wc) when a message is put into diagnostics area. */ DBUG_ASSERT(wc < 0x10000); - *dst++= '\\'; + *dst++= (char) bs; *dst++= _dig_vec_upper[(wc >> 12) & 0x0F]; *dst++= _dig_vec_upper[(wc >> 8) & 0x0F]; *dst++= _dig_vec_upper[(wc >> 4) & 0x0F]; @@ -1037,18 +1037,25 @@ static uint to_printable_8bit(uchar *dst, my_wc_t wc) } +static uint my_printable_length(uint bslen, uint diglen) +{ + return bslen + (MY_CS_PRINTABLE_CHAR_LENGTH - 1) * diglen; +} + + /** Encode an Unicode character "wc" into a printable string. This function is suitable for any character set, including ASCII-incompatible multi-byte character sets, e.g. ucs2, utf16, utf32. */ int -my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc, - uchar *str, uchar *end) +my_wc_to_printable_ex(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end, + uint bs, uint bslen, uint diglen) { uchar *str0; uint i, length; - uchar tmp[MY_CS_PRINTABLE_CHAR_LENGTH]; + uchar tmp[MY_CS_PRINTABLE_CHAR_LENGTH * MY_CS_MBMAXLEN]; if (my_is_printable(wc)) { @@ -1057,27 +1064,62 @@ my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc, return mblen; } - if (str + MY_CS_PRINTABLE_CHAR_LENGTH * cs->mbminlen > end) - return MY_CS_TOOSMALLN(MY_CS_PRINTABLE_CHAR_LENGTH * cs->mbminlen); + if (str + my_printable_length(bslen, diglen) > end) + return MY_CS_TOOSMALLN(my_printable_length(bslen, diglen)); if ((cs->state & MY_CS_NONASCII) == 0) - return to_printable_8bit(str, wc); + return to_printable_8bit(str, wc, bs); - length= to_printable_8bit(tmp, wc); + length= to_printable_8bit(tmp, wc, bs); str0= str; for (i= 0; i < length; i++) { - if (my_ci_wc_mb(cs, tmp[i], str, end) != (int) cs->mbminlen) + uint expected_length= i == 0 ? bslen : diglen; + if (my_ci_wc_mb(cs, tmp[i], str, end) != (int) expected_length) { DBUG_ASSERT(0); return MY_CS_ILSEQ; } - str+= cs->mbminlen; + str+= expected_length; } return (int) (str - str0); } +int +my_wc_to_printable_8bit(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end) +{ + /* + Special case: swe7 does not have the backslash character. + Use dot instead of backslash for escaping. + */ + uint bs= cs->tab_to_uni && cs->tab_to_uni['\\'] != '\\' ? '.' : '\\'; + DBUG_ASSERT(cs->mbminlen == 1); + /* + Additionally, if the original swe7 string contains backslashes, + replace them to dots, so this error message: + Invalid swe7 character string: '\xEF\xBC\xB4' + is displayed as: + Invalid swe7 character string: '.xEF.xBC.xB4' + which is more readable than what would happen without '\'-to-dot mapping: + Invalid swe7 character string: '.005CxEF.005CxBC.005CxB4' + */ + if (bs == '.' && wc == '\\') + wc= '.'; + return my_wc_to_printable_ex(cs, wc, str, end, bs, 1, 1); +} + + +int +my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_to_printable_ex(cs, wc, str, end, '\\', + cs->mbminlen, cs->mbminlen); +} + + /* Convert a string between two character sets. 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes. diff --git a/strings/strings_def.h b/strings/strings_def.h index b3727321e19..d4f51bcd0a5 100644 --- a/strings/strings_def.h +++ b/strings/strings_def.h @@ -117,4 +117,17 @@ uint my_8bit_collation_flags_from_data(CHARSET_INFO *cs); #define MY_HASH_ADD_16(A, B, value) \ do { MY_HASH_ADD(A, B, ((value) & 0xFF)) ; MY_HASH_ADD(A, B, ((value >>8 ))); } while(0) -#endif + +#define my_wc_t ulong + +int my_wc_to_printable_ex(CHARSET_INFO *cs, my_wc_t wc, + uchar *s, uchar *e, + uint bs, uint bslen, uint diglen); + +int my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc, + uchar *s, uchar *e); + +int my_wc_to_printable_8bit(CHARSET_INFO *cs, my_wc_t wc, + uchar *s, uchar *e); + +#endif /*STRINGS_DEF_INCLUDED */ |