diff options
author | Alexander Barkov <bar@mariadb.org> | 2015-03-16 12:14:31 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2015-03-16 12:14:31 +0400 |
commit | f48dc5ccc7246c5ca9ebad2acc2d1d56a3470f9d (patch) | |
tree | dad0c76c6303556b7a92431957b08dec4ff8c559 | |
parent | c4b268add0475c6633f35eaf8cf4c7fbcad298f6 (diff) | |
download | mariadb-git-f48dc5ccc7246c5ca9ebad2acc2d1d56a3470f9d.tar.gz |
Moving the conversion code from String::well_formed_copy()
to my_convert_fix() - a new function in /strings.
-rw-r--r-- | include/m_ctype.h | 38 | ||||
-rw-r--r-- | sql/sql_string.cc | 71 | ||||
-rw-r--r-- | sql/sql_string.h | 7 | ||||
-rw-r--r-- | strings/ctype.c | 73 |
4 files changed, 117 insertions, 72 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index 7f4ccee2a3e..7df59488029 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -382,6 +382,16 @@ typedef struct } MY_STRCOPY_STATUS; +/* + A structure to return the statistics of a Unicode string conversion. +*/ +typedef struct +{ + MY_STRCOPY_STATUS m_native_copy_status; + const char *m_cannot_convert_error_pos; +} MY_STRCONV_STATUS; + + /* See strings/CHARSET_INFO.txt about information on this structure */ struct my_charset_handler_st { @@ -852,10 +862,38 @@ const MY_CONTRACTIONS *my_charset_get_contractions(CHARSET_INFO *cs, extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n, const char* fmt, va_list ap); +/* + Convert a string between two character sets. + Bad byte sequences as well as characters that cannot be + encoded in the destination character set are replaced to '?'. +*/ uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, const char *from, uint32 from_length, CHARSET_INFO *from_cs, uint *errors); +/* + Convert a string between two character sets. + Bad byte sequences as well as characters that cannot be + encoded in the destination character set are replaced to '?'. + Not more than "nchars" characters are copied. + Conversion statistics is returnd in "status" and is set as follows: + - status->m_native_copy_status.m_source_end_pos - to the position + between (src) and (src+src_length), where the function stopped reading + the source string. + - status->m_native_copy_status.m_well_formed_error_pos - to the position + between (src) and (src+src_length), where the first badly formed byte + sequence was found, or to NULL if the string was well formed in the + given range. + - status->m_cannot_convert_error_pos - to the position + between (src) and (src+src_length), where the first character that + cannot be represented in the destination character set was found, + or to NULL if all characters in the given range were successfully + converted. +*/ +size_t my_convert_fix(CHARSET_INFO *dstcs, char *dst, size_t dst_length, + CHARSET_INFO *srccs, const char *src, size_t src_length, + size_t nchars, MY_STRCONV_STATUS *status); + #define _MY_U 01 /* Upper case */ #define _MY_L 02 /* Lower case */ #define _MY_NMR 04 /* Numeral (digit) */ diff --git a/sql/sql_string.cc b/sql/sql_string.cc index a0b63956ed0..1b8ea936c0d 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -914,8 +914,6 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs, const char *from, uint from_length, uint nchars) { - uint res; - if ((to_cs == &my_charset_bin) || (from_cs == &my_charset_bin) || (to_cs == from_cs) || @@ -923,73 +921,10 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs, { m_cannot_convert_error_pos= NULL; return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length, - nchars, this); - } - else - { - int cnvres; - my_wc_t wc; - my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc; - my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb; - const uchar *from_end= (const uchar*) from + from_length; - uchar *to_end= (uchar*) to + to_length; - char *to_start= to; - m_well_formed_error_pos= NULL; - m_cannot_convert_error_pos= NULL; - - for ( ; nchars; nchars--) - { - const char *from_prev= from; - if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0) - from+= cnvres; - else if (cnvres == MY_CS_ILSEQ) - { - if (!m_well_formed_error_pos) - m_well_formed_error_pos= from; - from++; - wc= '?'; - } - else if (cnvres > MY_CS_TOOSMALL) - { - /* - A correct multibyte sequence detected - But it doesn't have Unicode mapping. - */ - if (!m_cannot_convert_error_pos) - m_cannot_convert_error_pos= from; - from+= (-cnvres); - wc= '?'; - } - else - { - if ((uchar *) from >= from_end) - break; // End of line - // Incomplete byte sequence - if (!m_well_formed_error_pos) - m_well_formed_error_pos= from; - from++; - wc= '?'; - } -outp: - if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) - to+= cnvres; - else if (cnvres == MY_CS_ILUNI && wc != '?') - { - if (!m_cannot_convert_error_pos) - m_cannot_convert_error_pos= from_prev; - wc= '?'; - goto outp; - } - else - { - from= from_prev; - break; - } - } - m_source_end_pos= from; - res= (uint) (to - to_start); + nchars, &m_native_copy_status); } - return res; + return my_convert_fix(to_cs, to, to_length, from_cs, from, from_length, + nchars, this); } diff --git a/sql/sql_string.h b/sql/sql_string.h index d89adb6bf51..4c02a46cf67 100644 --- a/sql/sql_string.h +++ b/sql/sql_string.h @@ -43,14 +43,13 @@ inline uint32 copy_and_convert(char *to, uint32 to_length, } -class String_copier: private MY_STRCOPY_STATUS +class String_copier: private MY_STRCONV_STATUS { - const char *m_cannot_convert_error_pos; public: const char *source_end_pos() const - { return m_source_end_pos; } + { return m_native_copy_status.m_source_end_pos; } const char *well_formed_error_pos() const - { return m_well_formed_error_pos; } + { return m_native_copy_status.m_well_formed_error_pos; } const char *cannot_convert_error_pos() const { return m_cannot_convert_error_pos; } const char *most_important_error_pos() const diff --git a/strings/ctype.c b/strings/ctype.c index 048fbe3d368..aa40e2b338c 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -1161,3 +1161,76 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, DBUG_ASSERT(FALSE); // Should never get to here return 0; // Make compiler happy } + + +size_t +my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length, + CHARSET_INFO *from_cs, const char *from, size_t from_length, + size_t nchars, MY_STRCONV_STATUS *status) +{ + int cnvres; + my_wc_t wc; + my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc; + my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb; + const uchar *from_end= (const uchar*) from + from_length; + uchar *to_end= (uchar*) to + to_length; + char *to_start= to; + + DBUG_ASSERT(to_cs != &my_charset_bin); + DBUG_ASSERT(from_cs != &my_charset_bin); + + status->m_native_copy_status.m_well_formed_error_pos= NULL; + status->m_cannot_convert_error_pos= NULL; + + for ( ; nchars; nchars--) + { + const char *from_prev= from; + if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0) + from+= cnvres; + else if (cnvres == MY_CS_ILSEQ) + { + if (!status->m_native_copy_status.m_well_formed_error_pos) + status->m_native_copy_status.m_well_formed_error_pos= from; + from++; + wc= '?'; + } + else if (cnvres > MY_CS_TOOSMALL) + { + /* + A correct multibyte sequence detected + But it doesn't have Unicode mapping. + */ + if (!status->m_cannot_convert_error_pos) + status->m_cannot_convert_error_pos= from; + from+= (-cnvres); + wc= '?'; + } + else + { + if ((uchar *) from >= from_end) + break; // End of line + // Incomplete byte sequence + if (!status->m_native_copy_status.m_well_formed_error_pos) + status->m_native_copy_status.m_well_formed_error_pos= from; + from++; + wc= '?'; + } +outp: + if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) + to+= cnvres; + else if (cnvres == MY_CS_ILUNI && wc != '?') + { + if (!status->m_cannot_convert_error_pos) + status->m_cannot_convert_error_pos= from_prev; + wc= '?'; + goto outp; + } + else + { + from= from_prev; + break; + } + } + status->m_native_copy_status.m_source_end_pos= from; + return to - to_start; +} |