summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.org>2015-03-16 12:14:31 +0400
committerAlexander Barkov <bar@mariadb.org>2015-03-16 12:14:31 +0400
commitf48dc5ccc7246c5ca9ebad2acc2d1d56a3470f9d (patch)
treedad0c76c6303556b7a92431957b08dec4ff8c559
parentc4b268add0475c6633f35eaf8cf4c7fbcad298f6 (diff)
downloadmariadb-git-f48dc5ccc7246c5ca9ebad2acc2d1d56a3470f9d.tar.gz
Moving the conversion code from String::well_formed_copy()
to my_convert_fix() - a new function in /strings.
-rw-r--r--include/m_ctype.h38
-rw-r--r--sql/sql_string.cc71
-rw-r--r--sql/sql_string.h7
-rw-r--r--strings/ctype.c73
4 files changed, 117 insertions, 72 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 7f4ccee2a3e..7df59488029 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -382,6 +382,16 @@ typedef struct
} MY_STRCOPY_STATUS;
+/*
+ A structure to return the statistics of a Unicode string conversion.
+*/
+typedef struct
+{
+ MY_STRCOPY_STATUS m_native_copy_status;
+ const char *m_cannot_convert_error_pos;
+} MY_STRCONV_STATUS;
+
+
/* See strings/CHARSET_INFO.txt about information on this structure */
struct my_charset_handler_st
{
@@ -852,10 +862,38 @@ const MY_CONTRACTIONS *my_charset_get_contractions(CHARSET_INFO *cs,
extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
const char* fmt, va_list ap);
+/*
+ Convert a string between two character sets.
+ Bad byte sequences as well as characters that cannot be
+ encoded in the destination character set are replaced to '?'.
+*/
uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length,
CHARSET_INFO *from_cs, uint *errors);
+/*
+ Convert a string between two character sets.
+ Bad byte sequences as well as characters that cannot be
+ encoded in the destination character set are replaced to '?'.
+ Not more than "nchars" characters are copied.
+ Conversion statistics is returnd in "status" and is set as follows:
+ - status->m_native_copy_status.m_source_end_pos - to the position
+ between (src) and (src+src_length), where the function stopped reading
+ the source string.
+ - status->m_native_copy_status.m_well_formed_error_pos - to the position
+ between (src) and (src+src_length), where the first badly formed byte
+ sequence was found, or to NULL if the string was well formed in the
+ given range.
+ - status->m_cannot_convert_error_pos - to the position
+ between (src) and (src+src_length), where the first character that
+ cannot be represented in the destination character set was found,
+ or to NULL if all characters in the given range were successfully
+ converted.
+*/
+size_t my_convert_fix(CHARSET_INFO *dstcs, char *dst, size_t dst_length,
+ CHARSET_INFO *srccs, const char *src, size_t src_length,
+ size_t nchars, MY_STRCONV_STATUS *status);
+
#define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */
#define _MY_NMR 04 /* Numeral (digit) */
diff --git a/sql/sql_string.cc b/sql/sql_string.cc
index a0b63956ed0..1b8ea936c0d 100644
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@@ -914,8 +914,6 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
const char *from, uint from_length,
uint nchars)
{
- uint res;
-
if ((to_cs == &my_charset_bin) ||
(from_cs == &my_charset_bin) ||
(to_cs == from_cs) ||
@@ -923,73 +921,10 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
{
m_cannot_convert_error_pos= NULL;
return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length,
- nchars, this);
- }
- else
- {
- int cnvres;
- my_wc_t wc;
- my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
- my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
- const uchar *from_end= (const uchar*) from + from_length;
- uchar *to_end= (uchar*) to + to_length;
- char *to_start= to;
- m_well_formed_error_pos= NULL;
- m_cannot_convert_error_pos= NULL;
-
- for ( ; nchars; nchars--)
- {
- const char *from_prev= from;
- if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
- from+= cnvres;
- else if (cnvres == MY_CS_ILSEQ)
- {
- if (!m_well_formed_error_pos)
- m_well_formed_error_pos= from;
- from++;
- wc= '?';
- }
- else if (cnvres > MY_CS_TOOSMALL)
- {
- /*
- A correct multibyte sequence detected
- But it doesn't have Unicode mapping.
- */
- if (!m_cannot_convert_error_pos)
- m_cannot_convert_error_pos= from;
- from+= (-cnvres);
- wc= '?';
- }
- else
- {
- if ((uchar *) from >= from_end)
- break; // End of line
- // Incomplete byte sequence
- if (!m_well_formed_error_pos)
- m_well_formed_error_pos= from;
- from++;
- wc= '?';
- }
-outp:
- if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
- to+= cnvres;
- else if (cnvres == MY_CS_ILUNI && wc != '?')
- {
- if (!m_cannot_convert_error_pos)
- m_cannot_convert_error_pos= from_prev;
- wc= '?';
- goto outp;
- }
- else
- {
- from= from_prev;
- break;
- }
- }
- m_source_end_pos= from;
- res= (uint) (to - to_start);
+ nchars, &m_native_copy_status);
}
- return res;
+ return my_convert_fix(to_cs, to, to_length, from_cs, from, from_length,
+ nchars, this);
}
diff --git a/sql/sql_string.h b/sql/sql_string.h
index d89adb6bf51..4c02a46cf67 100644
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -43,14 +43,13 @@ inline uint32 copy_and_convert(char *to, uint32 to_length,
}
-class String_copier: private MY_STRCOPY_STATUS
+class String_copier: private MY_STRCONV_STATUS
{
- const char *m_cannot_convert_error_pos;
public:
const char *source_end_pos() const
- { return m_source_end_pos; }
+ { return m_native_copy_status.m_source_end_pos; }
const char *well_formed_error_pos() const
- { return m_well_formed_error_pos; }
+ { return m_native_copy_status.m_well_formed_error_pos; }
const char *cannot_convert_error_pos() const
{ return m_cannot_convert_error_pos; }
const char *most_important_error_pos() const
diff --git a/strings/ctype.c b/strings/ctype.c
index 048fbe3d368..aa40e2b338c 100644
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -1161,3 +1161,76 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
DBUG_ASSERT(FALSE); // Should never get to here
return 0; // Make compiler happy
}
+
+
+size_t
+my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length,
+ CHARSET_INFO *from_cs, const char *from, size_t from_length,
+ size_t nchars, MY_STRCONV_STATUS *status)
+{
+ int cnvres;
+ my_wc_t wc;
+ my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
+ my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
+ const uchar *from_end= (const uchar*) from + from_length;
+ uchar *to_end= (uchar*) to + to_length;
+ char *to_start= to;
+
+ DBUG_ASSERT(to_cs != &my_charset_bin);
+ DBUG_ASSERT(from_cs != &my_charset_bin);
+
+ status->m_native_copy_status.m_well_formed_error_pos= NULL;
+ status->m_cannot_convert_error_pos= NULL;
+
+ for ( ; nchars; nchars--)
+ {
+ const char *from_prev= from;
+ if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
+ from+= cnvres;
+ else if (cnvres == MY_CS_ILSEQ)
+ {
+ if (!status->m_native_copy_status.m_well_formed_error_pos)
+ status->m_native_copy_status.m_well_formed_error_pos= from;
+ from++;
+ wc= '?';
+ }
+ else if (cnvres > MY_CS_TOOSMALL)
+ {
+ /*
+ A correct multibyte sequence detected
+ But it doesn't have Unicode mapping.
+ */
+ if (!status->m_cannot_convert_error_pos)
+ status->m_cannot_convert_error_pos= from;
+ from+= (-cnvres);
+ wc= '?';
+ }
+ else
+ {
+ if ((uchar *) from >= from_end)
+ break; // End of line
+ // Incomplete byte sequence
+ if (!status->m_native_copy_status.m_well_formed_error_pos)
+ status->m_native_copy_status.m_well_formed_error_pos= from;
+ from++;
+ wc= '?';
+ }
+outp:
+ if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
+ to+= cnvres;
+ else if (cnvres == MY_CS_ILUNI && wc != '?')
+ {
+ if (!status->m_cannot_convert_error_pos)
+ status->m_cannot_convert_error_pos= from_prev;
+ wc= '?';
+ goto outp;
+ }
+ else
+ {
+ from= from_prev;
+ break;
+ }
+ }
+ status->m_native_copy_status.m_source_end_pos= from;
+ return to - to_start;
+}