summaryrefslogtreecommitdiff
path: root/sql/sql_lex.cc
diff options
context:
space:
mode:
Diffstat (limited to 'sql/sql_lex.cc')
-rw-r--r--sql/sql_lex.cc205
1 files changed, 197 insertions, 8 deletions
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index 304feea5f5c..639f0d2325d 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -123,15 +123,19 @@ Lex_input_stream::Lex_input_stream(THD *thd,
m_end_of_query(buffer + length),
m_tok_start_prev(NULL),
m_buf(buffer),
+ m_buf_length(length),
m_echo(true),
m_cpp_tok_start(NULL),
m_cpp_tok_start_prev(NULL),
m_cpp_tok_end(NULL),
+ m_body_utf8(NULL),
+ m_cpp_utf8_processed_ptr(NULL),
next_state(MY_LEX_START),
found_semicolon(NULL),
ignore_space(test(thd->variables.sql_mode & MODE_IGNORE_SPACE)),
stmt_prepare_mode(FALSE),
- in_comment(NO_COMMENT)
+ in_comment(NO_COMMENT),
+ m_underscore_cs(NULL)
{
m_cpp_buf= (char*) thd->alloc(length + 1);
m_cpp_ptr= m_cpp_buf;
@@ -140,6 +144,133 @@ Lex_input_stream::Lex_input_stream(THD *thd,
Lex_input_stream::~Lex_input_stream()
{}
+/**
+ The operation is called from the parser in order to
+ 1) designate the intention to have utf8 body;
+ 1) Indicate to the lexer that we will need a utf8 representation of this
+ statement;
+ 2) Determine the beginning of the body.
+
+ @param thd Thread context.
+ @param begin_ptr Pointer to the start of the body in the pre-processed
+ buffer.
+*/
+
+void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr)
+{
+ DBUG_ASSERT(begin_ptr);
+ DBUG_ASSERT(m_cpp_buf <= begin_ptr && begin_ptr <= m_cpp_buf + m_buf_length);
+
+ uint body_utf8_length=
+ (m_buf_length / thd->variables.character_set_client->mbminlen) *
+ my_charset_utf8_bin.mbmaxlen;
+
+ m_body_utf8= (char *) thd->alloc(body_utf8_length + 1);
+ m_body_utf8_ptr= m_body_utf8;
+ *m_body_utf8_ptr= 0;
+
+ m_cpp_utf8_processed_ptr= begin_ptr;
+}
+
+/**
+ The operation appends unprocessed part of pre-processed buffer till
+ the given pointer (ptr) and sets m_cpp_utf8_processed_ptr to end_ptr.
+
+ The idea is that some tokens in the pre-processed buffer (like character
+ set introducers) should be skipped.
+
+ Example:
+ CPP buffer: SELECT 'str1', _latin1 'str2';
+ m_cpp_utf8_processed_ptr -- points at the "SELECT ...";
+ In order to skip "_latin1", the following call should be made:
+ body_utf8_append(<pointer to "_latin1 ...">, <pointer to " 'str2'...">)
+
+ @param ptr Pointer in the pre-processed buffer, which specifies the
+ end of the chunk, which should be appended to the utf8
+ body.
+ @param end_ptr Pointer in the pre-processed buffer, to which
+ m_cpp_utf8_processed_ptr will be set in the end of the
+ operation.
+*/
+
+void Lex_input_stream::body_utf8_append(const char *ptr,
+ const char *end_ptr)
+{
+ DBUG_ASSERT(m_cpp_buf <= ptr && ptr <= m_cpp_buf + m_buf_length);
+ DBUG_ASSERT(m_cpp_buf <= end_ptr && end_ptr <= m_cpp_buf + m_buf_length);
+
+ if (!m_body_utf8)
+ return;
+
+ if (m_cpp_utf8_processed_ptr >= ptr)
+ return;
+
+ int bytes_to_copy= ptr - m_cpp_utf8_processed_ptr;
+
+ memcpy(m_body_utf8_ptr, m_cpp_utf8_processed_ptr, bytes_to_copy);
+ m_body_utf8_ptr += bytes_to_copy;
+ *m_body_utf8_ptr= 0;
+
+ m_cpp_utf8_processed_ptr= end_ptr;
+}
+
+/**
+ The operation appends unprocessed part of the pre-processed buffer till
+ the given pointer (ptr) and sets m_cpp_utf8_processed_ptr to ptr.
+
+ @param ptr Pointer in the pre-processed buffer, which specifies the end
+ of the chunk, which should be appended to the utf8 body.
+*/
+
+void Lex_input_stream::body_utf8_append(const char *ptr)
+{
+ body_utf8_append(ptr, ptr);
+}
+
+/**
+ The operation converts the specified text literal to the utf8 and appends
+ the result to the utf8-body.
+
+ @param thd Thread context.
+ @param txt Text literal.
+ @param txt_cs Character set of the text literal.
+ @param end_ptr Pointer in the pre-processed buffer, to which
+ m_cpp_utf8_processed_ptr will be set in the end of the
+ operation.
+*/
+
+void Lex_input_stream::body_utf8_append_literal(THD *thd,
+ const LEX_STRING *txt,
+ CHARSET_INFO *txt_cs,
+ const char *end_ptr)
+{
+ if (!m_cpp_utf8_processed_ptr)
+ return;
+
+ LEX_STRING utf_txt;
+
+ if (!my_charset_same(txt_cs, &my_charset_utf8_general_ci))
+ {
+ thd->convert_string(&utf_txt,
+ &my_charset_utf8_general_ci,
+ txt->str, txt->length,
+ txt_cs);
+ }
+ else
+ {
+ utf_txt.str= txt->str;
+ utf_txt.length= txt->length;
+ }
+
+ /* NOTE: utf_txt.length is in bytes, not in symbols. */
+
+ memcpy(m_body_utf8_ptr, utf_txt.str, utf_txt.length);
+ m_body_utf8_ptr += utf_txt.length;
+ *m_body_utf8_ptr= 0;
+
+ m_cpp_utf8_processed_ptr= end_ptr;
+}
+
/*
This is called before every query that is to be parsed.
@@ -231,6 +362,7 @@ void lex_start(THD *thd)
lex->server_options.socket= 0;
lex->server_options.owner= 0;
lex->server_options.port= -1;
+
DBUG_VOID_RETURN;
}
@@ -311,6 +443,10 @@ static LEX_STRING get_token(Lex_input_stream *lip, uint skip, uint length)
lip->yyUnget(); // ptr points now after last token char
tmp.length=lip->yytoklen=length;
tmp.str= lip->m_thd->strmake(lip->get_tok_start() + skip, tmp.length);
+
+ lip->m_cpp_text_start= lip->get_cpp_tok_start() + skip;
+ lip->m_cpp_text_end= lip->m_cpp_text_start + tmp.length;
+
return tmp;
}
@@ -334,10 +470,17 @@ static LEX_STRING get_quoted_token(Lex_input_stream *lip,
from= lip->get_tok_start() + skip;
to= tmp.str;
end= to+length;
+
+ lip->m_cpp_text_start= lip->get_cpp_tok_start() + skip;
+ lip->m_cpp_text_end= lip->m_cpp_text_start + length;
+
for ( ; to != end; )
{
if ((*to++= *from++) == quote)
+ {
from++; // Skip double quotes
+ lip->m_cpp_text_start++;
+ }
}
*to= 0; // End null for safety
return tmp;
@@ -402,6 +545,10 @@ static char *get_text(Lex_input_stream *lip, int pre_skip, int post_skip)
if (!(start= (char*) lip->m_thd->alloc((uint) (end-str)+1)))
return (char*) ""; // Sql_alloc has set error flag
+
+ lip->m_cpp_text_start= lip->get_cpp_tok_start() + pre_skip;
+ lip->m_cpp_text_end= lip->get_cpp_ptr() - post_skip;
+
if (!found_escape)
{
lip->yytoklen=(uint) (end-str);
@@ -743,18 +890,33 @@ int MYSQLlex(void *arg, void *yythd)
}
yylval->lex_str=get_token(lip, 0, length);
- /*
+ /*
Note: "SELECT _bla AS 'alias'"
_bla should be considered as a IDENT if charset haven't been found.
- So we don't use MYF(MY_WME) with get_charset_by_csname to avoid
+ So we don't use MYF(MY_WME) with get_charset_by_csname to avoid
producing an error.
*/
- if ((yylval->lex_str.str[0]=='_') &&
- (lex->underscore_charset=
- get_charset_by_csname(yylval->lex_str.str + 1,
- MY_CS_PRIMARY,MYF(0))))
- return(UNDERSCORE_CHARSET);
+ if (yylval->lex_str.str[0] == '_')
+ {
+ CHARSET_INFO *cs= get_charset_by_csname(yylval->lex_str.str + 1,
+ MY_CS_PRIMARY, MYF(0));
+ if (cs)
+ {
+ yylval->charset= cs;
+ lip->m_underscore_cs= cs;
+
+ lip->body_utf8_append(lip->m_cpp_text_start,
+ lip->get_cpp_tok_start() + length);
+ return(UNDERSCORE_CHARSET);
+ }
+ }
+
+ lip->body_utf8_append(lip->m_cpp_text_start);
+
+ lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
+ lip->m_cpp_text_end);
+
return(result_state); // IDENT or IDENT_QUOTED
case MY_LEX_IDENT_SEP: // Found ident and now '.'
@@ -852,6 +1014,12 @@ int MYSQLlex(void *arg, void *yythd)
lip->next_state=MY_LEX_IDENT_SEP;// Next is '.'
yylval->lex_str= get_token(lip, 0, lip->yyLength());
+
+ lip->body_utf8_append(lip->m_cpp_text_start);
+
+ lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
+ lip->m_cpp_text_end);
+
return(result_state);
case MY_LEX_USER_VARIABLE_DELIMITER: // Found quote char
@@ -887,6 +1055,12 @@ int MYSQLlex(void *arg, void *yythd)
if (c == quote_char)
lip->yySkip(); // Skip end `
lip->next_state= MY_LEX_START;
+
+ lip->body_utf8_append(lip->m_cpp_text_start);
+
+ lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
+ lip->m_cpp_text_end);
+
return(IDENT_QUOTED);
}
case MY_LEX_INT_OR_REAL: // Complete int or incomplete real
@@ -995,6 +1169,15 @@ int MYSQLlex(void *arg, void *yythd)
break;
}
yylval->lex_str.length=lip->yytoklen;
+
+ lip->body_utf8_append(lip->m_cpp_text_start);
+
+ lip->body_utf8_append_literal(thd, &yylval->lex_str,
+ lip->m_underscore_cs ? lip->m_underscore_cs : cs,
+ lip->m_cpp_text_end);
+
+ lip->m_underscore_cs= NULL;
+
return(TEXT_STRING);
case MY_LEX_COMMENT: // Comment
@@ -1201,6 +1384,12 @@ int MYSQLlex(void *arg, void *yythd)
return(tokval); // Was keyword
}
yylval->lex_str=get_token(lip, 0, length);
+
+ lip->body_utf8_append(lip->m_cpp_text_start);
+
+ lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
+ lip->m_cpp_text_end);
+
return(result_state);
}
}