From 765ae6e82165d1bc4cf6cc9f0d556d66a5e172d1 Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Sun, 21 Apr 2019 12:07:30 +0400 Subject: MDEV-19239 ERROR 1300 (HY000): Invalid utf8 character string in 10.3.13-MariaDB A sequence of e, e.g.: SELECT 123eXYzzz FROM t1; was not scanned correctly (where XY is a multi-byte character). The multi-byte head byte X was appended to 123e separately from the multi-byte tail byte Y, so a pointer to "Yzzz" was passed into scan_ident_start(), which failed on a bad multi-byte sequence. After this change, scan_ident_start() gets a pointer to "XYzzz", so it correctly sees the whole multi-byte character. --- sql/sql_lex.cc | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'sql') diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index c52005e7683..b5ff060ecc6 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -1587,9 +1587,27 @@ int Lex_input_stream::lex_one_token(YYSTYPE *yylval, THD *thd) return(FLOAT_NUM); } } + /* + We've found: + - A sequence of digits + - Followed by 'e' or 'E' + - Followed by some byte XX which is not a known mantissa start, + and it's known to be a valid identifier part. + XX can be either a 8bit identifier character, or a multi-byte head. + */ yyUnget(); + return scan_ident_start(thd, &yylval->ident_cli); } - // fall through + /* + We've found: + - A sequence of digits + - Followed by some character XX, which is neither 'e' nor 'E', + and it's known to be a valid identifier part. + XX can be a 8bit identifier character, or a multi-byte head. + */ + yyUnget(); + return scan_ident_start(thd, &yylval->ident_cli); + case MY_LEX_IDENT_START: // We come here after '.' return scan_ident_start(thd, &yylval->ident_cli); -- cgit v1.2.1