summaryrefslogtreecommitdiff
path: root/mysql-test/include/ctype_unescape.inc
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.org>2014-09-12 12:57:27 +0400
committerAlexander Barkov <bar@mariadb.org>2014-09-12 12:57:27 +0400
commit6a576f1a581b75ac86d85c1aa80c3386f8a55cd3 (patch)
tree3d0352087b7b40647e35a02f5a43db226424d56a /mysql-test/include/ctype_unescape.inc
parentd2ae40ac2e9867fcf67ee3b3135de2905d463a12 (diff)
downloadmariadb-git-6a576f1a581b75ac86d85c1aa80c3386f8a55cd3.tar.gz
Adding thorough tests covering what happens with escaped sequences
in the SQL parser. Various backslash escapes and quote-quote escaped sequences are covered in combination with single and multi-byte characters. This is especially important for the character sets that can have 0x5C as the second byte in a multi-byte character (big5, cp932, gbk, sjis). swe7 is also a special character set, because in swe7 0x5C is used for both escape character and for "LATIN CAPITAL LETTER O WITH DIAERESIS".
Diffstat (limited to 'mysql-test/include/ctype_unescape.inc')
-rw-r--r--mysql-test/include/ctype_unescape.inc327
1 files changed, 327 insertions, 0 deletions
diff --git a/mysql-test/include/ctype_unescape.inc b/mysql-test/include/ctype_unescape.inc
new file mode 100644
index 00000000000..5d67bf8d189
--- /dev/null
+++ b/mysql-test/include/ctype_unescape.inc
@@ -0,0 +1,327 @@
+--echo # Start of ctype_unescape.inc
+
+#
+# Testing how string literals with backslash and quote-quote are unescaped.
+# The tests assume that single quote (') is used as a delimiter.
+#
+
+#
+# Make sure that the parser really works using the character set we need.
+# We use binary strings to compose strings, to be able to test get malformed
+# sequences, which are possible as a result of mysql_real_escape_string().
+# The important thing for this test is to make the parser unescape using
+# the client character set, rather than binary. Currently it works exactly
+# that way by default, so the query below should return @@character_set_client
+#
+SET @query=_binary'SELECT CHARSET(\'test\'),@@character_set_client,@@character_set_connection';
+PREPARE stmt FROM @query;
+EXECUTE stmt;
+DEALLOCATE PREPARE stmt;
+
+let $CHARSET=`SELECT @@character_set_connection`;
+
+CREATE TABLE allbytes (a VARBINARY(10));
+
+#
+# Create various byte sequences to test. Testing the full banch of
+# possible combinations takes about 2 minutes. So this test provides
+# variants to run with:
+# - the full set of possible combinations
+# - a reduced test of combinations for selected bytes only
+#
+
+# Create selected byte combinations
+if ($ctype_unescape_combinations == 'selected')
+{
+--echo # Using selected bytes combinations
+--source include/bytes.inc
+#
+# Populate "selected_bytes" with bytes that have a special meaning.
+# We'll use "selected_bytes" to generate byte seqeunces,
+# instead of the full possible byte combinations, to reduce test time.
+#
+CREATE TABLE selected_bytes (a VARBINARY(10));
+
+# Bytes that have a special meaning in all character sets:
+# 0x00 - mysql_real_escape_string() quotes this to '\0'
+# 0x0D - mysql_real_escape_string() quotes this to '\r'
+# 0x0A - mysql_real_escape_string() quotes this to '\n'
+# 0x1A - mysql_real_escape_string() quotes this to '\Z'
+# 0x08 - mysql_real_escape_string() does not quote this,
+# but '\b' is unescaped to 0x08.
+# 0x09 - mysql_real_escape_string() does not quote this,
+# but '\t' is unescaped to 0x09.
+# 0x30 - '0', as in '\0'
+# 0x5A - 'Z', as in '\Z'
+# 0x62 - 'b', as in '\b'
+# 0x6E - 'n', as in '\n'
+# 0x72 - 't', as in '\r'
+# 0x74 - 't', as in '\t'
+
+INSERT INTO selected_bytes (a) VALUES ('\0'),('\b'),('\t'),('\r'),('\n'),('\Z');
+INSERT INTO selected_bytes (a) VALUES ('0'),('b'),('t'),('r'),('n'),('Z');
+
+# 0x22 - double quote
+# 0x25 - percent sign, '\%' is preserved as is for LIKE.
+# 0x27 - single quote
+# 0x5C - backslash
+# 0x5F - underscore, '\_' is preserved as is for LIKE.
+INSERT INTO selected_bytes (a) VALUES ('\\'),('_'),('%'),(0x22),(0x27);
+
+# Some bytes do not have any special meaning, for example basic Latin letters.
+# Let's add, one should be enough for a good enough coverage.
+INSERT INTO selected_bytes (a) VALUES ('a');
+
+#
+# This maps summarizes bytes that have a special
+# meaning in various character sets:
+#
+# MBHEAD MBTAIL NONASCII-8BIT BAD
+# ------ ------ -------------- ----------
+# big5: [A1..F9] [40..7E,A1..FE] N/A [80..A0,FA..FF]
+# cp932: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
+# gbk: [81..FE] [40..7E,80..FE] N/A [FF]
+# sjis: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
+# swe7: N/A N/A [5B..5E,7B..7E] [80..FF]
+#
+
+INSERT INTO selected_bytes (a) VALUES
+(0x3F), # 7bit
+(0x40), # 7bit mbtail
+(0x7E), # 7bit mbtail nonascii-8bit
+(0x7F), # 7bit nonascii-8bit
+(0x80), # mbtail bad-mb
+(0x81), # mbhead mbtail
+(0x9F), # mbhead mbtail bad-mb
+(0xA0), # mbhead mbtail bad-mb
+(0xA1), # mbhead mbtail nonascii-8bit
+(0xE0), # mbhead mbtai
+(0xEF), # mbhead mbtail
+(0xF9), # mbhead mbtail
+(0xFA), # mbhead mbtail bad-mb
+(0xFC), # mbhead mbtail bad-mb
+(0xFD), # mbhead mbtail bad-mb
+(0xFE), # mbhead mbtial bad-mb
+(0xFF); # bad-mb
+
+#
+# Now populate the test table
+#
+
+# Use all single bytes, this is cheap, there are only 256 values.
+INSERT INTO allbytes (a) SELECT a FROM bytes;
+
+# Add selected bytes combinations
+INSERT INTO allbytes (a) SELECT CONCAT(t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
+INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
+INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,0x5C,t2.a) FROM selected_bytes t1,selected_bytes t2;
+DROP TABLE selected_bytes;
+
+# Delete all non-single byte sequences that do not have
+# backslashes or quotes at all. There is nothing special with these strings.
+DELETE FROM allbytes WHERE
+ OCTET_LENGTH(a)>1 AND
+ LOCATE(0x5C,a)=0 AND
+ a NOT LIKE '%\'%' AND
+ a NOT LIKE '%"%';
+
+}
+
+if ($ctype_unescape_combinations=='')
+{
+--echo # Using full byte combinations
+--source include/bytes2.inc
+INSERT INTO allbytes (a) SELECT a FROM bytes;
+INSERT INTO allbytes (a) SELECT CONCAT(hi,lo) FROM bytes2;
+INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,lo) FROM bytes2;
+INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,0x5C,lo) FROM bytes2;
+}
+
+
+DELIMITER //;
+
+#
+# A procedure that make an SQL query using 'val' as a string literal.
+# The result of the query execution is written into the table 't1'.
+# NULL in t1.b means that query failed due to syntax error,
+# typically because of mis-interpreted closing quote delimiter.
+#
+CREATE PROCEDURE p1(val VARBINARY(10))
+BEGIN
+ DECLARE EXIT HANDLER FOR SQLSTATE '42000' INSERT INTO t1 (a,b) VALUES(val,NULL);
+ SET @query=CONCAT(_binary"INSERT INTO t1 (a,b) VALUES (0x",HEX(val),",'",val,"')");
+ PREPARE stmt FROM @query;
+ EXECUTE stmt;
+ DEALLOCATE PREPARE stmt;
+END//
+
+#
+# A procedure that iterates through all records in "allbytes".
+# And runs p1() for every record.
+#
+CREATE PROCEDURE p2()
+BEGIN
+ DECLARE val VARBINARY(10);
+ DECLARE done INT DEFAULT FALSE;
+ DECLARE stmt CURSOR FOR SELECT a FROM allbytes;
+ DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=TRUE;
+ OPEN stmt;
+read_loop1: LOOP
+ FETCH stmt INTO val;
+ IF done THEN
+ LEAVE read_loop1;
+ END IF;
+ CALL p1(val);
+END LOOP;
+ CLOSE stmt;
+END//
+
+
+# A function that converts the value from binary to $CHARSET
+# and check if it has changed. CONVERT() fixes malformed strings.
+# So if the string changes in CONVERT(), it means it was not wellformed.
+--eval CREATE FUNCTION iswellformed(a VARBINARY(256)) RETURNS INT RETURN a=BINARY CONVERT(a USING $CHARSET);
+
+#
+# A function that approximately reproduces how the SQL parser
+# would unescape a binary string.
+#
+CREATE FUNCTION unescape(a VARBINARY(256)) RETURNS VARBINARY(256)
+BEGIN
+ # We need to do it in a way to avoid producing new escape sequences
+ # First, enclose all known escsape sequences to '{{xx}}'
+ # - Backslash not followed by a LIKE pattern characters _ and %
+ # - Double escapes
+ # This uses PCRE Branch Reset Groups: (?|(alt1)|(alt2)|(alt3)).
+ # So '\\1' in the last argument always means the match, no matter
+ # which alternative it came from.
+ SET a=REGEXP_REPLACE(a,'(?|(\\\\[^_%])|(\\x{27}\\x{27}))','{{\\1}}');
+ # Now unescape all enclosed standard escape sequences
+ SET a=REPLACE(a,'{{\\0}}', '\0');
+ SET a=REPLACE(a,'{{\\b}}', '\b');
+ SET a=REPLACE(a,'{{\\t}}', '\t');
+ SET a=REPLACE(a,'{{\\r}}', '\r');
+ SET a=REPLACE(a,'{{\\n}}', '\n');
+ SET a=REPLACE(a,'{{\\Z}}', '\Z');
+ SET a=REPLACE(a,'{{\\\'}}', '\'');
+ # Unescape double quotes
+ SET a=REPLACE(a,'{{\'\'}}', '\'');
+ # Unescape the rest: all other \x sequences mean just 'x'
+ SET a=REGEXP_REPLACE(a, '{{\\\\(.|\\R)}}', '\\1');
+ RETURN a;
+END//
+
+
+#
+# A function that checks what happened during unescaping.
+#
+# @param a - the value before unescaping
+# @param b - the value after unescaping
+#
+# The following return values are possible:
+# - SyntErr - b IS NULL, which means syntax error happened in p1().
+# - Preserv - the value was not modified during unescaping.
+# This is possible if 0x5C was treated as mbtail.
+# Or only LIKE escape sequences were found: '\_' and '\%'.
+# - Trivial - only 0x5C were removed.
+# - Regular - the value was unescaped like a binary string.
+# Some standard escape sequences were found.
+# No special multi-byte handling happened.
+# - Special - Something else happened. Should not happen.
+#
+CREATE FUNCTION unescape_type(a VARBINARY(256),b VARBINARY(256)) RETURNS VARBINARY(256)
+BEGIN
+ RETURN CASE
+ WHEN b IS NULL THEN '[SyntErr]'
+ WHEN a=b THEN CASE
+ WHEN OCTET_LENGTH(a)=1 THEN '[Preserve]'
+ WHEN a RLIKE '\\\\[_%]' THEN '[Preserve][LIKE]'
+ WHEN a RLIKE '^[[:ascii:]]+$' THEN '[Preserve][ASCII]'
+ ELSE '[Preserv][MB]' END
+ WHEN REPLACE(a,0x5C,'')=b THEN '[Trivial]'
+ WHEN UNESCAPE(a)=b THEN '[Regular]'
+ ELSE '[Special]' END;
+END//
+
+
+#
+# Check what happened with wellformedness during unescaping
+# @param a - the value before unescaping
+# @param b - the value after unescaping
+#
+# Returned values:
+# [FIXED] - the value was malformed and become wellformed after unescaping
+# [BROKE] - the value was wellformed and become malformed after unescaping
+# [ILSEQ] - both values (before unescaping and after unescaping) are malformed
+# '' - both values are wellformed
+#
+CREATE FUNCTION wellformedness(a VARBINARY(256), b VARBINARY(256))
+ RETURNS VARBINARY(256)
+BEGIN
+ RETURN CASE
+ WHEN b IS NULL THEN ''
+ WHEN NOT iswellformed(a) AND iswellformed(b) THEN '[FIXED]'
+ WHEN iswellformed(a) AND NOT iswellformed(b) THEN '[BROKE]'
+ WHEN NOT iswellformed(a) AND NOT iswellformed(b) THEN '[ILSEQ]'
+ ELSE ''
+ END;
+END//
+
+
+#
+# Check if the value could be generated by mysql_real_escape_string(),
+# or can only come from a direct user input.
+#
+# @param a - the value before unescaping
+#
+# Returns:
+# [USER] - if the value could not be generated by mysql_real_escape_string()
+# '' - if the value was possibly generated by mysql_real_escape_string()
+#
+#
+CREATE FUNCTION mysql_real_escape_string_generated(a VARBINARY(256))
+ RETURNS VARBINARY(256)
+BEGIN
+ DECLARE a1 BINARY(1) DEFAULT SUBSTR(a,1,1);
+ DECLARE a2 BINARY(1) DEFAULT SUBSTR(a,2,1);
+ DECLARE a3 BINARY(1) DEFAULT SUBSTR(a,3,1);
+ DECLARE a4 BINARY(1) DEFAULT SUBSTR(a,4,1);
+ DECLARE a2a4 BINARY(2) DEFAULT CONCAT(a2,a4);
+ RETURN CASE
+ WHEN (a1=0x5C) AND
+ (a3=0x5C) AND
+ (a2>0x7F) AND
+ (a4 NOT IN ('_','%','0','t','r','n','Z')) AND
+ iswellformed(a2a4) THEN '[USER]'
+ ELSE ''
+ END;
+END//
+
+DELIMITER ;//
+
+
+CREATE TABLE t1 (a VARBINARY(10),b VARBINARY(10));
+CALL p2();
+# Avoid "Invalid XXX character string" warnings
+# We mark malformed strings in the output anyway
+--disable_warnings
+# All records marked with '[BAD]' mean that the string was unescaped
+# in a unexpected way, that means there is a bug in UNESCAPE() above.
+SELECT HEX(a),HEX(b),
+ CONCAT(unescape_type(a,b),
+ wellformedness(a,b),
+ mysql_real_escape_string_generated(a),
+ IF(UNESCAPE(a)<>b,CONCAT('[BAD',HEX(UNESCAPE(a)),']'),'')) AS comment
+FROM t1 ORDER BY LENGTH(a),a;
+--enable_warnings
+DROP TABLE t1;
+DROP PROCEDURE p1;
+DROP PROCEDURE p2;
+DROP FUNCTION unescape;
+DROP FUNCTION unescape_type;
+DROP FUNCTION wellformedness;
+DROP FUNCTION mysql_real_escape_string_generated;
+DROP FUNCTION iswellformed;
+DROP TABLE allbytes;
+
+--echo # End of ctype_backslash.inc