Fix de-escaping checks so that we will reject \000 as well as other invalidly

encoded sequences. Per discussion of a couple of days ago.
author: Tom Lane <tgl@sss.pgh.pa.us> 2009-04-19 21:08:54 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2009-04-19 21:08:54 +0000
commit: 22c922269f5f8a80267389e1c879c0b65fbba902 (patch)
tree: f82f522d11c899bbdad662da74c1d01fcaa5b04d /src/backend
parent: c1c40e580a5498ae7804270fe20dd8023fc7a9d6 (diff)
download: postgresql-22c922269f5f8a80267389e1c879c0b65fbba902.tar.gz
2 files changed, 28 insertions, 26 deletions
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 90ceb77bbb..9ba7c5fc03 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.307 2009/03/31 22:12:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.308 2009/04/19 21:08:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2718,7 +2718,7 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
 		char	   *start_ptr;
 		char	   *end_ptr;
 		int			input_len;
-		bool		saw_high_bit = false;
+		bool		saw_non_ascii = false;
 
 		/* Make sure space remains in fieldvals[] */
 		if (fieldno >= maxfields)
@@ -2783,8 +2783,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
 								}
 							}
 							c = val & 0377;
-							if (IS_HIGHBIT_SET(c))
-								saw_high_bit = true;
+							if (c == '\0' || IS_HIGHBIT_SET(c))
+								saw_non_ascii = true;
 						}
 						break;
 					case 'x':
@@ -2808,8 +2808,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
 									}
 								}
 								c = val & 0xff;
-								if (IS_HIGHBIT_SET(c))
-									saw_high_bit = true;
+								if (c == '\0' || IS_HIGHBIT_SET(c))
+									saw_non_ascii = true;
 							}
 						}
 						break;
@@ -2847,11 +2847,11 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
 		*output_ptr++ = '\0';
 
 		/*
-		 * If we de-escaped a char with the high bit set, make sure we still
+		 * If we de-escaped a non-7-bit-ASCII char, make sure we still
 		 * have valid data for the db encoding. Avoid calling strlen here for
 		 * the sake of efficiency.
 		 */
-		if (saw_high_bit)
+		if (saw_non_ascii)
 		{
 			char	   *fld = fieldvals[fieldno];
 
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index a3d4d857c8..8551cd2753 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -24,7 +24,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.150 2009/04/14 22:18:47 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.151 2009/04/19 21:08:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,7 +60,7 @@ bool			escape_string_warning = true;
 bool			standard_conforming_strings = false;
 
 static bool		warn_on_first_escape;
-static bool     saw_high_bit = false;
+static bool		saw_non_ascii = false;
 
 /*
  * literalbuf is used to accumulate literal values when multiple rules
@@ -453,7 +453,7 @@ other			.
 
 {xqstart}		{
 					warn_on_first_escape = true;
-					saw_high_bit = false;
+					saw_non_ascii = false;
 					SET_YYLLOC();
 					if (standard_conforming_strings)
 						BEGIN(xq);
@@ -463,7 +463,7 @@ other			.
 				}
 {xestart}		{
 					warn_on_first_escape = false;
-					saw_high_bit = false;
+					saw_non_ascii = false;
 					SET_YYLLOC();
 					BEGIN(xe);
 					startlit();
@@ -477,10 +477,11 @@ other			.
 <xq,xe>{quotefail} {
 					yyless(1);
 					BEGIN(INITIAL);
-					/* check that the data remains valid if it might have been
+					/*
+					 * check that the data remains valid if it might have been
 					 * made invalid by unescaping any chars.
 					 */
-					if (saw_high_bit)
+					if (saw_non_ascii)
 						pg_verifymbstr(literalbuf, literallen, false);
 					yylval.str = litbufdup();
 					return SCONST;
@@ -526,16 +527,16 @@ other			.
 
 					check_escape_warning();
 					addlitchar(c);
-					if (IS_HIGHBIT_SET(c))
-						saw_high_bit = true;
+					if (c == '\0' || IS_HIGHBIT_SET(c))
+						saw_non_ascii = true;
 				}
 <xe>{xehexesc}  {
 					unsigned char c = strtoul(yytext+2, NULL, 16);
 
 					check_escape_warning();
 					addlitchar(c);
-					if (IS_HIGHBIT_SET(c))
-						saw_high_bit = true;
+					if (c == '\0' || IS_HIGHBIT_SET(c))
+						saw_non_ascii = true;
 				}
 <xq,xe,xus>{quotecontinue} {
 					/* ignore */
@@ -1083,6 +1084,11 @@ litbuf_udeescape(unsigned char escape)
 	}
 
 	*out = '\0';
+	/*
+	 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
+	 * codes; but it's probably not worth the trouble, since this isn't
+	 * likely to be a performance-critical path.
+	 */
 	pg_verifymbstr(new, out - new, false);
 	return new;
 }
@@ -1090,14 +1096,6 @@ litbuf_udeescape(unsigned char escape)
 static unsigned char
 unescape_single_char(unsigned char c)
 {
-	/* Normally we wouldn't expect to see \n where n has its high bit set
-	 * but we set the flag to check the string if we do get it, so
-	 * that this doesn't become a way of getting around the coding validity
-	 * checks.
-	 */
-	if (IS_HIGHBIT_SET(c))
-		saw_high_bit = true;
-
 	switch (c)
 	{
 		case 'b':
@@ -1111,6 +1109,10 @@ unescape_single_char(unsigned char c)
 		case 't':
 			return '\t';
 		default:
+			/* check for backslash followed by non-7-bit-ASCII */
+			if (c == '\0' || IS_HIGHBIT_SET(c))
+				saw_non_ascii = true;
+
 			return c;
 	}
 }
author	Tom Lane <tgl@sss.pgh.pa.us>	2009-04-19 21:08:54 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2009-04-19 21:08:54 +0000
commit	22c922269f5f8a80267389e1c879c0b65fbba902 (patch)
tree	f82f522d11c899bbdad662da74c1d01fcaa5b04d /src/backend
parent	c1c40e580a5498ae7804270fe20dd8023fc7a9d6 (diff)
download	postgresql-22c922269f5f8a80267389e1c879c0b65fbba902.tar.gz