Fix \xa0 matching both [\s] [\S], et.al.

This bug stemmed from Latin1 characters not matching any (non-complemented) character class in /d semantics when the target string is no utf8; but having unicode semantics when it isn't. The solution here is to add a special flag. There were several tests that relied on the broken behavior, specifically they tested that \xff isn't a printable word character even in utf8. I changed the deparse test to instead use a non-printable code point, and I changed the ones in re_tests to be TODOs, and will change them back using /a when that is shortly added.
author: Karl Williamson <public@khwilliamson.com> 2011-01-15 13:42:58 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-01-16 08:18:54 -0700
commit: 11454c594f22abc5945e69a46fc965363dbf326e (patch)
tree: 8e51baaf062d5e28410294b9cac63f791c63ced2
parent: f424400810b6af341e96230836690da51c37b812 (diff)
download: perl-11454c594f22abc5945e69a46fc965363dbf326e.tar.gz
5 files changed, 30 insertions, 10 deletions
diff --git a/dist/B-Deparse/t/deparse.t b/dist/B-Deparse/t/deparse.t
index 89a449383f..50baa90ee5 100644
--- a/dist/B-Deparse/t/deparse.t
+++ b/dist/B-Deparse/t/deparse.t
@@ -247,7 +247,7 @@ my $foo;
 $_ .= <ARGV> . <$foo>;
 ####
 # \x{}
-my $foo = "Ab\x{100}\200\x{200}\377Cd\000Ef\x{1000}\cA\x{2000}\cZ";
+my $foo = "Ab\x{100}\200\x{200}\237Cd\000Ef\x{1000}\cA\x{2000}\cZ";
 ####
 # s///e
 s/x/'y';/e;
diff --git a/regcomp.c b/regcomp.c
index fa8f44ba98..d83c9dee98 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -707,10 +707,9 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
 
     ANYOF_CLASS_ZERO(cl);
     ANYOF_BITMAP_SETALL(cl);
-    cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
+    cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
     if (LOC)
 	cl->flags |= ANYOF_LOCALE;
-    cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
 }
 
 /* Can match anything (initialization) */
@@ -783,6 +782,8 @@ S_cl_and(struct regnode_charclass_class *cl,
 
     if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
 	cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
+    if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
+	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 
     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
 	!(and_with->flags & ANYOF_INVERT)) {
@@ -850,6 +851,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
     }
     if (or_with->flags & ANYOF_EOS)
 	cl->flags |= ANYOF_EOS;
+    if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
+	cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
 
     if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 	cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
@@ -8232,9 +8235,12 @@ case ANYOF_N##NAME:                                     \
             if (! TEST_7) stored +=                     \
                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
-        for (value = 128; value < 256; value++) {         \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
-        }                                               \
+	/* For a non-ut8 target string with DEPENDS semantics, all above ASCII \
+	 * Latin1 code points match the complement of any of the classes.  But \
+	 * in utf8, they have their Unicode semantics, so can't just set them  \
+	 * in the bitmap, or else regexec.c will think they matched when they  \
+	 * shouldn't. */                                                       \
+	ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;  \
     }                                                   \
     yesno = '!';                                        \
     what = WORD;                                        \
@@ -9824,6 +9830,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
         
         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
         
+	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
+	    sv_catpvs(sv, "{non-utf8-latin1-all}");
+	}
+
         /* output information about the unicode matching */
 	if (flags & ANYOF_UNICODE_ALL)
 	    sv_catpvs(sv, "{unicode_all}");
diff --git a/regcomp.h b/regcomp.h
index 0dc4374973..96e7ae14f6 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -362,6 +362,10 @@ struct regnode_charclass_class {
 /* Matches every code point 0x100 and above*/
 #define ANYOF_UNICODE_ALL	0x40
 
+/* Match all Latin1 characters that aren't ASCII when the target string is not
+ * in utf8. */
+#define ANYOF_NON_UTF8_LATIN1_ALL 0x80
+
 #define ANYOF_FLAGS_ALL		0xff
 
 /* Character classes for node->classflags of ANYOF */
diff --git a/regexec.c b/regexec.c
index ca88d9f5ab..be0feeb80c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6336,6 +6336,12 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
     if (c < 256) {
 	if (ANYOF_BITMAP_TEST(n, c))
 	    match = TRUE;
+	else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
+		&& ! utf8_target
+		&& ! isASCII(c))
+	{
+	    match = TRUE;
+	}
 
 	else if (flags & ANYOF_LOCALE) {
 	    PL_reg_flags |= RF_tainted;
diff --git a/t/re/re_tests b/t/re/re_tests
index 02da1e1bdc..041296aa54 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -628,15 +628,15 @@ $(?<=^(a))	a	y	$1	a
 ([[:upper:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	AB
 ([[:xdigit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01
 ([[:^alpha:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	01
-([[:^alnum:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	__--  ${nulnul}${ffff}
+([[:^alnum:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yT	$1	__--  ${nulnul}${ffff}
 ([[:^ascii:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	${ffff}
 ([[:^cntrl:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy__--  
 ([[:^digit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd
 ([[:^lower:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	AB
-([[:^print:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	${nulnul}${ffff}
+([[:^print:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yT	$1	${nulnul}${ffff}
 ([[:^punct:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy
 ([[:^space:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy__--
-([[:^word:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	--  ${nulnul}${ffff}
+([[:^word:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yT	$1	--  ${nulnul}${ffff}
 ([[:^upper:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	cd01
 ([[:^xdigit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	Xy__--  ${nulnul}${ffff}
 [[:foo:]]	-	c	-	POSIX class [:foo:] unknown
@@ -1406,7 +1406,7 @@ foo(\h)bar	foo\tbar	y	$1	\t
 /\N{U+41}\x{c1}/i	a\x{e1}	y	$&	a\x{e1}
 /[\N{U+41}\x{c1}]/i	\x{e1}	y	$&	\x{e1}
 
-[\s][\S]	\x{a0}\x{a0}	nT	-	-	# Unicode complements should not match same character
+[\s][\S]	\x{a0}\x{a0}	n	-	-	# Unicode complements should not match same character
 
 # was generating malformed utf8
 '[\x{100}\xff]'i	\x{ff}	y	$&	\x{ff}
author	Karl Williamson <public@khwilliamson.com>	2011-01-15 13:42:58 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-01-16 08:18:54 -0700
commit	11454c594f22abc5945e69a46fc965363dbf326e (patch)
tree	8e51baaf062d5e28410294b9cac63f791c63ced2
parent	f424400810b6af341e96230836690da51c37b812 (diff)
download	perl-11454c594f22abc5945e69a46fc965363dbf326e.tar.gz