Add portablity warning for re 'strict'

When a range in a bracketed character class has one end be specified as Unicode, the whole range is viewed as Unicode. Currently this is not warned about, though it is somewhat like mixing apples and oranges. This commit adds a warning, but only under "use re 'strict'", and it now documents the only one-end behavior.
author: Karl Williamson <khw@cpan.org> 2015-01-19 12:47:41 -0700
committer: Karl Williamson <khw@cpan.org> 2015-01-20 10:38:03 -0700
commit: b927b7e95b7031e0f55821c537e194ad78fd3a09 (patch)
tree: 3e7a60c4a058774bc9d7c0faf3bb869624863010 /regcomp.c
parent: dfa1e3a5359a09e2f8ad21051412248182399696 (diff)
download: perl-b927b7e95b7031e0f55821c537e194ad78fd3a09.tar.gz
1 files changed, 19 insertions, 14 deletions
diff --git a/regcomp.c b/regcomp.c
index 6e0daba7c2..7d5c50ca18 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -641,6 +641,12 @@ static const scan_data_t zero_scan_data =
 	    REPORT_LOCATION_ARGS(offset));		\
 } STMT_END
 
+#define	vWARN(loc, m) STMT_START {				        \
+    const IV offset = loc - RExC_precomp;				\
+    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
+	    REPORT_LOCATION_ARGS(offset));	        \
+} STMT_END
+
 #define	vWARN_dep(loc, m) STMT_START {				        \
     const IV offset = loc - RExC_precomp;				\
     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,	\
@@ -13810,7 +13816,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * runtime locale is UTF-8 */
     SV* only_utf8_locale_list = NULL;
 
-#ifdef EBCDIC
     /* In a range, if one of the endpoints is non-character-set portable,
      * meaning that it hard-codes a code point that may mean a different
      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
@@ -13822,7 +13827,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * to Unicode (i.e. non-ASCII), each code point in it should be considered
      * to be a Unicode value.  */
     bool unicode_range = FALSE;
-#endif
     bool invert = FALSE;    /* Is this class to be complemented */
 
     bool warn_super = ALWAYS_WARN_SUPER;
@@ -13926,9 +13930,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 	if (!range) {
 	    rangebegin = RExC_parse;
 	    element_count++;
-#ifdef EBCDIC
             non_portable_endpoint = 0;
-#endif
 	}
 	if (UTF) {
 	    value = utf8n_to_uvchr((U8*)RExC_parse,
@@ -14027,10 +14029,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         prevvalue = save_prevvalue;
                         continue;   /* Back to top of loop to get next char */
                     }
+
                     /* Here, is a single code point, and <value> contains it */
-#ifdef EBCDIC
                     unicode_range = TRUE;   /* \N{} are Unicode */
-#endif
                 }
                 break;
 	    case 'p':
@@ -14228,9 +14229,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 			vFAIL(error_msg);
 		    }
 		}
-#ifdef EBCDIC
                 non_portable_endpoint++;
-#endif
 		if (IN_ENCODING && value < 0x100) {
 		    goto recode_encoding;
 		}
@@ -14250,17 +14249,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 			vFAIL(error_msg);
 		    }
 		}
-#ifdef EBCDIC
                 non_portable_endpoint++;
-#endif
 		if (IN_ENCODING && value < 0x100)
 		    goto recode_encoding;
 		break;
 	    case 'c':
 		value = grok_bslash_c(*RExC_parse++, PASS2);
-#ifdef EBCDIC
                 non_portable_endpoint++;
-#endif
 		break;
 	    case '0': case '1': case '2': case '3': case '4':
 	    case '5': case '6': case '7':
@@ -14288,9 +14283,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             (void)ReREFCNT_inc(RExC_rx_sv);
                         }
                     }
-#ifdef EBCDIC
                     non_portable_endpoint++;
-#endif
 		    if (IN_ENCODING && value < 0x100)
 			goto recode_encoding;
 		    break;
@@ -14662,6 +14655,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             }
         }
 
+        if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
+            if (range) {
+
+                /* If the range starts above 255, everything is portable and
+                 * likely to be so for any forseeable character set, so don't
+                 * warn. */
+                if (unicode_range && non_portable_endpoint && prevvalue < 256) {
+                    vWARN(RExC_parse, "Both or neither range ends should be Unicode");
+                }
+            }
+        }
+
         /* Deal with this element of the class */
 	if (! SIZE_ONLY) {
 #ifndef EBCDIC
author	Karl Williamson <khw@cpan.org>	2015-01-19 12:47:41 -0700
committer	Karl Williamson <khw@cpan.org>	2015-01-20 10:38:03 -0700
commit	b927b7e95b7031e0f55821c537e194ad78fd3a09 (patch)
tree	3e7a60c4a058774bc9d7c0faf3bb869624863010 /regcomp.c
parent	dfa1e3a5359a09e2f8ad21051412248182399696 (diff)
download	perl-b927b7e95b7031e0f55821c537e194ad78fd3a09.tar.gz