PATCH: deprecation warnings for unreasonable charnames

Prior to now just about anything has been legal for a character name in \N{...}. This means that legal code was broken by having \N{3,4} for example mean [^\n]{3,4}. Such code doesn't come from standard charnames, but from legal custom translators. This patch deprecates "unreasonable" names. handy.h is changed by the addition of macros that taken together define the names we deem reasonable, namely alpha beginning with alphanumerics and some punctuations as continuations. toke.c is changed to parse each name and to raise a warning if any problematic characters are found. Some tests and diagnostic documentation are also included.
author: Karl Williamson <khw@khw-desktop.(none)> 2010-02-19 23:53:36 -0700
committer: Steve Hay <steve.m.hay@googlemail.com> 2010-02-20 11:03:53 +0000
commit: cb233ae346c666d88ee890fc837f4cd3195c1f0b (patch)
tree: 0d414ca0a66af489c1d4764b5aec0f2061aa6b72 /toke.c
parent: c3c4140635dd08363a20c93a8c8b6d8e7464b891 (diff)
download: perl-cb233ae346c666d88ee890fc837f4cd3195c1f0b.tar.gz
1 files changed, 64 insertions, 1 deletions
diff --git a/toke.c b/toke.c
index 361d7d202c..7167004c1f 100644
--- a/toke.c
+++ b/toke.c
@@ -3206,7 +3206,70 @@ S_scan_const(pTHX_ char *start)
 			d += len;
 		    }
 		    SvREFCNT_dec(res);
-		}
+
+		    /* Deprecate non-approved name syntax */
+		    if (ckWARN_d(WARN_DEPRECATED)) {
+			bool problematic = FALSE;
+			char* i = s;
+
+			/* For non-ut8 input, look to see that the first
+			 * character is an alpha, then loop through the rest
+			 * checking that each is a continuation */
+			if (! this_utf8) {
+			    if (! isALPHAU(*i)) problematic = TRUE;
+			    else for (i = s + 1; i < e; i++) {
+				if (isCHARNAME_CONT(*i)) continue;
+				problematic = TRUE;
+				break;
+			    }
+			}
+			else {
+			    /* Similarly for utf8.  For invariants can check
+			     * directly.  We accept anything above the latin1
+			     * range because it is immaterial to Perl if it is
+			     * correct or not, and is expensive to check.  But
+			     * it is fairly easy in the latin1 range to convert
+			     * the variants into a single character and check
+			     * those */
+			    if (UTF8_IS_INVARIANT(*i)) {
+				if (! isALPHAU(*i)) problematic = TRUE;
+			    } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
+				if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i,
+									    *(i+1)))))
+				{
+				    problematic = TRUE;
+				}
+			    }
+			    if (! problematic) for (i = s + UTF8SKIP(s);
+						    i < e;
+						    i+= UTF8SKIP(i))
+			    {
+				if (UTF8_IS_INVARIANT(*i)) {
+				    if (isCHARNAME_CONT(*i)) continue;
+				} else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
+				    continue;
+				} else if (isCHARNAME_CONT(
+					    UNI_TO_NATIVE(
+					    UTF8_ACCUMULATE(*i, *(i+1)))))
+				{
+				    continue;
+				}
+				problematic = TRUE;
+				break;
+			    }
+			}
+			if (problematic) {
+			    char *string;
+			    Newx(string, e - i + 1, char);
+			    Copy(i, string, e - i, char);
+			    string[e - i] = '\0';
+			    Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+				"Deprecated character(s) in \\N{...} starting at '%s'",
+				string);
+			    Safefree(string);
+			}
+		    }
+		} /* End \N{NAME} */
 #ifdef EBCDIC
 		if (!dorange) 
 		    native_range = FALSE; /* \N{} is defined to be Unicode */
author	Karl Williamson <khw@khw-desktop.(none)>	2010-02-19 23:53:36 -0700
committer	Steve Hay <steve.m.hay@googlemail.com>	2010-02-20 11:03:53 +0000
commit	cb233ae346c666d88ee890fc837f4cd3195c1f0b (patch)
tree	0d414ca0a66af489c1d4764b5aec0f2061aa6b72 /toke.c
parent	c3c4140635dd08363a20c93a8c8b6d8e7464b891 (diff)
download	perl-cb233ae346c666d88ee890fc837f4cd3195c1f0b.tar.gz