stop regex engine reading beyond end of string

Historically the regex engine has assumed that any string passed to it will have a trailing null char. This isn't normally an issue in perl code, since perl strings *are* null terminated; but it could cause problems with strings returned by XS code, or with someone calling the regex engine directly from XS, with strend not pointing at a null char. The engine currently relies on there being a null char in the following ways. First, when at the end of string, the main loop of regmatch() still reads in the 'next' character (i.e. the character following the end of string) even if it doesn't make any use of it. This precludes using memory mapped files as strings for example, since the read off the end would SEGV. Second, the matching algorithm often required the trailing character to be \0 to work correctly: the test for 'EOF' was "if next char is null *and* locinput >= PL_regeol, then stop". So a random non-null trailing char could cause an overshoot. Thirdly, some match ops require the trailing char to be null to operate correctly; for example, \b applied at the end of the string only happens to work because the trailing char (\0) happens to match \W. Also, some utf8 ops will try to extract the code point at the end, which can result in multiple bytes past the end of string being read, and possible problems if they don't correspond to well-formed utf8. The main fix is in S_regmatch, where the 'read next char' code has been updated to set it to a special value, NEXTCHR_EOS instead, if we would be reading past the end of the string. Lots of other random bits in the regex engine needed to be fixed up too. To track these down, I temporarily hacked regexec_flags() to make a copy of the string but without trailing \0, then ran all the t/re/*.t tests under valgrind to flush out all buffer overruns. So I think I've removed most of the bad code, but by no means all of it. The code within the various functions in regexec.c is far too complex to be able to visually audit the code with any confidence.
author: David Mitchell <davem@iabyn.com> 2012-09-21 10:29:04 +0100
committer: David Mitchell <davem@iabyn.com> 2012-09-26 09:41:10 +0100
commit: 7016d6ebb4afd4eb7b71b00f15b7515b5e45fee8 (patch)
tree: 880deaa8b0e09d4369797bf9deec676802d10442
parent: 895cc420d0398ff184560679b40f5f2c0af72366 (diff)
download: perl-7016d6ebb4afd4eb7b71b00f15b7515b5e45fee8.tar.gz
5 files changed, 224 insertions, 78 deletions
diff --git a/MANIFEST b/MANIFEST
index a3e752b3ad..a6884d0fa2 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3989,6 +3989,7 @@ ext/XS-APItest/t/blockhooks.t	XS::APItest: tests for PL_blockhooks
 ext/XS-APItest/t/Block.pm	Helper for ./blockhooks.t
 ext/XS-APItest/t/call_checker.t	test call checker plugin API
 ext/XS-APItest/t/caller.t	XS::APItest: tests for caller_cx
+ext/XS-APItest/t/callregexec.t	XS::APItest: tests for CALLREGEXEC()
 ext/XS-APItest/t/call.t		XS::APItest extension
 ext/XS-APItest/t/check_warnings.t	test scope of "Too late for CHECK"
 ext/XS-APItest/t/cleanup.t	test stack behaviour on unwinding
diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm
index 749af95b8d..f33b80b656 100644
--- a/ext/XS-APItest/APItest.pm
+++ b/ext/XS-APItest/APItest.pm
@@ -5,7 +5,7 @@ use strict;
 use warnings;
 use Carp;
 
-our $VERSION = '0.43';
+our $VERSION = '0.44';
 
 require XSLoader;
 
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index 08694e6c3c..357b033f7f 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -3407,6 +3407,30 @@ CODE:
 OUTPUT:
     RETVAL
 
+ # provide access to CALLREGEXEC, except replace pointers within the
+ # string with offsets from the start of the string
+
+I32
+callregexec(SV *prog, STRLEN stringarg, STRLEN strend, I32 minend, SV *sv, U32 nosave)
+CODE:
+    {
+	STRLEN len;
+	char *strbeg;
+	if (SvROK(prog))
+	    prog = SvRV(prog);
+	strbeg = SvPV_force(sv, len);
+	RETVAL = CALLREGEXEC((REGEXP *)prog,
+			    strbeg + stringarg,
+			    strbeg + strend,
+			    strbeg,
+			    minend,
+			    sv,
+			    NULL, /* data */
+			    nosave);
+    }
+OUTPUT:
+    RETVAL
+
 
 MODULE = XS::APItest PACKAGE = XS::APItest::AUTOLOADtest
 
diff --git a/ext/XS-APItest/t/callregexec.t b/ext/XS-APItest/t/callregexec.t
new file mode 100644
index 0000000000..3111390686
--- /dev/null
+++ b/ext/XS-APItest/t/callregexec.t
@@ -0,0 +1,66 @@
+#!perl
+
+# test CALLREGEXEC()
+# (currently it just checks that it handles non-\0 terminated strings;
+# full tests haven't been added yet)
+
+use warnings;
+use strict;
+
+use XS::APItest;
+*callregexec = *XS::APItest::callregexec;
+
+use Test::More tests => 50;
+
+# Test that the regex engine can handle strings without terminating \0
+# XXX This is by no means comprehensive; it doesn't test all ops, nor all
+# code paths within those ops (especially not utf8).
+
+
+# this sub takes a string that has an extraneous char at the end.
+# First see if the string (less the last char) matches the regex;
+# then see if that string (including the last char) matches when
+# calling callregexec(), but with the length arg set to 1 char less than
+# the length of the string.
+# In theory the result should be the same for both matches, since
+# they should both not 'see' the final char.
+
+sub try {
+    my ($str, $re, $exp, $desc) = @_;
+
+    my $str1 = substr($str, 0, -1);
+    ok !!$exp == !!($str1 =~ $re), "$desc str =~ qr";
+
+    my $bytes = do { use bytes; length $str1 };
+    ok  !!$exp == !!callregexec($re, 0, $bytes, 0, $str, 0),
+	    "$desc callregexec";
+}
+
+
+{
+    try "\nx",         qr/\n^/m,          0, 'MBOL';
+    try "ax",          qr/a$/m,           1, 'MEOL';
+    try "ax",          qr/a$/s,           1, 'SEOL';
+    try "abx",         qr/^(ab|X)./s,     0, 'SANY';
+    try "abx",         qr/^(ab|X)\C/,     0, 'CANY';
+    try "abx",         qr/^(ab|X)./,      0, 'REG_ANY';
+    try "abx",         qr/^ab(c|d|e|x)/,  0, 'TRIE/TRIEC';
+    try "abx",         qr/^abx/,          0, 'EXACT';
+    try "abx",         qr/^ABX/i,         0, 'EXACTF';
+    try "abx",         qr/^ab\b/,         1, 'BOUND';
+    try "ab-",         qr/^ab\B/,         0, 'NBOUND';
+    try "aas",         qr/a[st]/,         0, 'ANYOF';
+    try "aas",         qr/a[s\xDF]/i,     0, 'ANYOFV';
+    try "ab1",         qr/ab\d/,          0, 'DIGIT';
+    try "ab\n",        qr/ab[[:ascii:]]/, 0, 'POSIX';
+    try "aP\x{307}",   qr/^a\X/,          1, 'CLUMP 1';
+    try "aP\x{307}x",  qr/^a\X/,          1, 'CLUMP 2';
+    try "\x{100}\r\n", qr/^\x{100}\X/,    1, 'CLUMP 3';
+    try "abb",         qr/^a(b)\1/,       0, 'REF';
+    try "ab\n",        qr/^.+\R/,         0, 'LNBREAK';
+    try "ab\n",        qr/^.+\v/,         0, 'VERTWS';
+    try "abx",         qr/^.+\V/,         1, 'NVERTWS';
+    try "ab\t",        qr/^.+\h/,         0, 'HORIZWS';
+    try "abx",         qr/^.+\H/,         1, 'NHORIZWS';
+    try "abx",         qr/a.*x/,          0, 'CURLY';
+}
diff --git a/regexec.c b/regexec.c
index 6ace8b60c9..989affafb4 100644
--- a/regexec.c
+++ b/regexec.c
@@ -121,6 +121,18 @@
 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 
+
+#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
+#define NEXTCHR_IS_EOS (nextchr < 0)
+
+#define SET_nextchr \
+    nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
+
+#define SET_locinput(p) \
+    locinput = (p);  \
+    SET_nextchr
+
+
 /* these are unrolled below in the CCC_TRY_XXX defined */
 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
     if (!CAT2(PL_utf8_,class)) { \
@@ -165,7 +177,7 @@
  * fails, or advance to the next character */
 
 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
-    if (locinput >= PL_regeol) {                                              \
+    if (NEXTCHR_IS_EOS) {                                              \
 	sayNO;                                                                \
     }                                                                         \
     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
@@ -219,14 +231,14 @@
 	_CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 		       CLASS, STR)                                            \
     case NAMEA:                                                               \
-	if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
+	if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 	    sayNO;                                                            \
 	}                                                                     \
 	/* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 	locinput++;                                        \
 	break;                                                                \
     case NNAMEA:                                                              \
-	if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
+	if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 	    sayNO;                                                            \
 	}                                                                     \
         goto increment_locinput;                                              \
@@ -597,7 +609,21 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 	goto fail;
     }
                 
-    strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
+    /* XXX we need to pass strbeg as a separate arg: the following is
+     * guesswork and can be wrong... */
+    if (sv && SvPOK(sv)) {
+        char * p   = SvPVX(sv);
+        STRLEN cur = SvCUR(sv); 
+        if (p <= strpos && strpos < p + cur) {
+            strbeg = p;
+            assert(p <= strend && strend <= p + cur);
+        }
+        else
+            strbeg = strend - cur;
+    }
+    else 
+        strbeg = strpos;
+
     PL_regeol = strend;
     if (utf8_target) {
 	if (!prog->check_utf8 && prog->check_substr)
@@ -1238,7 +1264,7 @@ STMT_START {                                              \
 
 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
 STMT_START {                                          \
-    while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
+    while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
 	CoDe                                          \
 	s += uskip;                                   \
     }                                                 \
@@ -1763,32 +1789,32 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    break;
 	case LNBREAK:
 	    REXEC_FBC_CSCAN(
-		is_LNBREAK_utf8(s),
-		is_LNBREAK_latin1(s)
+		is_LNBREAK_utf8_safe(s, strend),
+		is_LNBREAK_latin1_safe(s, strend)
 	    );
 	    break;
 	case VERTWS:
 	    REXEC_FBC_CSCAN(
-		is_VERTWS_utf8(s),
-		is_VERTWS_latin1(s)
+		is_VERTWS_utf8_safe(s, strend),
+		is_VERTWS_latin1_safe(s, strend)
 	    );
 	    break;
 	case NVERTWS:
 	    REXEC_FBC_CSCAN(
-		!is_VERTWS_utf8(s),
-		!is_VERTWS_latin1(s)
+		!is_VERTWS_utf8_safe(s, strend),
+		!is_VERTWS_latin1_safe(s, strend)
 	    );
 	    break;
 	case HORIZWS:
 	    REXEC_FBC_CSCAN(
-		is_HORIZWS_utf8(s),
-		is_HORIZWS_latin1(s)
+		is_HORIZWS_utf8_safe(s, strend),
+		is_HORIZWS_latin1_safe(s, strend)
 	    );
 	    break;
 	case NHORIZWS:
 	    REXEC_FBC_CSCAN(
-		!is_HORIZWS_utf8(s),
-		!is_HORIZWS_latin1(s)
+		!is_HORIZWS_utf8_safe(s, strend),
+		!is_HORIZWS_latin1_safe(s, strend)
 	    );	    
 	    break;
 	case POSIXA:
@@ -1923,16 +1949,24 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                             
                         }
                         points[pointpos++ % maxlen]= uc;
-			REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
+                        if (foldlen || uc < (U8*)strend) {
+                            REXEC_TRIE_READ_CHAR(trie_type, trie,
+                                             widecharmap, uc,
 					     uscan, len, uvc, charid, foldlen,
 					     foldbuf, uniflags);
-                        DEBUG_TRIE_EXECUTE_r({
-                            dump_exec_pos( (char *)uc, c, strend, real_start, 
-                                s,   utf8_target );
-                            PerlIO_printf(Perl_debug_log,
-                                " Charid:%3u CP:%4"UVxf" ",
-                                 charid, uvc);
-                        });
+                            DEBUG_TRIE_EXECUTE_r({
+                                dump_exec_pos( (char *)uc, c, strend,
+                                            real_start, s, utf8_target);
+                                PerlIO_printf(Perl_debug_log,
+                                    " Charid:%3u CP:%4"UVxf" ",
+                                     charid, uvc);
+                            });
+                        }
+                        else {
+                            len = 0;
+                            charid = 0;
+                        }
+
 
                         do {
 #ifdef DEBUGGING
@@ -2380,7 +2414,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
 		while (s <= last1) {
 		    if (regtry(&reginfo, &s))
 			goto got_it;
-		    s += UTF8SKIP(s);
+                    if (s >= last1) {
+                        s++; /* to break out of outer loop */
+                        break;
+                    }
+                    s += UTF8SKIP(s);
 		}
 	    }
 	    else {
@@ -2682,7 +2720,6 @@ phooey:
         Safefree(prog->offs);
         prog->offs = swap;
     }
-
     return 0;
 }
 
@@ -3288,7 +3325,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	st = PL_regmatch_state = S_push_slab(aTHX);
 
     /* Note that nextchr is a byte even in UTF */
-    nextchr = UCHARAT(locinput);
+    SET_nextchr;
     scan = prog;
     while (scan != NULL) {
 
@@ -3313,8 +3350,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
       reenter_switch:
 
-        nextchr = UCHARAT(locinput);
-        assert(nextchr >= 0);
+        SET_nextchr;
 
 	switch (state_num) {
 	case BOL: /*  /^../  */
@@ -3327,7 +3363,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 	case MBOL: /*  /^../m  */
 	    if (locinput == PL_bostr ||
-		((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
+		(!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
 	    {
 		break;
 	    }
@@ -3359,36 +3395,36 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		goto seol;
 
 	case MEOL: /* /..$/m  */
-	    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
+	    if (!NEXTCHR_IS_EOS && nextchr != '\n')
 		sayNO;
 	    break;
 
 	case SEOL: /* /..$/s  */
 	  seol:
-	    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
+	    if (!NEXTCHR_IS_EOS && nextchr != '\n')
 		sayNO;
 	    if (PL_regeol - locinput > 1)
 		sayNO;
 	    break;
 
 	case EOS: /*  \z  */
-	    if (PL_regeol != locinput)
+	    if (!NEXTCHR_IS_EOS)
 		sayNO;
 	    break;
 
 	case SANY: /*  /./s  */
-	    if (!nextchr && locinput >= PL_regeol)
+	    if (NEXTCHR_IS_EOS)
 		sayNO;
             goto increment_locinput;
 
 	case CANY: /*  \C  */
-	    if (!nextchr && locinput >= PL_regeol)
+	    if (NEXTCHR_IS_EOS)
 		sayNO;
 	    locinput++;
 	    break;
 
 	case REG_ANY: /*  /./  */
-	    if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
+	    if ((NEXTCHR_IS_EOS) || nextchr == '\n')
 		sayNO;
             goto increment_locinput;
 
@@ -3399,7 +3435,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             /* In this case the charclass data is available inline so
                we can fail fast without a lot of extra overhead. 
              */
-            if(!ANYOF_BITMAP_TEST(scan, nextchr)) {
+            if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
                 DEBUG_EXECUTE_r(
                     PerlIO_printf(Perl_debug_log,
                               "%*s  %sfailed to match trie start class...%s\n",
@@ -3464,7 +3500,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
                 U32 state = trie->startstate;
 
-                if (trie->bitmap && !TRIE_BITMAP_TEST(trie, nextchr) ) {
+                if (   trie->bitmap
+                    && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
+                {
         	    if (trie->states[ state ].wordnum) {
         	         DEBUG_EXECUTE_r(
                             PerlIO_printf(Perl_debug_log,
@@ -3537,7 +3575,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		    });
 
 		    /* read a char and goto next state */
-		    if ( base ) {
+		    if ( base && (foldlen || uc < (U8*)PL_regeol)) {
 			I32 offset;
 			REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
 					     uscan, len, uvc, charid, foldlen,
@@ -3808,6 +3846,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    locinput += ln;
 	    break;
 	    }
+
 	case EXACTFL: {          /*  /abc/il      */
 	    re_fold_t folder;
 	    const U8 * fold_array;
@@ -3899,12 +3938,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		}
 		if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
 		    ln = isALNUM_uni(ln);
-		    LOAD_UTF8_CHARCLASS_ALNUM();
-		    n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
+                    if (NEXTCHR_IS_EOS)
+                        n = 0;
+                    else {
+                        LOAD_UTF8_CHARCLASS_ALNUM();
+                        n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
+                                                                utf8_target);
+                    }
 		}
 		else {
 		    ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
-		    n = isALNUM_LC_utf8((U8*)locinput);
+		    n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
 		}
 	    }
 	    else {
@@ -3925,20 +3969,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		switch (FLAGS(scan)) {
 		    case REGEX_UNICODE_CHARSET:
 			ln = isWORDCHAR_L1(ln);
-			n = isWORDCHAR_L1(nextchr);
+			n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
 			break;
 		    case REGEX_LOCALE_CHARSET:
 			ln = isALNUM_LC(ln);
-			n = isALNUM_LC(nextchr);
+			n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
 			break;
 		    case REGEX_DEPENDS_CHARSET:
 			ln = isALNUM(ln);
-			n = isALNUM(nextchr);
+			n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
 			break;
 		    case REGEX_ASCII_RESTRICTED_CHARSET:
 		    case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
 			ln = isWORDCHAR_A(ln);
-			n = isWORDCHAR_A(nextchr);
+			n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
 			break;
 		    default:
 			Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
@@ -3953,19 +3997,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 	case ANYOFV: /*  /[abx{df}]/i  */
 	case ANYOF:  /*  /[abc]/       */
+            if (NEXTCHR_IS_EOS)
+                sayNO;
 	    if (utf8_target || state_num == ANYOFV) {
 	        STRLEN inclasslen = PL_regeol - locinput;
-		if (locinput >= PL_regeol)
-		    sayNO;
-
 	        if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
 		    sayNO;
 		locinput += inclasslen;
 		break;
 	    }
 	    else {
-		if (!nextchr && locinput >= PL_regeol)
-		    sayNO;
 		if (!REGINCLASS(rex, scan, (U8*)locinput))
 		    sayNO;
 		locinput++;
@@ -3993,7 +4034,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		digit, "0");
 
         case POSIXA: /* /[[:ascii:]]/ etc */
-            if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+            if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
                 sayNO;
             }
             /* Matched a utf8-invariant, so don't have to worry about utf8 */
@@ -4001,7 +4042,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             break;
 
         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
-            if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
+            if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
                 sayNO;
             }
             goto increment_locinput;
@@ -4040,7 +4081,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	       Prepend, that one will be a suitable Begin.
 	    */
 
-	    if (locinput >= PL_regeol)
+	    if (NEXTCHR_IS_EOS)
 		sayNO;
 	    if  (! utf8_target) {
 
@@ -4056,7 +4097,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 		/* Utf8: See if is ( CR LF ); already know that locinput <
 		 * PL_regeol, so locinput+1 is in bounds */
-		if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
+		if ( nextchr == '\r' && locinput+1 < PL_regeol
+                        && UCHARAT(locinput + 1) == '\n')
+                {
 		    locinput += 2;
 		}
 		else {
@@ -4311,7 +4354,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    }
 
 	    /* Not utf8:  Inline the first character, for speed. */
-	    if (UCHARAT(s) != nextchr &&
+	    if (!NEXTCHR_IS_EOS &&
+                UCHARAT(s) != nextchr &&
 		(type == REF ||
 		 UCHARAT(s) != fold_array[nextchr]))
 		sayNO;
@@ -5325,7 +5369,8 @@ NULL
 		    (int)(REPORT_CODE_OFF+(depth*2)),
 		    "", (IV)ST.count)
 		);
-	    if (ST.c1 != CHRTEST_VOID
+	    if (       !NEXTCHR_IS_EOS
+                    && ST.c1 != CHRTEST_VOID
 		    && nextchr != ST.c1
 		    && nextchr != ST.c2)
 	    {
@@ -5379,8 +5424,7 @@ NULL
 	    if (ST.count == ARG1(ST.me) /* min */)
 		sayNO;
 	    ST.count--;
-	    locinput = HOPc(locinput, -ST.alen);
-            nextchr = UCHARAT(locinput);
+	    SET_locinput(HOPc(locinput, -ST.alen));
 	    goto curlym_do_B; /* try to match B */
 
 #undef ST
@@ -5511,8 +5555,7 @@ NULL
 		minmod = 0;
 		if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
 		    sayNO;
-                locinput = li;
-                nextchr = UCHARAT(locinput);
+                SET_locinput(li);
 		ST.count = ST.min;
 		REGCP_SET(ST.cp);
 		if (ST.c1 == CHRTEST_VOID)
@@ -5549,8 +5592,7 @@ NULL
 		ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
 		if (ST.count < ST.min)
 		    sayNO;
-                locinput = li;
-                nextchr = UCHARAT(locinput);
+                SET_locinput(li);
 		if ((ST.count > ST.min)
 		    && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
 		{
@@ -5690,12 +5732,15 @@ NULL
             }
 	    {
 		UV c = 0;
-		if (ST.c1 != CHRTEST_VOID)
+		if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
 		    c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
 					   UTF8_MAXBYTES, 0, uniflags)
 				: (UV) UCHARAT(locinput);
 		/* If it could work, try it. */
-		if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
+		if (ST.c1 == CHRTEST_VOID
+                    || (locinput < PL_regeol &&
+                        (c == (UV)ST.c1 || c == (UV)ST.c2)))
+                {
 		    CURLY_SETPAREN(ST.paren, ST.count);
 		    PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
 		    assert(0); /* NOTREACHED */
@@ -5949,7 +5994,7 @@ NULL
 #undef ST
 
         case LNBREAK: /* \R */
-            if ((n=is_LNBREAK(locinput,utf8_target))) {
+            if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
                 locinput += n;
             } else
                 sayNO;
@@ -5957,7 +6002,7 @@ NULL
 
 #define CASE_CLASS(nAmE)                              \
         case nAmE:                                    \
-	    if (locinput >= PL_regeol)                \
+	    if (NEXTCHR_IS_EOS)                       \
 		sayNO;                                \
             if ((n=is_##nAmE(locinput,utf8_target))) {    \
                 locinput += n;                        \
@@ -5965,7 +6010,7 @@ NULL
                 sayNO;                                \
             break;                                    \
         case N##nAmE:                                 \
-	    if (locinput >= PL_regeol)                \
+	    if (NEXTCHR_IS_EOS)                       \
 		sayNO;                                \
             if ((n=is_##nAmE(locinput,utf8_target))) {    \
                 sayNO;                                \
@@ -5988,6 +6033,7 @@ NULL
         increment_locinput:
             if (utf8_target) {
                 locinput += PL_utf8skip[nextchr];
+                /* locinput is allowed to go 1 char off the end, but not 2+ */
                 if (locinput > PL_regeol)
                     sayNO;
             }
@@ -6654,7 +6700,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case LNBREAK:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+	    while (hardcount < max && scan < loceol &&
+                    (c=is_LNBREAK_utf8_safe(scan, loceol))) {
 		scan += c;
 		hardcount++;
 	    }
@@ -6664,7 +6711,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	      because we have a null terminated string, but we
 	      have to use hardcount in this situation
 	    */
-	    while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
+	    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
 		scan+=c;
 		hardcount++;
 	    }
@@ -6673,24 +6720,28 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case HORIZWS:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+	    while (hardcount < max && scan < loceol &&
+                    (c=is_HORIZWS_utf8_safe(scan, loceol)))
+            {
 		scan += c;
 		hardcount++;
 	    }
 	} else {
-	    while (scan < loceol && is_HORIZWS_latin1(scan)) 
+	    while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol)) 
 		scan++;		
 	}	
 	break;
     case NHORIZWS:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+	    while (hardcount < max && scan < loceol &&
+                        !is_HORIZWS_utf8_safe(scan, loceol))
+            {
 		scan += UTF8SKIP(scan);
 		hardcount++;
 	    }
 	} else {
-	    while (scan < loceol && !is_HORIZWS_latin1(scan))
+	    while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
 		scan++;
 
 	}	
@@ -6698,12 +6749,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case VERTWS:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+	    while (hardcount < max && scan < loceol &&
+                            (c=is_VERTWS_utf8_safe(scan, loceol)))
+            {
 		scan += c;
 		hardcount++;
 	    }
 	} else {
-	    while (scan < loceol && is_VERTWS_latin1(scan)) 
+	    while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol)) 
 		scan++;
 
 	}	
@@ -6711,12 +6764,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case NVERTWS:
         if (utf8_target) {
 	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+	    while (hardcount < max && scan < loceol &&
+                                !is_VERTWS_utf8_safe(scan, loceol))
+            {
 		scan += UTF8SKIP(scan);
 		hardcount++;
 	    }
 	} else {
-	    while (scan < loceol && !is_VERTWS_latin1(scan)) 
+	    while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol)) 
 		scan++;
           
 	}
author	David Mitchell <davem@iabyn.com>	2012-09-21 10:29:04 +0100
committer	David Mitchell <davem@iabyn.com>	2012-09-26 09:41:10 +0100
commit	7016d6ebb4afd4eb7b71b00f15b7515b5e45fee8 (patch)
tree	880deaa8b0e09d4369797bf9deec676802d10442
parent	895cc420d0398ff184560679b40f5f2c0af72366 (diff)
download	perl-7016d6ebb4afd4eb7b71b00f15b7515b5e45fee8.tar.gz