PATCH: gh #17319 Segfault

It turns out that one isn't supposed to fill in the offset to the next regnode at node creation time. And this node is like EXACTish, so the string stuff isn't accounted for in its regcomp.sym definition
author: Karl Williamson <khw@cpan.org> 2019-11-22 15:28:13 -0700
committer: Karl Williamson <khw@cpan.org> 2019-11-22 15:54:44 -0700
commit: 53d42e43e359facdd83b313c1f4b70f9ff559a70 (patch)
tree: d1e8547bbdfa043aaa7cd7a23b58ffcbc25a8fdc
parent: 008bb368ebc18adc42e95769e4ebbd7d5545ce3d (diff)
download: perl-53d42e43e359facdd83b313c1f4b70f9ff559a70.tar.gz
6 files changed, 18 insertions, 12 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index ce411e6d29..c8c251d27a 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -621,7 +621,7 @@ will be lost.
  ANYOFHr          sv 1       Like ANYOFH, but the flags field contains
                              packed bounds for all matchable UTF-8 start
                              bytes.
- ANYOFHs          sv anyofhs Like ANYOFHb, but has a string field that
+ ANYOFHs          sv 1       Like ANYOFHb, but has a string field that
                              gives the leading matchable UTF-8 bytes;
                              flags field is len
  ANYOFR           packed 1   Matches any character in the range given by
diff --git a/regcomp.c b/regcomp.c
index 1a6ab15b9c..bb3cd6618c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -19516,15 +19516,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                            regarglen[op] + STR_SZ(len),
                                            "anyofhs");
                         FILL_NODE(ret, op);
-                        RExC_emit += 1 + regarglen[op]
-                                   - 1 + STR_SZ(len); /* Replace the [1]
-                                                         element of the struct
-                                                         by the real value */
-                        REGNODE_p(ret)->flags = len;
+                        ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
+                                                                        = len;
                         Copy(low_utf8,  /* Add the common bytes */
                            ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
                            len, U8);
-                        NEXT_OFF(REGNODE_p(ret)) = regarglen[op] + STR_SZ(len);
+                        RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
                         set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
                                                   NULL, only_utf8_locale_list);
                         goto not_anyof;
@@ -22571,7 +22568,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
 	else if ( op == PLUS || op == STAR) {
 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
 	}
-	else if (PL_regkind[(U8)op] == EXACT) {
+	else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
             /* Literal string, where present. */
 	    node += NODE_SZ_STR(node) - 1;
 	    node = NEXTOPER(node);
diff --git a/regcomp.h b/regcomp.h
index 3f7dd31391..e58534278d 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -165,7 +165,7 @@ struct regnode_lstring { /* Constructed this way to keep the string aligned. */
 };
 
 struct regnode_anyofhs { /* Constructed this way to keep the string aligned. */
-    U8	flags;
+    U8	str_len;
     U8  type;
     U16 next_off;
     U32 arg1;                           /* set by set_ANYOF_arg() */
diff --git a/regcomp.sym b/regcomp.sym
index 2f4018d62d..a8ff034083 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -82,7 +82,7 @@ ANYOFPOSIXL ANYOF,      sv charclass_posixl S    ; Like ANYOFL, but matches [[:p
 ANYOFH      ANYOF,      sv 1 S    ; Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte
 ANYOFHb     ANYOF,      sv 1 S    ; Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field
 ANYOFHr     ANYOF,      sv 1 S    ; Like ANYOFH, but the flags field contains packed bounds for all matchable UTF-8 start bytes.
-ANYOFHs     ANYOF,      sv anyofhs S ; Like ANYOFHb, but has a string field that gives the leading matchable UTF-8 bytes; flags field is len
+ANYOFHs     ANYOF,      sv 1 S    ; Like ANYOFHb, but has a string field that gives the leading matchable UTF-8 bytes; flags field is len
 ANYOFR      ANYOFR,     packed 1  S  ; Matches any character in the range given by its packed args: upper 12 bits is the max delta from the base lower 20; the flags field contains the lowest matchable UTF-8 start byte
 ANYOFRb     ANYOFR,     packed 1  S ; Like ANYOFR, but all matches share the same UTF-8 start byte, given in the flags field
 # There is no ANYOFRr because khw doesn't think there are likely to be real-world cases where such a large range is used.
diff --git a/regnodes.h b/regnodes.h
index fa90f50b14..89f8ecc2e7 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -351,7 +351,7 @@ static const U8 regarglen[] = {
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFH       */
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFHb      */
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFHr      */
-	EXTRA_SIZE(struct regnode_anyofhs),  	/* ANYOFHs      */
+	EXTRA_SIZE(struct regnode_1),        	/* ANYOFHs      */
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFR       */
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFRb      */
 	EXTRA_SIZE(struct regnode_1),        	/* ANYOFM       */
diff --git a/t/re/pat.t b/t/re/pat.t
index ccf494c302..7d07d9981e 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -25,7 +25,7 @@ BEGIN {
 skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader;
 skip_all_without_unicode_tables();
 
-plan tests => 1005;  # Update this when adding/deleting tests.
+plan tests => 1011;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -2207,6 +2207,15 @@ SKIP:
         unlike("\x{4000004}", $pat, "4000004 isn't in pattern");
         like("\x{4000005}", $pat, "4000005 is in pattern");
         unlike("\x{4000006}", $pat, "4000006 isn't in pattern");
+
+        # gh #17319
+        $pat = qr/[\N{U+200D}\N{U+2000}]()/;
+        unlike("\x{1FFF}", $pat, "1FFF isn't in pattern");
+        like("\x{2000}", $pat, "2000 is in pattern");
+        unlike("\x{2001}", $pat, "2001 isn't in pattern");
+        unlike("\x{200C}", $pat, "200C isn't in pattern");
+        like("\x{200D}", $pat, "200 is in pattern");
+        unlike("\x{200E}", $pat, "200E isn't in pattern");
     }
 
 } # End of sub run_tests
author	Karl Williamson <khw@cpan.org>	2019-11-22 15:28:13 -0700
committer	Karl Williamson <khw@cpan.org>	2019-11-22 15:54:44 -0700
commit	53d42e43e359facdd83b313c1f4b70f9ff559a70 (patch)
tree	d1e8547bbdfa043aaa7cd7a23b58ffcbc25a8fdc
parent	008bb368ebc18adc42e95769e4ebbd7d5545ce3d (diff)
download	perl-53d42e43e359facdd83b313c1f4b70f9ff559a70.tar.gz