merge revision(s) eccfc978fd6f65332eb70c9a46fbb4d5110bbe0a: [Backport #19379]

Fix parsing of regexps that toggle extended mode on/off inside regexp This was broken in ec3542229b29ec93062e9d90e877ea29d3c19472. That commit didn't handle cases where extended mode was turned on/off inside the regexp. There are two ways to turn extended mode on/off: ``` /(?-x:#y)#z /x =~ '#y' /(?-x)#y(?x)#z /x =~ '#y' ``` These can be nested inside the same regexp: ``` /(?-x:(?x)#x (?-x)#y)#z /x =~ '#y' ``` As you can probably imagine, this makes handling these regexps somewhat complex. Due to the nesting inside portions of regexps, the unassign_nonascii function needs to be recursive. In recursive mode, it needs to track both opening and closing parentheses, similar to how it already tracked opening and closing brackets for character classes. When scanning the regexp and coming to `(?` not followed by `#`, scan for options, and use `x` and `i` to determine whether to turn on or off extended mode. For `:`, indicting only the current regexp section should have the extended mode switched, recurse with the extended mode set or unset. For `)`, indicating the remainder of the regexp (or current regexp portion if already recursing) should turn extended mode on or off, just change the extended mode flag and keep scanning. While testing this, I noticed that `a`, `d`, and `u` are accepted as options, in addition to `i`, `m`, and `x`, but I can't see where those options are documented. I'm not sure whether or not handling `a`, `d`, and `u` as options is a bug. Fixes [Bug #19379] --- re.c | 153 +++++++++++++++++++++++++++++++++++++---------- test/ruby/test_regexp.rb | 56 +++++++++++++++++ 2 files changed, 176 insertions(+), 33 deletions(-)
author: NARUSE, Yui <naruse@airemix.jp> 2023-01-31 15:28:01 +0900
committer: NARUSE, Yui <naruse@airemix.jp> 2023-01-31 15:28:01 +0900
commit: ca75332f46c39804e06cd37c2608cbdef0aebf05 (patch)
tree: 1f185f23ff0e2349a56b2ac4ea109635df71ea85
parent: 5a2b28909ece2e1310250180f097bfcb7b0203dc (diff)
download: ruby-ca75332f46c39804e06cd37c2608cbdef0aebf05.tar.gz
3 files changed, 177 insertions, 34 deletions
diff --git a/re.c b/re.c
index 7a74318558..65317a4a5f 100644
--- a/re.c
+++ b/re.c
@@ -2801,14 +2801,18 @@ unescape_unicode_bmp(const char **pp, const char *end,
 }
 
 static int
-unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
+unescape_nonascii0(const char **pp, const char *end, rb_encoding *enc,
         VALUE buf, rb_encoding **encp, int *has_property,
-        onig_errmsg_buffer err, int options)
+        onig_errmsg_buffer err, int options, int recurse)
 {
+    const char *p = *pp;
     unsigned char c;
     char smallbuf[2];
     int in_char_class = 0;
+    int parens = 1; /* ignored unless recurse is true */
+    int extended_mode = options & ONIG_OPTION_EXTEND;
 
+begin_scan:
     while (p < end) {
         int chlen = rb_enc_precise_mbclen(p, end, enc);
         if (!MBCLEN_CHARFOUND_P(chlen)) {
@@ -2920,7 +2924,7 @@ escape_asis:
             break;
 
           case '#':
-            if ((options & ONIG_OPTION_EXTEND) && !in_char_class) {
+            if (extended_mode && !in_char_class) {
                 /* consume and ignore comment in extended regexp */
                 while ((p < end) && ((c = *p++) != '\n'));
                 break;
@@ -2937,51 +2941,134 @@ escape_asis:
             }
             rb_str_buf_cat(buf, (char *)&c, 1);
             break;
+          case ')':
+            rb_str_buf_cat(buf, (char *)&c, 1);
+            if (!in_char_class && recurse) {
+                if (--parens == 0) {
+                    *pp = p;
+                    return 0;
+                }
+            }
+            break;
           case '(':
-            if (!in_char_class && p + 1 < end && *p == '?' && *(p+1) == '#') {
-                /* (?# is comment inside any regexp, and content inside should be ignored */
-                const char *orig_p = p;
-                int cont = 1;
-
-                while (cont && (p < end)) {
-                    switch (c = *p++) {
-                      default:
-                        if (!(c & 0x80)) break;
-                        --p;
-                        /* fallthrough */
-                      case '\\':
-                        chlen = rb_enc_precise_mbclen(p, end, enc);
-                        if (!MBCLEN_CHARFOUND_P(chlen)) {
-                            goto invalid_multibyte;
+            if (!in_char_class && p + 1 < end && *p == '?') {
+                if (*(p+1) == '#') {
+                    /* (?# is comment inside any regexp, and content inside should be ignored */
+                    const char *orig_p = p;
+                    int cont = 1;
+
+                    while (cont && (p < end)) {
+                        switch (c = *p++) {
+                          default:
+                            if (!(c & 0x80)) break;
+                            --p;
+                            /* fallthrough */
+                          case '\\':
+                            chlen = rb_enc_precise_mbclen(p, end, enc);
+                            if (!MBCLEN_CHARFOUND_P(chlen)) {
+                                goto invalid_multibyte;
+                            }
+                            p += MBCLEN_CHARFOUND_LEN(chlen);
+                            break;
+                          case ')':
+                            cont = 0;
+                            break;
                         }
-                        p += MBCLEN_CHARFOUND_LEN(chlen);
-                        break;
-                      case ')':
-                        cont = 0;
-                        break;
                     }
-                }
 
-                if (cont) {
-                    /* unterminated (?#, rewind so it is syntax error */
-                    p = orig_p;
-                    c = '(';
-                    rb_str_buf_cat(buf, (char *)&c, 1);
+                    if (cont) {
+                        /* unterminated (?#, rewind so it is syntax error */
+                        p = orig_p;
+                        c = '(';
+                        rb_str_buf_cat(buf, (char *)&c, 1);
+                    }
+                    break;
+                } else {
+                    /* potential change of extended option */
+                    int invert = 0;
+                    int local_extend = 0;
+                    const char *s;
+
+                    if (recurse) {
+                        parens++;
+                    }
+
+                    for(s = p+1; s < end; s++) {
+                        switch(*s) {
+                            case 'x':
+                                local_extend = invert ? -1 : 1;
+                                break;
+                            case '-':
+                                invert = 1;
+                                break;
+                            case ':':
+                            case ')':
+                                if (local_extend == 0 ||
+                                    (local_extend == -1 && !extended_mode) ||
+                                    (local_extend == 1 && extended_mode)) {
+                                    /* no changes to extended flag */
+                                    goto fallthrough;
+                                }
+
+                                if (*s == ':') {
+                                    /* change extended flag until ')' */
+                                    int local_options = options;
+                                    if (local_extend == 1) {
+                                         local_options |= ONIG_OPTION_EXTEND;
+                                    } else {
+                                         local_options &= ~ONIG_OPTION_EXTEND;
+                                    }
+
+                                    rb_str_buf_cat(buf, (char *)&c, 1);
+                                    int ret = unescape_nonascii0(&p, end, enc, buf, encp,
+                                                                has_property, err,
+                                                                local_options, 1);
+                                    if (ret < 0) return ret;
+                                    goto begin_scan;
+                                } else {
+                                    /* change extended flag for rest of expression */
+                                    extended_mode = local_extend == 1;
+                                    goto fallthrough;
+                                }
+                            case 'i':
+                            case 'm':
+                            case 'a':
+                            case 'd':
+                            case 'u':
+                                /* other option flags, ignored during scanning */
+                                break;
+                            default:
+                                /* other character, no extended flag change*/
+                                goto fallthrough;
+                        }
+                    }
                 }
+            } else if (!in_char_class && recurse) {
+                parens++;
             }
-            else {
-                rb_str_buf_cat(buf, (char *)&c, 1);
-            }
-            break;
+            /* FALLTHROUGH */
           default:
+fallthrough:
             rb_str_buf_cat(buf, (char *)&c, 1);
             break;
         }
     }
 
+    if (recurse) {
+        *pp = p;
+    }
     return 0;
 }
 
+static int
+unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
+        VALUE buf, rb_encoding **encp, int *has_property,
+        onig_errmsg_buffer err, int options)
+{
+    return unescape_nonascii0(&p, end, enc, buf, encp, has_property,
+                              err, options, 0);
+}
+
 static VALUE
 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
         rb_encoding **fixed_enc, onig_errmsg_buffer err, int options)
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index 98bf41d2f1..c871580aeb 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -144,6 +144,62 @@ class TestRegexp < Test::Unit::TestCase
     assert_raise(SyntaxError) {eval "/# \\users/"}
   end
 
+  def test_nonextended_section_of_extended_regexp_bug_19379
+    assert_separately([], <<-'RUBY')
+      re = /(?-x:#)/x
+      assert_match(re, '#')
+      assert_not_match(re, '-')
+
+      re = /(?xi:#
+      y)/
+      assert_match(re, 'Y')
+      assert_not_match(re, '-')
+
+      re = /(?mix:#
+      y)/
+      assert_match(re, 'Y')
+      assert_not_match(re, '-')
+
+      re = /(?x-im:#
+      y)/i
+      assert_match(re, 'y')
+      assert_not_match(re, 'Y')
+
+      re = /(?-imx:(?xim:#
+      y))/x
+      assert_match(re, 'y')
+      assert_not_match(re, '-')
+
+      re = /(?x)#
+      y/
+      assert_match(re, 'y')
+      assert_not_match(re, 'Y')
+
+      re = /(?mx-i)#
+      y/i
+      assert_match(re, 'y')
+      assert_not_match(re, 'Y')
+
+      re = /(?-imx:(?xim:#
+      (?-x)y#))/x
+      assert_match(re, 'Y#')
+      assert_not_match(re, '-#')
+
+      re = /(?imx:#
+      (?-xim:#(?im)#(?x)#
+      )#
+      (?x)#
+      y)/
+      assert_match(re, '###Y')
+      assert_not_match(re, '###-')
+
+      re = %r{#c-\w+/comment/[\w-]+}
+      re = %r{https?://[^/]+#{re}}x
+      assert_match(re, 'http://foo#c-x/comment/bar')
+      assert_not_match(re, 'http://foo#cx/comment/bar')
+    RUBY
+  end
+
   def test_union
     assert_equal :ok, begin
       Regexp.union(
diff --git a/version.h b/version.h
index a26c469164..2dc130ab2b 100644
--- a/version.h
+++ b/version.h
@@ -11,7 +11,7 @@
 # define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR
 #define RUBY_VERSION_TEENY 0
 #define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR
-#define RUBY_PATCHLEVEL 23
+#define RUBY_PATCHLEVEL 24
 
 #include "ruby/version.h"
 #include "ruby/internal/abi.h"
author	NARUSE, Yui <naruse@airemix.jp>	2023-01-31 15:28:01 +0900
committer	NARUSE, Yui <naruse@airemix.jp>	2023-01-31 15:28:01 +0900
commit	ca75332f46c39804e06cd37c2608cbdef0aebf05 (patch)
tree	1f185f23ff0e2349a56b2ac4ea109635df71ea85
parent	5a2b28909ece2e1310250180f097bfcb7b0203dc (diff)
download	ruby-ca75332f46c39804e06cd37c2608cbdef0aebf05.tar.gz