diff options
-rw-r--r-- | re.c | 153 | ||||
-rw-r--r-- | test/ruby/test_regexp.rb | 56 |
2 files changed, 176 insertions, 33 deletions
@@ -2801,14 +2801,18 @@ unescape_unicode_bmp(const char **pp, const char *end, } static int -unescape_nonascii(const char *p, const char *end, rb_encoding *enc, +unescape_nonascii0(const char **pp, const char *end, rb_encoding *enc, VALUE buf, rb_encoding **encp, int *has_property, - onig_errmsg_buffer err, int options) + onig_errmsg_buffer err, int options, int recurse) { + const char *p = *pp; unsigned char c; char smallbuf[2]; int in_char_class = 0; + int parens = 1; /* ignored unless recurse is true */ + int extended_mode = options & ONIG_OPTION_EXTEND; +begin_scan: while (p < end) { int chlen = rb_enc_precise_mbclen(p, end, enc); if (!MBCLEN_CHARFOUND_P(chlen)) { @@ -2920,7 +2924,7 @@ escape_asis: break; case '#': - if ((options & ONIG_OPTION_EXTEND) && !in_char_class) { + if (extended_mode && !in_char_class) { /* consume and ignore comment in extended regexp */ while ((p < end) && ((c = *p++) != '\n')); break; @@ -2937,51 +2941,134 @@ escape_asis: } rb_str_buf_cat(buf, (char *)&c, 1); break; + case ')': + rb_str_buf_cat(buf, (char *)&c, 1); + if (!in_char_class && recurse) { + if (--parens == 0) { + *pp = p; + return 0; + } + } + break; case '(': - if (!in_char_class && p + 1 < end && *p == '?' && *(p+1) == '#') { - /* (?# is comment inside any regexp, and content inside should be ignored */ - const char *orig_p = p; - int cont = 1; - - while (cont && (p < end)) { - switch (c = *p++) { - default: - if (!(c & 0x80)) break; - --p; - /* fallthrough */ - case '\\': - chlen = rb_enc_precise_mbclen(p, end, enc); - if (!MBCLEN_CHARFOUND_P(chlen)) { - goto invalid_multibyte; + if (!in_char_class && p + 1 < end && *p == '?') { + if (*(p+1) == '#') { + /* (?# is comment inside any regexp, and content inside should be ignored */ + const char *orig_p = p; + int cont = 1; + + while (cont && (p < end)) { + switch (c = *p++) { + default: + if (!(c & 0x80)) break; + --p; + /* fallthrough */ + case '\\': + chlen = rb_enc_precise_mbclen(p, end, enc); + if (!MBCLEN_CHARFOUND_P(chlen)) { + goto invalid_multibyte; + } + p += MBCLEN_CHARFOUND_LEN(chlen); + break; + case ')': + cont = 0; + break; } - p += MBCLEN_CHARFOUND_LEN(chlen); - break; - case ')': - cont = 0; - break; } - } - if (cont) { - /* unterminated (?#, rewind so it is syntax error */ - p = orig_p; - c = '('; - rb_str_buf_cat(buf, (char *)&c, 1); + if (cont) { + /* unterminated (?#, rewind so it is syntax error */ + p = orig_p; + c = '('; + rb_str_buf_cat(buf, (char *)&c, 1); + } + break; + } else { + /* potential change of extended option */ + int invert = 0; + int local_extend = 0; + const char *s; + + if (recurse) { + parens++; + } + + for(s = p+1; s < end; s++) { + switch(*s) { + case 'x': + local_extend = invert ? -1 : 1; + break; + case '-': + invert = 1; + break; + case ':': + case ')': + if (local_extend == 0 || + (local_extend == -1 && !extended_mode) || + (local_extend == 1 && extended_mode)) { + /* no changes to extended flag */ + goto fallthrough; + } + + if (*s == ':') { + /* change extended flag until ')' */ + int local_options = options; + if (local_extend == 1) { + local_options |= ONIG_OPTION_EXTEND; + } else { + local_options &= ~ONIG_OPTION_EXTEND; + } + + rb_str_buf_cat(buf, (char *)&c, 1); + int ret = unescape_nonascii0(&p, end, enc, buf, encp, + has_property, err, + local_options, 1); + if (ret < 0) return ret; + goto begin_scan; + } else { + /* change extended flag for rest of expression */ + extended_mode = local_extend == 1; + goto fallthrough; + } + case 'i': + case 'm': + case 'a': + case 'd': + case 'u': + /* other option flags, ignored during scanning */ + break; + default: + /* other character, no extended flag change*/ + goto fallthrough; + } + } } + } else if (!in_char_class && recurse) { + parens++; } - else { - rb_str_buf_cat(buf, (char *)&c, 1); - } - break; + /* FALLTHROUGH */ default: +fallthrough: rb_str_buf_cat(buf, (char *)&c, 1); break; } } + if (recurse) { + *pp = p; + } return 0; } +static int +unescape_nonascii(const char *p, const char *end, rb_encoding *enc, + VALUE buf, rb_encoding **encp, int *has_property, + onig_errmsg_buffer err, int options) +{ + return unescape_nonascii0(&p, end, enc, buf, encp, has_property, + err, options, 0); +} + static VALUE rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, rb_encoding **fixed_enc, onig_errmsg_buffer err, int options) diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index 98bf41d2f1..c871580aeb 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -144,6 +144,62 @@ class TestRegexp < Test::Unit::TestCase assert_raise(SyntaxError) {eval "/# \\users/"} end + def test_nonextended_section_of_extended_regexp_bug_19379 + assert_separately([], <<-'RUBY') + re = /(?-x:#)/x + assert_match(re, '#') + assert_not_match(re, '-') + + re = /(?xi:# + y)/ + assert_match(re, 'Y') + assert_not_match(re, '-') + + re = /(?mix:# + y)/ + assert_match(re, 'Y') + assert_not_match(re, '-') + + re = /(?x-im:# + y)/i + assert_match(re, 'y') + assert_not_match(re, 'Y') + + re = /(?-imx:(?xim:# + y))/x + assert_match(re, 'y') + assert_not_match(re, '-') + + re = /(?x)# + y/ + assert_match(re, 'y') + assert_not_match(re, 'Y') + + re = /(?mx-i)# + y/i + assert_match(re, 'y') + assert_not_match(re, 'Y') + + re = /(?-imx:(?xim:# + (?-x)y#))/x + assert_match(re, 'Y#') + assert_not_match(re, '-#') + + re = /(?imx:# + (?-xim:#(?im)#(?x)# + )# + (?x)# + y)/ + assert_match(re, '###Y') + assert_not_match(re, '###-') + + re = %r{#c-\w+/comment/[\w-]+} + re = %r{https?://[^/]+#{re}}x + assert_match(re, 'http://foo#c-x/comment/bar') + assert_not_match(re, 'http://foo#cx/comment/bar') + RUBY + end + def test_union assert_equal :ok, begin Regexp.union( |