diff options
-rw-r--r-- | ChangeLog | 12 | ||||
-rw-r--r-- | include/ruby/encoding.h | 2 | ||||
-rw-r--r-- | re.c | 117 | ||||
-rw-r--r-- | string.c | 17 | ||||
-rw-r--r-- | test/ruby/test_m17n.rb | 210 |
5 files changed, 318 insertions, 40 deletions
@@ -1,3 +1,15 @@ +Sun Nov 25 22:21:35 2007 Tanaka Akira <akr@fsij.org> + + * include/ruby/encoding.h (rb_enc_str_asciionly_p): declared. + (rb_enc_str_asciicompat_p): defined. + + * re.c (rb_reg_initialize_str): use rb_enc_str_asciionly_p. + (rb_reg_quote): return ascii-8bit string if the argument is + ascii-only to generate encoding generic regexp if possible. + (rb_reg_s_union): fix encoding handling. [ruby-dev:32094] + + * string.c (rb_enc_str_asciionly_p): defined. + Sun Nov 25 12:12:03 2007 Eric Hodel <drbrain@segment7.net> * gem_prelude.rb: Import fast-loading gem_prelude.rb from RubyGems. diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index aaea7b3ca5..93fe77de2d 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -100,6 +100,8 @@ int rb_enc_tolower(int c, rb_encoding *enc); ID rb_intern3(const char*, long, rb_encoding*); int rb_enc_symname_p(const char*, rb_encoding*); int rb_enc_str_coderange(VALUE); +int rb_enc_str_asciionly_p(VALUE); +#define rb_enc_str_asciicompat_p(str) rb_enc_asciicompat(rb_enc_get(str)) VALUE rb_enc_from_encoding(rb_encoding *enc); rb_encoding *rb_enc_primary(void); rb_encoding *rb_enc_default(void); @@ -1268,7 +1268,7 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, static int rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err) { - if (rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) { + if (!rb_enc_str_asciionly_p(str)) { options |= ARG_ENCODING_FIXED; } return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str), @@ -1654,6 +1654,7 @@ rb_reg_quote(VALUE str) char *s, *send, *t; VALUE tmp; int c; + int ascii_only = rb_enc_str_asciionly_p(str); s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); @@ -1677,11 +1678,17 @@ rb_reg_quote(VALUE str) goto meta_found; } } + if (ascii_only && rb_enc_get_index(str) != 0) { + str = rb_str_new3(str); + rb_enc_associate(str, rb_enc_from_index(0)); + } return str; meta_found: tmp = rb_str_new(0, RSTRING_LEN(str)*2); - rb_enc_copy(tmp, str); + if (!ascii_only) { + rb_enc_copy(tmp, str); + } t = RSTRING_PTR(tmp); /* copy upto metacharacter */ memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); @@ -1802,43 +1809,115 @@ rb_reg_s_union(VALUE self, VALUE args0) return rb_class_new_instance(1, args, rb_cRegexp); } else if (argc == 1) { - VALUE v; - v = rb_check_regexp_type(rb_ary_entry(args0, 0)); - if (!NIL_P(v)) - return v; + VALUE arg = rb_ary_entry(args0, 0); + VALUE re = rb_check_regexp_type(arg); + if (!NIL_P(re)) + return re; else { - VALUE args[1]; - args[0] = rb_reg_s_quote(Qnil, RARRAY_PTR(args0)[0]); - return rb_class_new_instance(1, args, rb_cRegexp); + VALUE quoted; + quoted = rb_reg_s_quote(Qnil, arg); + return rb_reg_new(quoted, 0); } } else { int i; VALUE source = rb_str_buf_new(0); - int mbs = Qfalse; - rb_encoding *enc = 0; + rb_encoding *enc; + + int has_asciionly_string = 0; + rb_encoding *has_ascii_compat_string = 0; + rb_encoding *has_ascii_incompat_string = 0; + + int has_generic_regexp = 0; + rb_encoding *has_ascii_compat_fixed_regexp = 0; + rb_encoding *has_ascii_incompat_regexp = 0; for (i = 0; i < argc; i++) { volatile VALUE v; VALUE e = rb_ary_entry(args0, i); + if (0 < i) - rb_str_buf_cat2(source, "|"); + rb_str_buf_cat2(source, "|"); /* xxx: UTF-16 */ + v = rb_check_regexp_type(e); if (!NIL_P(v)) { + rb_encoding *enc0 = rb_enc_get(v); + if (!rb_enc_asciicompat(enc0)) { + if (!has_ascii_incompat_regexp) { + has_ascii_incompat_regexp = enc0; + } + else { + if (has_ascii_incompat_regexp != enc0) + rb_raise(rb_eArgError, "regexp encodings differ"); + } + } + else if (ENCODING_GET(v) != 0 || FL_TEST(v, KCODE_FIXED)) { + if (!has_ascii_compat_fixed_regexp) { + has_ascii_compat_fixed_regexp = enc0; + } + else { + if (has_ascii_compat_fixed_regexp != enc0) + rb_raise(rb_eArgError, "regexp encodings differ"); + } + } + else { + has_generic_regexp = 1; + } v = rb_reg_to_s(v); } else { + StringValue(e); + if (!rb_enc_str_asciicompat_p(e)) { + rb_encoding *enc0 = rb_enc_get(e); + if (!has_ascii_incompat_string) { + has_ascii_incompat_string = enc0; + } + else { + if (has_ascii_incompat_string != enc0) + rb_raise(rb_eArgError, "regexp encodings differ"); + } + } + else if (rb_enc_str_asciionly_p(e)) { + has_asciionly_string = 1; + } + else { + rb_encoding *enc0 = rb_enc_get(e); + if (!has_ascii_compat_string) { + has_ascii_compat_string = enc0; + } + else { + if (has_ascii_compat_string != enc0) + rb_raise(rb_eArgError, "regexp encodings differ"); + } + } v = rb_reg_s_quote(Qnil, e); } - if (mbs || rb_enc_str_coderange(v) != ENC_CODERANGE_SINGLE) { - if (!enc) enc = rb_enc_get(v); - else if (mbs && enc != rb_enc_get(v)) { - rb_raise(rb_eArgError, "regexp encodings differ"); - } - mbs = Qtrue; - } rb_str_append(source, v); } + if (has_ascii_incompat_string || has_ascii_incompat_regexp) { + if (has_asciionly_string || has_ascii_compat_string || + has_generic_regexp || has_ascii_compat_fixed_regexp) + rb_raise(rb_eArgError, "regexp encodings differ"); + if (has_ascii_incompat_string && has_ascii_incompat_regexp && + has_ascii_incompat_string != has_ascii_incompat_regexp) + rb_raise(rb_eArgError, "regexp encodings differ"); + enc = has_ascii_incompat_string; + if (enc == 0) + enc = has_ascii_incompat_regexp; + } + else if (has_ascii_compat_string || has_ascii_compat_fixed_regexp) { + if (has_ascii_compat_string && has_ascii_compat_fixed_regexp && + has_ascii_compat_string != has_ascii_compat_fixed_regexp) + rb_raise(rb_eArgError, "regexp encodings differ"); + enc = has_ascii_compat_string; + if (enc == 0) + enc = has_ascii_compat_fixed_regexp; + } + else { + enc = rb_enc_from_index(0); + } + + rb_enc_associate(source, enc); return rb_class_new_instance(1, &source, rb_cRegexp); } } @@ -129,6 +129,23 @@ rb_enc_str_coderange(VALUE str) return cr; } +int rb_enc_str_asciionly_p(VALUE str) +{ + rb_encoding *enc = rb_enc_get(str); + + if (rb_enc_asciicompat(enc) && + rb_enc_str_coderange(str) == ENC_CODERANGE_SINGLE) { + char *ptr = RSTRING_PTR(str); + long len = RSTRING_LEN(str); + long i; + for (i = 0; i < len; i++) + if (ptr[i] & 0x80) + return Qfalse; + return Qtrue; + } + return Qfalse; +} + static inline void str_mod_check(VALUE s, char *p, long len) { diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index c50c6b8384..61b4309469 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -46,30 +46,71 @@ class TestM17N < Test::Unit::TestCase #assert_raise(SyntaxError) { eval('/\xc0\x20/u') } end + def assert_regexp_generic_encoding(r) + %w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename| + # "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8. + assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(ename) } + } + end + + def assert_regexp_fixed_encoding(r) + %w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename| + enc = Encoding.find(ename) + if enc == r.encoding + assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(enc) } + else + assert_raise(ArgumentError) { r =~ "\xc0\xa1".force_encoding(enc) } + end + } + end + + def assert_regexp_generic_ascii(r) + assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_generic_encoding(r) + end + + def assert_regexp_fixed_ascii8bit(r) + assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_eucjp(r) + assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_sjis(r) + assert_encoding("Shift_JIS", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_utf8(r) + assert_encoding("UTF-8", r.encoding) + assert_regexp_fixed_encoding(r) + end + def test_regexp_generic r = /a/ - assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_generic_ascii(r) assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) assert_equal(0, r =~ u("a")) - - # "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8. assert_equal(nil, r =~ a("\xc0\xa1")) assert_equal(nil, r =~ e("\xc0\xa1")) assert_equal(nil, r =~ s("\xc0\xa1")) assert_equal(nil, r =~ u("\xc0\xa1")) - r = eval(a(%{/\xc0\xa1/})) - assert_encoding("ASCII-8BIT", r.encoding) - assert_equal(nil, r =~ a("a")) - assert_equal(nil, r =~ e("a")) - assert_equal(nil, r =~ s("a")) - assert_equal(nil, r =~ u("a")) - assert_equal(0, r =~ a("\xc0\xa1")) - assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + r = Regexp.new("a".force_encoding("ASCII-8BIT")) + assert_regexp_generic_ascii(r) + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + assert_equal(nil, r =~ a("\xc0\xa1")) + assert_equal(nil, r =~ e("\xc0\xa1")) + assert_equal(nil, r =~ s("\xc0\xa1")) + assert_equal(nil, r =~ u("\xc0\xa1")) # xxx: /\xc0\xa1/ should be restricted only for ASCII-8BIT? # r = /\xc0\xa1/ @@ -86,7 +127,7 @@ class TestM17N < Test::Unit::TestCase def test_regexp_ascii r = /a/n - assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_ascii8bit(r) assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) @@ -97,7 +138,18 @@ class TestM17N < Test::Unit::TestCase assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = /\xc0\xa1/n - assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_ascii8bit(r) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_equal(0, r =~ a("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + r = eval(a(%{/\xc0\xa1/})) + assert_regexp_fixed_ascii8bit(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -108,7 +160,7 @@ class TestM17N < Test::Unit::TestCase assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = eval(%{/\xc0\xa1/n}.force_encoding("ASCII-8BIT")) - assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_ascii8bit(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -119,7 +171,9 @@ class TestM17N < Test::Unit::TestCase assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = eval(%q{/\xc0\xa1/}.force_encoding("ASCII-8BIT")) + # assert_regexp_fixed_ascii8bit(r) assert_encoding("ASCII-8BIT", r.encoding) + # assert_regexp_fixed_encoding(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -128,12 +182,22 @@ class TestM17N < Test::Unit::TestCase # assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } # assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } # assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } - end def test_regexp_euc r = /a/e - assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_eucjp(r) + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } + assert_equal(nil, r =~ e("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + r = Regexp.new("a".force_encoding("EUC-JP")) + assert_regexp_fixed_eucjp(r) assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) @@ -144,7 +208,7 @@ class TestM17N < Test::Unit::TestCase assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = /\xc0\xa1/e - assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_eucjp(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -155,7 +219,7 @@ class TestM17N < Test::Unit::TestCase assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = eval(%{/\xc0\xa1/}.force_encoding("EUC-JP")) - assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_eucjp(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -166,7 +230,7 @@ class TestM17N < Test::Unit::TestCase assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = eval(%q{/\xc0\xa1/}.force_encoding("EUC-JP")) - assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_eucjp(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -175,6 +239,7 @@ class TestM17N < Test::Unit::TestCase assert_equal(0, r =~ e("\xc0\xa1")) assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + end def test_begin_end_offset @@ -198,4 +263,107 @@ class TestM17N < Test::Unit::TestCase assert_equal([1,2], $~.offset(0)) end + def test_quote + assert_regexp_generic_ascii(/#{Regexp.quote(a("a"))}#{Regexp.quote(e("e"))}/) + + # Regexp.quote returns ASCII-8BIT string for ASCII only string + # to make generic regexp if possible. + assert_encoding("ASCII-8BIT", Regexp.quote(a("")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(e("")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(s("")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(u("")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(a("a")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(e("a")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(s("a")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(u("a")).encoding) + + assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc0\xa1")).encoding) + assert_encoding("EUC-JP", Regexp.quote(e("\xc0\xa1")).encoding) + assert_encoding("Shift_JIS", Regexp.quote(s("\xc0\xa1")).encoding) + assert_encoding("UTF-8", Regexp.quote(u("\xc0\xa1")).encoding) + end + + def test_union_0 + r = Regexp.union + assert_regexp_generic_ascii(r) + assert(r !~ a("")) + assert(r !~ e("")) + assert(r !~ s("")) + assert(r !~ u("")) + end + + def test_union_1_asciionly_string + assert_regexp_generic_ascii(Regexp.union(a(""))) + assert_regexp_generic_ascii(Regexp.union(e(""))) + assert_regexp_generic_ascii(Regexp.union(s(""))) + assert_regexp_generic_ascii(Regexp.union(u(""))) + assert_regexp_generic_ascii(Regexp.union(a("a"))) + assert_regexp_generic_ascii(Regexp.union(e("a"))) + assert_regexp_generic_ascii(Regexp.union(s("a"))) + assert_regexp_generic_ascii(Regexp.union(u("a"))) + assert_regexp_generic_ascii(Regexp.union(a("\t"))) + assert_regexp_generic_ascii(Regexp.union(e("\t"))) + assert_regexp_generic_ascii(Regexp.union(s("\t"))) + assert_regexp_generic_ascii(Regexp.union(u("\t"))) + end + + def test_union_1_nonascii_string + assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc0\xa1"))) + assert_regexp_fixed_eucjp(Regexp.union(e("\xc0\xa1"))) + assert_regexp_fixed_sjis(Regexp.union(s("\xc0\xa1"))) + assert_regexp_fixed_utf8(Regexp.union(u("\xc0\xa1"))) + end + + def test_union_1_regexp + assert_regexp_generic_ascii(Regexp.union(//)) + assert_regexp_fixed_ascii8bit(Regexp.union(//n)) + assert_regexp_fixed_eucjp(Regexp.union(//e)) + assert_regexp_fixed_sjis(Regexp.union(//s)) + assert_regexp_fixed_utf8(Regexp.union(//u)) + end + + def test_union_2_asciionly_strings + ary = [a(""), e(""), s(""), u("")] + ary.each {|s1| + ary.each {|s2| + assert_regexp_generic_ascii(Regexp.union(s1, s2)) + } + } + end + + def test_union_2_strings + ary = [ + a(""), e(""), s(""), u(""), + a("\xc0\xa1"), e("\xc0\xa1"), s("\xc0\xa1"), u("\xc0\xa1") + ] + ary.each {|s1| + ary.each {|s2| + if s1.empty? + if s2.empty? + assert_regexp_generic_ascii(Regexp.union(s1, s2)) + else + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s2.encoding, r.encoding) + end + else + if s2.empty? + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s1.encoding, r.encoding) + else + if s1.encoding == s2.encoding + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s1.encoding, r.encoding) + else + assert_raise(ArgumentError) { Regexp.union(s1, s2) } + end + end + end + } + } + end + + end |