diff options
author | Koichi Sasada <ko1@atdot.net> | 2022-12-15 17:53:55 +0900 |
---|---|---|
committer | Koichi Sasada <ko1@atdot.net> | 2022-12-16 10:04:37 +0900 |
commit | ae19ac5b5ba1e613a788addeb6d4d67fa65f6518 (patch) | |
tree | a431cdff5e73e1b48edb1a2f0b30578162736e68 | |
parent | 15b60bb1a48b98e2ba267c88551803df5f88355e (diff) | |
download | ruby-ae19ac5b5ba1e613a788addeb6d4d67fa65f6518.tar.gz |
fixed encoding table
This reduces global lock acquiring for reading.
https://bugs.ruby-lang.org/issues/18949
-rw-r--r-- | NEWS.md | 2 | ||||
-rw-r--r-- | bootstraptest/test_ractor.rb | 4 | ||||
-rw-r--r-- | encoding.c | 154 | ||||
-rw-r--r-- | test/ruby/test_encoding.rb | 16 |
4 files changed, 61 insertions, 115 deletions
@@ -202,6 +202,8 @@ Note: We're only listing outstanding class updates. try to dynamically guess the endian based on a byte order mark. Use `Encoding::UTF_16BE`/`UTF_16LE` and `Encoding::UTF_32BE`/`UTF_32LE` instead. This change speeds up getting the encoding of a String. [[Feature #18949]] + * Limit maximum encoding set size by 256. + If exceeding maximum size, `EncodingError` will be raised. [[Feature #18949]] * Enumerator diff --git a/bootstraptest/test_ractor.rb b/bootstraptest/test_ractor.rb index deffa476e5..4f659374f2 100644 --- a/bootstraptest/test_ractor.rb +++ b/bootstraptest/test_ractor.rb @@ -1474,7 +1474,7 @@ assert_equal "#{N}#{N}", %Q{ } # enc_table -assert_equal "#{N/10}", %Q{ +assert_equal "100", %Q{ Ractor.new do loop do Encoding.find("test-enc-#{rand(5_000)}").inspect @@ -1483,7 +1483,7 @@ assert_equal "#{N/10}", %Q{ end src = Encoding.find("UTF-8") - #{N/10}.times{|i| + 100.times{|i| src.replicate("test-enc-\#{i}") } } diff --git a/encoding.c b/encoding.c index b8e7f790b8..7c09939ef4 100644 --- a/encoding.c +++ b/encoding.c @@ -56,9 +56,8 @@ int rb_encdb_alias(const char *alias, const char *orig); static ID id_encoding; VALUE rb_cEncoding; -#define DEFAULT_ENCODING_LIST_CAPA 128 -static VALUE rb_default_encoding_list; -static VALUE rb_additional_encoding_list; +#define ENCODING_LIST_CAPA 256 +static VALUE rb_encoding_list; struct rb_encoding_entry { const char *name; @@ -67,9 +66,8 @@ struct rb_encoding_entry { }; static struct enc_table { - struct rb_encoding_entry *list; + struct rb_encoding_entry list[ENCODING_LIST_CAPA]; int count; - int size; st_table *names; } global_enc_table; @@ -128,47 +126,25 @@ enc_new(rb_encoding *encoding) static void enc_list_update(int index, rb_raw_encoding *encoding) { - if (index < DEFAULT_ENCODING_LIST_CAPA) { - VALUE list = rb_default_encoding_list; - if (list && NIL_P(rb_ary_entry(list, index))) { - /* initialize encoding data */ - rb_ary_store(list, index, enc_new(encoding)); - } - } - else { - RB_VM_LOCK_ENTER(); - { - VALUE list = rb_additional_encoding_list; - if (list && NIL_P(rb_ary_entry(list, index))) { - /* initialize encoding data */ - rb_ary_store(list, index - DEFAULT_ENCODING_LIST_CAPA, enc_new(encoding)); - } - } - RB_VM_LOCK_LEAVE(); + RUBY_ASSERT(index < ENCODING_LIST_CAPA); + + VALUE list = rb_encoding_list; + if (list && NIL_P(rb_ary_entry(list, index))) { + /* initialize encoding data */ + rb_ary_store(list, index, enc_new(encoding)); } } static VALUE enc_list_lookup(int idx) { - VALUE list, enc; + VALUE list, enc = Qnil; - if (idx < DEFAULT_ENCODING_LIST_CAPA) { - if (!(list = rb_default_encoding_list)) { - rb_bug("rb_enc_from_encoding_index(%d): no rb_default_encoding_list", idx); - } + if (idx < ENCODING_LIST_CAPA) { + list = rb_encoding_list; + RUBY_ASSERT(list); enc = rb_ary_entry(list, idx); } - else { - RB_VM_LOCK_ENTER(); - { - if (!(list = rb_additional_encoding_list)) { - rb_bug("rb_enc_from_encoding_index(%d): no rb_additional_encoding_list", idx); - } - enc = rb_ary_entry(list, idx - DEFAULT_ENCODING_LIST_CAPA); - } - RB_VM_LOCK_LEAVE(); - } if (NIL_P(enc)) { rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx); @@ -345,16 +321,10 @@ rb_find_encoding(VALUE enc) static int enc_table_expand(struct enc_table *enc_table, int newsize) { - struct rb_encoding_entry *ent; - int count = newsize; - - if (enc_table->size >= newsize) return newsize; - newsize = (newsize + 7) / 8 * 8; - ent = REALLOC_N(enc_table->list, struct rb_encoding_entry, newsize); - memset(ent + enc_table->size, 0, sizeof(*ent)*(newsize - enc_table->size)); - enc_table->list = ent; - enc_table->size = newsize; - return count; + if (newsize > ENCODING_LIST_CAPA) { + rb_raise(rb_eEncodingError, "too many encoding (> %d)", ENCODING_LIST_CAPA); + } + return newsize; } static int @@ -413,17 +383,7 @@ enc_from_index(struct enc_table *enc_table, int index) rb_encoding * rb_enc_from_index(int index) { - rb_encoding *enc; - - switch (index) { - case ENCINDEX_ASCII_8BIT: return global_enc_ascii; - case ENCINDEX_UTF_8: return global_enc_utf_8; - case ENCINDEX_US_ASCII: return global_enc_us_ascii; - default: - GLOBAL_ENC_TABLE_EVAL(enc_table, - enc = enc_from_index(enc_table, index)); - return enc; - } + return enc_from_index(&global_enc_table, index); } int @@ -484,11 +444,14 @@ rb_encdb_declare(const char *name) } static void -enc_check_duplication(struct enc_table *enc_table, const char *name) +enc_check_addable(struct enc_table *enc_table, const char *name) { if (enc_registered(enc_table, name) >= 0) { rb_raise(rb_eArgError, "encoding %s is already registered", name); } + else if (!valid_encoding_name_p(name)) { + rb_raise(rb_eArgError, "invalid encoding name: %s", name); + } } static rb_encoding* @@ -524,11 +487,7 @@ rb_enc_set_base(const char *name, const char *orig) int rb_enc_set_dummy(int index) { - rb_encoding *enc; - - GLOBAL_ENC_TABLE_EVAL(enc_table, - enc = enc_table->list[index].enc); - + rb_encoding *enc = global_enc_table.list[index].enc; ENC_SET_DUMMY((rb_raw_encoding *)enc); return index; } @@ -538,7 +497,7 @@ enc_replicate(struct enc_table *enc_table, const char *name, rb_encoding *encodi { int idx; - enc_check_duplication(enc_table, name); + enc_check_addable(enc_table, name); idx = enc_register(enc_table, name, encoding); if (idx < 0) rb_raise(rb_eArgError, "invalid encoding name: %s", name); set_base_encoding(enc_table, idx, encoding); @@ -727,7 +686,7 @@ rb_enc_alias(const char *alias, const char *orig) GLOBAL_ENC_TABLE_ENTER(enc_table); { - enc_check_duplication(enc_table, alias); + enc_check_addable(enc_table, alias); if ((idx = rb_enc_find_index(orig)) < 0) { r = -1; } @@ -764,7 +723,7 @@ rb_enc_init(struct enc_table *enc_table) { enc_table_expand(enc_table, ENCODING_COUNT + 1); if (!enc_table->names) { - enc_table->names = st_init_strcasetable(); + enc_table->names = st_init_strcasetable_with_size(ENCODING_LIST_CAPA); } #define OnigEncodingASCII_8BIT OnigEncodingASCII #define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc) @@ -877,11 +836,9 @@ rb_enc_autoload(rb_encoding *enc) int rb_enc_find_index(const char *name) { - int i; + int i = enc_registered(&global_enc_table, name); rb_encoding *enc; - GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_registered(enc_table, name)); - if (i < 0) { i = load_encoding(name); } @@ -1368,10 +1325,7 @@ enc_names(VALUE self) args[0] = (VALUE)rb_to_encoding_index(self); args[1] = rb_ary_new2(0); - - GLOBAL_ENC_TABLE_EVAL(enc_table, - st_foreach(enc_table->names, enc_names_i, (st_data_t)args)); - + st_foreach(global_enc_table.names, enc_names_i, (st_data_t)args); return args[1]; } @@ -1397,14 +1351,7 @@ static VALUE enc_list(VALUE klass) { VALUE ary = rb_ary_new2(0); - - RB_VM_LOCK_ENTER(); - { - rb_ary_replace(ary, rb_default_encoding_list); - rb_ary_concat(ary, rb_additional_encoding_list); - } - RB_VM_LOCK_LEAVE(); - + rb_ary_replace(ary, rb_encoding_list); return ary; } @@ -1553,15 +1500,17 @@ rb_locale_encindex(void) if (idx < 0) idx = ENCINDEX_UTF_8; - GLOBAL_ENC_TABLE_ENTER(enc_table); - if (enc_registered(enc_table, "locale") < 0) { + if (enc_registered(&global_enc_table, "locale") < 0) { # if defined _WIN32 void Init_w32_codepage(void); Init_w32_codepage(); # endif - enc_alias_internal(enc_table, "locale", idx); + GLOBAL_ENC_TABLE_ENTER(enc_table); + { + enc_alias_internal(enc_table, "locale", idx); + } + GLOBAL_ENC_TABLE_LEAVE(); } - GLOBAL_ENC_TABLE_LEAVE(); return idx; } @@ -1575,13 +1524,8 @@ rb_locale_encoding(void) int rb_filesystem_encindex(void) { - int idx; - - GLOBAL_ENC_TABLE_EVAL(enc_table, - idx = enc_registered(enc_table, "filesystem")); - - if (idx < 0) - idx = ENCINDEX_ASCII_8BIT; + int idx = enc_registered(&global_enc_table, "filesystem"); + if (idx < 0) idx = ENCINDEX_ASCII_8BIT; return idx; } @@ -1872,15 +1816,8 @@ rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg) static VALUE rb_enc_name_list(VALUE klass) { - VALUE ary; - - GLOBAL_ENC_TABLE_ENTER(enc_table); - { - ary = rb_ary_new2(enc_table->names->num_entries); - st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary); - } - GLOBAL_ENC_TABLE_LEAVE(); - + VALUE ary = rb_ary_new2(global_enc_table.names->num_entries); + st_foreach(global_enc_table.names, rb_enc_name_list_i, (st_data_t)ary); return ary; } @@ -1926,8 +1863,7 @@ rb_enc_aliases(VALUE klass) aliases[0] = rb_hash_new(); aliases[1] = rb_ary_new(); - GLOBAL_ENC_TABLE_EVAL(enc_table, - st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases)); + st_foreach(global_enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases); return aliases[0]; } @@ -1996,13 +1932,7 @@ Init_Encoding(void) struct enc_table *enc_table = &global_enc_table; - if (DEFAULT_ENCODING_LIST_CAPA < enc_table->count) rb_bug("DEFAULT_ENCODING_LIST_CAPA is too small"); - - list = rb_additional_encoding_list = rb_ary_new(); - RBASIC_CLEAR_CLASS(list); - rb_gc_register_mark_object(list); - - list = rb_default_encoding_list = rb_ary_new2(DEFAULT_ENCODING_LIST_CAPA); + list = rb_encoding_list = rb_ary_new2(ENCODING_LIST_CAPA); RBASIC_CLEAR_CLASS(list); rb_gc_register_mark_object(list); @@ -2024,5 +1954,5 @@ Init_encodings(void) void rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg) { - GLOBAL_ENC_TABLE_EVAL(enc_table, st_foreach(enc_table->names, func, arg)); + st_foreach(global_enc_table.names, func, arg); } diff --git a/test/ruby/test_encoding.rb b/test/ruby/test_encoding.rb index 64af8b488a..cde2951413 100644 --- a/test/ruby/test_encoding.rb +++ b/test/ruby/test_encoding.rb @@ -69,7 +69,7 @@ class TestEncoding < Test::Unit::TestCase def test_extra_encoding assert_separately([], "#{<<~"begin;"}\n#{<<~'end;'}") begin; - 200.times {|i| + 100.times {|i| EnvUtil.suppress_warning { Encoding::UTF_8.replicate("dummy#{i}") } } e = Encoding.list.last @@ -160,4 +160,18 @@ class TestEncoding < Test::Unit::TestCase end end; end + + def test_exceed_encoding_table_size + assert_separately(%w[--disable=gems], "#{<<~"begin;"}\n#{<<~'end;'}") + begin; + begin + enc = Encoding::UTF_8 + 1_000.times{|i| EnvUtil.suppress_warning{ enc.replicate("R_#{i}") } } # now 256 entries + rescue EncodingError => e + assert_match(/too many encoding/, e.message) + else + assert false + end + end; + end end |