From 6525b6f760ccd9612c9546b0313ab1c7e4af5e66 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Sat, 10 Sep 2022 19:15:49 +0200 Subject: Remove get_actual_encoding() and the dynamic endian detection for dummy UTF-16/UTF-32 * And simplify callers of get_actual_encoding(). * See [Feature #18949]. * See https://github.com/ruby/ruby/pull/6322#issuecomment-1242758474 --- enc/encdb.c | 2 +- enc/utf_16_32.h | 2 +- encoding.c | 9 ------- internal/encoding.h | 1 - string.c | 61 +++++++-------------------------------------- test/ruby/test_m17n.rb | 42 ++++++++----------------------- test/ruby/test_transcode.rb | 12 ++++----- 7 files changed, 27 insertions(+), 102 deletions(-) diff --git a/enc/encdb.c b/enc/encdb.c index a1936df804..8247e9ff6a 100644 --- a/enc/encdb.c +++ b/enc/encdb.c @@ -17,7 +17,7 @@ #define ENC_DEFINE(name) rb_encdb_declare(name) #define ENC_SET_BASE(name, orig) rb_enc_set_base((name), (orig)) #define ENC_SET_DUMMY(name, orig) rb_enc_set_dummy(name) -#define ENC_DUMMY_UNICODE(name) rb_encdb_set_unicode(rb_enc_set_dummy(ENC_REPLICATE((name), name "BE"))) +#define ENC_DUMMY_UNICODE(name) ENC_DUMMY(name) void Init_encdb(void) diff --git a/enc/utf_16_32.h b/enc/utf_16_32.h index 9f9216d8ff..4d669019bf 100644 --- a/enc/utf_16_32.h +++ b/enc/utf_16_32.h @@ -1,5 +1,5 @@ #include "regenc.h" /* dummy for unsupported, stateful encoding */ -#define ENC_DUMMY_UNICODE(name) ENC_REPLICATE(name, name "BE") +#define ENC_DUMMY_UNICODE(name) ENC_DUMMY(name) ENC_DUMMY_UNICODE("UTF-16"); ENC_DUMMY_UNICODE("UTF-32"); diff --git a/encoding.c b/encoding.c index 5c4dfad3f0..b8e7f790b8 100644 --- a/encoding.c +++ b/encoding.c @@ -50,7 +50,6 @@ void rb_encdb_declare(const char *name); int rb_encdb_replicate(const char *name, const char *orig); int rb_encdb_dummy(const char *name); int rb_encdb_alias(const char *alias, const char *orig); -void rb_encdb_set_unicode(int index); #pragma GCC visibility pop #endif @@ -760,14 +759,6 @@ rb_encdb_alias(const char *alias, const char *orig) return r; } -void -rb_encdb_set_unicode(int index) -{ - rb_raw_encoding *enc = (rb_raw_encoding *)rb_enc_from_index(index); - ASSUME(enc); - enc->flags |= ONIGENC_FLAG_UNICODE; -} - static void rb_enc_init(struct enc_table *enc_table) { diff --git a/internal/encoding.h b/internal/encoding.h index c48cb24b04..853426a58d 100644 --- a/internal/encoding.h +++ b/internal/encoding.h @@ -24,7 +24,6 @@ int rb_encdb_dummy(const char *name); void rb_encdb_declare(const char *name); void rb_enc_set_base(const char *name, const char *orig); int rb_enc_set_dummy(int index); -void rb_encdb_set_unicode(int index); PUREFUNC(int rb_data_is_encoding(VALUE obj)); #endif /* INTERNAL_ENCODING_H */ diff --git a/string.c b/string.c index c9ec827708..ac0a2acb11 100644 --- a/string.c +++ b/string.c @@ -355,40 +355,10 @@ rb_debug_rstring_null_ptr(const char *func) /* symbols for [up|down|swap]case/capitalize options */ static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold; -static rb_encoding * -get_actual_encoding(const int encidx, VALUE str) -{ - const unsigned char *q; - - switch (encidx) { - case ENCINDEX_UTF_16: - if (RSTRING_LEN(str) < 2) break; - q = (const unsigned char *)RSTRING_PTR(str); - if (q[0] == 0xFE && q[1] == 0xFF) { - return rb_enc_get_from_index(ENCINDEX_UTF_16BE); - } - if (q[0] == 0xFF && q[1] == 0xFE) { - return rb_enc_get_from_index(ENCINDEX_UTF_16LE); - } - return rb_ascii8bit_encoding(); - case ENCINDEX_UTF_32: - if (RSTRING_LEN(str) < 4) break; - q = (const unsigned char *)RSTRING_PTR(str); - if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) { - return rb_enc_get_from_index(ENCINDEX_UTF_32BE); - } - if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) { - return rb_enc_get_from_index(ENCINDEX_UTF_32LE); - } - return rb_ascii8bit_encoding(); - } - return rb_enc_from_index(encidx); -} - static rb_encoding * get_encoding(VALUE str) { - return get_actual_encoding(ENCODING_GET(str), str); + return rb_enc_from_index(ENCODING_GET(str)); } static void @@ -832,21 +802,15 @@ rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) } static int -enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx) +enc_coderange_scan(VALUE str, rb_encoding *enc) { - if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) && - rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) { - return ENC_CODERANGE_BROKEN; - } - else { - return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); - } + return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); } int rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc) { - return enc_coderange_scan(str, enc, rb_enc_to_index(enc)); + return enc_coderange_scan(str, enc); } int @@ -855,9 +819,7 @@ rb_enc_str_coderange(VALUE str) int cr = ENC_CODERANGE(str); if (cr == ENC_CODERANGE_UNKNOWN) { - int encidx = ENCODING_GET(str); - rb_encoding *enc = rb_enc_from_index(encidx); - cr = enc_coderange_scan(str, enc, encidx); + cr = enc_coderange_scan(str, get_encoding(str)); ENC_CODERANGE_SET(str, cr); } return cr; @@ -1123,7 +1085,7 @@ is_enc_ascii_string(VALUE str, rb_encoding *enc) int encidx = rb_enc_to_index(enc); if (rb_enc_get_index(str) == encidx) return is_ascii_string(str); - return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT; + return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT; } VALUE @@ -6730,7 +6692,7 @@ VALUE rb_str_inspect(VALUE str) { int encidx = ENCODING_GET(str); - rb_encoding *enc = rb_enc_from_index(encidx), *actenc; + rb_encoding *enc = rb_enc_from_index(encidx); const char *p, *pend, *prev; char buf[CHAR_ESC_LEN + 1]; VALUE result = rb_str_buf_new(0); @@ -6745,11 +6707,6 @@ rb_str_inspect(VALUE str) p = RSTRING_PTR(str); pend = RSTRING_END(str); prev = p; - actenc = get_actual_encoding(encidx, str); - if (actenc != enc) { - enc = actenc; - if (unicode_p) unicode_p = rb_enc_unicode_p(enc); - } while (p < pend) { unsigned int c, cc; int n; @@ -9374,7 +9331,7 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) { size_t grapheme_cluster_count = 0; regex_t *reg_grapheme_cluster = NULL; - rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str)); + rb_encoding *enc = get_encoding(str); const char *ptr, *end; if (!rb_enc_unicode_p(enc)) { @@ -9402,7 +9359,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) { VALUE orig = str; regex_t *reg_grapheme_cluster = NULL; - rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str)); + rb_encoding *enc = get_encoding(str); const char *ptr0, *ptr, *end; if (!rb_enc_unicode_p(enc)) { diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index da04ae7fa7..28293ffffc 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -226,38 +226,16 @@ class TestM17N < Test::Unit::TestCase end end - STR_WITHOUT_BOM = "\u3042".freeze - STR_WITH_BOM = "\uFEFF\u3042".freeze - bug8940 = '[ruby-core:59757] [Bug #8940]' - bug9415 = '[ruby-dev:47895] [Bug #9415]' - %w/UTF-16 UTF-32/.each do |enc| - %w/BE LE/.each do |endian| - bom = "\uFEFF".encode("#{enc}#{endian}").force_encoding(enc) - - define_method("test_utf_16_32_inspect(#{enc}#{endian})") do - s = STR_WITHOUT_BOM.encode(enc + endian) - # When a UTF-16/32 string doesn't have a BOM, - # inspect as a dummy encoding string. - assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect, - s.dup.force_encoding(enc).inspect) - assert_normal_exit("#{bom.b.dump}.force_encoding('#{enc}').inspect", bug8940) - end - - define_method("test_utf_16_32_codepoints(#{enc}#{endian})") do - assert_equal([0xFEFF], bom.codepoints, bug9415) - end - - define_method("test_utf_16_32_ord(#{enc}#{endian})") do - assert_equal(0xFEFF, bom.ord, bug9415) - end - - define_method("test_utf_16_32_inspect(#{enc}#{endian}-BOM)") do - s = STR_WITH_BOM.encode(enc + endian) - # When a UTF-16/32 string has a BOM, - # inspect as a particular encoding string. - assert_equal(s.inspect, - s.dup.force_encoding(enc).inspect) - end + def test_utf_dummy_are_like_regular_dummy_encodings + [Encoding::UTF_16, Encoding::UTF_32].each do |enc| + s = "\u3042".encode("UTF-32BE") + assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect, s.dup.force_encoding(enc).inspect) + s = "\x00\x00\xFE\xFF" + assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect, s.dup.force_encoding(enc).inspect) + + assert_equal [0, 0, 254, 255], "\x00\x00\xFE\xFF".force_encoding(enc).codepoints + assert_equal 0, "\x00\x00\xFE\xFF".force_encoding(enc).ord + assert_equal 255, "\xFF\xFE\x00\x00".force_encoding(enc).ord end end diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 73737be0ad..24ee9b9533 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -2232,12 +2232,12 @@ class TestTranscode < Test::Unit::TestCase assert_equal("U+3042", "\u{3042}".encode("US-ASCII", fallback: fallback)) end - bug8940 = '[ruby-core:57318] [Bug #8940]' - %w[UTF-32 UTF-16].each do |enc| - define_method("test_pseudo_encoding_inspect(#{enc})") do - assert_normal_exit("'aaa'.encode('#{enc}').inspect", bug8940) - assert_equal(4, 'aaa'.encode(enc).length, "should count in #{enc} with BOM") - end + def test_pseudo_encoding_inspect + s = 'aaa'.encode "UTF-16" + assert_equal '"\xFE\xFF\x00\x61\x00\x61\x00\x61"', s.inspect + + s = 'aaa'.encode "UTF-32" + assert_equal '"\x00\x00\xFE\xFF\x00\x00\x00\x61\x00\x00\x00\x61\x00\x00\x00\x61"', s.inspect end def test_encode_with_invalid_chars -- cgit v1.2.1