diff options
Diffstat (limited to 'transcode.c')
-rw-r--r-- | transcode.c | 129 |
1 files changed, 75 insertions, 54 deletions
diff --git a/transcode.c b/transcode.c index 4d1107ddc4..b82729cb5b 100644 --- a/transcode.c +++ b/transcode.c @@ -14,9 +14,9 @@ #include "transcode_data.h" #include <ctype.h> -VALUE rb_eConversionUndefined; -VALUE rb_eInvalidByteSequence; -VALUE rb_eNoConverter; +VALUE rb_eConversionUndefinedError; +VALUE rb_eInvalidByteSequenceError; +VALUE rb_eNoConverterError; VALUE rb_cEncodingConverter; @@ -52,6 +52,7 @@ typedef struct rb_transcoding { unsigned int next_table; VALUE next_info; unsigned char next_byte; + unsigned int output_index; int recognized_len; /* already interpreted */ int readagain_len; /* not yet interpreted */ @@ -80,6 +81,10 @@ typedef struct rb_transcoding { ((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \ (tc)->writebuf.ary : \ (tc)->writebuf.ptr) +#define TRANSCODING_WRITEBUF_SIZE(tc) \ + ((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \ + sizeof((tc)->writebuf.ary) : \ + (tc)->transcoder->max_output) #define TRANSCODING_STATE_EMBED_MAX sizeof(union rb_transcoding_state_t) #define TRANSCODING_STATE(tc) \ ((tc)->transcoder->state_size <= sizeof((tc)->state) ? \ @@ -580,10 +585,10 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, SUSPEND_OBUF(19); *out_p++ = getBT3(next_info); continue; case STR1: - next_byte = 0; /* index */ - while (next_byte < BYTE_ADDR(STR1_BYTEINDEX(next_info))[0]) { - SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+next_byte]; - next_byte++; + tc->output_index = 0; + while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) { + SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index]; + tc->output_index++; } continue; case FUNii: @@ -600,9 +605,12 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, case FUNio: SUSPEND_OBUF(13); if (tr->max_output <= out_stop - out_p) - out_p += (VALUE)(*tr->func_io)(TRANSCODING_STATE(tc), next_info, out_p); + out_p += tr->func_io(TRANSCODING_STATE(tc), + next_info, out_p, out_stop - out_p); else { - writebuf_len = (VALUE)(*tr->func_io)(TRANSCODING_STATE(tc), next_info, TRANSCODING_WRITEBUF(tc)); + writebuf_len = tr->func_io(TRANSCODING_STATE(tc), + next_info, + TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); writebuf_off = 0; while (writebuf_off < writebuf_len) { SUSPEND_OBUF(20); @@ -617,11 +625,15 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, SUSPEND_OBUF(14); if (tr->max_output <= out_stop - out_p) { char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); - out_p += (VALUE)(*tr->func_so)(TRANSCODING_STATE(tc), char_start, (size_t)char_len, out_p); + out_p += tr->func_so(TRANSCODING_STATE(tc), + char_start, (size_t)char_len, + out_p, out_stop - out_p); } else { char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); - writebuf_len = (VALUE)(*tr->func_so)(TRANSCODING_STATE(tc), char_start, (size_t)char_len, TRANSCODING_WRITEBUF(tc)); + writebuf_len = tr->func_so(TRANSCODING_STATE(tc), + char_start, (size_t)char_len, + TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); writebuf_off = 0; while (writebuf_off < writebuf_len) { SUSPEND_OBUF(22); @@ -675,10 +687,12 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, if (tr->finish_func) { SUSPEND_OBUF(4); if (tr->max_output <= out_stop - out_p) { - out_p += tr->finish_func(TRANSCODING_STATE(tc), out_p); + out_p += tr->finish_func(TRANSCODING_STATE(tc), + out_p, out_stop - out_p); } else { - writebuf_len = tr->finish_func(TRANSCODING_STATE(tc), TRANSCODING_WRITEBUF(tc)); + writebuf_len = tr->finish_func(TRANSCODING_STATE(tc), + TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); writebuf_off = 0; while (writebuf_off < writebuf_len) { SUSPEND_OBUF(23); @@ -1946,7 +1960,7 @@ rb_econv_open_exc(const char *sname, const char *dname, int ecflags) mesg = rb_str_new_cstr("code converter not found ("); econv_description(sname, dname, ecflags, mesg); rb_str_cat2(mesg, ")"); - exc = rb_exc_new3(rb_eNoConverter, mesg); + exc = rb_exc_new3(rb_eNoConverterError, mesg); return exc; } @@ -1983,7 +1997,7 @@ make_econv_exception(rb_econv_t *ec) ec->last_error.source_encoding); } - exc = rb_exc_new3(rb_eInvalidByteSequence, mesg); + exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg); rb_ivar_set(exc, rb_intern("error_bytes"), bytes); rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2); rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse); @@ -2009,7 +2023,7 @@ make_econv_exception(rb_econv_t *ec) StringValueCStr(dumped), ec->last_error.source_encoding, ec->last_error.destination_encoding); - exc = rb_exc_new3(rb_eConversionUndefined, mesg); + exc = rb_exc_new3(rb_eConversionUndefinedError, mesg); idx = rb_enc_find_index(ec->last_error.source_encoding); if (0 <= idx) rb_enc_associate_index(bytes, idx); @@ -2274,16 +2288,18 @@ econv_opts(VALUE opt) v = rb_hash_aref(opt, sym_xml); if (!NIL_P(v)) { - v = rb_convert_type(v, T_SYMBOL, "Symbol", "to_sym"); if (v==sym_text) { ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF; } else if (v==sym_attr) { ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF; } - else { + else if (TYPE(v) == T_SYMBOL) { rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v))); } + else { + rb_raise(rb_eArgError, "unexpected value for xml option"); + } } v = rb_hash_aref(opt, sym_universal_newline); @@ -2570,7 +2586,12 @@ str_encode(int argc, VALUE *argv, VALUE str) int encidx = str_transcode(argc, argv, &newstr); if (encidx < 0) return rb_str_dup(str); - RBASIC(newstr)->klass = rb_obj_class(str); + if (newstr == str) { + newstr = rb_str_dup(str); + } + else { + RBASIC(newstr)->klass = rb_obj_class(str); + } return str_encode_associate(newstr, encidx); } @@ -3361,8 +3382,8 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP") * * If a conversion error occur, - * Encoding::ConversionUndefined or - * Encoding::InvalidByteSequence is raised. + * Encoding::ConversionUndefinedError or + * Encoding::InvalidByteSequenceError is raised. * */ static VALUE @@ -3614,7 +3635,7 @@ econv_insert_output(VALUE self, VALUE string) * some bytes are buffered to be converted later. * The latter bytes can be put back. * It can be observed by - * Encoding::InvalidByteSequence#readagain_bytes and + * Encoding::InvalidByteSequenceError#readagain_bytes and * Encoding::Converter#primitive_errinfo. * * ec = Encoding::Converter.new("utf-16le", "iso-8859-1") @@ -3663,14 +3684,14 @@ econv_putback(int argc, VALUE *argv, VALUE self) * It returns nil if the last conversion is not an error. * * "error" means that - * Encoding::InvalidByteSequence and Encoding::ConversionUndefined for + * Encoding::InvalidByteSequenceError and Encoding::ConversionUndefinedError for * Encoding::Converter#convert and * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for * Encoding::Converter#primitive_convert. * * ec = Encoding::Converter.new("utf-8", "iso-8859-1") * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence - * p ec.last_error #=> #<Encoding::InvalidByteSequence: "\xF1" followed by "a" on UTF-8> + * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8> * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full * p ec.last_error #=> nil * @@ -3708,7 +3729,7 @@ econv_get_replacement(VALUE self) ret = make_replacement(ec); if (ret == -1) { - rb_raise(rb_eConversionUndefined, "replacement character setup failed"); + rb_raise(rb_eConversionUndefinedError, "replacement character setup failed"); } enc = rb_enc_find(ec->replacement_enc); @@ -3742,8 +3763,8 @@ econv_set_replacement(VALUE self, VALUE arg) rb_enc_name(enc)); if (ret == -1) { - /* xxx: rb_eInvalidByteSequence? */ - rb_raise(rb_eConversionUndefined, "replacement character setup failed"); + /* xxx: rb_eInvalidByteSequenceError? */ + rb_raise(rb_eConversionUndefinedError, "replacement character setup failed"); } return arg; @@ -3784,7 +3805,7 @@ ecerr_source_encoding_name(VALUE self) * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP * begin * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP. - * rescue Encoding::ConversionUndefined + * rescue Encoding::ConversionUndefinedError * p $!.source_encoding #=> #<Encoding:UTF-8> * p $!.destination_encoding #=> #<Encoding:EUC-JP> * p $!.source_encoding_name #=> "UTF-8" @@ -3826,12 +3847,12 @@ ecerr_destination_encoding(VALUE self) * call-seq: * ecerr.error_char -> string * - * returns the one-character string which cause Encoding::ConversionUndefined. + * returns the one-character string which cause Encoding::ConversionUndefinedError. * * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") * begin * ec.convert("\xa0") - * rescue Encoding::ConversionUndefined + * rescue Encoding::ConversionUndefinedError * puts $!.error_char.dump #=> "\xC2\xA0" * p $!.error_char.encoding #=> #<Encoding:UTF-8> * end @@ -3847,13 +3868,13 @@ ecerr_error_char(VALUE self) * call-seq: * ecerr.error_bytes -> string * - * returns the discarded bytes when Encoding::InvalidByteSequence occur. + * returns the discarded bytes when Encoding::InvalidByteSequenceError occur. * * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") * begin * ec.convert("abc\xA1\xFFdef") - * rescue Encoding::InvalidByteSequence - * p $! #=> #<Encoding::InvalidByteSequence: "\xA1" followed by "\xFF" on EUC-JP> + * rescue Encoding::InvalidByteSequenceError + * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP> * puts $!.error_bytes.dump #=> "\xA1" * puts $!.readagain_bytes.dump #=> "\xFF" * end @@ -3868,7 +3889,7 @@ ecerr_error_bytes(VALUE self) * call-seq: * ecerr.readagain_bytes -> string * - * returns the bytes to be read again when Encoding::InvalidByteSequence occur. + * returns the bytes to be read again when Encoding::InvalidByteSequenceError occur. */ static VALUE ecerr_readagain_bytes(VALUE self) @@ -3887,16 +3908,16 @@ ecerr_readagain_bytes(VALUE self) * * begin * ec.convert("abc\xA1z") - * rescue Encoding::InvalidByteSequence - * p $! #=> #<Encoding::InvalidByteSequence: "\xA1" followed by "z" on EUC-JP> + * rescue Encoding::InvalidByteSequenceError + * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP> * p $!.incomplete_input? #=> false * end * * begin * ec.convert("abc\xA1") * ec.finish - * rescue Encoding::InvalidByteSequence - * p $! #=> #<Encoding::InvalidByteSequence: incomplete "\xA1" on EUC-JP> + * rescue Encoding::InvalidByteSequenceError + * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP> * p $!.incomplete_input? #=> true * end */ @@ -3911,9 +3932,9 @@ extern void Init_newline(void); void Init_transcode(void) { - rb_eConversionUndefined = rb_define_class_under(rb_cEncoding, "ConversionUndefined", rb_eStandardError); - rb_eInvalidByteSequence = rb_define_class_under(rb_cEncoding, "InvalidByteSequence", rb_eStandardError); - rb_eNoConverter = rb_define_class_under(rb_cEncoding, "NoConverter", rb_eStandardError); + rb_eConversionUndefinedError = rb_define_class_under(rb_cEncoding, "ConversionUndefinedError", rb_eStandardError); + rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eStandardError); + rb_eNoConverterError = rb_define_class_under(rb_cEncoding, "NoConverterError", rb_eStandardError); transcoder_table = st_init_strcasetable(); @@ -3973,19 +3994,19 @@ Init_transcode(void) rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR)); rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR)); - rb_define_method(rb_eConversionUndefined, "source_encoding_name", ecerr_source_encoding_name, 0); - rb_define_method(rb_eConversionUndefined, "destination_encoding_name", ecerr_destination_encoding_name, 0); - rb_define_method(rb_eConversionUndefined, "source_encoding", ecerr_source_encoding, 0); - rb_define_method(rb_eConversionUndefined, "destination_encoding", ecerr_destination_encoding, 0); - rb_define_method(rb_eConversionUndefined, "error_char", ecerr_error_char, 0); - - rb_define_method(rb_eInvalidByteSequence, "source_encoding_name", ecerr_source_encoding_name, 0); - rb_define_method(rb_eInvalidByteSequence, "destination_encoding_name", ecerr_destination_encoding_name, 0); - rb_define_method(rb_eInvalidByteSequence, "source_encoding", ecerr_source_encoding, 0); - rb_define_method(rb_eInvalidByteSequence, "destination_encoding", ecerr_destination_encoding, 0); - rb_define_method(rb_eInvalidByteSequence, "error_bytes", ecerr_error_bytes, 0); - rb_define_method(rb_eInvalidByteSequence, "readagain_bytes", ecerr_readagain_bytes, 0); - rb_define_method(rb_eInvalidByteSequence, "incomplete_input?", ecerr_incomplete_input, 0); + rb_define_method(rb_eConversionUndefinedError, "source_encoding_name", ecerr_source_encoding_name, 0); + rb_define_method(rb_eConversionUndefinedError, "destination_encoding_name", ecerr_destination_encoding_name, 0); + rb_define_method(rb_eConversionUndefinedError, "source_encoding", ecerr_source_encoding, 0); + rb_define_method(rb_eConversionUndefinedError, "destination_encoding", ecerr_destination_encoding, 0); + rb_define_method(rb_eConversionUndefinedError, "error_char", ecerr_error_char, 0); + + rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0); + rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0); + rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0); + rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0); + rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0); + rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0); + rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0); Init_newline(); } |