summaryrefslogtreecommitdiff
path: root/transcode.c
diff options
context:
space:
mode:
Diffstat (limited to 'transcode.c')
-rw-r--r--transcode.c129
1 files changed, 75 insertions, 54 deletions
diff --git a/transcode.c b/transcode.c
index 4d1107ddc4..b82729cb5b 100644
--- a/transcode.c
+++ b/transcode.c
@@ -14,9 +14,9 @@
#include "transcode_data.h"
#include <ctype.h>
-VALUE rb_eConversionUndefined;
-VALUE rb_eInvalidByteSequence;
-VALUE rb_eNoConverter;
+VALUE rb_eConversionUndefinedError;
+VALUE rb_eInvalidByteSequenceError;
+VALUE rb_eNoConverterError;
VALUE rb_cEncodingConverter;
@@ -52,6 +52,7 @@ typedef struct rb_transcoding {
unsigned int next_table;
VALUE next_info;
unsigned char next_byte;
+ unsigned int output_index;
int recognized_len; /* already interpreted */
int readagain_len; /* not yet interpreted */
@@ -80,6 +81,10 @@ typedef struct rb_transcoding {
((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \
(tc)->writebuf.ary : \
(tc)->writebuf.ptr)
+#define TRANSCODING_WRITEBUF_SIZE(tc) \
+ ((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \
+ sizeof((tc)->writebuf.ary) : \
+ (tc)->transcoder->max_output)
#define TRANSCODING_STATE_EMBED_MAX sizeof(union rb_transcoding_state_t)
#define TRANSCODING_STATE(tc) \
((tc)->transcoder->state_size <= sizeof((tc)->state) ? \
@@ -580,10 +585,10 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
continue;
case STR1:
- next_byte = 0; /* index */
- while (next_byte < BYTE_ADDR(STR1_BYTEINDEX(next_info))[0]) {
- SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+next_byte];
- next_byte++;
+ tc->output_index = 0;
+ while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
+ SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
+ tc->output_index++;
}
continue;
case FUNii:
@@ -600,9 +605,12 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
case FUNio:
SUSPEND_OBUF(13);
if (tr->max_output <= out_stop - out_p)
- out_p += (VALUE)(*tr->func_io)(TRANSCODING_STATE(tc), next_info, out_p);
+ out_p += tr->func_io(TRANSCODING_STATE(tc),
+ next_info, out_p, out_stop - out_p);
else {
- writebuf_len = (VALUE)(*tr->func_io)(TRANSCODING_STATE(tc), next_info, TRANSCODING_WRITEBUF(tc));
+ writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
+ next_info,
+ TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
writebuf_off = 0;
while (writebuf_off < writebuf_len) {
SUSPEND_OBUF(20);
@@ -617,11 +625,15 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
SUSPEND_OBUF(14);
if (tr->max_output <= out_stop - out_p) {
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
- out_p += (VALUE)(*tr->func_so)(TRANSCODING_STATE(tc), char_start, (size_t)char_len, out_p);
+ out_p += tr->func_so(TRANSCODING_STATE(tc),
+ char_start, (size_t)char_len,
+ out_p, out_stop - out_p);
}
else {
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
- writebuf_len = (VALUE)(*tr->func_so)(TRANSCODING_STATE(tc), char_start, (size_t)char_len, TRANSCODING_WRITEBUF(tc));
+ writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
+ char_start, (size_t)char_len,
+ TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
writebuf_off = 0;
while (writebuf_off < writebuf_len) {
SUSPEND_OBUF(22);
@@ -675,10 +687,12 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
if (tr->finish_func) {
SUSPEND_OBUF(4);
if (tr->max_output <= out_stop - out_p) {
- out_p += tr->finish_func(TRANSCODING_STATE(tc), out_p);
+ out_p += tr->finish_func(TRANSCODING_STATE(tc),
+ out_p, out_stop - out_p);
}
else {
- writebuf_len = tr->finish_func(TRANSCODING_STATE(tc), TRANSCODING_WRITEBUF(tc));
+ writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
+ TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
writebuf_off = 0;
while (writebuf_off < writebuf_len) {
SUSPEND_OBUF(23);
@@ -1946,7 +1960,7 @@ rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
mesg = rb_str_new_cstr("code converter not found (");
econv_description(sname, dname, ecflags, mesg);
rb_str_cat2(mesg, ")");
- exc = rb_exc_new3(rb_eNoConverter, mesg);
+ exc = rb_exc_new3(rb_eNoConverterError, mesg);
return exc;
}
@@ -1983,7 +1997,7 @@ make_econv_exception(rb_econv_t *ec)
ec->last_error.source_encoding);
}
- exc = rb_exc_new3(rb_eInvalidByteSequence, mesg);
+ exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
@@ -2009,7 +2023,7 @@ make_econv_exception(rb_econv_t *ec)
StringValueCStr(dumped),
ec->last_error.source_encoding,
ec->last_error.destination_encoding);
- exc = rb_exc_new3(rb_eConversionUndefined, mesg);
+ exc = rb_exc_new3(rb_eConversionUndefinedError, mesg);
idx = rb_enc_find_index(ec->last_error.source_encoding);
if (0 <= idx)
rb_enc_associate_index(bytes, idx);
@@ -2274,16 +2288,18 @@ econv_opts(VALUE opt)
v = rb_hash_aref(opt, sym_xml);
if (!NIL_P(v)) {
- v = rb_convert_type(v, T_SYMBOL, "Symbol", "to_sym");
if (v==sym_text) {
ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
}
else if (v==sym_attr) {
ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
}
- else {
+ else if (TYPE(v) == T_SYMBOL) {
rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
}
+ else {
+ rb_raise(rb_eArgError, "unexpected value for xml option");
+ }
}
v = rb_hash_aref(opt, sym_universal_newline);
@@ -2570,7 +2586,12 @@ str_encode(int argc, VALUE *argv, VALUE str)
int encidx = str_transcode(argc, argv, &newstr);
if (encidx < 0) return rb_str_dup(str);
- RBASIC(newstr)->klass = rb_obj_class(str);
+ if (newstr == str) {
+ newstr = rb_str_dup(str);
+ }
+ else {
+ RBASIC(newstr)->klass = rb_obj_class(str);
+ }
return str_encode_associate(newstr, encidx);
}
@@ -3361,8 +3382,8 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self)
* puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
*
* If a conversion error occur,
- * Encoding::ConversionUndefined or
- * Encoding::InvalidByteSequence is raised.
+ * Encoding::ConversionUndefinedError or
+ * Encoding::InvalidByteSequenceError is raised.
*
*/
static VALUE
@@ -3614,7 +3635,7 @@ econv_insert_output(VALUE self, VALUE string)
* some bytes are buffered to be converted later.
* The latter bytes can be put back.
* It can be observed by
- * Encoding::InvalidByteSequence#readagain_bytes and
+ * Encoding::InvalidByteSequenceError#readagain_bytes and
* Encoding::Converter#primitive_errinfo.
*
* ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
@@ -3663,14 +3684,14 @@ econv_putback(int argc, VALUE *argv, VALUE self)
* It returns nil if the last conversion is not an error.
*
* "error" means that
- * Encoding::InvalidByteSequence and Encoding::ConversionUndefined for
+ * Encoding::InvalidByteSequenceError and Encoding::ConversionUndefinedError for
* Encoding::Converter#convert and
* :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
* Encoding::Converter#primitive_convert.
*
* ec = Encoding::Converter.new("utf-8", "iso-8859-1")
* p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
- * p ec.last_error #=> #<Encoding::InvalidByteSequence: "\xF1" followed by "a" on UTF-8>
+ * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
* p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
* p ec.last_error #=> nil
*
@@ -3708,7 +3729,7 @@ econv_get_replacement(VALUE self)
ret = make_replacement(ec);
if (ret == -1) {
- rb_raise(rb_eConversionUndefined, "replacement character setup failed");
+ rb_raise(rb_eConversionUndefinedError, "replacement character setup failed");
}
enc = rb_enc_find(ec->replacement_enc);
@@ -3742,8 +3763,8 @@ econv_set_replacement(VALUE self, VALUE arg)
rb_enc_name(enc));
if (ret == -1) {
- /* xxx: rb_eInvalidByteSequence? */
- rb_raise(rb_eConversionUndefined, "replacement character setup failed");
+ /* xxx: rb_eInvalidByteSequenceError? */
+ rb_raise(rb_eConversionUndefinedError, "replacement character setup failed");
}
return arg;
@@ -3784,7 +3805,7 @@ ecerr_source_encoding_name(VALUE self)
* ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
* begin
* ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
- * rescue Encoding::ConversionUndefined
+ * rescue Encoding::ConversionUndefinedError
* p $!.source_encoding #=> #<Encoding:UTF-8>
* p $!.destination_encoding #=> #<Encoding:EUC-JP>
* p $!.source_encoding_name #=> "UTF-8"
@@ -3826,12 +3847,12 @@ ecerr_destination_encoding(VALUE self)
* call-seq:
* ecerr.error_char -> string
*
- * returns the one-character string which cause Encoding::ConversionUndefined.
+ * returns the one-character string which cause Encoding::ConversionUndefinedError.
*
* ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
* begin
* ec.convert("\xa0")
- * rescue Encoding::ConversionUndefined
+ * rescue Encoding::ConversionUndefinedError
* puts $!.error_char.dump #=> "\xC2\xA0"
* p $!.error_char.encoding #=> #<Encoding:UTF-8>
* end
@@ -3847,13 +3868,13 @@ ecerr_error_char(VALUE self)
* call-seq:
* ecerr.error_bytes -> string
*
- * returns the discarded bytes when Encoding::InvalidByteSequence occur.
+ * returns the discarded bytes when Encoding::InvalidByteSequenceError occur.
*
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
* begin
* ec.convert("abc\xA1\xFFdef")
- * rescue Encoding::InvalidByteSequence
- * p $! #=> #<Encoding::InvalidByteSequence: "\xA1" followed by "\xFF" on EUC-JP>
+ * rescue Encoding::InvalidByteSequenceError
+ * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
* puts $!.error_bytes.dump #=> "\xA1"
* puts $!.readagain_bytes.dump #=> "\xFF"
* end
@@ -3868,7 +3889,7 @@ ecerr_error_bytes(VALUE self)
* call-seq:
* ecerr.readagain_bytes -> string
*
- * returns the bytes to be read again when Encoding::InvalidByteSequence occur.
+ * returns the bytes to be read again when Encoding::InvalidByteSequenceError occur.
*/
static VALUE
ecerr_readagain_bytes(VALUE self)
@@ -3887,16 +3908,16 @@ ecerr_readagain_bytes(VALUE self)
*
* begin
* ec.convert("abc\xA1z")
- * rescue Encoding::InvalidByteSequence
- * p $! #=> #<Encoding::InvalidByteSequence: "\xA1" followed by "z" on EUC-JP>
+ * rescue Encoding::InvalidByteSequenceError
+ * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
* p $!.incomplete_input? #=> false
* end
*
* begin
* ec.convert("abc\xA1")
* ec.finish
- * rescue Encoding::InvalidByteSequence
- * p $! #=> #<Encoding::InvalidByteSequence: incomplete "\xA1" on EUC-JP>
+ * rescue Encoding::InvalidByteSequenceError
+ * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
* p $!.incomplete_input? #=> true
* end
*/
@@ -3911,9 +3932,9 @@ extern void Init_newline(void);
void
Init_transcode(void)
{
- rb_eConversionUndefined = rb_define_class_under(rb_cEncoding, "ConversionUndefined", rb_eStandardError);
- rb_eInvalidByteSequence = rb_define_class_under(rb_cEncoding, "InvalidByteSequence", rb_eStandardError);
- rb_eNoConverter = rb_define_class_under(rb_cEncoding, "NoConverter", rb_eStandardError);
+ rb_eConversionUndefinedError = rb_define_class_under(rb_cEncoding, "ConversionUndefinedError", rb_eStandardError);
+ rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eStandardError);
+ rb_eNoConverterError = rb_define_class_under(rb_cEncoding, "NoConverterError", rb_eStandardError);
transcoder_table = st_init_strcasetable();
@@ -3973,19 +3994,19 @@ Init_transcode(void)
rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
- rb_define_method(rb_eConversionUndefined, "source_encoding_name", ecerr_source_encoding_name, 0);
- rb_define_method(rb_eConversionUndefined, "destination_encoding_name", ecerr_destination_encoding_name, 0);
- rb_define_method(rb_eConversionUndefined, "source_encoding", ecerr_source_encoding, 0);
- rb_define_method(rb_eConversionUndefined, "destination_encoding", ecerr_destination_encoding, 0);
- rb_define_method(rb_eConversionUndefined, "error_char", ecerr_error_char, 0);
-
- rb_define_method(rb_eInvalidByteSequence, "source_encoding_name", ecerr_source_encoding_name, 0);
- rb_define_method(rb_eInvalidByteSequence, "destination_encoding_name", ecerr_destination_encoding_name, 0);
- rb_define_method(rb_eInvalidByteSequence, "source_encoding", ecerr_source_encoding, 0);
- rb_define_method(rb_eInvalidByteSequence, "destination_encoding", ecerr_destination_encoding, 0);
- rb_define_method(rb_eInvalidByteSequence, "error_bytes", ecerr_error_bytes, 0);
- rb_define_method(rb_eInvalidByteSequence, "readagain_bytes", ecerr_readagain_bytes, 0);
- rb_define_method(rb_eInvalidByteSequence, "incomplete_input?", ecerr_incomplete_input, 0);
+ rb_define_method(rb_eConversionUndefinedError, "source_encoding_name", ecerr_source_encoding_name, 0);
+ rb_define_method(rb_eConversionUndefinedError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
+ rb_define_method(rb_eConversionUndefinedError, "source_encoding", ecerr_source_encoding, 0);
+ rb_define_method(rb_eConversionUndefinedError, "destination_encoding", ecerr_destination_encoding, 0);
+ rb_define_method(rb_eConversionUndefinedError, "error_char", ecerr_error_char, 0);
+
+ rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
Init_newline();
}