diff options
Diffstat (limited to 'transcode.c')
-rw-r--r-- | transcode.c | 39 |
1 files changed, 38 insertions, 1 deletions
diff --git a/transcode.c b/transcode.c index dba26a2394..7683d4c9be 100644 --- a/transcode.c +++ b/transcode.c @@ -21,7 +21,7 @@ VALUE rb_eConverterNotFoundError; VALUE rb_cEncodingConverter; -static VALUE sym_invalid, sym_undef, sym_replace; +static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback; static VALUE sym_xml, sym_text, sym_attr; static VALUE sym_universal_newline; static VALUE sym_crlf_newline; @@ -2256,17 +2256,37 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, unsigned char *out_start = *out_pos; int max_output; VALUE exc; + VALUE fallback = Qnil; ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); if (!ec) rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); + if (!NIL_P(ecopts) && TYPE(ecopts) == T_HASH) + fallback = rb_hash_aref(ecopts, sym_fallback); last_tc = ec->last_tc; max_output = last_tc ? last_tc->transcoder->max_output : 1; resume: ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0); + if (!NIL_P(fallback) && ret == econv_undefined_conversion) { + VALUE rep = rb_enc_str_new( + (const char *)ec->last_error.error_bytes_start, + ec->last_error.error_bytes_len, + rb_enc_find(ec->last_error.source_encoding)); + rep = rb_hash_lookup2(fallback, rep, Qundef); + if (rep != Qundef) { + StringValue(rep); + ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep), + RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep))); + if (ret == -1) { + rb_raise(rb_eArgError, "too big fallback string"); + } + goto resume; + } + } + if (ret == econv_invalid_byte_sequence || ret == econv_incomplete_input || ret == econv_undefined_conversion) { @@ -2442,6 +2462,7 @@ rb_econv_prepare_opts(VALUE opthash, VALUE *opts) return 0; } ecflags = econv_opts(opthash); + v = rb_hash_aref(opthash, sym_replace); if (!NIL_P(v)) { StringValue(v); @@ -2456,6 +2477,16 @@ rb_econv_prepare_opts(VALUE opthash, VALUE *opts) rb_hash_aset(newhash, sym_replace, v); } + v = rb_hash_aref(opthash, sym_fallback); + if (!NIL_P(v)) { + v = rb_convert_type(v, T_HASH, "Hash", "to_hash"); + if (!NIL_P(v)) { + if (NIL_P(newhash)) + newhash = rb_hash_new(); + rb_hash_aset(newhash, sym_fallback, v); + } + } + if (!NIL_P(newhash)) rb_hash_freeze(newhash); *opts = newhash; @@ -2728,6 +2759,11 @@ str_encode_bang(int argc, VALUE *argv, VALUE str) * :replace :: * Sets the replacement string to the value. The default replacement * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. + * :fallback :: + * Sets the replacement string by the hash for undefined character. + * Its key is a such undefined character encoded in source encoding + * of current transcoder. Its value can be any encoding until it + * can be converted into the destination encoding of the transcoder. * :xml :: * The value must be <code>:text</code> or <code>:attr</code>. * If the value is <code>:text</code> <code>#encode</code> replaces @@ -4193,6 +4229,7 @@ Init_transcode(void) sym_invalid = ID2SYM(rb_intern("invalid")); sym_undef = ID2SYM(rb_intern("undef")); sym_replace = ID2SYM(rb_intern("replace")); + sym_fallback = ID2SYM(rb_intern("fallback")); sym_xml = ID2SYM(rb_intern("xml")); sym_text = ID2SYM(rb_intern("text")); sym_attr = ID2SYM(rb_intern("attr")); |