diff options
Diffstat (limited to 'ext/Encode/Encode.xs')
-rw-r--r-- | ext/Encode/Encode.xs | 546 |
1 files changed, 61 insertions, 485 deletions
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 229359e7f2..9c30c4a9d7 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -7,15 +7,14 @@ #include "def_t.h" #define FBCHAR 0xFFFd +#define FBCHAR_UTF8 "\xEF\xBF\xBD" #define BOM_BE 0xFeFF #define BOM16LE 0xFFFe #define BOM32LE 0xFFFe0000 - -#define valid_ucs2(x) ((0 <= (x) && (x) < 0xD800) || (0xDFFF < (x) && (x) <= 0xFFFF)) - #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) +#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) static UV enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) @@ -53,7 +52,7 @@ enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) void enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) { - U8 *d = SvGROW(result,SvCUR(result)+size); + U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size); switch(endian) { case 'v': case 'V': @@ -93,452 +92,6 @@ enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) } UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) -#if defined(USE_PERLIO) && !defined(USE_SFIO) -/* Define an encoding "layer" in the perliol.h sense. - The layer defined here "inherits" in an object-oriented sense from the - "perlio" layer with its PerlIOBuf_* "methods". - The implementation is particularly efficient as until Encode settles down - there is no point in tryint to tune it. - - The layer works by overloading the "fill" and "flush" methods. - - "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API - to convert the encoded data to UTF-8 form, then copies it back to the - buffer. The "base class's" read methods then see the UTF-8 data. - - "flush" transforms the UTF-8 data deposited by the "base class's write - method in the buffer back into the encoded form using the encode OO perl API, - then copies data back into the buffer and calls "SUPER::flush. - - Note that "flush" is _also_ called for read mode - we still do the (back)-translate - so that the the base class's "flush" sees the correct number of encoded chars - for positioning the seek pointer. (This double translation is the worst performance - issue - particularly with all-perl encode engine.) - -*/ -#include "perliol.h" -typedef struct { - PerlIOBuf base; /* PerlIOBuf stuff */ - SV *bufsv; /* buffer seen by layers above */ - SV *dataSV; /* data we have read from layer below */ - SV *enc; /* the encoding object */ -} PerlIOEncode; - -SV * -PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - SV *sv = &PL_sv_undef; - if (e->enc) { - dSP; - ENTER; - SAVETMPS; - PUSHMARK(sp); - XPUSHs(e->enc); - PUTBACK; - if (perl_call_method("name", G_SCALAR) == 1) { - SPAGAIN; - sv = newSVsv(POPs); - PUTBACK; - } - } - return sv; -} - -IV -PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - dSP; - IV code; - code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv); - ENTER; - SAVETMPS; - PUSHMARK(sp); - XPUSHs(arg); - PUTBACK; - if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) { - /* should never happen */ - Perl_die(aTHX_ "Encode::find_encoding did not return a value"); - return -1; - } - SPAGAIN; - e->enc = POPs; - PUTBACK; - if (!SvROK(e->enc)) { - e->enc = Nullsv; - errno = EINVAL; - Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"", - arg); - code = -1; - } - else { - SvREFCNT_inc(e->enc); - PerlIOBase(f)->flags |= PERLIO_F_UTF8; - } - FREETMPS; - LEAVE; - return code; -} - -IV -PerlIOEncode_popped(pTHX_ PerlIO * f) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - if (e->enc) { - SvREFCNT_dec(e->enc); - e->enc = Nullsv; - } - if (e->bufsv) { - SvREFCNT_dec(e->bufsv); - e->bufsv = Nullsv; - } - if (e->dataSV) { - SvREFCNT_dec(e->dataSV); - e->dataSV = Nullsv; - } - return 0; -} - -STDCHAR * -PerlIOEncode_get_base(pTHX_ PerlIO * f) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - if (!e->base.bufsiz) - e->base.bufsiz = 1024; - if (!e->bufsv) { - e->bufsv = newSV(e->base.bufsiz); - sv_setpvn(e->bufsv, "", 0); - } - e->base.buf = (STDCHAR *) SvPVX(e->bufsv); - if (!e->base.ptr) - e->base.ptr = e->base.buf; - if (!e->base.end) - e->base.end = e->base.buf; - if (e->base.ptr < e->base.buf - || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) { - Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr, - e->base.buf + SvLEN(e->bufsv)); - abort(); - } - if (SvLEN(e->bufsv) < e->base.bufsiz) { - SSize_t poff = e->base.ptr - e->base.buf; - SSize_t eoff = e->base.end - e->base.buf; - e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz); - e->base.ptr = e->base.buf + poff; - e->base.end = e->base.buf + eoff; - } - if (e->base.ptr < e->base.buf - || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) { - Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr, - e->base.buf + SvLEN(e->bufsv)); - abort(); - } - return e->base.buf; -} - -IV -PerlIOEncode_fill(pTHX_ PerlIO * f) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - dSP; - IV code = 0; - PerlIO *n; - SSize_t avail; - if (PerlIO_flush(f) != 0) - return -1; - n = PerlIONext(f); - if (!PerlIO_fast_gets(n)) { - /* Things get too messy if we don't have a buffer layer - push a :perlio to do the job */ - char mode[8]; - n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv); - if (!n) { - Perl_die(aTHX_ "panic: cannot push :perlio for %p",f); - } - } - ENTER; - SAVETMPS; - retry: - avail = PerlIO_get_cnt(n); - if (avail <= 0) { - avail = PerlIO_fill(n); - if (avail == 0) { - avail = PerlIO_get_cnt(n); - } - else { - if (!PerlIO_error(n) && PerlIO_eof(n)) - avail = 0; - } - } - if (avail > 0) { - STDCHAR *ptr = PerlIO_get_ptr(n); - SSize_t use = avail; - SV *uni; - char *s; - STRLEN len = 0; - e->base.ptr = e->base.end = (STDCHAR *) Nullch; - (void) PerlIOEncode_get_base(aTHX_ f); - if (!e->dataSV) - e->dataSV = newSV(0); - if (SvTYPE(e->dataSV) < SVt_PV) { - sv_upgrade(e->dataSV,SVt_PV); - } - if (SvCUR(e->dataSV)) { - /* something left over from last time - create a normal - SV with new data appended - */ - if (use + SvCUR(e->dataSV) > e->base.bufsiz) { - use = e->base.bufsiz - SvCUR(e->dataSV); - } - sv_catpvn(e->dataSV,(char*)ptr,use); - } - else { - /* Create a "dummy" SV to represent the available data from layer below */ - if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) { - Safefree(SvPVX(e->dataSV)); - } - if (use > e->base.bufsiz) { - use = e->base.bufsiz; - } - SvPVX(e->dataSV) = (char *) ptr; - SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */ - SvCUR_set(e->dataSV,use); - SvPOK_only(e->dataSV); - } - SvUTF8_off(e->dataSV); - PUSHMARK(sp); - XPUSHs(e->enc); - XPUSHs(e->dataSV); - XPUSHs(&PL_sv_yes); - PUTBACK; - if (perl_call_method("decode", G_SCALAR) != 1) { - Perl_die(aTHX_ "panic: decode did not return a value"); - } - SPAGAIN; - uni = POPs; - PUTBACK; - /* Now get translated string (forced to UTF-8) and use as buffer */ - if (SvPOK(uni)) { - s = SvPVutf8(uni, len); -#ifdef PARANOID_ENCODE_CHECKS - if (len && !is_utf8_string((U8*)s,len)) { - Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s); - } -#endif - } - if (len > 0) { - /* Got _something */ - /* if decode gave us back dataSV then data may vanish when - we do ptrcnt adjust - so take our copy now. - (The copy is a pain - need a put-it-here option for decode.) - */ - sv_setpvn(e->bufsv,s,len); - e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv); - e->base.end = e->base.ptr + SvCUR(e->bufsv); - PerlIOBase(f)->flags |= PERLIO_F_RDBUF; - SvUTF8_on(e->bufsv); - - /* Adjust ptr/cnt not taking anything which - did not translate - not clear this is a win */ - /* compute amount we took */ - use -= SvCUR(e->dataSV); - PerlIO_set_ptrcnt(n, ptr+use, (avail-use)); - /* and as we did not take it it isn't pending */ - SvCUR_set(e->dataSV,0); - } else { - /* Got nothing - assume partial character so we need some more */ - /* Make sure e->dataSV is a normal SV before re-filling as - buffer alias will change under us - */ - s = SvPV(e->dataSV,len); - sv_setpvn(e->dataSV,s,len); - PerlIO_set_ptrcnt(n, ptr+use, (avail-use)); - goto retry; - } - FREETMPS; - LEAVE; - return code; - } - else { - if (avail == 0) - PerlIOBase(f)->flags |= PERLIO_F_EOF; - else - PerlIOBase(f)->flags |= PERLIO_F_ERROR; - return -1; - } -} - -IV -PerlIOEncode_flush(pTHX_ PerlIO * f) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - IV code = 0; - if (e->bufsv && (e->base.ptr > e->base.buf)) { - dSP; - SV *str; - char *s; - STRLEN len; - SSize_t count = 0; - if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) { - /* Write case encode the buffer and write() to layer below */ - ENTER; - SAVETMPS; - PUSHMARK(sp); - XPUSHs(e->enc); - SvCUR_set(e->bufsv, e->base.ptr - e->base.buf); - SvUTF8_on(e->bufsv); - XPUSHs(e->bufsv); - XPUSHs(&PL_sv_yes); - PUTBACK; - if (perl_call_method("encode", G_SCALAR) != 1) { - Perl_die(aTHX_ "panic: encode did not return a value"); - } - SPAGAIN; - str = POPs; - PUTBACK; - s = SvPV(str, len); - count = PerlIO_write(PerlIONext(f),s,len); - if (count != len) { - code = -1; - } - FREETMPS; - LEAVE; - if (PerlIO_flush(PerlIONext(f)) != 0) { - code = -1; - } - if (SvCUR(e->bufsv)) { - /* Did not all translate */ - e->base.ptr = e->base.buf+SvCUR(e->bufsv); - return code; - } - } - else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) { - /* read case */ - /* if we have any untranslated stuff then unread that first */ - if (e->dataSV && SvCUR(e->dataSV)) { - s = SvPV(e->dataSV, len); - count = PerlIO_unread(PerlIONext(f),s,len); - if (count != len) { - code = -1; - } - } - /* See if there is anything left in the buffer */ - if (e->base.ptr < e->base.end) { - /* Bother - have unread data. - re-encode and unread() to layer below - */ - ENTER; - SAVETMPS; - str = sv_newmortal(); - sv_upgrade(str, SVt_PV); - SvPVX(str) = (char*)e->base.ptr; - SvLEN(str) = 0; - SvCUR_set(str, e->base.end - e->base.ptr); - SvPOK_only(str); - SvUTF8_on(str); - PUSHMARK(sp); - XPUSHs(e->enc); - XPUSHs(str); - XPUSHs(&PL_sv_yes); - PUTBACK; - if (perl_call_method("encode", G_SCALAR) != 1) { - Perl_die(aTHX_ "panic: encode did not return a value"); - } - SPAGAIN; - str = POPs; - PUTBACK; - s = SvPV(str, len); - count = PerlIO_unread(PerlIONext(f),s,len); - if (count != len) { - code = -1; - } - FREETMPS; - LEAVE; - } - } - e->base.ptr = e->base.end = e->base.buf; - PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF); - } - return code; -} - -IV -PerlIOEncode_close(pTHX_ PerlIO * f) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - IV code = PerlIOBase_close(aTHX_ f); - if (e->bufsv) { - if (e->base.buf && e->base.ptr > e->base.buf) { - Perl_croak(aTHX_ "Close with partial character"); - } - SvREFCNT_dec(e->bufsv); - e->bufsv = Nullsv; - } - e->base.buf = NULL; - e->base.ptr = NULL; - e->base.end = NULL; - PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF); - return code; -} - -Off_t -PerlIOEncode_tell(pTHX_ PerlIO * f) -{ - PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf); - /* Unfortunately the only way to get a postion is to (re-)translate, - the UTF8 we have in bufefr and then ask layer below - */ - PerlIO_flush(f); - if (b->buf && b->ptr > b->buf) { - Perl_croak(aTHX_ "Cannot tell at partial character"); - } - return PerlIO_tell(PerlIONext(f)); -} - -PerlIO * -PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o, - CLONE_PARAMS * params, int flags) -{ - if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) { - PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode); - PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode); - if (oe->enc) { - fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params); - } - } - return f; -} - -PerlIO_funcs PerlIO_encode = { - "encoding", - sizeof(PerlIOEncode), - PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT, - PerlIOEncode_pushed, - PerlIOEncode_popped, - PerlIOBuf_open, - PerlIOEncode_getarg, - PerlIOBase_fileno, - PerlIOEncode_dup, - PerlIOBuf_read, - PerlIOBuf_unread, - PerlIOBuf_write, - PerlIOBuf_seek, - PerlIOEncode_tell, - PerlIOEncode_close, - PerlIOEncode_flush, - PerlIOEncode_fill, - PerlIOBase_eof, - PerlIOBase_error, - PerlIOBase_clearerr, - PerlIOBase_setlinebuf, - PerlIOEncode_get_base, - PerlIOBuf_bufsiz, - PerlIOBuf_get_ptr, - PerlIOBuf_get_cnt, - PerlIOBuf_set_ptrcnt, -}; -#endif /* encode layer */ void Encode_XSEncoding(pTHX_ encode_t * enc) @@ -636,33 +189,56 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, case ENCODE_NOREP: if (dir == enc->f_utf8) { - if (!check && ckWARN_d(WARN_UTF8)) { - STRLEN clen; - UV ch = - utf8n_to_uvuni(s + slen, (SvCUR(src) - slen), - &clen, 0); - Perl_warner(aTHX_ packWARN(WARN_UTF8), - "\"\\N{U+%" UVxf - "}\" does not map to %s", ch, - enc->name[0]); - /* FIXME: Skip over the character, copy in replacement and continue - * but that is messy so for now just fail. - */ - return &PL_sv_undef; + STRLEN clen; + UV ch = + utf8n_to_uvuni(s + slen, (SvCUR(src) - slen), + &clen, 0); + if (!check) { /* fallback char */ + sdone += slen + clen; + ddone += dlen + enc->replen; + sv_catpvn(dst, enc->rep, enc->replen); } - else { - return &PL_sv_undef; + else if (check == -1){ /* perlqq */ + SV* perlqq = + sv_2mortal(newSVpvf("\\x{%x}", ch)); + sdone += slen + clen; + ddone += dlen + SvLEN(perlqq); + sv_catsv(dst, perlqq); + } + else { + Perl_croak(aTHX_ + "\"\\N{U+%" UVxf + "}\" does not map to %s", ch, + enc->name[0]); } + } + else { + if (!check){ /* fallback char */ + sdone += slen + 1; + ddone += dlen + strlen(FBCHAR_UTF8); + sv_catpv(dst, FBCHAR_UTF8); } + else if (check == -1){ /* perlqq */ + SV* perlqq = + sv_2mortal(newSVpvf("\\x%02X", s[slen])); + sdone += slen + 1; + ddone += dlen + SvLEN(perlqq); + sv_catsv(dst, perlqq); + } else { - /* UTF-8 is supposed to be "Universal" so should not happen - for real characters, but some encodings have non-assigned - codes which may occur. - */ - Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)", - enc->name[0], (U8) s[slen], code); + /* UTF-8 is supposed to be "Universal" so should not + happen for real characters, but some encodings + have non-assigned codes which may occur. */ + Perl_croak(aTHX_ "%s \"\\x%02X\" " + "does not map to Unicode (%d)", + enc->name[0], (U8) s[slen], code); } - break; + } + dlen = SvCUR(dst); + d = SvPVX(dst) + dlen; + s = SvPVX(src) + sdone; + slen = tlen - sdone; + break; default: Perl_croak(aTHX_ "Unexpected code %d converting %s %s", @@ -722,10 +298,10 @@ CODE: } void -Method_decode(obj,src,check = FALSE) +Method_decode(obj,src,check = 0) SV * obj SV * src -bool check +int check CODE: { encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); @@ -735,10 +311,10 @@ CODE: } void -Method_encode(obj,src,check = FALSE) +Method_encode(obj,src,check = 0) SV * obj SV * src -bool check +int check CODE: { encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); @@ -761,8 +337,8 @@ CODE: int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); SV *result = newSVpvn("",0); STRLEN ulen; - U8 *s = SvPVbyte(str,ulen); - U8 *e = SvEND(str); + U8 *s = (U8 *)SvPVbyte(str,ulen); + U8 *e = (U8 *)SvEND(str); ST(0) = sv_2mortal(result); SvUTF8_on(result); @@ -790,7 +366,7 @@ CODE: while (s < e && s+size <= e) { UV ord = enc_unpack(aTHX_ &s,e,size,endian); U8 *d; - if (size != 4 && !valid_ucs2(ord)) { + if (size != 4 && invalid_ucs2(ord)) { if (ucs2) { if (SvTRUE(chk)) { croak("%s:no surrogates allowed %"UVxf, @@ -851,8 +427,8 @@ CODE: int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); SV *result = newSVpvn("",0); STRLEN ulen; - U8 *s = SvPVutf8(utf8,ulen); - U8 *e = SvEND(utf8); + U8 *s = (U8 *)SvPVutf8(utf8,ulen); + U8 *e = (U8 *)SvEND(utf8); ST(0) = sv_2mortal(result); if (!endian) { endian = (size == 4) ? 'N' : 'n'; @@ -866,7 +442,7 @@ CODE: STRLEN len; UV ord = utf8n_to_uvuni(s, e-s, &len, 0); s += len; - if (size != 4 && !valid_ucs2(ord)) { + if (size != 4 && invalid_ucs2(ord)) { if (!issurrogate(ord)){ if (ucs2) { if (SvTRUE(chk)) { @@ -999,9 +575,9 @@ _utf8_to_bytes(sv, ...) RETVAL bool -is_utf8(sv, check = FALSE) +is_utf8(sv, check = 0) SV * sv -bool check +int check CODE: { if (SvGMAGICAL(sv)) /* it could be $1, for example */ @@ -1056,7 +632,7 @@ _utf8_off(sv) BOOT: { #if defined(USE_PERLIO) && !defined(USE_SFIO) - PerlIO_define_layer(aTHX_ &PerlIO_encode); +/* PerlIO_define_layer(aTHX_ &PerlIO_encode); */ #endif #include "def_t.exh" } |