summaryrefslogtreecommitdiff
path: root/ext/Encode/Encode.xs
diff options
context:
space:
mode:
Diffstat (limited to 'ext/Encode/Encode.xs')
-rw-r--r--ext/Encode/Encode.xs546
1 files changed, 61 insertions, 485 deletions
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index 229359e7f2..9c30c4a9d7 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -7,15 +7,14 @@
#include "def_t.h"
#define FBCHAR 0xFFFd
+#define FBCHAR_UTF8 "\xEF\xBF\xBD"
#define BOM_BE 0xFeFF
#define BOM16LE 0xFFFe
#define BOM32LE 0xFFFe0000
-
-#define valid_ucs2(x) ((0 <= (x) && (x) < 0xD800) || (0xDFFF < (x) && (x) <= 0xFFFF))
-
#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
+#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
static UV
enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
@@ -53,7 +52,7 @@ enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
void
enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
{
- U8 *d = SvGROW(result,SvCUR(result)+size);
+ U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size);
switch(endian) {
case 'v':
case 'V':
@@ -93,452 +92,6 @@ enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
}
UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
-#if defined(USE_PERLIO) && !defined(USE_SFIO)
-/* Define an encoding "layer" in the perliol.h sense.
- The layer defined here "inherits" in an object-oriented sense from the
- "perlio" layer with its PerlIOBuf_* "methods".
- The implementation is particularly efficient as until Encode settles down
- there is no point in tryint to tune it.
-
- The layer works by overloading the "fill" and "flush" methods.
-
- "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
- to convert the encoded data to UTF-8 form, then copies it back to the
- buffer. The "base class's" read methods then see the UTF-8 data.
-
- "flush" transforms the UTF-8 data deposited by the "base class's write
- method in the buffer back into the encoded form using the encode OO perl API,
- then copies data back into the buffer and calls "SUPER::flush.
-
- Note that "flush" is _also_ called for read mode - we still do the (back)-translate
- so that the the base class's "flush" sees the correct number of encoded chars
- for positioning the seek pointer. (This double translation is the worst performance
- issue - particularly with all-perl encode engine.)
-
-*/
-#include "perliol.h"
-typedef struct {
- PerlIOBuf base; /* PerlIOBuf stuff */
- SV *bufsv; /* buffer seen by layers above */
- SV *dataSV; /* data we have read from layer below */
- SV *enc; /* the encoding object */
-} PerlIOEncode;
-
-SV *
-PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags)
-{
- PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
- SV *sv = &PL_sv_undef;
- if (e->enc) {
- dSP;
- ENTER;
- SAVETMPS;
- PUSHMARK(sp);
- XPUSHs(e->enc);
- PUTBACK;
- if (perl_call_method("name", G_SCALAR) == 1) {
- SPAGAIN;
- sv = newSVsv(POPs);
- PUTBACK;
- }
- }
- return sv;
-}
-
-IV
-PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg)
-{
- PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
- dSP;
- IV code;
- code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv);
- ENTER;
- SAVETMPS;
- PUSHMARK(sp);
- XPUSHs(arg);
- PUTBACK;
- if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) {
- /* should never happen */
- Perl_die(aTHX_ "Encode::find_encoding did not return a value");
- return -1;
- }
- SPAGAIN;
- e->enc = POPs;
- PUTBACK;
- if (!SvROK(e->enc)) {
- e->enc = Nullsv;
- errno = EINVAL;
- Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"",
- arg);
- code = -1;
- }
- else {
- SvREFCNT_inc(e->enc);
- PerlIOBase(f)->flags |= PERLIO_F_UTF8;
- }
- FREETMPS;
- LEAVE;
- return code;
-}
-
-IV
-PerlIOEncode_popped(pTHX_ PerlIO * f)
-{
- PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
- if (e->enc) {
- SvREFCNT_dec(e->enc);
- e->enc = Nullsv;
- }
- if (e->bufsv) {
- SvREFCNT_dec(e->bufsv);
- e->bufsv = Nullsv;
- }
- if (e->dataSV) {
- SvREFCNT_dec(e->dataSV);
- e->dataSV = Nullsv;
- }
- return 0;
-}
-
-STDCHAR *
-PerlIOEncode_get_base(pTHX_ PerlIO * f)
-{
- PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
- if (!e->base.bufsiz)
- e->base.bufsiz = 1024;
- if (!e->bufsv) {
- e->bufsv = newSV(e->base.bufsiz);
- sv_setpvn(e->bufsv, "", 0);
- }
- e->base.buf = (STDCHAR *) SvPVX(e->bufsv);
- if (!e->base.ptr)
- e->base.ptr = e->base.buf;
- if (!e->base.end)
- e->base.end = e->base.buf;
- if (e->base.ptr < e->base.buf
- || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
- Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
- e->base.buf + SvLEN(e->bufsv));
- abort();
- }
- if (SvLEN(e->bufsv) < e->base.bufsiz) {
- SSize_t poff = e->base.ptr - e->base.buf;
- SSize_t eoff = e->base.end - e->base.buf;
- e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz);
- e->base.ptr = e->base.buf + poff;
- e->base.end = e->base.buf + eoff;
- }
- if (e->base.ptr < e->base.buf
- || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
- Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
- e->base.buf + SvLEN(e->bufsv));
- abort();
- }
- return e->base.buf;
-}
-
-IV
-PerlIOEncode_fill(pTHX_ PerlIO * f)
-{
- PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
- dSP;
- IV code = 0;
- PerlIO *n;
- SSize_t avail;
- if (PerlIO_flush(f) != 0)
- return -1;
- n = PerlIONext(f);
- if (!PerlIO_fast_gets(n)) {
- /* Things get too messy if we don't have a buffer layer
- push a :perlio to do the job */
- char mode[8];
- n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
- if (!n) {
- Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
- }
- }
- ENTER;
- SAVETMPS;
- retry:
- avail = PerlIO_get_cnt(n);
- if (avail <= 0) {
- avail = PerlIO_fill(n);
- if (avail == 0) {
- avail = PerlIO_get_cnt(n);
- }
- else {
- if (!PerlIO_error(n) && PerlIO_eof(n))
- avail = 0;
- }
- }
- if (avail > 0) {
- STDCHAR *ptr = PerlIO_get_ptr(n);
- SSize_t use = avail;
- SV *uni;
- char *s;
- STRLEN len = 0;
- e->base.ptr = e->base.end = (STDCHAR *) Nullch;
- (void) PerlIOEncode_get_base(aTHX_ f);
- if (!e->dataSV)
- e->dataSV = newSV(0);
- if (SvTYPE(e->dataSV) < SVt_PV) {
- sv_upgrade(e->dataSV,SVt_PV);
- }
- if (SvCUR(e->dataSV)) {
- /* something left over from last time - create a normal
- SV with new data appended
- */
- if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
- use = e->base.bufsiz - SvCUR(e->dataSV);
- }
- sv_catpvn(e->dataSV,(char*)ptr,use);
- }
- else {
- /* Create a "dummy" SV to represent the available data from layer below */
- if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
- Safefree(SvPVX(e->dataSV));
- }
- if (use > e->base.bufsiz) {
- use = e->base.bufsiz;
- }
- SvPVX(e->dataSV) = (char *) ptr;
- SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */
- SvCUR_set(e->dataSV,use);
- SvPOK_only(e->dataSV);
- }
- SvUTF8_off(e->dataSV);
- PUSHMARK(sp);
- XPUSHs(e->enc);
- XPUSHs(e->dataSV);
- XPUSHs(&PL_sv_yes);
- PUTBACK;
- if (perl_call_method("decode", G_SCALAR) != 1) {
- Perl_die(aTHX_ "panic: decode did not return a value");
- }
- SPAGAIN;
- uni = POPs;
- PUTBACK;
- /* Now get translated string (forced to UTF-8) and use as buffer */
- if (SvPOK(uni)) {
- s = SvPVutf8(uni, len);
-#ifdef PARANOID_ENCODE_CHECKS
- if (len && !is_utf8_string((U8*)s,len)) {
- Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
- }
-#endif
- }
- if (len > 0) {
- /* Got _something */
- /* if decode gave us back dataSV then data may vanish when
- we do ptrcnt adjust - so take our copy now.
- (The copy is a pain - need a put-it-here option for decode.)
- */
- sv_setpvn(e->bufsv,s,len);
- e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv);
- e->base.end = e->base.ptr + SvCUR(e->bufsv);
- PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
- SvUTF8_on(e->bufsv);
-
- /* Adjust ptr/cnt not taking anything which
- did not translate - not clear this is a win */
- /* compute amount we took */
- use -= SvCUR(e->dataSV);
- PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
- /* and as we did not take it it isn't pending */
- SvCUR_set(e->dataSV,0);
- } else {
- /* Got nothing - assume partial character so we need some more */
- /* Make sure e->dataSV is a normal SV before re-filling as
- buffer alias will change under us
- */
- s = SvPV(e->dataSV,len);
- sv_setpvn(e->dataSV,s,len);
- PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
- goto retry;
- }
- FREETMPS;
- LEAVE;
- return code;
- }
- else {
- if (avail == 0)
- PerlIOBase(f)->flags |= PERLIO_F_EOF;
- else
- PerlIOBase(f)->flags |= PERLIO_F_ERROR;
- return -1;
- }
-}
-
-IV
-PerlIOEncode_flush(pTHX_ PerlIO * f)
-{
- PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
- IV code = 0;
- if (e->bufsv && (e->base.ptr > e->base.buf)) {
- dSP;
- SV *str;
- char *s;
- STRLEN len;
- SSize_t count = 0;
- if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) {
- /* Write case encode the buffer and write() to layer below */
- ENTER;
- SAVETMPS;
- PUSHMARK(sp);
- XPUSHs(e->enc);
- SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
- SvUTF8_on(e->bufsv);
- XPUSHs(e->bufsv);
- XPUSHs(&PL_sv_yes);
- PUTBACK;
- if (perl_call_method("encode", G_SCALAR) != 1) {
- Perl_die(aTHX_ "panic: encode did not return a value");
- }
- SPAGAIN;
- str = POPs;
- PUTBACK;
- s = SvPV(str, len);
- count = PerlIO_write(PerlIONext(f),s,len);
- if (count != len) {
- code = -1;
- }
- FREETMPS;
- LEAVE;
- if (PerlIO_flush(PerlIONext(f)) != 0) {
- code = -1;
- }
- if (SvCUR(e->bufsv)) {
- /* Did not all translate */
- e->base.ptr = e->base.buf+SvCUR(e->bufsv);
- return code;
- }
- }
- else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
- /* read case */
- /* if we have any untranslated stuff then unread that first */
- if (e->dataSV && SvCUR(e->dataSV)) {
- s = SvPV(e->dataSV, len);
- count = PerlIO_unread(PerlIONext(f),s,len);
- if (count != len) {
- code = -1;
- }
- }
- /* See if there is anything left in the buffer */
- if (e->base.ptr < e->base.end) {
- /* Bother - have unread data.
- re-encode and unread() to layer below
- */
- ENTER;
- SAVETMPS;
- str = sv_newmortal();
- sv_upgrade(str, SVt_PV);
- SvPVX(str) = (char*)e->base.ptr;
- SvLEN(str) = 0;
- SvCUR_set(str, e->base.end - e->base.ptr);
- SvPOK_only(str);
- SvUTF8_on(str);
- PUSHMARK(sp);
- XPUSHs(e->enc);
- XPUSHs(str);
- XPUSHs(&PL_sv_yes);
- PUTBACK;
- if (perl_call_method("encode", G_SCALAR) != 1) {
- Perl_die(aTHX_ "panic: encode did not return a value");
- }
- SPAGAIN;
- str = POPs;
- PUTBACK;
- s = SvPV(str, len);
- count = PerlIO_unread(PerlIONext(f),s,len);
- if (count != len) {
- code = -1;
- }
- FREETMPS;
- LEAVE;
- }
- }
- e->base.ptr = e->base.end = e->base.buf;
- PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
- }
- return code;
-}
-
-IV
-PerlIOEncode_close(pTHX_ PerlIO * f)
-{
- PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
- IV code = PerlIOBase_close(aTHX_ f);
- if (e->bufsv) {
- if (e->base.buf && e->base.ptr > e->base.buf) {
- Perl_croak(aTHX_ "Close with partial character");
- }
- SvREFCNT_dec(e->bufsv);
- e->bufsv = Nullsv;
- }
- e->base.buf = NULL;
- e->base.ptr = NULL;
- e->base.end = NULL;
- PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
- return code;
-}
-
-Off_t
-PerlIOEncode_tell(pTHX_ PerlIO * f)
-{
- PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
- /* Unfortunately the only way to get a postion is to (re-)translate,
- the UTF8 we have in bufefr and then ask layer below
- */
- PerlIO_flush(f);
- if (b->buf && b->ptr > b->buf) {
- Perl_croak(aTHX_ "Cannot tell at partial character");
- }
- return PerlIO_tell(PerlIONext(f));
-}
-
-PerlIO *
-PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o,
- CLONE_PARAMS * params, int flags)
-{
- if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) {
- PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode);
- PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode);
- if (oe->enc) {
- fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params);
- }
- }
- return f;
-}
-
-PerlIO_funcs PerlIO_encode = {
- "encoding",
- sizeof(PerlIOEncode),
- PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT,
- PerlIOEncode_pushed,
- PerlIOEncode_popped,
- PerlIOBuf_open,
- PerlIOEncode_getarg,
- PerlIOBase_fileno,
- PerlIOEncode_dup,
- PerlIOBuf_read,
- PerlIOBuf_unread,
- PerlIOBuf_write,
- PerlIOBuf_seek,
- PerlIOEncode_tell,
- PerlIOEncode_close,
- PerlIOEncode_flush,
- PerlIOEncode_fill,
- PerlIOBase_eof,
- PerlIOBase_error,
- PerlIOBase_clearerr,
- PerlIOBase_setlinebuf,
- PerlIOEncode_get_base,
- PerlIOBuf_bufsiz,
- PerlIOBuf_get_ptr,
- PerlIOBuf_get_cnt,
- PerlIOBuf_set_ptrcnt,
-};
-#endif /* encode layer */
void
Encode_XSEncoding(pTHX_ encode_t * enc)
@@ -636,33 +189,56 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
case ENCODE_NOREP:
if (dir == enc->f_utf8) {
- if (!check && ckWARN_d(WARN_UTF8)) {
- STRLEN clen;
- UV ch =
- utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
- &clen, 0);
- Perl_warner(aTHX_ packWARN(WARN_UTF8),
- "\"\\N{U+%" UVxf
- "}\" does not map to %s", ch,
- enc->name[0]);
- /* FIXME: Skip over the character, copy in replacement and continue
- * but that is messy so for now just fail.
- */
- return &PL_sv_undef;
+ STRLEN clen;
+ UV ch =
+ utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
+ &clen, 0);
+ if (!check) { /* fallback char */
+ sdone += slen + clen;
+ ddone += dlen + enc->replen;
+ sv_catpvn(dst, enc->rep, enc->replen);
}
- else {
- return &PL_sv_undef;
+ else if (check == -1){ /* perlqq */
+ SV* perlqq =
+ sv_2mortal(newSVpvf("\\x{%x}", ch));
+ sdone += slen + clen;
+ ddone += dlen + SvLEN(perlqq);
+ sv_catsv(dst, perlqq);
+ }
+ else {
+ Perl_croak(aTHX_
+ "\"\\N{U+%" UVxf
+ "}\" does not map to %s", ch,
+ enc->name[0]);
}
+ }
+ else {
+ if (!check){ /* fallback char */
+ sdone += slen + 1;
+ ddone += dlen + strlen(FBCHAR_UTF8);
+ sv_catpv(dst, FBCHAR_UTF8);
}
+ else if (check == -1){ /* perlqq */
+ SV* perlqq =
+ sv_2mortal(newSVpvf("\\x%02X", s[slen]));
+ sdone += slen + 1;
+ ddone += dlen + SvLEN(perlqq);
+ sv_catsv(dst, perlqq);
+ }
else {
- /* UTF-8 is supposed to be "Universal" so should not happen
- for real characters, but some encodings have non-assigned
- codes which may occur.
- */
- Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)",
- enc->name[0], (U8) s[slen], code);
+ /* UTF-8 is supposed to be "Universal" so should not
+ happen for real characters, but some encodings
+ have non-assigned codes which may occur. */
+ Perl_croak(aTHX_ "%s \"\\x%02X\" "
+ "does not map to Unicode (%d)",
+ enc->name[0], (U8) s[slen], code);
}
- break;
+ }
+ dlen = SvCUR(dst);
+ d = SvPVX(dst) + dlen;
+ s = SvPVX(src) + sdone;
+ slen = tlen - sdone;
+ break;
default:
Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
@@ -722,10 +298,10 @@ CODE:
}
void
-Method_decode(obj,src,check = FALSE)
+Method_decode(obj,src,check = 0)
SV * obj
SV * src
-bool check
+int check
CODE:
{
encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
@@ -735,10 +311,10 @@ CODE:
}
void
-Method_encode(obj,src,check = FALSE)
+Method_encode(obj,src,check = 0)
SV * obj
SV * src
-bool check
+int check
CODE:
{
encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
@@ -761,8 +337,8 @@ CODE:
int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
SV *result = newSVpvn("",0);
STRLEN ulen;
- U8 *s = SvPVbyte(str,ulen);
- U8 *e = SvEND(str);
+ U8 *s = (U8 *)SvPVbyte(str,ulen);
+ U8 *e = (U8 *)SvEND(str);
ST(0) = sv_2mortal(result);
SvUTF8_on(result);
@@ -790,7 +366,7 @@ CODE:
while (s < e && s+size <= e) {
UV ord = enc_unpack(aTHX_ &s,e,size,endian);
U8 *d;
- if (size != 4 && !valid_ucs2(ord)) {
+ if (size != 4 && invalid_ucs2(ord)) {
if (ucs2) {
if (SvTRUE(chk)) {
croak("%s:no surrogates allowed %"UVxf,
@@ -851,8 +427,8 @@ CODE:
int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
SV *result = newSVpvn("",0);
STRLEN ulen;
- U8 *s = SvPVutf8(utf8,ulen);
- U8 *e = SvEND(utf8);
+ U8 *s = (U8 *)SvPVutf8(utf8,ulen);
+ U8 *e = (U8 *)SvEND(utf8);
ST(0) = sv_2mortal(result);
if (!endian) {
endian = (size == 4) ? 'N' : 'n';
@@ -866,7 +442,7 @@ CODE:
STRLEN len;
UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
s += len;
- if (size != 4 && !valid_ucs2(ord)) {
+ if (size != 4 && invalid_ucs2(ord)) {
if (!issurrogate(ord)){
if (ucs2) {
if (SvTRUE(chk)) {
@@ -999,9 +575,9 @@ _utf8_to_bytes(sv, ...)
RETVAL
bool
-is_utf8(sv, check = FALSE)
+is_utf8(sv, check = 0)
SV * sv
-bool check
+int check
CODE:
{
if (SvGMAGICAL(sv)) /* it could be $1, for example */
@@ -1056,7 +632,7 @@ _utf8_off(sv)
BOOT:
{
#if defined(USE_PERLIO) && !defined(USE_SFIO)
- PerlIO_define_layer(aTHX_ &PerlIO_encode);
+/* PerlIO_define_layer(aTHX_ &PerlIO_encode); */
#endif
#include "def_t.exh"
}