summaryrefslogtreecommitdiff
path: root/cpan/Encode
diff options
context:
space:
mode:
authorSteve Hay <steve.m.hay@googlemail.com>2018-02-12 20:37:36 +0000
committerSteve Hay <steve.m.hay@googlemail.com>2018-02-12 20:37:36 +0000
commitc31ca2013f287840fcddf498ead9602666569966 (patch)
tree4f2383e147c320c2e2cba3b369f1f8690c02849b /cpan/Encode
parentc16e7f98327a78a23d0eba94da62bf70782165ae (diff)
downloadperl-c31ca2013f287840fcddf498ead9602666569966.tar.gz
Upgrade Encode from version 2.94 to 2.96
Diffstat (limited to 'cpan/Encode')
-rw-r--r--cpan/Encode/Encode.pm4
-rw-r--r--cpan/Encode/Encode.xs192
-rw-r--r--cpan/Encode/Encode/encode.h7
-rw-r--r--cpan/Encode/Unicode/Unicode.pm2
-rw-r--r--cpan/Encode/Unicode/Unicode.xs6
-rw-r--r--cpan/Encode/encengine.c84
-rw-r--r--cpan/Encode/encoding.pm4
7 files changed, 193 insertions, 106 deletions
diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm
index 249ac6b138..c1de56100f 100644
--- a/cpan/Encode/Encode.pm
+++ b/cpan/Encode/Encode.pm
@@ -1,5 +1,5 @@
#
-# $Id: Encode.pm,v 2.94 2018/01/09 05:53:00 dankogai Exp dankogai $
+# $Id: Encode.pm,v 2.96 2018/02/11 05:32:30 dankogai Exp $
#
package Encode;
use strict;
@@ -7,7 +7,7 @@ use warnings;
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
our $VERSION;
BEGIN {
- $VERSION = sprintf "%d.%02d", q$Revision: 2.94 $ =~ /(\d+)/g;
+ $VERSION = sprintf "%d.%02d", q$Revision: 2.96 $ =~ /(\d+)/g;
require XSLoader;
XSLoader::load( __PACKAGE__, $VERSION );
}
diff --git a/cpan/Encode/Encode.xs b/cpan/Encode/Encode.xs
index 6c077bec3a..bc4a77d6d2 100644
--- a/cpan/Encode/Encode.xs
+++ b/cpan/Encode/Encode.xs
@@ -1,5 +1,5 @@
/*
- $Id: Encode.xs,v 2.41 2017/06/10 17:23:50 dankogai Exp $
+ $Id: Encode.xs,v 2.42 2018/02/08 00:26:15 dankogai Exp $
*/
#define PERL_NO_GET_CONTEXT
@@ -35,6 +35,13 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
#define SvIV_nomg SvIV
#endif
+#ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+# define UTF8_DISALLOW_ILLEGAL_INTERCHANGE 0
+# define UTF8_ALLOW_NON_STRICT (UTF8_ALLOW_FE_FF|UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
+#else
+# define UTF8_ALLOW_NON_STRICT 0
+#endif
+
static void
Encode_XSEncoding(pTHX_ encode_t * enc)
{
@@ -177,65 +184,66 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 *
if (offset) {
s += *offset;
if (slen > *offset){ /* safeguard against slen overflow */
- slen -= *offset;
+ slen -= *offset;
}else{
- slen = 0;
+ slen = 0;
}
tlen = slen;
}
if (slen == 0){
- SvCUR_set(dst, 0);
- SvPOK_only(dst);
- goto ENCODE_END;
+ SvCUR_set(dst, 0);
+ SvPOK_only(dst);
+ goto ENCODE_END;
}
while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check,
- trm, trmlen)) )
+ trm, trmlen)) )
{
- SvCUR_set(dst, dlen+ddone);
- SvPOK_only(dst);
-
- if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL ||
- code == ENCODE_FOUND_TERM) {
- break;
- }
- switch (code) {
- case ENCODE_NOSPACE:
- {
- STRLEN more = 0; /* make sure you initialize! */
- STRLEN sleft;
- sdone += slen;
- ddone += dlen;
- sleft = tlen - sdone;
+ SvCUR_set(dst, dlen+ddone);
+ SvPOK_only(dst);
+
+ if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL ||
+ code == ENCODE_FOUND_TERM) {
+ break;
+ }
+ switch (code) {
+ case ENCODE_NOSPACE:
+ {
+ STRLEN more = 0; /* make sure you initialize! */
+ STRLEN sleft;
+ sdone += slen;
+ ddone += dlen;
+ sleft = tlen - sdone;
#if ENCODE_XS_PROFILE >= 2
- Perl_warn(aTHX_
- "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
- more, sdone, sleft, SvLEN(dst));
+ Perl_warn(aTHX_
+ "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
+ more, sdone, sleft, SvLEN(dst));
#endif
- if (sdone != 0) { /* has src ever been processed ? */
+ if (sdone != 0) { /* has src ever been processed ? */
#if ENCODE_XS_USEFP == 2
- more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
- - SvLEN(dst);
+ more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
+ - SvLEN(dst);
#elif ENCODE_XS_USEFP
- more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft);
+ more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft);
#else
- /* safe until SvLEN(dst) == MAX_INT/16 */
- more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
+ /* safe until SvLEN(dst) == MAX_INT/16 */
+ more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
#endif
+ }
+ more += UTF8_MAXLEN; /* insurance policy */
+ d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
+ /* dst need to grow need MORE bytes! */
+ if (ddone >= SvLEN(dst)) {
+ Perl_croak(aTHX_ "Destination couldn't be grown.");
+ }
+ dlen = SvLEN(dst)-ddone-1;
+ d += ddone;
+ s += slen;
+ slen = tlen-sdone;
+ continue;
}
- more += UTF8_MAXLEN; /* insurance policy */
- d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
- /* dst need to grow need MORE bytes! */
- if (ddone >= SvLEN(dst)) {
- Perl_croak(aTHX_ "Destination couldn't be grown.");
- }
- dlen = SvLEN(dst)-ddone-1;
- d += ddone;
- s += slen;
- slen = tlen-sdone;
- continue;
- }
+
case ENCODE_NOREP:
/* encoding */
if (dir == enc->f_utf8) {
@@ -319,18 +327,18 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 *
}
/* settle variables when fallback */
d = (U8 *)SvEND(dst);
- dlen = SvLEN(dst) - ddone - 1;
+ dlen = SvLEN(dst) - ddone - 1;
s = (U8*)SvPVX(src) + sdone;
slen = tlen - sdone;
break;
- default:
- Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
- code, (dir == enc->f_utf8) ? "to" : "from",
- enc->name[0]);
- return &PL_sv_undef;
- }
- }
+ default:
+ Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
+ code, (dir == enc->f_utf8) ? "to" : "from",
+ enc->name[0]);
+ return &PL_sv_undef;
+ }
+ } /* End of looping through the string */
ENCODE_SET_SRC:
if (check && !(check & ENCODE_LEAVE_SRC)){
sdone = SvCUR(src) - (slen+sdone);
@@ -354,7 +362,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 *
}
#endif
- if (offset)
+ if (offset)
*offset += sdone + slen;
ENCODE_END:
@@ -378,6 +386,13 @@ strict_utf8(pTHX_ SV* sv)
return SvTRUE(*svp);
}
+/* Modern perls have the capability to do this more efficiently and portably */
+#ifdef is_utf8_string_loc_flags
+# define CAN_USE_BASE_PERL
+#endif
+
+#ifndef CAN_USE_BASE_PERL
+
/*
* https://github.com/dankogai/p5-encode/pull/56#issuecomment-231959126
*/
@@ -433,10 +448,27 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
return uv;
}
+#endif /* CAN_USE_BASE_PERL */
+
static U8*
process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
bool encode, bool strict, bool stop_at_partial)
{
+ /* Copies the purportedly UTF-8 encoded string starting at 's' and ending
+ * at 'e' - 1 to 'dst', checking as it goes along that the string actually
+ * is valid UTF-8. There are two levels of strictness checking. If
+ * 'strict' is FALSE, the string is checked for being well-formed UTF-8, as
+ * extended by Perl. Additionally, if 'strict' is TRUE, above-Unicode code
+ * points, surrogates, and non-character code points are checked for. When
+ * invalid input is encountered, some action is taken, exactly what depends
+ * on the flags in 'check_sv'. 'encode' gives if this is from an encode
+ * operation (if TRUE), or a decode one. This function returns the
+ * position in 's' of the start of the next character beyond where it got
+ * to. If there were no problems, that will be 'e'. If 'stop_at_partial'
+ * is TRUE, if the final character before 'e' is incomplete, but valid as
+ * far as is available, no action will be taken on that partial character,
+ * and the return value will point to its first byte */
+
UV uv;
STRLEN ulen;
SV *fallback_cb;
@@ -445,6 +477,9 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
STRLEN dlen;
char esc[UTF8_MAXLEN * 6 + 1];
STRLEN i;
+ const U32 flags = (strict)
+ ? UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ : UTF8_ALLOW_NON_STRICT;
if (SvROK(check_sv)) {
/* croak("UTF-8 decoder doesn't support callback CHECK"); */
@@ -462,7 +497,44 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
dlen = (s && e && s < e) ? e-s+1 : 1;
d = (U8 *) SvGROW(dst, dlen);
+ stop_at_partial = stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL);
+
while (s < e) {
+
+#ifdef CAN_USE_BASE_PERL /* Use the much faster, portable implementation if
+ available */
+
+ /* If there were no errors, this will be 'e'; otherwise it will point
+ * to the first byte of the erroneous input */
+ const U8* e_or_where_failed;
+ bool valid = is_utf8_string_loc_flags(s, e - s, &e_or_where_failed, flags);
+ STRLEN len = e_or_where_failed - s;
+
+ /* Copy as far as was successful */
+ Move(s, d, len, U8);
+ d += len;
+ s = (U8 *) e_or_where_failed;
+
+ /* Are done if it was valid, or we are accepting partial characters and
+ * the only error is that the final bytes form a partial character */
+ if ( LIKELY(valid)
+ || ( stop_at_partial
+ && is_utf8_valid_partial_char_flags(s, e, flags)))
+ {
+ break;
+ }
+
+ /* Here, was not valid. If is 'strict', and is legal extended UTF-8,
+ * we know it is a code point whose value we can calculate, just not
+ * one accepted under strict. Otherwise, it is malformed in some way.
+ * In either case, the system function can calculate either the code
+ * point, or the best substitution for it */
+ uv = utf8n_to_uvchr(s, e - s, &ulen, UTF8_ALLOW_ANY);
+
+#else /* Use code for earlier perls */
+
+ PERL_UNUSED_VAR(flags);
+
if (UTF8_IS_INVARIANT(*s)) {
*d++ = *s++;
continue;
@@ -483,7 +555,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
else
ulen = 1;
- if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == (STRLEN)(e-s))
+ if (stop_at_partial && ulen == (STRLEN)(e-s))
break;
goto malformed_byte;
@@ -512,6 +584,16 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
ulen = 1;
malformed:
+
+#endif /* The two versions for processing come back together here, for the
+ * error handling code.
+ *
+ * Here, we are looping through the input and found an error.
+ * 'uv' is the code point in error if calculable, or the REPLACEMENT
+ * CHARACTER if not.
+ * 'ulen' is how many bytes of input this iteration of the loop
+ * consumes */
+
if (!encode && (check & (ENCODE_DIE_ON_ERR|ENCODE_WARN_ON_ERR|ENCODE_PERLQQ)))
for (i=0; i<ulen; ++i) sprintf(esc+4*i, "\\x%02X", s[i]);
if (check & ENCODE_DIE_ON_ERR){
@@ -617,7 +699,7 @@ PPCODE:
utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify);
e = s+slen;
- /*
+ /*
* PerlIO check -- we assume the object is of PerlIO if renewed
*/
ENTER; SAVETMPS;
@@ -627,7 +709,7 @@ PPCODE:
if (call_method("renewed",G_SCALAR) == 1) {
SPAGAIN;
renewed = (bool)POPi;
- PUTBACK;
+ PUTBACK;
#if 0
fprintf(stderr, "renewed == %d\n", renewed);
#endif
diff --git a/cpan/Encode/Encode/encode.h b/cpan/Encode/Encode/encode.h
index 5fbcf76ad3..df5554f1cb 100644
--- a/cpan/Encode/Encode/encode.h
+++ b/cpan/Encode/Encode/encode.h
@@ -88,7 +88,12 @@ extern void Encode_DefineEncoding(encode_t *enc);
#define ENCODE_FALLBACK 4
#define ENCODE_FOUND_TERM 5
-#define FBCHAR_UTF8 "\xEF\xBF\xBD"
+/* Use the perl core value if available; it is portable to EBCDIC */
+#ifdef REPLACEMENT_CHARACTER_UTF8
+# define FBCHAR_UTF8 REPLACEMENT_CHARACTER_UTF8
+#else
+# define FBCHAR_UTF8 "\xEF\xBF\xBD"
+#endif
#define ENCODE_DIE_ON_ERR 0x0001 /* croaks immediately */
#define ENCODE_WARN_ON_ERR 0x0002 /* warn on error; may proceed */
diff --git a/cpan/Encode/Unicode/Unicode.pm b/cpan/Encode/Unicode/Unicode.pm
index c56745d7b1..2a8b477784 100644
--- a/cpan/Encode/Unicode/Unicode.pm
+++ b/cpan/Encode/Unicode/Unicode.pm
@@ -3,7 +3,7 @@ package Encode::Unicode;
use strict;
use warnings;
-our $VERSION = do { my @r = ( q$Revision: 2.16 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+our $VERSION = do { my @r = ( q$Revision: 2.17 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use XSLoader;
XSLoader::load( __PACKAGE__, $VERSION );
diff --git a/cpan/Encode/Unicode/Unicode.xs b/cpan/Encode/Unicode/Unicode.xs
index b3b1d2fea8..b459786d16 100644
--- a/cpan/Encode/Unicode/Unicode.xs
+++ b/cpan/Encode/Unicode/Unicode.xs
@@ -1,5 +1,5 @@
/*
- $Id: Unicode.xs,v 2.16 2017/06/10 17:23:50 dankogai Exp $
+ $Id: Unicode.xs,v 2.17 2018/02/08 00:26:15 dankogai Exp $
*/
#define PERL_NO_GET_CONTEXT
@@ -315,7 +315,7 @@ CODE:
resultbuflen = SvLEN(result);
}
- d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
+ d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), ord,
UNICODE_WARN_ILLEGAL_INTERCHANGE);
SvCUR_set(result, d - (U8 *)SvPVX(result));
}
@@ -407,7 +407,7 @@ CODE:
}
while (s < e && s+UTF8SKIP(s) <= e) {
STRLEN len;
- UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
+ UV ord = utf8n_to_uvchr(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
|UTF8_WARN_SURROGATE
|UTF8_DISALLOW_FE_FF
|UTF8_WARN_FE_FF
diff --git a/cpan/Encode/encengine.c b/cpan/Encode/encengine.c
index bddf556b35..67613a89e3 100644
--- a/cpan/Encode/encengine.c
+++ b/cpan/Encode/encengine.c
@@ -102,56 +102,56 @@ do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
U8 *dend = d + dlen, *dlast = d;
int code = 0;
while (s < send) {
- const encpage_t *e = enc;
- U8 byte = *s;
- while (byte > e->max)
- e++;
- if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
- const U8 *cend = s + (e->slen & 0x7f);
- if (cend <= send) {
- STRLEN n;
- if ((n = e->dlen)) {
- const U8 *out = e->seq + n * (byte - e->min);
- U8 *oend = d + n;
- if (dst) {
- if (oend <= dend) {
- while (d < oend)
- *d++ = *out++;
+ const encpage_t *e = enc;
+ U8 byte = *s;
+ while (byte > e->max)
+ e++;
+ if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
+ const U8 *cend = s + (e->slen & 0x7f);
+ if (cend <= send) {
+ STRLEN n;
+ if ((n = e->dlen)) {
+ const U8 *out = e->seq + n * (byte - e->min);
+ U8 *oend = d + n;
+ if (dst) {
+ if (oend <= dend) {
+ while (d < oend)
+ *d++ = *out++;
+ }
+ else {
+ /* Out of space */
+ code = ENCODE_NOSPACE;
+ break;
+ }
+ }
+ else
+ d = oend;
+ }
+ enc = e->next;
+ s++;
+ if (s == cend) {
+ if (approx && (e->slen & 0x80))
+ code = ENCODE_FALLBACK;
+ last = s;
+ if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
+ code = ENCODE_FOUND_TERM;
+ break;
+ }
+ dlast = d;
+ }
}
else {
- /* Out of space */
- code = ENCODE_NOSPACE;
+ /* partial source character */
+ code = ENCODE_PARTIAL;
break;
}
- }
- else
- d = oend;
- }
- enc = e->next;
- s++;
- if (s == cend) {
- if (approx && (e->slen & 0x80))
- code = ENCODE_FALLBACK;
- last = s;
- if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
- code = ENCODE_FOUND_TERM;
- break;
- }
- dlast = d;
- }
}
else {
- /* partial source character */
- code = ENCODE_PARTIAL;
- break;
+ /* Cannot represent */
+ code = ENCODE_NOREP;
+ break;
}
}
- else {
- /* Cannot represent */
- code = ENCODE_NOREP;
- break;
- }
- }
*slen = last - src;
*dout = d - dst;
return code;
diff --git a/cpan/Encode/encoding.pm b/cpan/Encode/encoding.pm
index 1e82070afd..c3f324d29f 100644
--- a/cpan/Encode/encoding.pm
+++ b/cpan/Encode/encoding.pm
@@ -1,6 +1,6 @@
-# $Id: encoding.pm,v 2.21 2017/10/06 22:21:53 dankogai Exp dankogai $
+# $Id: encoding.pm,v 2.22 2018/02/11 05:32:03 dankogai Exp $
package encoding;
-our $VERSION = sprintf "%d.%02d", q$Revision: 2.21 $ =~ /(\d+)/g;
+our $VERSION = sprintf "%d.%02d", q$Revision: 2.22 $ =~ /(\d+)/g;
use Encode;
use strict;