Upgrade Encode from version 2.94 to 2.96

author: Steve Hay <steve.m.hay@googlemail.com> 2018-02-12 20:37:36 +0000
committer: Steve Hay <steve.m.hay@googlemail.com> 2018-02-12 20:37:36 +0000
commit: c31ca2013f287840fcddf498ead9602666569966 (patch)
tree: 4f2383e147c320c2e2cba3b369f1f8690c02849b /cpan/Encode
parent: c16e7f98327a78a23d0eba94da62bf70782165ae (diff)
download: perl-c31ca2013f287840fcddf498ead9602666569966.tar.gz
7 files changed, 193 insertions, 106 deletions
diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm
index 249ac6b138..c1de56100f 100644
--- a/cpan/Encode/Encode.pm
+++ b/cpan/Encode/Encode.pm
@@ -1,5 +1,5 @@
 #
-# $Id: Encode.pm,v 2.94 2018/01/09 05:53:00 dankogai Exp dankogai $
+# $Id: Encode.pm,v 2.96 2018/02/11 05:32:30 dankogai Exp $
 #
 package Encode;
 use strict;
@@ -7,7 +7,7 @@ use warnings;
 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
 our $VERSION;
 BEGIN {
-    $VERSION = sprintf "%d.%02d", q$Revision: 2.94 $ =~ /(\d+)/g;
+    $VERSION = sprintf "%d.%02d", q$Revision: 2.96 $ =~ /(\d+)/g;
     require XSLoader;
     XSLoader::load( __PACKAGE__, $VERSION );
 }
diff --git a/cpan/Encode/Encode.xs b/cpan/Encode/Encode.xs
index 6c077bec3a..bc4a77d6d2 100644
--- a/cpan/Encode/Encode.xs
+++ b/cpan/Encode/Encode.xs
@@ -1,5 +1,5 @@
 /*
- $Id: Encode.xs,v 2.41 2017/06/10 17:23:50 dankogai Exp $
+ $Id: Encode.xs,v 2.42 2018/02/08 00:26:15 dankogai Exp $
  */
 
 #define PERL_NO_GET_CONTEXT
@@ -35,6 +35,13 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
 #define SvIV_nomg SvIV
 #endif
 
+#ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+#  define UTF8_DISALLOW_ILLEGAL_INTERCHANGE 0
+#  define UTF8_ALLOW_NON_STRICT (UTF8_ALLOW_FE_FF|UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
+#else
+#  define UTF8_ALLOW_NON_STRICT 0
+#endif
+
 static void
 Encode_XSEncoding(pTHX_ encode_t * enc)
 {
@@ -177,65 +184,66 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 *
     if (offset) {
       s += *offset;
       if (slen > *offset){ /* safeguard against slen overflow */
-      slen -= *offset;
+          slen -= *offset;
       }else{
-      slen = 0;
+          slen = 0;
       }
       tlen = slen;
     }
 
     if (slen == 0){
-    SvCUR_set(dst, 0);
-    SvPOK_only(dst);
-    goto ENCODE_END;
+        SvCUR_set(dst, 0);
+        SvPOK_only(dst);
+        goto ENCODE_END;
     }
 
     while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check,
-                 trm, trmlen)) ) 
+                 trm, trmlen)) )
     {
-    SvCUR_set(dst, dlen+ddone);
-    SvPOK_only(dst);
-    
-    if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL ||
-        code == ENCODE_FOUND_TERM) {
-        break;
-    }
-    switch (code) {
-    case ENCODE_NOSPACE:
-    {	
-        STRLEN more = 0; /* make sure you initialize! */
-        STRLEN sleft;
-        sdone += slen;
-        ddone += dlen;
-        sleft = tlen - sdone;
+        SvCUR_set(dst, dlen+ddone);
+        SvPOK_only(dst);
+
+        if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL ||
+            code == ENCODE_FOUND_TERM) {
+            break;
+        }
+        switch (code) {
+        case ENCODE_NOSPACE:
+        {
+            STRLEN more = 0; /* make sure you initialize! */
+            STRLEN sleft;
+            sdone += slen;
+            ddone += dlen;
+            sleft = tlen - sdone;
 #if ENCODE_XS_PROFILE >= 2
-        Perl_warn(aTHX_
-              "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
-              more, sdone, sleft, SvLEN(dst));
+            Perl_warn(aTHX_
+                  "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
+                  more, sdone, sleft, SvLEN(dst));
 #endif
-        if (sdone != 0) { /* has src ever been processed ? */
+            if (sdone != 0) { /* has src ever been processed ? */
 #if   ENCODE_XS_USEFP == 2
-        more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
-            - SvLEN(dst);
+                more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
+                    - SvLEN(dst);
 #elif ENCODE_XS_USEFP
-        more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft);
+                more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft);
 #else
-        /* safe until SvLEN(dst) == MAX_INT/16 */
-        more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
+            /* safe until SvLEN(dst) == MAX_INT/16 */
+                more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
 #endif
+            }
+            more += UTF8_MAXLEN; /* insurance policy */
+            d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
+            /* dst need to grow need MORE bytes! */
+            if (ddone >= SvLEN(dst)) {
+                Perl_croak(aTHX_ "Destination couldn't be grown.");
+            }
+            dlen = SvLEN(dst)-ddone-1;
+            d   += ddone;
+            s   += slen;
+            slen = tlen-sdone;
+            continue;
         }
-        more += UTF8_MAXLEN; /* insurance policy */
-        d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
-        /* dst need to grow need MORE bytes! */
-        if (ddone >= SvLEN(dst)) {
-        Perl_croak(aTHX_ "Destination couldn't be grown.");
-        }
-        dlen = SvLEN(dst)-ddone-1;
-        d   += ddone;
-        s   += slen;
-        slen = tlen-sdone;
-        continue;
-    }
+
     case ENCODE_NOREP:
         /* encoding */	
         if (dir == enc->f_utf8) {
@@ -319,18 +327,18 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 *
         }
         /* settle variables when fallback */
         d    = (U8 *)SvEND(dst);
-            dlen = SvLEN(dst) - ddone - 1;
+        dlen = SvLEN(dst) - ddone - 1;
         s    = (U8*)SvPVX(src) + sdone;
         slen = tlen - sdone;
         break;
 
-    default:
-        Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
-               code, (dir == enc->f_utf8) ? "to" : "from",
-               enc->name[0]);
-        return &PL_sv_undef;
-    }
-    }
+        default:
+            Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
+                   code, (dir == enc->f_utf8) ? "to" : "from",
+                   enc->name[0]);
+            return &PL_sv_undef;
+        }
+    }   /* End of looping through the string */
  ENCODE_SET_SRC:
     if (check && !(check & ENCODE_LEAVE_SRC)){
     sdone = SvCUR(src) - (slen+sdone);
@@ -354,7 +362,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 *
     }
 #endif
 
-    if (offset) 
+    if (offset)
       *offset += sdone + slen;
 
  ENCODE_END:
@@ -378,6 +386,13 @@ strict_utf8(pTHX_ SV* sv)
     return SvTRUE(*svp);
 }
 
+/* Modern perls have the capability to do this more efficiently and portably */
+#ifdef is_utf8_string_loc_flags
+# define CAN_USE_BASE_PERL
+#endif
+
+#ifndef CAN_USE_BASE_PERL
+
 /*
  * https://github.com/dankogai/p5-encode/pull/56#issuecomment-231959126
  */
@@ -433,10 +448,27 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
     return uv;
 }
 
+#endif  /* CAN_USE_BASE_PERL */
+
 static U8*
 process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
              bool encode, bool strict, bool stop_at_partial)
 {
+    /* Copies the purportedly UTF-8 encoded string starting at 's' and ending
+     * at 'e' - 1 to 'dst', checking as it goes along that the string actually
+     * is valid UTF-8.  There are two levels of strictness checking.  If
+     * 'strict' is FALSE, the string is checked for being well-formed UTF-8, as
+     * extended by Perl.  Additionally, if 'strict' is TRUE, above-Unicode code
+     * points, surrogates, and non-character code points are checked for.  When
+     * invalid input is encountered, some action is taken, exactly what depends
+     * on the flags in 'check_sv'.  'encode' gives if this is from an encode
+     * operation (if TRUE), or a decode one.  This function returns the
+     * position in 's' of the start of the next character beyond where it got
+     * to.  If there were no problems, that will be 'e'.  If 'stop_at_partial'
+     * is TRUE, if the final character before 'e' is incomplete, but valid as
+     * far as is available, no action will be taken on that partial character,
+     * and the return value will point to its first byte */
+
     UV uv;
     STRLEN ulen;
     SV *fallback_cb;
@@ -445,6 +477,9 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
     STRLEN dlen;
     char esc[UTF8_MAXLEN * 6 + 1];
     STRLEN i;
+    const U32 flags = (strict)
+                    ? UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                    : UTF8_ALLOW_NON_STRICT;
 
     if (SvROK(check_sv)) {
 	/* croak("UTF-8 decoder doesn't support callback CHECK"); */
@@ -462,7 +497,44 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
     dlen = (s && e && s < e) ? e-s+1 : 1;
     d = (U8 *) SvGROW(dst, dlen);
 
+    stop_at_partial = stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL);
+
     while (s < e) {
+
+#ifdef CAN_USE_BASE_PERL    /* Use the much faster, portable implementation if
+                               available */
+
+        /* If there were no errors, this will be 'e'; otherwise it will point
+         * to the first byte of the erroneous input */
+        const U8* e_or_where_failed;
+        bool valid = is_utf8_string_loc_flags(s, e - s, &e_or_where_failed, flags);
+        STRLEN len = e_or_where_failed - s;
+
+        /* Copy as far as was successful */
+        Move(s, d, len, U8);
+        d += len;
+        s = (U8 *) e_or_where_failed;
+
+        /* Are done if it was valid, or we are accepting partial characters and
+         * the only error is that the final bytes form a partial character */
+        if (    LIKELY(valid)
+            || (   stop_at_partial
+                && is_utf8_valid_partial_char_flags(s, e, flags)))
+        {
+            break;
+        }
+
+        /* Here, was not valid.  If is 'strict', and is legal extended UTF-8,
+         * we know it is a code point whose value we can calculate, just not
+         * one accepted under strict.  Otherwise, it is malformed in some way.
+         * In either case, the system function can calculate either the code
+         * point, or the best substitution for it */
+        uv = utf8n_to_uvchr(s, e - s, &ulen, UTF8_ALLOW_ANY);
+
+#else   /* Use code for earlier perls */
+
+        PERL_UNUSED_VAR(flags);
+
         if (UTF8_IS_INVARIANT(*s)) {
             *d++ = *s++;
             continue;
@@ -483,7 +555,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
                 else
                     ulen = 1;
 
-                if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == (STRLEN)(e-s))
+                if (stop_at_partial && ulen == (STRLEN)(e-s))
                     break;
 
                 goto malformed_byte;
@@ -512,6 +584,16 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
             ulen = 1;
 
     malformed:
+
+#endif  /* The two versions for processing come back together here, for the
+         * error handling code.
+         *
+         * Here, we are looping through the input and found an error.
+         * 'uv' is the code point in error if calculable, or the REPLACEMENT
+         *      CHARACTER if not.
+         * 'ulen' is how many bytes of input this iteration of the loop
+         *        consumes */
+
         if (!encode && (check & (ENCODE_DIE_ON_ERR|ENCODE_WARN_ON_ERR|ENCODE_PERLQQ)))
             for (i=0; i<ulen; ++i) sprintf(esc+4*i, "\\x%02X", s[i]);
         if (check & ENCODE_DIE_ON_ERR){
@@ -617,7 +699,7 @@ PPCODE:
         utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify);
     e = s+slen;
 
-    /* 
+    /*
      * PerlIO check -- we assume the object is of PerlIO if renewed
      */
     ENTER; SAVETMPS;
@@ -627,7 +709,7 @@ PPCODE:
     if (call_method("renewed",G_SCALAR) == 1) {
     SPAGAIN;
     renewed = (bool)POPi;
-    PUTBACK; 
+    PUTBACK;
 #if 0
     fprintf(stderr, "renewed == %d\n", renewed);
 #endif
diff --git a/cpan/Encode/Encode/encode.h b/cpan/Encode/Encode/encode.h
index 5fbcf76ad3..df5554f1cb 100644
--- a/cpan/Encode/Encode/encode.h
+++ b/cpan/Encode/Encode/encode.h
@@ -88,7 +88,12 @@ extern void Encode_DefineEncoding(encode_t *enc);
 #define ENCODE_FALLBACK 4
 #define ENCODE_FOUND_TERM 5
 
-#define FBCHAR_UTF8		"\xEF\xBF\xBD"
+/* Use the perl core value if available; it is portable to EBCDIC */
+#ifdef REPLACEMENT_CHARACTER_UTF8
+#  define FBCHAR_UTF8		REPLACEMENT_CHARACTER_UTF8
+#else
+#  define FBCHAR_UTF8           "\xEF\xBF\xBD"
+#endif
 
 #define  ENCODE_DIE_ON_ERR     0x0001 /* croaks immediately */
 #define  ENCODE_WARN_ON_ERR    0x0002 /* warn on error; may proceed */
diff --git a/cpan/Encode/Unicode/Unicode.pm b/cpan/Encode/Unicode/Unicode.pm
index c56745d7b1..2a8b477784 100644
--- a/cpan/Encode/Unicode/Unicode.pm
+++ b/cpan/Encode/Unicode/Unicode.pm
@@ -3,7 +3,7 @@ package Encode::Unicode;
 use strict;
 use warnings;
 
-our $VERSION = do { my @r = ( q$Revision: 2.16 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+our $VERSION = do { my @r = ( q$Revision: 2.17 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
 
 use XSLoader;
 XSLoader::load( __PACKAGE__, $VERSION );
diff --git a/cpan/Encode/Unicode/Unicode.xs b/cpan/Encode/Unicode/Unicode.xs
index b3b1d2fea8..b459786d16 100644
--- a/cpan/Encode/Unicode/Unicode.xs
+++ b/cpan/Encode/Unicode/Unicode.xs
@@ -1,5 +1,5 @@
 /*
- $Id: Unicode.xs,v 2.16 2017/06/10 17:23:50 dankogai Exp $
+ $Id: Unicode.xs,v 2.17 2018/02/08 00:26:15 dankogai Exp $
  */
 
 #define PERL_NO_GET_CONTEXT
@@ -315,7 +315,7 @@ CODE:
 	    resultbuflen = SvLEN(result);
 	}
 
-	d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
+	d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), ord,
                                             UNICODE_WARN_ILLEGAL_INTERCHANGE);
 	SvCUR_set(result, d - (U8 *)SvPVX(result));
     }
@@ -407,7 +407,7 @@ CODE:
     }
     while (s < e && s+UTF8SKIP(s) <= e) {
 	STRLEN len;
-	UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
+	UV ord = utf8n_to_uvchr(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
                                                |UTF8_WARN_SURROGATE
                                                |UTF8_DISALLOW_FE_FF
                                                |UTF8_WARN_FE_FF
diff --git a/cpan/Encode/encengine.c b/cpan/Encode/encengine.c
index bddf556b35..67613a89e3 100644
--- a/cpan/Encode/encengine.c
+++ b/cpan/Encode/encengine.c
@@ -102,56 +102,56 @@ do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
     U8 *dend = d + dlen, *dlast = d;
     int code = 0;
     while (s < send) {
-    const encpage_t *e = enc;
-    U8 byte = *s;
-    while (byte > e->max)
-        e++;
-    if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
-        const U8 *cend = s + (e->slen & 0x7f);
-        if (cend <= send) {
-        STRLEN n;
-        if ((n = e->dlen)) {
-            const U8 *out = e->seq + n * (byte - e->min);
-            U8 *oend = d + n;
-            if (dst) {
-            if (oend <= dend) {
-                while (d < oend)
-                *d++ = *out++;
+        const encpage_t *e = enc;
+        U8 byte = *s;
+        while (byte > e->max)
+            e++;
+        if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
+            const U8 *cend = s + (e->slen & 0x7f);
+            if (cend <= send) {
+                STRLEN n;
+                if ((n = e->dlen)) {
+                    const U8 *out = e->seq + n * (byte - e->min);
+                    U8 *oend = d + n;
+                    if (dst) {
+                        if (oend <= dend) {
+                            while (d < oend)
+                                *d++ = *out++;
+                        }
+                        else {
+                            /* Out of space */
+                            code = ENCODE_NOSPACE;
+                            break;
+                        }
+                    }
+                    else
+                        d = oend;
+                }
+                enc = e->next;
+                s++;
+                if (s == cend) {
+                    if (approx && (e->slen & 0x80))
+                        code = ENCODE_FALLBACK;
+                    last = s;
+                    if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
+                        code = ENCODE_FOUND_TERM;
+                        break;
+                    }
+                    dlast = d;
+                }
             }
             else {
-                /* Out of space */
-                code = ENCODE_NOSPACE;
+                /* partial source character */
+                code = ENCODE_PARTIAL;
                 break;
             }
-            }
-            else
-            d = oend;
-        }
-        enc = e->next;
-        s++;
-        if (s == cend) {
-            if (approx && (e->slen & 0x80))
-            code = ENCODE_FALLBACK;
-            last = s;
-            if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
-              code = ENCODE_FOUND_TERM;
-              break;
-            }
-            dlast = d;
-        }
         }
         else {
-        /* partial source character */
-        code = ENCODE_PARTIAL;
-        break;
+            /* Cannot represent */
+            code = ENCODE_NOREP;
+            break;
         }
     }
-    else {
-        /* Cannot represent */
-        code = ENCODE_NOREP;
-        break;
-    }
-    }
     *slen = last - src;
     *dout = d - dst;
     return code;
diff --git a/cpan/Encode/encoding.pm b/cpan/Encode/encoding.pm
index 1e82070afd..c3f324d29f 100644
--- a/cpan/Encode/encoding.pm
+++ b/cpan/Encode/encoding.pm
@@ -1,6 +1,6 @@
-# $Id: encoding.pm,v 2.21 2017/10/06 22:21:53 dankogai Exp dankogai $
+# $Id: encoding.pm,v 2.22 2018/02/11 05:32:03 dankogai Exp $
 package encoding;
-our $VERSION = sprintf "%d.%02d", q$Revision: 2.21 $ =~ /(\d+)/g;
+our $VERSION = sprintf "%d.%02d", q$Revision: 2.22 $ =~ /(\d+)/g;
 
 use Encode;
 use strict;
author	Steve Hay <steve.m.hay@googlemail.com>	2018-02-12 20:37:36 +0000
committer	Steve Hay <steve.m.hay@googlemail.com>	2018-02-12 20:37:36 +0000
commit	c31ca2013f287840fcddf498ead9602666569966 (patch)
tree	4f2383e147c320c2e2cba3b369f1f8690c02849b /cpan/Encode
parent	c16e7f98327a78a23d0eba94da62bf70782165ae (diff)
download	perl-c31ca2013f287840fcddf498ead9602666569966.tar.gz