Add new function utf8_from_bytes_loc()

This is currently undocumented externally, so we can change the API if needed. This is like utf8_from_bytes(), but in the case of not being able to convert the whole string, it converts the initial substring that is convertible, and tells you where it had to stop.
author: Karl Williamson <khw@cpan.org> 2017-06-13 21:37:22 -0600
committer: Karl Williamson <khw@cpan.org> 2017-06-14 11:08:24 -0600
commit: 976c1b0821cf2c6b33779dfb8e251deaee29f7bd (patch)
tree: 9acf9412ad51295251008ef9c2cd89b055453ab1 /utf8.c
parent: 23b37b1241a38c682dd4cd59cd6355024048868d (diff)
download: perl-976c1b0821cf2c6b33779dfb8e251deaee29f7bd.tar.gz
1 files changed, 96 insertions, 33 deletions
diff --git a/utf8.c b/utf8.c
index d9ceef0a37..ebc4ad7fe4 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1994,58 +1994,121 @@ having saved the value of C<*lenp> before the call, and subtracting the
 after-call value of C<*lenp> from it.
 
 =cut
-*/
+
+There is a macro that avoids this function call, but this is retained for
+anyone who calls it with the Perl_ prefix */
 
 U8 *
 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
 {
-    U8 *d;
-    const U8 *start = s;
-    const U8 *send;
-    Size_t count = 0;
-
     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
     PERL_UNUSED_CONTEXT;
-    if (!*is_utf8p)
-        return (U8 *)start;
-
-    /* ensure valid UTF-8 and chars < 256 before converting string */
-    for (send = s + *lenp; s < send;) {
-        if (! UTF8_IS_INVARIANT(*s)) {
-            if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
-                return (U8 *)start;
-            }
-            count++;
-            s++;
-	}
-        s++;
-    }
 
-    *is_utf8p = FALSE;
+    return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
+}
+
+/*
+No = here because currently externally undocumented
+for apidoc bytes_from_utf8_loc
+
+Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
+to store the location of the first character in C<"s"> that cannot be
+converted to non-UTF8.
+
+If that parameter is C<NULL>, this function behaves identically to
+C<bytes_from_utf8>.
+
+Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
+C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
+
+Otherwise, the function returns a newly created C<NUL>-terminated string
+containing the non-UTF8 equivalent of the convertible first portion of
+C<"s">.  C<*lenp> is set to its length, not including the terminating C<NUL>.
+If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
+and C<*first_non_downgradable> is set to C<NULL>.
+
+Otherwise, C<*first_non_downgradable> set to point to the first byte of the
+first character in the original string that wasn't converted.  C<*is_utf8p> is
+unchanged.  Note that the new string may have length 0.
+
+Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
+C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
+converts as many characters in it as possible stopping at the first one it
+finds one that can't be converted to non-UTF-8.  C<*first_non_downgradable> is
+set to point to that.  The function returns the portion that could be converted
+in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
+not including the terminating C<NUL>.  If the very first character in the
+original could not be converted, C<*lenp> will be 0, and the new string will
+contain just a single C<NUL>.  If the entire input string was converted,
+C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
+
+Upon successful return, the number of variants in the converted portion of the
+string can be computed by having saved the value of C<*lenp> before the call,
+and subtracting the after-call value of C<*lenp> from it.
+
+=cut
+
+
+*/
+
+U8 *
+Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
+{
+    U8 *d;
+    const U8 *original = s;
+    U8 *converted_start;
+    const U8 *send = s + *lenp;
+
+    PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
+
+    if (! *is_utf8p) {
+        if (first_unconverted) {
+            *first_unconverted = NULL;
+        }
+
+        return (U8 *) original;
+    }
 
-    Newx(d, (*lenp) - count + 1, U8);
+    Newx(d, (*lenp) + 1, U8);
 
-    if (LIKELY(count)) {
-        s = start; start = d;
+    converted_start = d;
         while (s < send) {
             U8 c = *s++;
             if (! UTF8_IS_INVARIANT(c)) {
-                /* Then it is two-byte encoded */
+
+            /* Then it is multi-byte encoded.  If the code point is above 0xFF,
+             * have to stop now */
+            if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
+                if (first_unconverted) {
+                    *first_unconverted = s - 1;
+                    goto finish_and_return;
+                }
+                else {
+                    Safefree(converted_start);
+                    return (U8 *) original;
+                }
+            }
+
                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
                 s++;
             }
             *d++ = c;
         }
-        *d = '\0';
-        *lenp = d - start;
 
-        return (U8 *)start;
-    }
-    else {
-        Copy(start, d, *lenp, U8);
-        *(d + *lenp) = '\0';
-        return (U8 *)d;
+    /* Here, converted the whole of the input */
+    *is_utf8p = FALSE;
+    if (first_unconverted) {
+        *first_unconverted = NULL;
     }
+
+  finish_and_return:
+        *d = '\0';
+        *lenp = d - converted_start;
+
+    /* Trim unused space */
+    Renew(converted_start, *lenp + 1, U8);
+
+    return converted_start;
 }
 
 /*
author	Karl Williamson <khw@cpan.org>	2017-06-13 21:37:22 -0600
committer	Karl Williamson <khw@cpan.org>	2017-06-14 11:08:24 -0600
commit	976c1b0821cf2c6b33779dfb8e251deaee29f7bd (patch)
tree	9acf9412ad51295251008ef9c2cd89b055453ab1 /utf8.c
parent	23b37b1241a38c682dd4cd59cd6355024048868d (diff)
download	perl-976c1b0821cf2c6b33779dfb8e251deaee29f7bd.tar.gz