diff options
author | Karl Williamson <khw@cpan.org> | 2017-06-13 21:37:22 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2017-06-14 11:08:24 -0600 |
commit | 976c1b0821cf2c6b33779dfb8e251deaee29f7bd (patch) | |
tree | 9acf9412ad51295251008ef9c2cd89b055453ab1 /utf8.c | |
parent | 23b37b1241a38c682dd4cd59cd6355024048868d (diff) | |
download | perl-976c1b0821cf2c6b33779dfb8e251deaee29f7bd.tar.gz |
Add new function utf8_from_bytes_loc()
This is currently undocumented externally, so we can change the API if
needed.
This is like utf8_from_bytes(), but in the case of not being able to
convert the whole string, it converts the initial substring that is
convertible, and tells you where it had to stop.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 129 |
1 files changed, 96 insertions, 33 deletions
@@ -1994,58 +1994,121 @@ having saved the value of C<*lenp> before the call, and subtracting the after-call value of C<*lenp> from it. =cut -*/ + +There is a macro that avoids this function call, but this is retained for +anyone who calls it with the Perl_ prefix */ U8 * Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) { - U8 *d; - const U8 *start = s; - const U8 *send; - Size_t count = 0; - PERL_ARGS_ASSERT_BYTES_FROM_UTF8; PERL_UNUSED_CONTEXT; - if (!*is_utf8p) - return (U8 *)start; - - /* ensure valid UTF-8 and chars < 256 before converting string */ - for (send = s + *lenp; s < send;) { - if (! UTF8_IS_INVARIANT(*s)) { - if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { - return (U8 *)start; - } - count++; - s++; - } - s++; - } - *is_utf8p = FALSE; + return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL); +} + +/* +No = here because currently externally undocumented +for apidoc bytes_from_utf8_loc + +Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where +to store the location of the first character in C<"s"> that cannot be +converted to non-UTF8. + +If that parameter is C<NULL>, this function behaves identically to +C<bytes_from_utf8>. + +Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to +C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>. + +Otherwise, the function returns a newly created C<NUL>-terminated string +containing the non-UTF8 equivalent of the convertible first portion of +C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>. +If the entire input string was converted, C<*is_utf8p> is set to a FALSE value, +and C<*first_non_downgradable> is set to C<NULL>. + +Otherwise, C<*first_non_downgradable> set to point to the first byte of the +first character in the original string that wasn't converted. C<*is_utf8p> is +unchanged. Note that the new string may have length 0. + +Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and +C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and +converts as many characters in it as possible stopping at the first one it +finds one that can't be converted to non-UTF-8. C<*first_non_downgradable> is +set to point to that. The function returns the portion that could be converted +in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length, +not including the terminating C<NUL>. If the very first character in the +original could not be converted, C<*lenp> will be 0, and the new string will +contain just a single C<NUL>. If the entire input string was converted, +C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>. + +Upon successful return, the number of variants in the converted portion of the +string can be computed by having saved the value of C<*lenp> before the call, +and subtracting the after-call value of C<*lenp> from it. + +=cut + + +*/ + +U8 * +Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted) +{ + U8 *d; + const U8 *original = s; + U8 *converted_start; + const U8 *send = s + *lenp; + + PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC; + + if (! *is_utf8p) { + if (first_unconverted) { + *first_unconverted = NULL; + } + + return (U8 *) original; + } - Newx(d, (*lenp) - count + 1, U8); + Newx(d, (*lenp) + 1, U8); - if (LIKELY(count)) { - s = start; start = d; + converted_start = d; while (s < send) { U8 c = *s++; if (! UTF8_IS_INVARIANT(c)) { - /* Then it is two-byte encoded */ + + /* Then it is multi-byte encoded. If the code point is above 0xFF, + * have to stop now */ + if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) { + if (first_unconverted) { + *first_unconverted = s - 1; + goto finish_and_return; + } + else { + Safefree(converted_start); + return (U8 *) original; + } + } + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); s++; } *d++ = c; } - *d = '\0'; - *lenp = d - start; - return (U8 *)start; - } - else { - Copy(start, d, *lenp, U8); - *(d + *lenp) = '\0'; - return (U8 *)d; + /* Here, converted the whole of the input */ + *is_utf8p = FALSE; + if (first_unconverted) { + *first_unconverted = NULL; } + + finish_and_return: + *d = '\0'; + *lenp = d - converted_start; + + /* Trim unused space */ + Renew(converted_start, *lenp + 1, U8); + + return converted_start; } /* |