summaryrefslogtreecommitdiff
path: root/inline.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2017-11-18 14:05:07 -0700
committerKarl Williamson <khw@cpan.org>2017-11-26 10:19:46 -0700
commitc9cd936b8f828f1c71873b459ceaec2300fec6c7 (patch)
treebb2864c47121b20cf204b5cae5cd580f78e75169 /inline.h
parentdd237e826e98a1f24e4b18cc6176152da1f1fd58 (diff)
downloadperl-c9cd936b8f828f1c71873b459ceaec2300fec6c7.tar.gz
Add is_utf8_non_invariant_string()
This function tells whether or not its argument is a sequence of bytes that is legal Perl-extended-UTF-8, and which either requires UTF-8 (because it contains wide characters) or would have a different representation when not under UTF-8. This paradigm is used in several places in the perl core to decide whether to turn on an SV's utf8 flag. None of those places realized that there was a simple way to avoid rescanning the string (though perhaps a good C optimizer would). This commit creates a funtion that does this task without the rescan; the next commits will convert to use this function.
Diffstat (limited to 'inline.h')
-rw-r--r--inline.h45
1 files changed, 45 insertions, 0 deletions
diff --git a/inline.h b/inline.h
index aa8798ed59..309d74f435 100644
--- a/inline.h
+++ b/inline.h
@@ -486,6 +486,51 @@ C<L</is_utf8_fixed_width_buf_loclen_flags>>,
#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
+#if defined(PERL_CORE) || defined (PERL_EXT)
+
+/*
+=for apidoc is_utf8_non_invariant_string
+
+Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
+C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
+UTF-8; otherwise returns FALSE.
+
+A TRUE return means that at least one code point represented by the sequence
+either is a wide character not representable as a single byte, or the
+representation differs depending on whether the sequence is encoded in UTF-8 or
+not.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>,
+C<L<perlapi/is_utf8_string>>
+
+=cut
+
+This is commonly used to determine if a SV's UTF-8 flag should be turned on.
+It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if
+it otherwise contains invalid UTF-8.
+
+It is an internal function because khw thinks that XS code shouldn't be working
+at this low a level. A valid use case could change that.
+
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
+{
+ const U8 * first_variant;
+
+ PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ return FALSE;
+ }
+
+ return is_utf8_string(first_variant, len - (first_variant - s));
+}
+
+#endif
+
/*
=for apidoc is_strict_utf8_string