summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc2
-rw-r--r--embed.h2
-rw-r--r--inline.h30
-rw-r--r--proto.h7
-rw-r--r--t/lib/warnings/utf81
-rw-r--r--utf8.c112
-rw-r--r--utf8.h25
7 files changed, 136 insertions, 43 deletions
diff --git a/embed.fnc b/embed.fnc
index e2ac1272a0..08b248307b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -678,7 +678,7 @@ ApR |I32 |is_lvalue_sub
: Used in cop.h
XopR |I32 |was_lvalue_sub
#ifndef PERL_NO_INLINE_FUNCTIONS
-AiMRn |STRLEN |_is_utf8_char_slow|NN const U8 *s|NN const U8 *e
+ApMRnP |STRLEN |_is_utf8_char_slow|NN const U8 * const s|const STRLEN len
#endif
ADMpPR |U32 |to_uni_upper_lc|U32 c
ADMpPR |U32 |to_uni_title_lc|U32 c
diff --git a/embed.h b/embed.h
index 9213a5df88..c9683ee32c 100644
--- a/embed.h
+++ b/embed.h
@@ -786,7 +786,7 @@
#define my_popen(a,b) Perl_my_popen(aTHX_ a,b)
#endif
#if !defined(PERL_NO_INLINE_FUNCTIONS)
-#define _is_utf8_char_slow S__is_utf8_char_slow
+#define _is_utf8_char_slow Perl__is_utf8_char_slow
#define append_utf8_from_native_byte S_append_utf8_from_native_byte
#define av_top_index(a) S_av_top_index(aTHX_ a)
#define cx_popblock(a) S_cx_popblock(aTHX_ a)
diff --git a/inline.h b/inline.h
index f709572ff6..0dcc733851 100644
--- a/inline.h
+++ b/inline.h
@@ -277,36 +277,6 @@ S_append_utf8_from_native_byte(const U8 byte, U8** dest)
}
/*
-
-A helper function for the macro isUTF8_CHAR(), which should be used instead of
-this function. The macro will handle smaller code points directly saving time,
-using this function as a fall-back for higher code points.
-
-Tests if the first bytes of string C<s> form a valid UTF-8 character. 0 is
-returned if the bytes starting at C<s> up to but not including C<e> do not form a
-complete well-formed UTF-8 character; otherwise the number of bytes in the
-character is returned.
-
-Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a valid UTF-8
-character.
-
-=cut */
-PERL_STATIC_INLINE STRLEN
-S__is_utf8_char_slow(const U8 *s, const U8 *e)
-{
- dTHX; /* The function called below requires thread context */
-
- STRLEN actual_len;
-
- PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
-
- assert(e >= s);
- utf8n_to_uvchr(s, e - s, &actual_len, UTF8_CHECK_ONLY);
-
- return (actual_len == (STRLEN) -1) ? 0 : actual_len;
-}
-
-/*
=for apidoc valid_utf8_to_uvchr
Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
diff --git a/proto.h b/proto.h
index a0fd435e52..498589648e 100644
--- a/proto.h
+++ b/proto.h
@@ -3800,10 +3800,11 @@ STATIC SV * S_incpush_if_exists(pTHX_ AV *const av, SV *dir, SV *const stem);
# endif
#endif
#if !defined(PERL_NO_INLINE_FUNCTIONS)
-PERL_STATIC_INLINE STRLEN S__is_utf8_char_slow(const U8 *s, const U8 *e)
- __attribute__warn_unused_result__;
+PERL_CALLCONV STRLEN Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len)
+ __attribute__warn_unused_result__
+ __attribute__pure__;
#define PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW \
- assert(s); assert(e)
+ assert(s)
PERL_STATIC_INLINE void S_append_utf8_from_native_byte(const U8 byte, U8** dest);
#define PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE \
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8
index 4263c04958..947dea467f 100644
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -756,7 +756,6 @@ Operation "uc" returns its argument for non-Unicode code point 0x7F+ at - line \
Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ at - line \d+.
Operation "uc" returns its argument for non-Unicode code point 0x80+ at - line \d+.
Code point 0x7F+ is not Unicode, may not be portable in print at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ in print at - line \d+.
########
# NAME [perl #127262]
BEGIN{
diff --git a/utf8.c b/utf8.c
index bd9b0c3b1c..f24402de4e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -337,6 +337,118 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
/*
+A helper function for the macro isUTF8_CHAR(), which should be used instead of
+this function. The macro will handle smaller code points directly saving time,
+using this function as a fall-back for higher code points. This function
+assumes that it is not called with an invariant character, and that
+'s + len - 1' is within bounds of the string 's'.
+
+Tests if the string C<s> of at least length 'len' is a valid variant UTF-8
+character. 0 is returned if not, otherwise, 'len' is returned.
+
+*/
+
+STRLEN
+Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len)
+{
+ const U8 *e;
+ const U8 *x, *y;
+
+ PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
+
+ if (UNLIKELY(! UTF8_IS_START(*s))) {
+ return 0;
+ }
+
+ e = s + len;
+
+ for (x = s + 1; x < e; x++) {
+ if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+ return 0;
+ }
+ }
+
+#ifndef EBCDIC
+
+ /* Here is syntactically valid. Make sure this isn't the start of an
+ * overlong. These values were found by manually inspecting the UTF-8
+ * patterns. See the tables in utf8.h and utfebcdic.h */
+
+ /* This is not needed on modern perls where C0 and C1 are not considered
+ * start bytes. */
+#if 0
+ if (UNLIKELY(*s < 0xC2)) {
+ return 0;
+ }
+#endif
+
+ if (len > 1) {
+ if ( (*s == 0xE0 && UNLIKELY(s[1] < 0xA0))
+ || (*s == 0xF0 && UNLIKELY(s[1] < 0x90))
+ || (*s == 0xF8 && UNLIKELY(s[1] < 0x88))
+ || (*s == 0xFC && UNLIKELY(s[1] < 0x84))
+ || (*s == 0xFE && UNLIKELY(s[1] < 0x82)))
+ {
+ return 0;
+ }
+ if ((len > 6 && UNLIKELY(*s == 0xFF) && UNLIKELY(s[6] < 0x81))) {
+ return 0;
+ }
+ }
+
+#else /* For EBCDIC, we use I8, which is the same on all code pages */
+ {
+ const U8 s0 = NATIVE_UTF8_TO_I8(*s);
+
+ /* On modern perls C0-C4 aren't considered start bytes */
+ if ( /* s0 < 0xC5 || */ s0 == 0xE0) {
+ return 0;
+ }
+
+ if (len >= 1) {
+ const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+ if ( (s0 == 0xF0 && UNLIKELY(s1 < 0xB0))
+ || (s0 == 0xF8 && UNLIKELY(s1 < 0xA8))
+ || (s0 == 0xFC && UNLIKELY(s1 < 0xA4))
+ || (s0 == 0xFE && UNLIKELY(s1 < 0x82)))
+ {
+ return 0;
+ }
+ if ((len > 7 && UNLIKELY(s0 == 0xFF) && UNLIKELY(s[7] < 0xA1))) {
+ return 0;
+ }
+ }
+ }
+
+#endif
+
+ /* Now see if this would overflow a UV on this platform. See if the UTF8
+ * for this code point is larger than that for the highest representable
+ * code point */
+ y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+ for (x = s; x < e; x++, y++) {
+
+ /* If the same at this byte, go on to the next */
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
+ continue;
+ }
+
+ /* If this is larger, it overflows */
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
+ return 0;
+ }
+
+ /* But if smaller, it won't */
+ break;
+ }
+
+ return len;
+}
+
+/*
+
=for apidoc utf8n_to_uvchr
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
diff --git a/utf8.h b/utf8.h
index 29b052cd64..b940eaf3ee 100644
--- a/utf8.h
+++ b/utf8.h
@@ -826,13 +826,24 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
-Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
-UTF-EBCDIC) encoded character, looking no further than S<C<e - s>> bytes into
-C<s>. Returns 0 if the sequence starting at C<s> through S<C<e - 1>> is not
-well-formed UTF-8.
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+code point; otherwise it evaluates to 0. If non-zero, the value gives how many
+many bytes starting at C<s> comprise the code point's representation.
-Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
-machines) is a valid UTF-8 character.
+The code point can be any that will fit in a UV on this machine, using Perl's
+extension to official UTF-8 to represent those higher than the Unicode maximum
+of 0x10FFFF. That means that this macro is used to efficiently decide if the
+next few bytes in C<s> is legal UTF-8 for a single character. Use
+L</is_utf8_string>(), L</is_utf8_string_loclen>(), and
+L</is_utf8_string_loc>() to check entire strings.
+
+Note that it is deprecated to use code points higher than what will fit in an
+IV. This macro does not raise any warnings for such code points, treating them
+as valid.
+
+Note also that a UTF-8 INVARIANT character (i.e. ASCII on non-EBCDIC machines)
+is a valid UTF-8 character.
=cut
*/
@@ -845,7 +856,7 @@ machines) is a valid UTF-8 character.
? 0 \
: LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \
? is_UTF8_CHAR_utf8_no_length_checks(s) \
- : _is_utf8_char_slow(s, e))
+ : _is_utf8_char_slow(s, UTF8SKIP(s)))
#define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)