Create isUTF8_CHAR() macro and use it

This macro will inline the code to determine if a character is well-formed UTF-8 for code points below a certain value, falling back to a slower function for larger ones. On ASCII platforms, it will inline for well-beyond all legal Unicode code points. On EBCDIC, it currently does it for code points up to 0x3FFF. This could be increased, but our porting tests do the regen every time to make sure everything is ok, and making it larger slows that down. This is worked around on ASCII by normally commenting out the code that generates this info, but including in utf8.h a version that did get generated. This is static information and won't change. (This could be done for EBCDIC too, but I chose not to at this time as each code page has a different macro generated, and it gets ugly getting all of them in utf8.h) Using this macro allowed for simplification of several functions in utf8.c
author: Karl Williamson <khw@cpan.org> 2014-05-05 20:43:47 -0600
committer: Karl Williamson <khw@cpan.org> 2014-05-31 11:42:40 -0600
commit: 6302f837102d66f532a1c151f7299abbef3a15dd (patch)
tree: 242fe154607368b270e65e29f81f8aed4214259c
parent: d9f92374c5f4b19ed46c29c6710922b80429de59 (diff)
download: perl-6302f837102d66f532a1c151f7299abbef3a15dd.tar.gz
7 files changed, 73 insertions, 103 deletions
diff --git a/embed.fnc b/embed.fnc
index 9d6915d429..35756e401b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -633,7 +633,7 @@ pR	|OP*	|invert		|NULLOK OP* cmd
 ApR	|I32	|is_lvalue_sub
 : Used in cop.h
 XopR	|I32	|was_lvalue_sub
-iRn	|STRLEN	|_is_utf8_char_slow|NN const U8 *s|const STRLEN len
+iRn	|STRLEN	|_is_utf8_char_slow|NN const U8 *s|NN const U8 *e
 ADMpPR	|U32	|to_uni_upper_lc|U32 c
 ADMpPR	|U32	|to_uni_title_lc|U32 c
 ADMpPR	|U32	|to_uni_lower_lc|U32 c
diff --git a/inline.h b/inline.h
index 34d9b3b866..fff7499f01 100644
--- a/inline.h
+++ b/inline.h
@@ -239,24 +239,19 @@ S_isALNUM_lazy(pTHX_ const char* p)
 }
 
 /*
-Tests if the first C<len> bytes of string C<s> form a valid UTF-8
-character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a
-valid UTF-8 character.  The number of bytes in the UTF-8 character
-will be returned if it is valid, otherwise 0.
-
-This is the "slow" version as opposed to the "fast" version which is
-the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
-difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
-or less you should use the IS_UTF8_CHAR(), for lengths of five or more
-you should use the _slow().  In practice this means that the _slow()
-will be used very rarely, since the maximum Unicode code point (as of
-Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
-the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
-five bytes or more.
+A helper function for the macro isUTF8_CHAR(), which should be used instead of
+this function.  The macro will handle smaller code points directly saving time,
+using this function as a fall-back for higher code points.
+Tests if the first bytes of string C<s> form a valid UTF-8 character.  0 is
+returned if the bytes starting at C<s> up to but not including C<e> do not form a
+complete well-formed UTF-8 character; otherwise the number of bytes in the
+character is returned.
+Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a valid UTF-8
+character.
 
 =cut */
 PERL_STATIC_INLINE STRLEN
-S__is_utf8_char_slow(const U8 *s, const STRLEN len)
+S__is_utf8_char_slow(const U8 *s, const U8 *e)
 {
     dTHX;   /* The function called below requires thread context */
 
@@ -264,7 +259,8 @@ S__is_utf8_char_slow(const U8 *s, const STRLEN len)
 
     PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
 
-    utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
+    assert(e >= s);
+    utf8n_to_uvchr(s, e - s, &actual_len, UTF8_CHECK_ONLY);
 
     return (actual_len == (STRLEN) -1) ? 0 : actual_len;
 }
diff --git a/pod/perlguts.pod b/pod/perlguts.pod
index 5fe7e566dc..74a7df95ec 100644
--- a/pod/perlguts.pod
+++ b/pod/perlguts.pod
@@ -2706,7 +2706,7 @@ In general, you either have to know what you're dealing with, or you
 have to guess.  The API function C<is_utf8_string> can help; it'll tell
 you if a string contains only valid UTF-8 characters.  However, it can't
 do the work for you.  On a character-by-character basis,
-C<is_utf8_char_buf>
+C<isUTF8_CHAR>
 will tell you whether the current character in a string is valid UTF-8. 
 
 =head2 How does UTF-8 represent Unicode characters?
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 06cd938556..0482d92596 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1701,7 +1701,7 @@ are valid UTF-8.
 
 =item *
 
-C<is_utf8_char_buf(buf, buf_end)> returns true if the pointer points to
+C<isUTF8_CHAR(buf, buf_end)> returns true if the pointer points to
 a valid UTF-8 character.
 
 =item *
diff --git a/proto.h b/proto.h
index 2d50a7518d..d4a8589a22 100644
--- a/proto.h
+++ b/proto.h
@@ -57,11 +57,12 @@ PERL_CALLCONV bool	Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
 #define PERL_ARGS_ASSERT__IS_UTF8_FOO	\
 	assert(p)
 
-PERL_STATIC_INLINE STRLEN	S__is_utf8_char_slow(const U8 *s, const STRLEN len)
+PERL_STATIC_INLINE STRLEN	S__is_utf8_char_slow(const U8 *s, const U8 *e)
 			__attribute__warn_unused_result__
-			__attribute__nonnull__(1);
+			__attribute__nonnull__(1)
+			__attribute__nonnull__(2);
 #define PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW	\
-	assert(s)
+	assert(s); assert(e)
 
 PERL_CALLCONV bool	Perl__is_utf8_mark(pTHX_ const U8 *p)
 			__attribute__warn_unused_result__
diff --git a/utf8.c b/utf8.c
index d4cbbe729a..8d62b5f831 100644
--- a/utf8.c
+++ b/utf8.c
@@ -310,13 +310,7 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 /*
 =for apidoc is_utf8_char_buf
 
-Returns the number of bytes that comprise the first UTF-8 encoded character in
-buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
-buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
-encoded character.
-
-Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
-machines) is a valid UTF-8 character.
+This is identical to the macro isUTF8_CHAR.
 
 =cut */
 
@@ -324,22 +318,9 @@ STRLEN
 Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
 {
 
-    STRLEN len;
-
     PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
 
-    if (buf_end <= buf) {
-	return 0;
-    }
-
-    len = buf_end - buf;
-    if (len > UTF8SKIP(buf)) {
-	len = UTF8SKIP(buf);
-    }
-
-    if (IS_UTF8_CHAR_FAST(len))
-        return IS_UTF8_CHAR(buf, len) ? len : 0;
-    return _is_utf8_char_slow(buf, len);
+    return isUTF8_CHAR(buf, buf_end);
 }
 
 /*
@@ -362,7 +343,7 @@ Perl_is_utf8_char(const U8 *s)
     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 
     /* Assumes we have enough space, which is why this is deprecated */
-    return is_utf8_char_buf(s, s + UTF8SKIP(s));
+    return isUTF8_CHAR(s, s + UTF8SKIP(s));
 }
 
 
@@ -389,28 +370,11 @@ Perl_is_utf8_string(const U8 *s, STRLEN len)
     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 
     while (x < send) {
-	 /* Inline the easy bits of is_utf8_char() here for speed... */
-	 if (UTF8_IS_INVARIANT(*x)) {
-	    x++;
-	 }
-	 else {
-	      /* ... and call is_utf8_char() only if really needed. */
-	     const STRLEN c = UTF8SKIP(x);
-	     const U8* const next_char_ptr = x + c;
-
-	     if (next_char_ptr > send) {
-		 return FALSE;
-	     }
-
-	     if (IS_UTF8_CHAR_FAST(c)) {
-	         if (!IS_UTF8_CHAR(x, c))
-		     return FALSE;
-	     }
-	     else if (! _is_utf8_char_slow(x, c)) {
-		 return FALSE;
-	     }
-	     x = next_char_ptr;
-	 }
+        STRLEN len = isUTF8_CHAR(x, send);
+        if (UNLIKELY(! len)) {
+            return FALSE;
+        }
+        x += len;
     }
 
     return TRUE;
@@ -444,34 +408,17 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 {
     const U8* const send = s + (len ? len : strlen((const char *)s));
     const U8* x = s;
-    STRLEN c;
     STRLEN outlen = 0;
 
     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 
     while (x < send) {
-	 const U8* next_char_ptr;
-
-	 /* Inline the easy bits of is_utf8_char() here for speed... */
-	 if (UTF8_IS_INVARIANT(*x))
-	     next_char_ptr = x + 1;
-	 else {
-	     /* ... and call is_utf8_char() only if really needed. */
-	     c = UTF8SKIP(x);
-	     next_char_ptr = c + x;
-	     if (next_char_ptr > send) {
-		 goto out;
-	     }
-	     if (IS_UTF8_CHAR_FAST(c)) {
-	         if (!IS_UTF8_CHAR(x, c))
-		     c = 0;
-	     } else
-	         c = _is_utf8_char_slow(x, c);
-	     if (!c)
-	         goto out;
-	 }
-         x = next_char_ptr;
-	 outlen++;
+        STRLEN len = isUTF8_CHAR(x, send);
+        if (UNLIKELY(! len)) {
+            goto out;
+        }
+        x += len;
+        outlen++;
     }
 
  out:
@@ -1811,7 +1758,7 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
      * as far as there being enough bytes available in it to accommodate the
      * character without reading beyond the end, and pass that number on to the
      * validating routine */
-    if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
+    if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
         if (ckWARN_d(WARN_UTF8)) {
             Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
 		    "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
diff --git a/utf8.h b/utf8.h
index a6b21277e0..a18faa2e35 100644
--- a/utf8.h
+++ b/utf8.h
@@ -577,12 +577,17 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 	 ((end) > (input) + 1) && \
 	 toFOLD((input)[0]) == 's' && \
 	 toFOLD((input)[1]) == 's')
+
 #define SHARP_S_SKIP 2
 
 /* If you want to exclude surrogates, and beyond legal Unicode, see the blame
  * log for earlier versions which gave details for these */
 
-/* regen/regcharclass.pl generates is_UTF8_CHAR_utf8() macros for up to these
+/* A helper macro for isUTF8_CHAR, so use that one, and not this one.  This is
+ * retained solely for backwards compatibility and may be deprecated and
+ * removed in a future Perl version.
+ *
+ * regen/regcharclass.pl generates is_UTF8_CHAR_utf8() macros for up to these
  * number of bytes.  So this has to be coordinated with that file */
 #ifdef EBCDIC
 #   define IS_UTF8_CHAR_FAST(n) ((n) <= 3)
@@ -591,11 +596,15 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #endif
 
 #ifndef EBCDIC
-/* This was generated by regen/regcharclass.pl, and then moved here.  The lines
- * that generated it were then commented out.  This was done solely because it
- * takes on the order of 10 minutes to generate, and is never going to change.
- * The EBCDIC equivalent hasn't been commented out in regcharclass.pl, so it
- * should generate and run the correct stuff */
+/* A helper macro for isUTF8_CHAR, so use that one instead of this.  This was
+ * generated by regen/regcharclass.pl, and then moved here.  The lines that
+ * generated it were then commented out.  This was done solely because it takes
+ * on the order of 10 minutes to generate, and is never going to change, unless
+ * the generated code is improved.
+ *
+ * The EBCDIC versions have been cut to not cover all of legal Unicode, so
+ * don't take too long to generate, and there is a separate one for each code
+ * page, so they are in regcharclass.h instead of here */
 /*
 	UTF8_CHAR: Matches utf8 from 1 to 4 bytes
 
@@ -629,13 +638,30 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 : 0 )
 #endif
 
-/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
- * (1) allows UTF-8 encoded UTF-16 surrogates
- * (2) it allows code points past U+10FFFF.
- * The Perl_is_utf8_char() full "slow" code will handle the Perl
- * "extended UTF-8". */
-#define IS_UTF8_CHAR(p, n)      (is_UTF8_CHAR_utf8_safe(p, (p) + (n)) == n)
-
+/*
+ * =for apidoc isUTF8_CHAR
+ *
+ * Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
+ * UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into
+ * C<s>.  Returns 0 if the sequence starting at C<s> through C<e - 1> is not
+ * well-formed UTF-8
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character. */
+
+#define isUTF8_CHAR(s, e)   (((e) <= (s))                                   \
+                             ? 0                                            \
+                             : (UTF8_IS_INVARIANT(*s))                      \
+                               ? 1                                          \
+                               : (((e) - (s)) < UTF8SKIP(s))                \
+                                 ? 0                                        \
+                                 : (IS_UTF8_CHAR_FAST(UTF8SKIP(s)))         \
+                                   ? is_UTF8_CHAR_utf8_safe(s,e)            \
+                                   : _is_utf8_char_slow(s, e))
+
+/* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
+ * retained solely for backwards compatibility */
+#define IS_UTF8_CHAR(p, n)      (isUTF8_CHAR(p, (p) + (n)) == n)
 
 #endif /* H_UTF8 */
author	Karl Williamson <khw@cpan.org>	2014-05-05 20:43:47 -0600
committer	Karl Williamson <khw@cpan.org>	2014-05-31 11:42:40 -0600
commit	6302f837102d66f532a1c151f7299abbef3a15dd (patch)
tree	242fe154607368b270e65e29f81f8aed4214259c
parent	d9f92374c5f4b19ed46c29c6710922b80429de59 (diff)
download	perl-6302f837102d66f532a1c151f7299abbef3a15dd.tar.gz