summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2018-06-25 19:11:46 -0600
committerKarl Williamson <khw@cpan.org>2018-07-05 14:47:18 -0600
commit8ed185f9c23fbf6e4b07abbe602e675889d07d4a (patch)
tree2c67fc73b1f4fa6023f5a4be18e0d158ca083c32 /utf8.h
parent3de6d1419f40062b902c4f65a73bde456095805f (diff)
downloadperl-8ed185f9c23fbf6e4b07abbe602e675889d07d4a.tar.gz
Make isUTF8_char() an inline function
It was a macro that used a trie. This changes to use the dfa constructed in previous commits. I didn't bother with taking measurements. A dfa should require fewer conditionals to be executed for many code points.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h73
1 files changed, 0 insertions, 73 deletions
diff --git a/utf8.h b/utf8.h
index a164ee495b..69fcacc4c0 100644
--- a/utf8.h
+++ b/utf8.h
@@ -316,33 +316,6 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(U8) c >= 0xED)
-/* A helper macro for isUTF8_CHAR, so use that one instead of this. This was
- * generated by regen/regcharclass.pl, and then moved here. Then it was
- * hand-edited to add some LIKELY() calls, presuming that malformations are
- * unlikely. The lines that generated it were then commented out. This was
- * done because it takes on the order of 10 minutes to generate, and is never
- * going to change, unless the generated code is improved, and figuring out
- * the LIKELYs there would be hard.
- *
- UTF8_CHAR: Matches legal UTF-8 variant code points up through 0x1FFFFFF
-
- 0x80 - 0x1FFFFF
-*/
-/*** GENERATED CODE ***/
-#define is_UTF8_CHAR_utf8_no_length_checks(s) \
-( ( 0xC2 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xDF ) ? \
- ( LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
-: ( 0xE0 == ((const U8*)s)[0] ) ? \
- ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xE1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xEF ) ? \
- ( LIKELY( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xF0 == ((const U8*)s)[0] ) ? \
- ( LIKELY( ( ( 0x90 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBF ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-: ( ( ( ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF7 ) && LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
-
-/* The above macro handles UTF-8 that has this start byte as the maximum */
-#define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF7
-
/* A helper macro for isSTRICT_UTF8_CHAR, so use that one instead of this.
* Like is_UTF8_CHAR_utf8_no_length_checks(), this was moved here and LIKELYs
* added manually.
@@ -1022,52 +995,6 @@ point's representation.
#define SHARP_S_SKIP 2
-/*
-
-=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
-
-Evaluates to non-zero if the first few bytes of the string starting at C<s> and
-looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
-that represents some code point; otherwise it evaluates to 0. If non-zero, the
-value gives how many bytes starting at C<s> comprise the code point's
-representation. Any bytes remaining before C<e>, but beyond the ones needed to
-form the first code point in C<s>, are not examined.
-
-The code point can be any that will fit in a UV on this machine, using Perl's
-extension to official UTF-8 to represent those higher than the Unicode maximum
-of 0x10FFFF. That means that this macro is used to efficiently decide if the
-next few bytes in C<s> is legal UTF-8 for a single character.
-
-Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
-defined by Unicode to be fully interchangeable across applications;
-C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
-#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
-code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
-
-Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
-C<L</is_utf8_string_loclen>> to check entire strings.
-
-Note that it is deprecated to use code points higher than what will fit in an
-IV. This macro does not raise any warnings for such code points, treating them
-as valid.
-
-Note also that a UTF-8 INVARIANT character (i.e. ASCII on non-EBCDIC machines)
-is a valid UTF-8 character.
-
-=cut
-*/
-
-#define isUTF8_CHAR(s, e) \
- (UNLIKELY((e) <= (s)) \
- ? 0 \
- : (UTF8_IS_INVARIANT(*s)) \
- ? 1 \
- : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
- ? 0 \
- : LIKELY(NATIVE_UTF8_TO_I8(*s) <= _IS_UTF8_CHAR_HIGHEST_START_BYTE) \
- ? is_UTF8_CHAR_utf8_no_length_checks(s) \
- : _is_utf8_char_helper(s, e, 0))
-
#define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
#define bytes_from_utf8(s, lenp, is_utf8p) \
bytes_from_utf8_loc(s, lenp, is_utf8p, 0)