summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2018-06-27 22:01:53 -0600
committerKarl Williamson <khw@cpan.org>2018-07-05 14:47:19 -0600
commit67049a5ffa8b7757041edb8f972a0a74fbe5d63d (patch)
tree7427d4c3d7a9b99d58a33e86647478916343871d /utf8.h
parente6a4ffc3f7aa69cbf3e5e83518e40e529a34b75b (diff)
downloadperl-67049a5ffa8b7757041edb8f972a0a74fbe5d63d.tar.gz
Make isSTRICT_UTF8_CHAR() an inline function
It was a macro that used a trie. This changes to use the dfa constructed in previous commits. I didn't bother with taking measurements. A dfa should have fewer conditionals for many code points.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h91
1 files changed, 3 insertions, 88 deletions
diff --git a/utf8.h b/utf8.h
index 69fcacc4c0..87b4351873 100644
--- a/utf8.h
+++ b/utf8.h
@@ -316,58 +316,12 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(U8) c >= 0xED)
-/* A helper macro for isSTRICT_UTF8_CHAR, so use that one instead of this.
+/* A helper macro for isC9_STRICT_UTF8_CHAR, so use that one instead of this.
* Like is_UTF8_CHAR_utf8_no_length_checks(), this was moved here and LIKELYs
* added manually.
*
- STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no
- surrrogates nor non-character code points
-*/
-/*** GENERATED CODE ***/
-#define is_STRICT_UTF8_CHAR_utf8_no_length_checks(s) \
-( ( 0xC2 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xDF ) ? \
- ( LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
-: ( 0xE0 == ((const U8*)s)[0] ) ? \
- ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( ( 0xE1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xEC ) || 0xEE == ((const U8*)s)[0] ) ?\
- ( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xED == ((const U8*)s)[0] ) ? \
- ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xEF == ((const U8*)s)[0] ) ? \
- ( ( ( 0x80 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xB6 ) || ( 0xB8 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBE ) ) ?\
- ( LIKELY( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ? 3 : 0 ) \
- : ( 0xB7 == ((const U8*)s)[1] ) ? \
- ( LIKELY( ( ((const U8*)s)[2] & 0xF0 ) == 0x80 || ( ((const U8*)s)[2] & 0xF0 ) == 0xB0 ) ? 3 : 0 )\
- : ( ( 0xBF == ((const U8*)s)[1] ) && ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBD ) ) ? 3 : 0 )\
-: ( 0xF0 == ((const U8*)s)[0] ) ? \
- ( ( ( 0x90 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0x9E ) || ( 0xA0 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xAE ) || ( 0xB0 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBE ) ) ?\
- ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
- : ( ((const U8*)s)[1] == 0x9F || ( ( ((const U8*)s)[1] & 0xEF ) == 0xAF ) ) ? \
- ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \
- ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
- : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
- : 0 ) \
-: ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ? \
- ( ( ( ( ((const U8*)s)[1] & 0xC8 ) == 0x80 ) || ( ( ((const U8*)s)[1] & 0xCC ) == 0x88 ) || ( ( ((const U8*)s)[1] & 0xCE ) == 0x8C ) || ( ( ((const U8*)s)[1] & 0xCF ) == 0x8E ) ) ?\
- ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
- : ( ( ((const U8*)s)[1] & 0xCF ) == 0x8F ) ? \
- ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \
- ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
- : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
- : 0 ) \
-: ( 0xF4 == ((const U8*)s)[0] ) ? \
- ( ( 0x80 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0x8E ) ? \
- ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
- : ( 0x8F == ((const U8*)s)[1] ) ? \
- ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \
- ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
- : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
- : 0 ) \
-: 0 )
-
-/* Similarly,
- C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code
- points, no surrogates
+ C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points,
+ no surrogates
0x0080 - 0xD7FF
0xE000 - 0x10FFFF
*/
@@ -1001,45 +955,6 @@ point's representation.
/*
-=for apidoc Am|STRLEN|isSTRICT_UTF8_CHAR|const U8 *s|const U8 *e
-
-Evaluates to non-zero if the first few bytes of the string starting at C<s> and
-looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
-Unicode code point completely acceptable for open interchange between all
-applications; otherwise it evaluates to 0. If non-zero, the value gives how
-many bytes starting at C<s> comprise the code point's representation. Any
-bytes remaining before C<e>, but beyond the ones needed to form the first code
-point in C<s>, are not examined.
-
-The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
-be a surrogate nor a non-character code point. Thus this excludes any code
-point from Perl's extended UTF-8.
-
-This is used to efficiently decide if the next few bytes in C<s> is
-legal Unicode-acceptable UTF-8 for a single character.
-
-Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
-#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
-code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
-and C<L</isUTF8_CHAR_flags>> for a more customized definition.
-
-Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
-C<L</is_strict_utf8_string_loclen>> to check entire strings.
-
-=cut
-*/
-
-#define isSTRICT_UTF8_CHAR(s, e) \
- (UNLIKELY((e) <= (s)) \
- ? 0 \
- : (UTF8_IS_INVARIANT(*s)) \
- ? 1 \
- : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
- ? 0 \
- : is_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
-
-/*
-
=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e
Evaluates to non-zero if the first few bytes of the string starting at C<s> and