diff options
-rw-r--r-- | ebcdic_tables.h | 78 | ||||
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | inline.h | 57 | ||||
-rw-r--r-- | perl.h | 94 | ||||
-rw-r--r-- | proto.h | 7 | ||||
-rw-r--r-- | regcharclass.h | 2 | ||||
-rw-r--r-- | regen/ebcdic.pl | 91 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 47 | ||||
-rw-r--r-- | utf8.h | 62 |
10 files changed, 331 insertions, 110 deletions
diff --git a/ebcdic_tables.h b/ebcdic_tables.h index c75399afce..08c7dc6bdb 100644 --- a/ebcdic_tables.h +++ b/ebcdic_tables.h @@ -373,6 +373,45 @@ SOFTWARE. }; # endif + +/* The table below is adapted from + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * See copyright notice at the beginning of this file. + */ + +# ifndef DOINIT +# EXTCONST U8 C9_utf8_dfa_tab[]; +# else +# EXTCONST U8 C9_utf8_dfa_tab[] = { +/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ +/*0_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/*1_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/*2_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/*3_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/*4_ */ 0, 9, 9, 10, 10, 10, 10, 10, 10, 11, 11, 0, 0, 0, 0, 0, +/*5_ */ 0, 11, 11, 11, 11, 11, 11, 12, 12, 12, 0, 0, 0, 0, 0, 0, +/*6_ */ 0, 0, 12, 12, 12, 13, 13, 12, 12, 12, 12, 0, 0, 0, 0, 0, +/*7_ */ 12, 12, 12, 12, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, +/*8_ */ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, +/*9_ */ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, +/*A_ */ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 2, +/*B_ */ 2, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 0, 3, 3, +/*C_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, +/*D_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 6, 5, 4, 4, +/*E_ */ 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 8, 7, 1, +/*F_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, +/*N0= 0*/ 0, 1, 14, 28, 42, 70, 56, 98, 84, 1, 1, 1, 1, 1, +/*N1=14*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +/*N2=28*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 14, +/*N3=42*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 28, 28, 28, 28, 28, +/*N4=56*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 28, 28, +/*N5=70*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 28, 28, 28, 28, 1, +/*N6=84*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 42, 42, 42, +/*N7=98*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 42, 1, 1, 1, 1 +/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13*/ +}; +# endif + #endif /* EBCDIC 1047 */ #if 'A' == 193 /* EBCDIC 037 */ \ @@ -711,6 +750,45 @@ SOFTWARE. }; # endif + +/* The table below is adapted from + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * See copyright notice at the beginning of this file. + */ + +# ifndef DOINIT +# EXTCONST U8 C9_utf8_dfa_tab[]; +# else +# EXTCONST U8 C9_utf8_dfa_tab[] = { +/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ +/*0_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/*1_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/*2_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/*3_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/*4_ */ 0, 9, 9, 10, 10, 10, 10, 10, 10, 11, 11, 0, 0, 0, 0, 0, +/*5_ */ 0, 11, 11, 11, 11, 11, 11, 12, 12, 12, 0, 0, 0, 0, 0, 12, +/*6_ */ 0, 0, 12, 12, 13, 13, 12, 12, 12, 12, 12, 0, 0, 0, 0, 0, +/*7_ */ 12, 12, 12, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, +/*8_ */ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, +/*9_ */ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, +/*A_ */ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, +/*B_ */ 0, 2, 2, 2, 2, 2, 1, 3, 3, 3, 0, 0, 3, 3, 3, 3, +/*C_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, +/*D_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 6, 5, 4, 4, +/*E_ */ 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 8, 7, 1, +/*F_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, +/*N0= 0*/ 0, 1, 14, 28, 42, 70, 56, 98, 84, 1, 1, 1, 1, 1, +/*N1=14*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +/*N2=28*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 14, +/*N3=42*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 28, 28, 28, 28, 28, +/*N4=56*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 28, 28, +/*N5=70*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 28, 28, 28, 28, 1, +/*N6=84*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 42, 42, 42, +/*N7=98*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 42, 1, 1, 1, 1 +/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13*/ +}; +# endif + #endif /* EBCDIC 037 */ #endif /* PERL_EBCDIC_TABLES_H_ */ @@ -829,6 +829,8 @@ AnidR |Size_t |isUTF8_CHAR|NN const U8 * const s0 \ |NN const U8 * const e AnidR |Size_t |isSTRICT_UTF8_CHAR |NN const U8 * const s0 \ |NN const U8 * const e +AnidR |Size_t |isC9_STRICT_UTF8_CHAR |NN const U8 * const s0 \ + |NN const U8 * const e AnmdpR |bool |is_utf8_string |NN const U8 *s|STRLEN len AnidR |bool |is_utf8_string_flags \ |NN const U8 *s|STRLEN len|const U32 flags @@ -259,6 +259,7 @@ #define init_stacks() Perl_init_stacks(aTHX) #define init_tm(a) Perl_init_tm(aTHX_ a) #define intro_my() Perl_intro_my(aTHX) +#define isC9_STRICT_UTF8_CHAR S_isC9_STRICT_UTF8_CHAR #define isSTRICT_UTF8_CHAR S_isSTRICT_UTF8_CHAR #define isUTF8_CHAR S_isUTF8_CHAR #define is_c9strict_utf8_string_loclen S_is_c9strict_utf8_string_loclen @@ -1156,6 +1156,63 @@ S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) /* +=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e + +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some +Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, +the value gives how many bytes starting at C<s> comprise the code point's +representation. Any bytes remaining before C<e>, but beyond the ones needed to +form the first code point in C<s>, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF. This +differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character +code points. This corresponds to +L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. +which said that non-character code points are merely discouraged rather than +completely forbidden in open interchange. See +L<perlunicode/Noncharacter code points>. + +Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and +C<L</isUTF8_CHAR_flags>> for a more customized definition. + +Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and +C<L</is_c9strict_utf8_string_loclen>> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of C9_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = C9_utf8_dfa_tab[256 + state + C9_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + + return 0; +} + +/* + =for apidoc is_strict_utf8_string_loc Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the @@ -5749,10 +5749,104 @@ EXTCONST U8 strict_utf8_dfa_tab[] = { /*N11*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, }; +/* And below is yet another version of the above tables that accepts only UTF-8 + * as defined by Corregidum #9. Hence no surrogates nor non-Unicode, but + * it allows non-characters. This is isomorphic to the original table + * in http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * + * The classes are + * 00-7F 0 + * 80-8F 9 + * 90-9F 10 + * A0-BF 11 + * C0,C1 1 + * C2-DF 2 + * E0 7 + * E1-EC 3 + * ED 4 + * EE-EF 3 + * F0 8 + * F1-F3 6 (6 bits can be stripped) + * F4 5 (only 5 can be stripped) + * F5-FF 1 + */ + +EXTCONST U8 C9_utf8_dfa_tab[] = { + /* The first part of the table maps bytes to character classes to reduce + * the size of the transition table and create bitmasks. */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*00-0F*/ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*10-1F*/ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*20-2F*/ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*30-3F*/ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*40-4F*/ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*50-5F*/ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*60-6F*/ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*70-7F*/ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, /*80-8F*/ + 10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10, /*90-9F*/ + 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, /*A0-AF*/ + 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, /*B0-BF*/ + 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /*C0-CF*/ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /*D0-DF*/ + 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, /*E0-EF*/ + 8, 6, 6, 6, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*F0-FF*/ + +/* The second part is a transition table that maps a combination + * of a state of the automaton and a character class to a new state, called a + * node. The nodes are: + * N0 The initial state, and final accepting one. + * N1 Any one continuation byte (80-BF) left. This is transitioned to + * immediately when the start byte indicates a two-byte sequence + * N2 Any two continuation bytes left. + * N3 Any three continuation bytes left. + * N4 Start byte is E0. Continuation bytes 80-9F are illegal (overlong); + * the other continuations transition to state N1 + * N5 Start byte is ED. Continuation bytes A0-BF all lead to surrogates, + * so are illegal. The other continuations transition to state N1. + * N6 Start byte is F0. Continuation bytes 80-8F are illegal (overlong); + * the other continuations transition to N2 + * N7 Start byte is F4. Continuation bytes 90-BF are illegal + * (non-unicode); the other continuations transition to N2 + * 1 Reject. All transitions not mentioned above (except the single + * byte ones (as they are always legal) are to this state. + */ + +# undef N0 +# undef N1 +# undef N2 +# undef N3 +# undef N4 +# undef N5 +# undef N6 +# undef N7 +# undef NUM_CLASSES +# define NUM_CLASSES 12 +# define N0 0 +# define N1 ((N0) + NUM_CLASSES) +# define N2 ((N1) + NUM_CLASSES) +# define N3 ((N2) + NUM_CLASSES) +# define N4 ((N3) + NUM_CLASSES) +# define N5 ((N4) + NUM_CLASSES) +# define N6 ((N5) + NUM_CLASSES) +# define N7 ((N6) + NUM_CLASSES) + +/*Class: 0 1 2 3 4 5 6 7 8 9 10 11 */ +/*N0*/ 0, 1, N1, N2, N5, N7, N3, N4, N6, 1, 1, 1, +/*N1*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, +/*N2*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, N1, N1, N1, +/*N3*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, N2, N2, N2, + +/*N4*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, N1, +/*N5*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, N1, N1, 1, +/*N6*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, N2, N2, +/*N7*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, N2, 1, 1, +}; + # else /* End of is DOINIT */ EXTCONST U8 perl_extended_utf8_dfa_tab[]; EXTCONST U8 strict_utf8_dfa_tab[]; +EXTCONST U8 C9_utf8_dfa_tab[]; # endif #endif /* end of isn't EBCDIC */ @@ -1407,6 +1407,13 @@ PERL_CALLCONV bool Perl_io_close(pTHX_ IO* io, GV *gv, bool not_implicit, bool w #define PERL_ARGS_ASSERT_IO_CLOSE \ assert(io) #ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE Size_t S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR \ + assert(s0); assert(e) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE Size_t S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR \ diff --git a/regcharclass.h b/regcharclass.h index 3c2a94cb66..96a483ca8c 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1910,6 +1910,6 @@ * 6aaacc29ce24746bcb2bf82a920fcf90e07cf92d75325199c50f40754d39bb72 lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl - * 10e68a929fd4a2cb367c0caf2f82104de6b1c769a4fc4db8a5d87408c6b3c07b regen/regcharclass.pl + * f4fe24c2a005b636f6e7418df060c6c9a57ddcae4219b54d0cb18baf312dbc3a regen/regcharclass.pl * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ diff --git a/regen/ebcdic.pl b/regen/ebcdic.pl index 33da6df551..ffc294d8d4 100644 --- a/regen/ebcdic.pl +++ b/regen/ebcdic.pl @@ -666,6 +666,97 @@ END output_table(\@strict_utf8_dfa, "strict_utf8_dfa_tab", $NUM_CLASSES); } + { + # This generates the dfa table for C9 strict UTF-8, which rejects + # surrogates and above Unicode, but allows non-characters,. + # + # The classes are + # 00-9F 0 Always legal at start + # A0-A1 9 Not legal immediately after start bytes F0 F8 + # A2-A7 10 Not legal immediately after start bytes F0 F8 F9 + # A8-AF 11 Not legal immediately after start bytes F0 F9 + # B0-B5,B8-BF 12 Not legal immediately after start byte F9 + # B6,B7 13 + # C0-C4 1 (reject, all are overlong) + # C5-DF 2 Accepts any legal continuation + # E0 1 (reject, all are overlong) + # E1-EF 3 Accepts any legal continuation + # F0 6 (has overlongs) + # F1 5 (has surrogates) + # F2-F7 4 Accepts any legal continuation + # F8 8 (has overlongs) + # F9 7 (has non-Unicode) + # FA-FF 1 (reject, all are non-Unicode) + # + # The first part of the table maps bytes to character classes to reduce + # the size of the transition table and create bitmasks. + # + # The second part is a transition table that maps a combination of a + # state of the automaton and a character class to a new state. The + # numbering of the original nodes is retained, but some have been split + # so that there are new nodes. They mean: + # N0 The initial state, and final accepting one. + # N1 One continuation byte (A0-BF) left. This is transitioned to + # immediately when the start byte indicates a two-byte sequence + # N2 Two continuation bytes left. + # N3 Three continuation bytes left. + # N4 Start byte is F0. Continuation bytes A[0-F] are illegal + # (overlong); the other continuations transition to N2 + # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the + # other continuations transition to N2 + # N6 Start byte is F8. Continuation bytes A[0-7] are illegal + # (overlong); the other continuations transition to N3 + # N7 Start byte is F9. Continuation bytes A0 and A1 transition to + # N3; the other continuation bytes are illegal (non-Unicode) + # 1 Reject. All transitions not mentioned above (except the single + # byte ones (as they are always legal) are to this state. + + my $NUM_CLASSES = 14; + my $N0 = 0; + my $N1 = $N0 + $NUM_CLASSES; + my $N2 = $N1 + $NUM_CLASSES; + my $N3 = $N2 + $NUM_CLASSES; + my $N4 = $N3 + $NUM_CLASSES; + my $N5 = $N4 + $NUM_CLASSES; + my $N6 = $N5 + $NUM_CLASSES; + my $N7 = $N6 + $NUM_CLASSES; + + my @C9_utf8_dfa; + my @i8 = ( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F + 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF + 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF + 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF + 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF + ); + $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); + push @C9_utf8_dfa, ( + # Class: + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0 + 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1 + 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2 + 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3 + + 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4 + 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5 + 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6 + 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7 + ); + output_table(\@C9_utf8_dfa, "C9_utf8_dfa_tab", $NUM_CLASSES); + } + print $out_fh get_conditional_compile_line_end(); } diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 3dee00060b..84936360d9 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1637,53 +1637,6 @@ SURROGATE: Surrogate code points => UTF8 :safe \p{_Perl_Surrogate} -# This program was run with this enabled, and the results copied to utf8.h and -# utfebcdic.h; then this was commented out because it takes so long to figure -# out these 2 million code points. The results would not change unless utf8.h -# decides it wants a different maximum, or this program creates better -# optimizations. Trying with 5 bytes used too much memory to calculate. -# -# We don't generate code for invariants here because the EBCDIC form is too -# complicated and would slow things down; instead the user should test for -# invariants first. -# -# 0x1FFFFF was chosen because for both UTF-8 and UTF-EBCDIC, its start byte -# is the same as 0x10FFFF, and it includes all the above-Unicode code points -# that have that start byte. In other words, it is the natural stopping place -# that includes all Unicode code points. -# -#STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points -#=> UTF8 :no_length_checks only_ebcdic_platform -#0x00A0 - 0xD7FF -#0xE000 - 0xFDCF -#0xFDF0 - 0xFFFD -#0x10000 - 0x1FFFD -#0x20000 - 0x2FFFD -#0x30000 - 0x3FFFD -#0x40000 - 0x4FFFD -#0x50000 - 0x5FFFD -#0x60000 - 0x6FFFD -#0x70000 - 0x7FFFD -#0x80000 - 0x8FFFD -#0x90000 - 0x9FFFD -#0xA0000 - 0xAFFFD -#0xB0000 - 0xBFFFD -#0xC0000 - 0xCFFFD -#0xD0000 - 0xDFFFD -#0xE0000 - 0xEFFFD -#0xF0000 - 0xFFFFD -#0x100000 - 0x10FFFD - -#C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrogates -#=> UTF8 :no_length_checks only_ascii_platform -#0x0080 - 0xD7FF -#0xE000 - 0x10FFFF -# -#C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points including non-character code points, no surrogates -#=> UTF8 :no_length_checks only_ebcdic_platform -#0x00A0 - 0xD7FF -#0xE000 - 0x10FFFF - QUOTEMETA: Meta-characters that \Q should quote => high :fast \p{_Perl_Quotemeta} @@ -316,31 +316,6 @@ C<cp> is Unicode if above 255; otherwise is platform-native. #define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ (U8) c >= 0xED) -/* A helper macro for isC9_STRICT_UTF8_CHAR, so use that one instead of this. - * Like is_UTF8_CHAR_utf8_no_length_checks(), this was moved here and LIKELYs - * added manually. - * - C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, - no surrogates - 0x0080 - 0xD7FF - 0xE000 - 0x10FFFF -*/ -/*** GENERATED CODE ***/ -#define is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s) \ -( ( 0xC2 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xDF ) ? \ - ( LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ -: ( 0xE0 == ((const U8*)s)[0] ) ? \ - ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ( ( 0xE1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xEC ) || ( ((const U8*)s)[0] & 0xFE ) == 0xEE ) ?\ - ( LIKELY( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ( 0xED == ((const U8*)s)[0] ) ? \ - ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ( 0xF0 == ((const U8*)s)[0] ) ? \ - ( LIKELY( ( ( 0x90 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBF ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ -: ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ? \ - ( LIKELY( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ -: LIKELY( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( ( ((const U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 ) - #define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x7FFFFFFF) #endif /* EBCDIC vs ASCII */ @@ -955,43 +930,6 @@ point's representation. /* -=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e - -Evaluates to non-zero if the first few bytes of the string starting at C<s> and -looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some -Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, -the value gives how many bytes starting at C<s> comprise the code point's -representation. Any bytes remaining before C<e>, but beyond the ones needed to -form the first code point in C<s>, are not examined. - -The largest acceptable code point is the Unicode maximum 0x10FFFF. This -differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character -code points. This corresponds to -L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. -which said that non-character code points are merely discouraged rather than -completely forbidden in open interchange. See -L<perlunicode/Noncharacter code points>. - -Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and -C<L</isUTF8_CHAR_flags>> for a more customized definition. - -Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and -C<L</is_c9strict_utf8_string_loclen>> to check entire strings. - -=cut -*/ - -#define isC9_STRICT_UTF8_CHAR(s, e) \ - (UNLIKELY((e) <= (s)) \ - ? 0 \ - : (UTF8_IS_INVARIANT(*s)) \ - ? 1 \ - : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ - ? 0 \ - : is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s)) - -/* - =for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags Evaluates to non-zero if the first few bytes of the string starting at C<s> and |