diff options
-rw-r--r-- | embed.fnc | 7 | ||||
-rw-r--r-- | embed.h | 7 | ||||
-rw-r--r-- | embedvar.h | 7 | ||||
-rw-r--r-- | intrpvar.h | 7 | ||||
-rw-r--r-- | proto.h | 42 | ||||
-rw-r--r-- | regen/unicode_constants.pl | 3 | ||||
-rw-r--r-- | regexec.c | 78 | ||||
-rw-r--r-- | sv.c | 8 | ||||
-rw-r--r-- | unicode_constants.h | 3 | ||||
-rw-r--r-- | utf8.c | 100 |
10 files changed, 27 insertions, 235 deletions
@@ -661,16 +661,9 @@ ApR |bool |is_utf8_punct |NN const U8 *p ApR |bool |is_utf8_xdigit |NN const U8 *p ApR |bool |is_utf8_mark |NN const U8 *p EXpR |bool |is_utf8_X_extend |NN const U8 *p -EXpR |bool |is_utf8_X_prepend |NN const U8 *p EXpR |bool |is_utf8_X_regular_begin|NN const U8 *p -EXpR |bool |is_utf8_X_special_begin|NN const U8 *p -EXpR |bool |is_utf8_X_L |NN const U8 *p -EXpR |bool |is_utf8_X_RI |NN const U8 *p :not currently used EXpR |bool |is_utf8_X_LV |NN const U8 *p EXpR |bool |is_utf8_X_LVT |NN const U8 *p -EXpR |bool |is_utf8_X_LV_LVT_V |NN const U8 *p -EXpR |bool |is_utf8_X_T |NN const U8 *p -EXpR |bool |is_utf8_X_V |NN const U8 *p : Used in perly.y p |OP* |jmaybe |NN OP *o : Used in pp.c @@ -855,16 +855,9 @@ #define _is_utf8__perl_idstart(a) Perl__is_utf8__perl_idstart(aTHX_ a) #define av_reify(a) Perl_av_reify(aTHX_ a) #define current_re_engine() Perl_current_re_engine(aTHX) -#define is_utf8_X_L(a) Perl_is_utf8_X_L(aTHX_ a) #define is_utf8_X_LVT(a) Perl_is_utf8_X_LVT(aTHX_ a) -#define is_utf8_X_LV_LVT_V(a) Perl_is_utf8_X_LV_LVT_V(aTHX_ a) -#define is_utf8_X_RI(a) Perl_is_utf8_X_RI(aTHX_ a) -#define is_utf8_X_T(a) Perl_is_utf8_X_T(aTHX_ a) -#define is_utf8_X_V(a) Perl_is_utf8_X_V(aTHX_ a) #define is_utf8_X_extend(a) Perl_is_utf8_X_extend(aTHX_ a) -#define is_utf8_X_prepend(a) Perl_is_utf8_X_prepend(aTHX_ a) #define is_utf8_X_regular_begin(a) Perl_is_utf8_X_regular_begin(aTHX_ a) -#define is_utf8_X_special_begin(a) Perl_is_utf8_X_special_begin(aTHX_ a) #define op_clear(a) Perl_op_clear(aTHX_ a) #define qerror(a) Perl_qerror(aTHX_ a) #define reg_named_buff(a,b,c,d) Perl_reg_named_buff(aTHX_ a,b,c,d) diff --git a/embedvar.h b/embedvar.h index 877e81161f..d3eeaf03a0 100644 --- a/embedvar.h +++ b/embedvar.h @@ -353,16 +353,9 @@ #define PL_unitcheckav_save (vTHX->Iunitcheckav_save) #define PL_unlockhook (vTHX->Iunlockhook) #define PL_unsafe (vTHX->Iunsafe) -#define PL_utf8_X_L (vTHX->Iutf8_X_L) #define PL_utf8_X_LVT (vTHX->Iutf8_X_LVT) -#define PL_utf8_X_LV_LVT_V (vTHX->Iutf8_X_LV_LVT_V) -#define PL_utf8_X_RI (vTHX->Iutf8_X_RI) -#define PL_utf8_X_T (vTHX->Iutf8_X_T) -#define PL_utf8_X_V (vTHX->Iutf8_X_V) #define PL_utf8_X_extend (vTHX->Iutf8_X_extend) -#define PL_utf8_X_prepend (vTHX->Iutf8_X_prepend) #define PL_utf8_X_regular_begin (vTHX->Iutf8_X_regular_begin) -#define PL_utf8_X_special_begin (vTHX->Iutf8_X_special_begin) #define PL_utf8_alnum (vTHX->Iutf8_alnum) #define PL_utf8_alpha (vTHX->Iutf8_alpha) #define PL_utf8_blank (vTHX->Iutf8_blank) diff --git a/intrpvar.h b/intrpvar.h index 94b7425c10..641cac6268 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -628,14 +628,7 @@ PERLVAR(I, utf8_xdigit, SV *) PERLVAR(I, utf8_mark, SV *) PERLVAR(I, utf8_X_regular_begin, SV *) PERLVAR(I, utf8_X_extend, SV *) -PERLVAR(I, utf8_X_prepend, SV *) -PERLVAR(I, utf8_X_special_begin, SV *) -PERLVAR(I, utf8_X_L, SV *) PERLVAR(I, utf8_X_LVT, SV *) -PERLVAR(I, utf8_X_RI, SV *) -PERLVAR(I, utf8_X_T, SV *) -PERLVAR(I, utf8_X_V, SV *) -PERLVAR(I, utf8_X_LV_LVT_V, SV *) PERLVAR(I, utf8_toupper, SV *) PERLVAR(I, utf8_totitle, SV *) PERLVAR(I, utf8_tolower, SV *) @@ -1764,66 +1764,24 @@ PERL_CALLCONV bool Perl_is_uni_xdigit_lc(pTHX_ UV c) __attribute__warn_unused_result__ __attribute__pure__; -PERL_CALLCONV bool Perl_is_utf8_X_L(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_IS_UTF8_X_L \ - assert(p) - PERL_CALLCONV bool Perl_is_utf8_X_LVT(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_IS_UTF8_X_LVT \ assert(p) -PERL_CALLCONV bool Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V \ - assert(p) - -PERL_CALLCONV bool Perl_is_utf8_X_RI(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_IS_UTF8_X_RI \ - assert(p) - -PERL_CALLCONV bool Perl_is_utf8_X_T(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_IS_UTF8_X_T \ - assert(p) - -PERL_CALLCONV bool Perl_is_utf8_X_V(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_IS_UTF8_X_V \ - assert(p) - PERL_CALLCONV bool Perl_is_utf8_X_extend(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND \ assert(p) -PERL_CALLCONV bool Perl_is_utf8_X_prepend(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND \ - assert(p) - PERL_CALLCONV bool Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN \ assert(p) -PERL_CALLCONV bool Perl_is_utf8_X_special_begin(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN \ - assert(p) - PERL_CALLCONV bool Perl_is_utf8_alnum(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 73ec4ae0bf..56e53491b1 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -127,9 +127,6 @@ __DATA__ 03C5 first 03C5 tail -1100 -1160 -11A8 2010 string 007F native @@ -145,14 +145,7 @@ /* No asserts are done for some of these, in case called on a */ \ /* Unicode version in which they map to nothing */ \ LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \ LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \ - LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8); \ - LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8); \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); /* empty in many releases */ \ - LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8); \ - LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8) #define PLACEHOLDER /* Something for the preprocessor to grab onto */ @@ -4058,6 +4051,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) locinput += 2; } else { + STRLEN len; + /* In case have to backtrack to beginning, then match '.' */ char *starting = locinput; @@ -4066,16 +4061,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) LOAD_UTF8_CHARCLASS_GCB(); - /* Match (prepend)*, but don't bother trying if empty (as - * being set to _undef indicates) */ - if (PL_utf8_X_prepend != &PL_sv_undef) { - while (locinput < PL_regeol - && swash_fetch(PL_utf8_X_prepend, - (U8*)locinput, utf8_target)) - { - previous_prepend = locinput; - locinput += UTF8SKIP(locinput); - } + /* Match (prepend)* */ + while (locinput < PL_regeol + && (len = is_GCB_Prepend_utf8(locinput))) + { + previous_prepend = locinput; + locinput += len; } /* As noted above, if we matched a prepend character, but @@ -4085,8 +4076,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) && (locinput >= PL_regeol || (! swash_fetch(PL_utf8_X_regular_begin, (U8*)locinput, utf8_target) - && ! swash_fetch(PL_utf8_X_special_begin, - (U8*)locinput, utf8_target))) + && ! is_GCB_SPECIAL_BEGIN_utf8(locinput))) ) { locinput = previous_prepend; @@ -4101,9 +4091,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } - else if (! swash_fetch(PL_utf8_X_special_begin, - (U8*)locinput, utf8_target)) - { + else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) { /* Here did not match the required 'Begin' in the * second term. So just match the very first @@ -4115,26 +4103,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Here is a special begin. It can be composed of * several individual characters. One possibility is * RI+ */ - if (swash_fetch(PL_utf8_X_RI, - (U8*)locinput, utf8_target)) - { - locinput += UTF8SKIP(locinput); + if ((len = is_GCB_RI_utf8(locinput))) { + locinput += len; while (locinput < PL_regeol - && swash_fetch(PL_utf8_X_RI, - (U8*)locinput, utf8_target)) + && (len = is_GCB_RI_utf8(locinput))) { - locinput += UTF8SKIP(locinput); + locinput += len; } - } else /* Another possibility is T+ */ - if (swash_fetch(PL_utf8_X_T, - (U8*)locinput, utf8_target)) - { - locinput += UTF8SKIP(locinput); + } else if ((len = is_GCB_T_utf8(locinput))) { + /* Another possibility is T+ */ + locinput += len; while (locinput < PL_regeol - && swash_fetch(PL_utf8_X_T, - (U8*)locinput, utf8_target)) + && (len = is_GCB_T_utf8(locinput))) { - locinput += UTF8SKIP(locinput); + locinput += len; } } else { @@ -4145,10 +4127,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Match L* */ while (locinput < PL_regeol - && swash_fetch(PL_utf8_X_L, - (U8*)locinput, utf8_target)) + && (len = is_GCB_L_utf8(locinput))) { - locinput += UTF8SKIP(locinput); + locinput += len; } /* Here, have exhausted L*. If the next character @@ -4158,8 +4139,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * Are done. */ if (locinput < PL_regeol - && swash_fetch(PL_utf8_X_LV_LVT_V, - (U8*)locinput, utf8_target)) + && is_GCB_LV_LVT_V_utf8(locinput)) { /* Otherwise keep going. Must be LV, LVT or V. @@ -4172,22 +4152,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * V* */ locinput += UTF8SKIP(locinput); while (locinput < PL_regeol - && swash_fetch(PL_utf8_X_V, - (U8*)locinput, - utf8_target)) + && (len = is_GCB_V_utf8(locinput))) { - locinput += UTF8SKIP(locinput); + locinput += len; } } /* And any of LV, LVT, or V can be followed - * by T* */ + * by T* */ while (locinput < PL_regeol - && swash_fetch(PL_utf8_X_T, - (U8*)locinput, - utf8_target)) + && (len = is_GCB_T_utf8(locinput))) { - locinput += UTF8SKIP(locinput); + locinput += len; } } } @@ -13376,15 +13376,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param); PL_utf8_X_regular_begin = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param); PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param); - PL_utf8_X_prepend = sv_dup_inc(proto_perl->Iutf8_X_prepend, param); - PL_utf8_X_special_begin = sv_dup_inc(proto_perl->Iutf8_X_special_begin, param); - PL_utf8_X_L = sv_dup_inc(proto_perl->Iutf8_X_L, param); - /*not currently used: PL_utf8_X_LV = sv_dup_inc(proto_perl->Iutf8_X_LV, param);*/ PL_utf8_X_LVT = sv_dup_inc(proto_perl->Iutf8_X_LVT, param); - PL_utf8_X_RI = sv_dup_inc(proto_perl->Iutf8_X_RI, param); - PL_utf8_X_T = sv_dup_inc(proto_perl->Iutf8_X_T, param); - PL_utf8_X_V = sv_dup_inc(proto_perl->Iutf8_X_V, param); - PL_utf8_X_LV_LVT_V = sv_dup_inc(proto_perl->Iutf8_X_LV_LVT_V, param); PL_utf8_toupper = sv_dup_inc(proto_perl->Iutf8_toupper, param); PL_utf8_totitle = sv_dup_inc(proto_perl->Iutf8_totitle, param); PL_utf8_tolower = sv_dup_inc(proto_perl->Iutf8_tolower, param); diff --git a/unicode_constants.h b/unicode_constants.h index f915d59b8e..82081d910a 100644 --- a/unicode_constants.h +++ b/unicode_constants.h @@ -30,9 +30,6 @@ #define GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE 0xCF /* U+03C5 */ #define GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL "\x85" /* U+03C5 */ -#define HANGUL_CHOSEONG_KIYEOK_UTF8 "\xE1\x84\x80" /* U+1100 */ -#define HANGUL_JUNGSEONG_FILLER_UTF8 "\xE1\x85\xA0" /* U+1160 */ -#define HANGUL_JONGSEONG_KIYEOK_UTF8 "\xE1\x86\xA8" /* U+11A8 */ #define HYPHEN_UTF8 "\xE2\x80\x90" /* U+2010 */ #define DELETE_NATIVE 0x007F /* U+007F */ @@ -2229,76 +2229,6 @@ Perl_is_utf8_X_extend(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend"); } -bool -Perl_is_utf8_X_prepend(pTHX_ const U8 *p) -{ - /* If no code points in the Unicode version being worked on match - * GCB=Prepend, this will set PL_utf8_X_prepend to &PL_sv_undef during its - * first call. Otherwise, it will set it to a swash created for it. - * swash_fetch() hence can't be used without checking first if it is valid - * to do so. */ - - dVAR; - bool initialized = cBOOL(PL_utf8_X_prepend); - bool ret; - - PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND; - - if (PL_utf8_X_prepend == &PL_sv_undef) { - return FALSE; - } - - if ((ret = is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend")) - || initialized) - { - return ret; - } - - /* Here the code point being checked was not a prepend, and we hadn't - * initialized PL_utf8_X_prepend, so we don't know if it is just this - * particular input code point that didn't match, or if the table is - * completely empty. The is_utf8_common() call did the initialization, so - * we can inspect the swash's inversion list to find out. If there are no - * elements in its inversion list, it's empty, and nothing will ever match, - * so set things up so we can skip the check in future calls. */ - if (_invlist_len(_get_swash_invlist(PL_utf8_X_prepend)) == 0) { - SvREFCNT_dec(PL_utf8_X_prepend); - PL_utf8_X_prepend = &PL_sv_undef; - } - - return FALSE; -} - -bool -Perl_is_utf8_X_special_begin(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_SPECIAL_BEGIN; - - return is_utf8_common(p, &PL_utf8_X_special_begin, "_X_Special_Begin"); -} - -bool -Perl_is_utf8_X_L(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_L; - - return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L"); -} - -bool -Perl_is_utf8_X_RI(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_RI; - - return is_utf8_common(p, &PL_utf8_X_RI, "_X_RI"); -} - /* These constants are for finding GCB=LV and GCB=LVT. These are for the * pre-composed Hangul syllables, which are all in a contiguous block and * arranged there in such a way so as to facilitate alorithmic determination of @@ -2367,35 +2297,6 @@ Perl_is_utf8_X_LVT(pTHX_ const U8 *p) && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */ } -bool -Perl_is_utf8_X_T(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_T; - - return is_utf8_common(p, &PL_utf8_X_T, "_X_GCB_T"); -} - -bool -Perl_is_utf8_X_V(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_V; - - return is_utf8_common(p, &PL_utf8_X_V, "_X_GCB_V"); -} - -bool -Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V; - - return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V"); -} bool Perl__is_utf8_quotemeta(pTHX_ const U8 *p) @@ -2408,7 +2309,6 @@ Perl__is_utf8_quotemeta(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta"); } - /* =for apidoc to_utf8_case |