summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-02-17 15:03:32 -0700
committerKarl Williamson <khw@cpan.org>2015-02-19 22:55:01 -0700
commit64935bc6975bb01af403817752e88d6540c8711d (patch)
tree6a1619ac46d1501a5b296a2b69d9af0b98db8c58 /regexec.c
parent0e0b935601a8b7a2c56653412a94a36f986bc34f (diff)
downloadperl-64935bc6975bb01af403817752e88d6540c8711d.tar.gz
Add qr/\b{gcb}/
A function implements seeing if the space between any two characters is a grapheme cluster break. Afer I wrote this, I realized that an array lookup might be a better implementation, but the deadline for v5.22 was too close to change it. I did see that my gcc optimized it down to an array lookup. This makes the implementation of \X go from being complicated to trivial.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c500
1 files changed, 298 insertions, 202 deletions
diff --git a/regexec.c b/regexec.c
index a4fea0a735..95dae10cae 100644
--- a/regexec.c
+++ b/regexec.c
@@ -37,6 +37,9 @@
#include "re_top.h"
#endif
+#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
+ "Use of \\b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
+
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
@@ -191,18 +194,6 @@ static const char* const non_utf8_target_but_utf8_required
PL_XPosix_ptrs[_CC_WORDCHAR], \
LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
-#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
- STMT_START { \
- LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
- "_X_regular_begin", \
- NULL, \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
- LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
- "_X_extend", \
- NULL, \
- COMBINING_GRAVE_ACCENT_UTF8); \
- } STMT_END
-
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
@@ -262,16 +253,6 @@ static const char* const non_utf8_target_but_utf8_required
} \
} STMT_END
-/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
- * These are for the pre-composed Hangul syllables, which are all in a
- * contiguous block and arranged there in such a way so as to facilitate
- * alorithmic determination of their characteristics. As such, they don't need
- * a swash, but can be determined by simple arithmetic. Almost all are
- * GCB=LVT, but every 28th one is a GCB=LV */
-#define SBASE 0xAC00 /* Start of block */
-#define SCount 11172 /* Length of block */
-#define TCount 28
-
#define SLAB_FIRST(s) (&(s)->states[0])
#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
@@ -1728,6 +1709,33 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+/* Takes a pointer to an inversion list, a pointer to its corresponding
+ * inversion map, and a code point, and returns the code point's value
+ * according to the two arrays. It assumes that all code points have a value.
+ * This is used as the base macro for macros for particular properties */
+#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
+ invmap[_invlist_search(invlist, cp)]
+
+/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
+ * of a code point, returning the value for the first code point in the string.
+ * And it takes the particular macro name that finds the desired value given a
+ * code point. Merely convert the UTF-8 to code point and call the cp macro */
+#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
+ (__ASSERT_(pos < strend) \
+ /* Note assumes is valid UTF-8 */ \
+ (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
+
+/* Returns the GCB value for the input code point */
+#define getGCB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_GCB_invlist, \
+ Grapheme_Cluster_Break_invmap, \
+ (cp))
+
+/* Returns the GCB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getGCB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
/* We know what class REx starts with. Try to find this position... */
/* if reginfo->intuit, its a dryrun */
@@ -1937,30 +1945,120 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
case BOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) != TRADITIONAL_BOUND) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ goto do_boundu;
+ }
+
FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
+
case NBOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) != TRADITIONAL_BOUND) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ goto do_nboundu;
+ }
+
FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
- case BOUND:
+
+ case BOUND: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
- case BOUNDA:
+
+ case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_BOUND_A(isWORDCHAR_A);
break;
- case NBOUND:
+
+ case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
- case NBOUNDA:
+
+ case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_NBOUND_A(isWORDCHAR_A);
break;
- case BOUNDU:
- FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
- break;
+
case NBOUNDU:
- FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ break;
+ }
+
+ do_nboundu:
+
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case BOUNDU:
+ do_boundu:
+ switch((bound_type) FLAGS(c)) {
+ case TRADITIONAL_BOUND:
+ FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ break;
+ case GCB_BOUND:
+ if (s == reginfo->strbeg) { /* GCB always matches at begin and
+ end */
+ if (to_complement ^ cBOOL(reginfo->intuit
+ || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ }
+
+ if (utf8_target) {
+ PL_GCB_enum before = getGCB_VAL_UTF8(
+ reghop3((U8*)s, -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ PL_GCB_enum after = getGCB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if (to_complement ^ isGCB(before, after)) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ before = after;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. Everything is a GCB except between CR and
+ LF */
+ while (s < strend) {
+ if (to_complement ^ (UCHARAT(s - 1) != '\r'
+ || UCHARAT(s) != '\n'))
+ {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ s++;
+ }
+ }
+ }
+
+ if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+ break;
+ }
break;
+
case LNBREAK:
REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
is_LNBREAK_latin1_safe(s, strend)
@@ -3892,6 +3990,105 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
return TRUE;
}
+/* This creates a single number by combining two, with 'before' being like the
+ * 10's digit, but this isn't necessarily base 10; it is base however many
+ * elements of the enum there are */
+#define GCBcase(before, after) ((PL_GCB_ENUM_COUNT * before) + after)
+
+STATIC bool
+S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
+{
+ /* returns a boolean indicating if there is a Grapheme Cluster Boundary
+ * between the inputs. See http://www.unicode.org/reports/tr29/ */
+
+ switch (GCBcase(before, after)) {
+
+ /* Break at the start and end of text.
+ GB1. sot ÷
+ GB2. ÷ eot
+
+ Break before and after controls except between CR and LF
+ GB4. ( Control | CR | LF ) ÷
+ GB5. ÷ ( Control | CR | LF )
+
+ Otherwise, break everywhere.
+ GB10. Any ÷ Any */
+ default:
+ return TRUE;
+
+ /* Do not break between a CR and LF.
+ GB3. CR × LF */
+ case GCBcase(PL_GCB_CR, PL_GCB_LF):
+ return FALSE;
+
+ /* Do not break Hangul syllable sequences.
+ GB6. L × ( L | V | LV | LVT ) */
+ case GCBcase(PL_GCB_L, PL_GCB_L):
+ case GCBcase(PL_GCB_L, PL_GCB_V):
+ case GCBcase(PL_GCB_L, PL_GCB_LV):
+ case GCBcase(PL_GCB_L, PL_GCB_LVT):
+ return FALSE;
+
+ /* GB7. ( LV | V ) × ( V | T ) */
+ case GCBcase(PL_GCB_LV, PL_GCB_V):
+ case GCBcase(PL_GCB_LV, PL_GCB_T):
+ case GCBcase(PL_GCB_V, PL_GCB_V):
+ case GCBcase(PL_GCB_V, PL_GCB_T):
+ return FALSE;
+
+ /* GB8. ( LVT | T) × T */
+ case GCBcase(PL_GCB_LVT, PL_GCB_T):
+ case GCBcase(PL_GCB_T, PL_GCB_T):
+ return FALSE;
+
+ /* Do not break between regional indicator symbols.
+ GB8a. Regional_Indicator × Regional_Indicator */
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Regional_Indicator):
+ return FALSE;
+
+ /* Do not break before extending characters.
+ GB9. × Extend */
+ case GCBcase(PL_GCB_Other, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Extend, PL_GCB_Extend):
+ case GCBcase(PL_GCB_L, PL_GCB_Extend):
+ case GCBcase(PL_GCB_LV, PL_GCB_Extend):
+ case GCBcase(PL_GCB_LVT, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Extend):
+ case GCBcase(PL_GCB_SpacingMark, PL_GCB_Extend):
+ case GCBcase(PL_GCB_T, PL_GCB_Extend):
+ case GCBcase(PL_GCB_V, PL_GCB_Extend):
+ return FALSE;
+
+ /* Do not break before SpacingMarks, or after Prepend characters.
+ GB9a. × SpacingMark */
+ case GCBcase(PL_GCB_Other, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Extend, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_L, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_LV, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_LVT, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_SpacingMark, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_T, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_V, PL_GCB_SpacingMark):
+ return FALSE;
+
+ /* GB9b. Prepend × */
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Other):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_L):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_LV):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_LVT):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Prepend):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Regional_Indicator):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_T):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_V):
+ return FALSE;
+ }
+
+ NOT_REACHED;
+}
+
/* returns -1 on failure, $+[0] on success */
STATIC SSize_t
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
@@ -3964,6 +4161,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
int to_complement; /* Invert the result? */
_char_class_number classnum;
bool is_utf8_pat = reginfo->is_utf8_pat;
+ bool match = FALSE;
+
#ifdef DEBUGGING
GET_RE_DEBUG_FLAGS_DECL;
@@ -4623,13 +4822,21 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
break;
}
- /* XXX At that point regcomp.c would no longer * have to set the FLAGS fields of these */
case NBOUNDL: /* /\B/l */
to_complement = 1;
/* FALLTHROUGH */
case BOUNDL: /* /\b/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (FLAGS(scan) != TRADITIONAL_BOUND) {
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
+ goto boundu;
+ }
+
if (utf8_target) {
if (locinput == reginfo->strbeg)
ln = isWORDCHAR_LC('\n');
@@ -4696,9 +4903,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
/* FALLTHROUGH */
case BOUNDU: /* /\b/u */
+
+ boundu:
if (utf8_target) {
- bound_utf8:
+ bound_utf8:
+ switch((bound_type) FLAGS(scan)) {
+ case TRADITIONAL_BOUND:
ln = (locinput == reginfo->strbeg)
? isWORDCHAR_L1('\n')
: isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
@@ -4706,18 +4917,55 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
n = (NEXTCHR_IS_EOS)
? isWORDCHAR_L1('\n')
: isWORDCHAR_utf8((U8*)locinput);
+
+ match = ln != n;
+ break;
+ case GCB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE; /* GCB always matches at begin and
+ end */
+ }
+ else {
+ /* Find the gcb values of previous and current
+ * chars, then see if is a break point */
+ match = isGCB(getGCB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend));
+ }
+ break;
+ }
}
- else {
+ else { /* Not utf8 target */
+ switch((bound_type) FLAGS(scan)) {
+ case TRADITIONAL_BOUND:
ln = (locinput == reginfo->strbeg)
? isWORDCHAR_L1('\n')
: isWORDCHAR_L1(UCHARAT(locinput - 1));
n = (NEXTCHR_IS_EOS)
? isWORDCHAR_L1('\n')
: isWORDCHAR_L1(nextchr);
+ match = ln != n;
+ break;
+ case GCB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE; /* GCB always matches at begin and
+ end */
+ }
+ else { /* Only CR-LF combo isn't a GCB in 0-255
+ range */
+ match = UCHARAT(locinput - 1) != '\r'
+ || UCHARAT(locinput) != '\n';
+ }
+ break;
+ }
}
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ ! match) {
sayNO;
}
break;
@@ -4921,38 +5169,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
- /* From http://www.unicode.org/reports/tr29 (5.2 version). An
- extended Grapheme Cluster is:
-
- CR LF
- | Prepend* Begin Extend*
- | .
-
- Begin is: ( Special_Begin | ! Control )
- Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
- Extend is: ( Grapheme_Extend | Spacing_Mark )
- Control is: [ GCB_Control | CR | LF ]
- Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
- If we create a 'Regular_Begin' = Begin - Special_Begin, then
- we can rewrite
-
- Begin is ( Regular_Begin + Special Begin )
-
- It turns out that 98.4% of all Unicode code points match
- Regular_Begin. Doing it this way eliminates a table match in
- the previous implementation for almost all Unicode code points.
-
- There is a subtlety with Prepend* which showed up in testing.
- Note that the Begin, and only the Begin is required in:
- | Prepend* Begin Extend*
- Also, Begin contains '! Control'. A Prepend must be a
- '! Control', which means it must also be a Begin. What it
- comes down to is that if we match Prepend* and then find no
- suitable Begin afterwards, that if we backtrack the last
- Prepend, that one will be a suitable Begin.
- */
-
if (NEXTCHR_IS_EOS)
sayNO;
if (! utf8_target) {
@@ -4970,147 +5186,27 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
}
else {
- /* Utf8: See if is ( CR LF ); already know that locinput <
- * reginfo->strend, so locinput+1 is in bounds */
- if ( nextchr == '\r' && locinput+1 < reginfo->strend
- && UCHARAT(locinput + 1) == '\n')
- {
- locinput += 2;
- }
- else {
- STRLEN len;
-
- /* In case have to backtrack to beginning, then match '.' */
- char *starting = locinput;
-
- /* In case have to backtrack the last prepend */
- char *previous_prepend = NULL;
+ /* Get the gcb type for the current character */
+ PL_GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend);
- LOAD_UTF8_CHARCLASS_GCB();
-
- /* Match (prepend)* */
- while (locinput < reginfo->strend
- && (len = is_GCB_Prepend_utf8(locinput)))
- {
- previous_prepend = locinput;
- locinput += len;
- }
-
- /* As noted above, if we matched a prepend character, but
- * the next thing won't match, back off the last prepend we
- * matched, as it is guaranteed to match the begin */
- if (previous_prepend
- && (locinput >= reginfo->strend
- || (! swash_fetch(PL_utf8_X_regular_begin,
- (U8*)locinput, utf8_target)
- && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
- )
- {
- locinput = previous_prepend;
- }
-
- /* Note that here we know reginfo->strend > locinput, as we
- * tested that upon input to this switch case, and if we
- * moved locinput forward, we tested the result just above
- * and it either passed, or we backed off so that it will
- * now pass */
- if (swash_fetch(PL_utf8_X_regular_begin,
- (U8*)locinput, utf8_target)) {
- locinput += UTF8SKIP(locinput);
+ /* Then scan through the input until we get to the first
+ * character whose type is supposed to be a gcb with the
+ * current character. (There is always a break at the
+ * end-of-input) */
+ locinput += UTF8SKIP(locinput);
+ while (locinput < reginfo->strend) {
+ PL_GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend);
+ if (isGCB(prev_gcb, cur_gcb)) {
+ break;
}
- else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
-
- /* Here did not match the required 'Begin' in the
- * second term. So just match the very first
- * character, the '.' of the final term of the regex */
- locinput = starting + UTF8SKIP(starting);
- goto exit_utf8;
- } else {
-
- /* Here is a special begin. It can be composed of
- * several individual characters. One possibility is
- * RI+ */
- if ((len = is_GCB_RI_utf8(locinput))) {
- locinput += len;
- while (locinput < reginfo->strend
- && (len = is_GCB_RI_utf8(locinput)))
- {
- locinput += len;
- }
- } else if ((len = is_GCB_T_utf8(locinput))) {
- /* Another possibility is T+ */
- locinput += len;
- while (locinput < reginfo->strend
- && (len = is_GCB_T_utf8(locinput)))
- {
- locinput += len;
- }
- } else {
- /* Here, neither RI+ nor T+; must be some other
- * Hangul. That means it is one of the others: L,
- * LV, LVT or V, and matches:
- * L* (L | LVT T* | V * V* T* | LV V* T*) */
-
- /* Match L* */
- while (locinput < reginfo->strend
- && (len = is_GCB_L_utf8(locinput)))
- {
- locinput += len;
- }
-
- /* Here, have exhausted L*. If the next character
- * is not an LV, LVT nor V, it means we had to have
- * at least one L, so matches L+ in the original
- * equation, we have a complete hangul syllable.
- * Are done. */
+ prev_gcb = cur_gcb;
+ locinput += UTF8SKIP(locinput);
+ }
- if (locinput < reginfo->strend
- && is_GCB_LV_LVT_V_utf8(locinput))
- {
- /* Otherwise keep going. Must be LV, LVT or V.
- * See if LVT, by first ruling out V, then LV */
- if (! is_GCB_V_utf8(locinput)
- /* All but every TCount one is LV */
- && (valid_utf8_to_uvchr((U8 *) locinput,
- NULL)
- - SBASE)
- % TCount != 0)
- {
- locinput += UTF8SKIP(locinput);
- } else {
-
- /* Must be V or LV. Take it, then match
- * V* */
- locinput += UTF8SKIP(locinput);
- while (locinput < reginfo->strend
- && (len = is_GCB_V_utf8(locinput)))
- {
- locinput += len;
- }
- }
- /* And any of LV, LVT, or V can be followed
- * by T* */
- while (locinput < reginfo->strend
- && (len = is_GCB_T_utf8(locinput)))
- {
- locinput += len;
- }
- }
- }
- }
-
- /* Match any extender */
- while (locinput < reginfo->strend
- && swash_fetch(PL_utf8_X_extend,
- (U8*)locinput, utf8_target))
- {
- locinput += UTF8SKIP(locinput);
- }
- }
- exit_utf8:
- if (locinput > reginfo->strend) sayNO;
}
break;